Skip to main content

reddb_server/runtime/
impl_core.rs

1use super::*;
2use crate::auth::column_policy_gate::ColumnAccessRequest;
3use crate::auth::UserId;
4use crate::replication::cdc::ChangeRecord;
5use crate::storage::query::ast::TableSource;
6
7/// Read a numeric score column out of a result record as `f64`, matching
8/// the column name case-insensitively. Used by the leaderboard-rank head
9/// walk (#918) to compare scores; non-numeric / missing columns yield
10/// `None` so a row with no comparable score never shifts a rank.
11fn record_column_f64(
12    rec: &crate::storage::query::unified::UnifiedRecord,
13    column: &str,
14) -> Option<f64> {
15    let value = rec
16        .get(column)
17        .or_else(|| rec.get(&column.to_lowercase()))?;
18    match value {
19        Value::Integer(n) => Some(*n as f64),
20        Value::UnsignedInteger(n) => Some(*n as f64),
21        Value::Float(n) => Some(*n),
22        Value::Timestamp(n) | Value::Duration(n) => Some(*n as f64),
23        _ => None,
24    }
25}
26
27fn record_rid_u64(rec: &crate::storage::query::unified::UnifiedRecord) -> Option<u64> {
28    match rec.get("rid") {
29        Some(Value::UnsignedInteger(n)) => Some(*n),
30        Some(Value::Integer(n)) if *n >= 0 => Some(*n as u64),
31        _ => None,
32    }
33}
34
35fn seed_storage_deploy_config(
36    store: &crate::storage::UnifiedStore,
37    selection: crate::storage::StorageProfileSelection,
38) {
39    store.set_config_tree(
40        "storage.deploy",
41        &crate::json!({
42            "profile": selection.deploy_profile.as_str(),
43            "packaging": selection.packaging.as_str(),
44            "preset": selection.preset_name(),
45            "replica_count": selection.replica_count,
46            "managed_backup": selection.managed_backup,
47            "wal_retention": selection.wal_retention,
48        }),
49    );
50}
51
52struct RankedHeadEntry {
53    rank: u64,
54    record: crate::storage::query::unified::UnifiedRecord,
55}
56
57fn secret_sql_value_to_string(value: &Value) -> RedDBResult<String> {
58    match value {
59        Value::Text(s) => Ok(s.to_string()),
60        Value::Integer(n) => Ok(n.to_string()),
61        Value::UnsignedInteger(n) => Ok(n.to_string()),
62        Value::Float(n) => Ok(n.to_string()),
63        Value::Boolean(b) => Ok(b.to_string()),
64        Value::Null => Err(RedDBError::Query(
65            "SET SECRET key = NULL deletes the secret; use DELETE SECRET for explicit deletes"
66                .to_string(),
67        )),
68        Value::Password(_) | Value::Secret(_) => Err(RedDBError::Query(
69            "SET SECRET accepts plain scalar literals; PASSWORD() and SECRET() are for typed columns"
70                .to_string(),
71        )),
72        _ => Err(RedDBError::Query(format!(
73            "SET SECRET does not support value type {:?} yet",
74            value.data_type()
75        ))),
76    }
77}
78
79fn insert_config_json_path(
80    root: &mut crate::serde_json::Value,
81    path: &str,
82    value: crate::serde_json::Value,
83) {
84    let segments: Vec<&str> = path
85        .split('.')
86        .filter(|segment| !segment.is_empty())
87        .collect();
88    insert_config_json_segments(root, &segments, value);
89}
90
91fn insert_config_json_segments(
92    root: &mut crate::serde_json::Value,
93    segments: &[&str],
94    value: crate::serde_json::Value,
95) {
96    if segments.is_empty() {
97        *root = value;
98        return;
99    }
100
101    if !matches!(root, crate::serde_json::Value::Object(_)) {
102        *root = crate::serde_json::Value::Object(crate::serde_json::Map::new());
103    }
104
105    let crate::serde_json::Value::Object(map) = root else {
106        return;
107    };
108    if segments.len() == 1 {
109        map.insert(segments[0].to_string(), value);
110        return;
111    }
112    let entry = map
113        .entry(segments[0].to_string())
114        .or_insert_with(|| crate::serde_json::Value::Object(crate::serde_json::Map::new()));
115    insert_config_json_segments(entry, &segments[1..], value);
116}
117
118fn show_config_json_result(
119    query: &str,
120    mode: crate::storage::query::modes::QueryMode,
121    prefix: &Option<String>,
122    value: crate::serde_json::Value,
123) -> RuntimeQueryResult {
124    let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
125    let mut record = UnifiedRecord::new();
126    record.set(
127        "key",
128        prefix
129            .as_ref()
130            .map(|key| Value::text(key.clone()))
131            .unwrap_or(Value::Null),
132    );
133    record.set("value", Value::Json(value.to_string_compact().into_bytes()));
134    result.push(record);
135    RuntimeQueryResult {
136        query: query.to_string(),
137        mode,
138        statement: "show_config_json",
139        engine: "runtime-config",
140        result,
141        affected_rows: 0,
142        statement_type: "select",
143        bookmark: None,
144    }
145}
146
147#[derive(Clone)]
148struct QueryControlEventSpec {
149    kind: crate::runtime::control_events::EventKind,
150    action: &'static str,
151    resource: Option<String>,
152    fields: Vec<(String, crate::runtime::control_events::Sensitivity)>,
153}
154
155#[derive(Clone)]
156struct QueryAuditPlan {
157    statement_kind: &'static str,
158    collections: Vec<String>,
159}
160
161fn query_audit_plan(expr: &QueryExpr) -> Option<QueryAuditPlan> {
162    let mut collections = Vec::new();
163    let statement_kind = match expr {
164        QueryExpr::Table(table) => {
165            push_query_audit_collection(&mut collections, &table.table);
166            "select"
167        }
168        QueryExpr::Join(join) => {
169            collect_query_audit_collections(&join.left, &mut collections);
170            collect_query_audit_collections(&join.right, &mut collections);
171            "select"
172        }
173        QueryExpr::Insert(insert) => {
174            push_query_audit_collection(&mut collections, &insert.table);
175            "insert"
176        }
177        QueryExpr::Update(update) => {
178            push_query_audit_collection(&mut collections, &update.table);
179            "update"
180        }
181        QueryExpr::Delete(delete) => {
182            push_query_audit_collection(&mut collections, &delete.table);
183            "delete"
184        }
185        _ => return None,
186    };
187    if collections.is_empty() {
188        None
189    } else {
190        Some(QueryAuditPlan {
191            statement_kind,
192            collections,
193        })
194    }
195}
196
197fn collect_query_audit_collections(expr: &QueryExpr, collections: &mut Vec<String>) {
198    match expr {
199        QueryExpr::Table(table) => push_query_audit_collection(collections, &table.table),
200        QueryExpr::Join(join) => {
201            collect_query_audit_collections(&join.left, collections);
202            collect_query_audit_collections(&join.right, collections);
203        }
204        _ => {}
205    }
206}
207
208fn push_query_audit_collection(collections: &mut Vec<String>, name: &str) {
209    if name == "red" || name.starts_with("red.") || name.starts_with("__red_schema_") {
210        return;
211    }
212    if !collections.iter().any(|existing| existing == name) {
213        collections.push(name.to_string());
214    }
215}
216
217const RUNTIME_INDEX_REGISTRY_COLLECTION: &str = "red_index_registry";
218
219impl RedDBRuntime {
220    fn execute_create_metric(
221        &self,
222        raw_query: &str,
223        query: &crate::storage::query::ast::CreateMetricQuery,
224    ) -> RedDBResult<RuntimeQueryResult> {
225        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
226        let store = self.inner.db.store();
227        super::metric_descriptor_catalog::create(
228            store.as_ref(),
229            &query.path,
230            &query.kind,
231            &query.role,
232            super::metric_descriptor_catalog::DerivedSpec {
233                source: query.source.clone(),
234                query: query.query.clone(),
235                window_ms: query.window_ms,
236                time_field: query.time_field.clone(),
237            },
238        )?;
239        self.invalidate_result_cache();
240        Ok(RuntimeQueryResult::ok_message(
241            raw_query.to_string(),
242            &format!("metric descriptor '{}' created", query.path),
243            "create",
244        ))
245    }
246
247    /// `CREATE RANKING <name> ON <table> (<column> [ASC|DESC]) [TOP <k>]`
248    /// — declare a Ranking capability over an ordinary table's score
249    /// column (issue #918 / ADR 0035). Persists a WAL-backed catalog
250    /// record; no new Collection model is introduced. Authorized through
251    /// the same DDL write gate as `CREATE METRIC`/`CREATE INDEX`.
252    fn execute_create_ranking(
253        &self,
254        raw_query: &str,
255        req: super::ranking_descriptor_catalog::CreateRankingRequest,
256    ) -> RedDBResult<RuntimeQueryResult> {
257        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
258        let store = self.inner.db.store();
259        let descriptor = super::ranking_descriptor_catalog::create(store.as_ref(), &req)?;
260        self.invalidate_result_cache();
261        Ok(RuntimeQueryResult::ok_message(
262            raw_query.to_string(),
263            &format!(
264                "ranking '{}' created on {}({})",
265                descriptor.name, descriptor.table, descriptor.column
266            ),
267            "create",
268        ))
269    }
270
271    /// `SHOW RANKINGS` — project the declared Ranking capabilities back as
272    /// rows, so a declared capability is observable (the Analytics
273    /// "prefer SELECT over admin verbs" rule).
274    fn execute_show_rankings(&self, raw_query: &str) -> RedDBResult<RuntimeQueryResult> {
275        let store = self.inner.db.store();
276        let entries = super::ranking_descriptor_catalog::list(store.as_ref());
277        let columns = vec![
278            "name".to_string(),
279            "table".to_string(),
280            "column".to_string(),
281            "direction".to_string(),
282            "top_k".to_string(),
283        ];
284        let rows = entries
285            .into_iter()
286            .map(|e| {
287                vec![
288                    ("name".to_string(), Value::text(e.name)),
289                    ("table".to_string(), Value::text(e.table)),
290                    ("column".to_string(), Value::text(e.column)),
291                    (
292                        "direction".to_string(),
293                        Value::text(if e.descending { "DESC" } else { "ASC" }.to_string()),
294                    ),
295                    ("top_k".to_string(), Value::UnsignedInteger(e.top_k)),
296                ]
297            })
298            .collect();
299        let mut result =
300            RuntimeQueryResult::ok_records(raw_query.to_string(), columns, rows, "select");
301        result.statement = "rank_of";
302        result.engine = "runtime-rank";
303        Ok(result)
304    }
305
306    /// `RANK OF <id> IN <name>` — exact, MVCC-correct rank of a specific
307    /// row within the capability's bounded top-K head (issue #918).
308    ///
309    /// Returns a single `rank` row when the row is visible *and* falls
310    /// inside the exact head; an empty result otherwise (not visible, or
311    /// in the approximate tail — a separate slice). The computation runs
312    /// entirely over the regular read pipeline so it inherits MVCC
313    /// visibility, RLS/policy, and tenant scope from ordinary reads.
314    fn execute_rank_of(
315        &self,
316        raw_query: &str,
317        req: &crate::storage::query::ast::RankOfQuery,
318    ) -> RedDBResult<RuntimeQueryResult> {
319        let store = self.inner.db.store();
320        let descriptor = super::ranking_descriptor_catalog::get(store.as_ref(), &req.ranking)
321            .ok_or_else(|| {
322                RedDBError::Query(format!("ranking '{}' does not exist", req.ranking))
323            })?;
324        let rank = self.compute_exact_head_rank(&descriptor, req.entity_id)?;
325        let columns = vec!["rank".to_string()];
326        let rows = match rank {
327            Some(rank) => vec![vec![("rank".to_string(), Value::UnsignedInteger(rank))]],
328            None => Vec::new(),
329        };
330        let mut result =
331            RuntimeQueryResult::ok_records(raw_query.to_string(), columns, rows, "select");
332        result.statement = "rank_range";
333        result.engine = "runtime-rank";
334        Ok(result)
335    }
336
337    /// `RANK RANGE <lo> TO <hi> IN <name>` — exact, MVCC-correct entries
338    /// occupying a contiguous rank range within the bounded top-K head.
339    ///
340    /// The output is in leaderboard order and includes `rank` plus the
341    /// row columns returned by the canonical exact-head SQL read.
342    fn execute_rank_range(
343        &self,
344        raw_query: &str,
345        req: &crate::storage::query::ast::RankRangeQuery,
346    ) -> RedDBResult<RuntimeQueryResult> {
347        let store = self.inner.db.store();
348        let descriptor = super::ranking_descriptor_catalog::get(store.as_ref(), &req.ranking)
349            .ok_or_else(|| {
350                RedDBError::Query(format!("ranking '{}' does not exist", req.ranking))
351            })?;
352        let (head_columns, entries) = self.compute_ranked_head_entries(&descriptor)?;
353
354        let mut columns = Vec::with_capacity(head_columns.len() + 1);
355        columns.push("rank".to_string());
356        for column in &head_columns {
357            if column != "rank" {
358                columns.push(column.clone());
359            }
360        }
361
362        let rows = entries
363            .into_iter()
364            .filter(|entry| entry.rank >= req.lo && entry.rank <= req.hi)
365            .map(|entry| {
366                let mut row = Vec::with_capacity(columns.len());
367                row.push(("rank".to_string(), Value::UnsignedInteger(entry.rank)));
368                for column in &head_columns {
369                    if column == "rank" {
370                        continue;
371                    }
372                    if let Some(value) = entry.record.get(column) {
373                        row.push((column.clone(), value.clone()));
374                    }
375                }
376                row
377            })
378            .collect();
379        let mut result =
380            RuntimeQueryResult::ok_records(raw_query.to_string(), columns, rows, "select");
381        result.statement = "approx_rank_of";
382        result.engine = "runtime-rank";
383        Ok(result)
384    }
385
386    /// Compute the exact rank of `target_id` within the descriptor's
387    /// bounded top-K head, or `None` if the row is invisible to the
388    /// querying snapshot or beyond the exact head.
389    ///
390    /// Faithful to ADR 0035: it walks the sorted index head
391    /// (`ORDER BY <col> {DESC|ASC} LIMIT k`, served by
392    /// `try_sorted_index_lookup` + the per-row MVCC visibility re-check)
393    /// and counts only rows visible to the current snapshot. Running the
394    /// head scan through `execute_query_inner` keeps it on the same
395    /// snapshot/tenant/policy frame as ordinary reads, so the rank agrees
396    /// with `ORDER BY <col> {DESC|ASC} LIMIT` under that snapshot by
397    /// construction. RANK semantics: tied scores share a rank, so the
398    /// rank is `1 + (number of strictly-better visible rows)`.
399    fn compute_exact_head_rank(
400        &self,
401        descriptor: &super::ranking_descriptor_catalog::RankingDescriptor,
402        target_id: u64,
403    ) -> RedDBResult<Option<u64>> {
404        let (_columns, entries) = self.compute_ranked_head_entries(descriptor)?;
405        Ok(entries
406            .into_iter()
407            .find(|entry| record_rid_u64(&entry.record) == Some(target_id))
408            .map(|entry| entry.rank))
409    }
410
411    /// Return the exact head rows in deterministic rank order.
412    fn compute_ranked_head_entries(
413        &self,
414        descriptor: &super::ranking_descriptor_catalog::RankingDescriptor,
415    ) -> RedDBResult<(Vec<String>, Vec<RankedHeadEntry>)> {
416        let table = &descriptor.table;
417        let column = &descriptor.column;
418
419        // The exact head: top-K rows in rank order. Each row here already
420        // passed MVCC visibility *and* RLS/tenant filtering during the
421        // scan, so identifying the target *within* this result (rather
422        // than via a separate `red_entity_id` lookup, which takes the
423        // direct entity-fetch path that bypasses the RLS gate) is what
424        // makes the rank honor policy/tenant scope (criterion 5).
425        let dir = if descriptor.descending { "DESC" } else { "ASC" };
426        let head_sql = format!(
427            "SELECT * FROM {table} ORDER BY {column} {dir}, rid ASC LIMIT {}",
428            descriptor.top_k
429        );
430        let head_result = self.execute_query_inner(&head_sql)?;
431
432        let mut entries = Vec::with_capacity(head_result.result.records.len());
433        let mut row_position = 0u64;
434        let mut current_rank = 0u64;
435        let mut previous_score: Option<f64> = None;
436        for rec in &head_result.result.records {
437            let Some(score) = record_column_f64(rec, column) else {
438                continue;
439            };
440            row_position += 1;
441            current_rank = if previous_score == Some(score) {
442                current_rank
443            } else {
444                row_position
445            };
446            previous_score = Some(score);
447            entries.push(RankedHeadEntry {
448                rank: current_rank,
449                record: rec.clone(),
450            });
451        }
452        Ok((head_result.result.columns, entries))
453    }
454
455    /// `APPROX RANK OF <id> IN <name>` — the *approximate tail* read
456    /// (issue #923 / ADR 0035). Serves an explicitly-approximate
457    /// percentile / rank for an entry below the exact top-K head from a
458    /// per-`(table, column)` score sketch.
459    ///
460    /// The result is always labeled approximate (`approximate = true`,
461    /// distinct from the exact `RANK OF` surface which returns only a bare
462    /// `rank`) so a caller never reads a tail estimate as an exact head
463    /// position. An invisible / non-existent row yields no row, exactly
464    /// like the exact surface.
465    fn execute_approx_rank_of(
466        &self,
467        raw_query: &str,
468        req: &crate::storage::query::ast::RankOfQuery,
469    ) -> RedDBResult<RuntimeQueryResult> {
470        let store = self.inner.db.store();
471        let descriptor = super::ranking_descriptor_catalog::get(store.as_ref(), &req.ranking)
472            .ok_or_else(|| {
473                RedDBError::Query(format!("ranking '{}' does not exist", req.ranking))
474            })?;
475
476        let approx = self.compute_approx_rank(&descriptor, req.entity_id)?;
477        let columns = vec![
478            "rank".to_string(),
479            "percentile".to_string(),
480            "approximate".to_string(),
481        ];
482        let rows = match approx {
483            Some(approx) => vec![vec![
484                ("rank".to_string(), Value::UnsignedInteger(approx.rank)),
485                ("percentile".to_string(), Value::Float(approx.percentile)),
486                ("approximate".to_string(), Value::Boolean(true)),
487            ]],
488            None => Vec::new(),
489        };
490        Ok(RuntimeQueryResult::ok_records(
491            raw_query.to_string(),
492            columns,
493            rows,
494            "select",
495        ))
496    }
497
498    /// Refresh the per-`(table, column)` score sketch from the rows visible
499    /// to the current snapshot and return the target's approximate rank, or
500    /// `None` if the target row is invisible to this snapshot / tenant.
501    ///
502    /// The sketch is rebuilt from the live column on each read and persisted
503    /// back to `red_config` keyed by `(table, column)` — so it is maintained
504    /// per `(collection, score column)` and stays current as scores change
505    /// (criterion 4). The scan runs through `execute_query_inner`, inheriting
506    /// the same MVCC snapshot, RLS/tenant scope, and policy as ordinary
507    /// reads. The *approximation* is the histogram bucketing in
508    /// [`super::score_sketch::ScoreSketch`], not the data freshness, so the
509    /// estimate carries the documented error band even though it is built
510    /// from a full scan in this v0 (incremental maintenance is an ADR-0035
511    /// implementation detail, left open and reversible).
512    fn compute_approx_rank(
513        &self,
514        descriptor: &super::ranking_descriptor_catalog::RankingDescriptor,
515        target_id: u64,
516    ) -> RedDBResult<Option<super::score_sketch::ApproxRank>> {
517        let table = &descriptor.table;
518        let column = &descriptor.column;
519
520        // Scan the visible rows once: it both feeds the sketch and locates
521        // the target's score under the same snapshot/tenant/policy frame.
522        let scan_sql = format!("SELECT * FROM {table}");
523        let scan = self.execute_query_inner(&scan_sql)?;
524        let records = &scan.result.records;
525
526        let mut scores: Vec<f64> = Vec::with_capacity(records.len());
527        let mut target_score: Option<f64> = None;
528        for rec in records {
529            let Some(score) = record_column_f64(rec, column) else {
530                continue;
531            };
532            scores.push(score);
533            let rid = match rec.get("rid") {
534                Some(Value::UnsignedInteger(n)) => Some(*n),
535                Some(Value::Integer(n)) if *n >= 0 => Some(*n as u64),
536                _ => None,
537            };
538            if rid == Some(target_id) {
539                target_score = Some(score);
540            }
541        }
542
543        let sketch = super::score_sketch::ScoreSketch::from_scores(&scores);
544        // Persist the refreshed sketch per (table, column).
545        super::ranking_descriptor_catalog::save_sketch(
546            self.inner.db.store().as_ref(),
547            table,
548            column,
549            &sketch,
550        );
551
552        let Some(target_score) = target_score else {
553            // Not visible to this snapshot/tenant ⇒ no rank (matches exact).
554            return Ok(None);
555        };
556        Ok(sketch.approx_rank(target_score, descriptor.descending))
557    }
558
559    fn execute_alter_metric(
560        &self,
561        raw_query: &str,
562        query: &crate::storage::query::ast::AlterMetricQuery,
563    ) -> RedDBResult<RuntimeQueryResult> {
564        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
565        let store = self.inner.db.store();
566        super::metric_descriptor_catalog::update(
567            store.as_ref(),
568            &query.path,
569            query.set_role.as_deref(),
570            query.attempted_kind.as_deref(),
571            query.attempted_path.as_deref(),
572        )?;
573        self.invalidate_result_cache();
574        Ok(RuntimeQueryResult::ok_message(
575            raw_query.to_string(),
576            &format!("metric descriptor '{}' updated", query.path),
577            "alter",
578        ))
579    }
580
581    fn execute_create_slo(
582        &self,
583        raw_query: &str,
584        query: &crate::storage::query::ast::CreateSloQuery,
585    ) -> RedDBResult<RuntimeQueryResult> {
586        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
587        let store = self.inner.db.store();
588        super::slo_descriptor_catalog::create(
589            store.as_ref(),
590            &query.path,
591            &query.metric_path,
592            query.target,
593            query.window_ms,
594        )?;
595        self.invalidate_result_cache();
596        Ok(RuntimeQueryResult::ok_message(
597            raw_query.to_string(),
598            &format!("SLO descriptor '{}' created", query.path),
599            "create",
600        ))
601    }
602
603    fn execute_create_analytics_source(
604        &self,
605        raw_query: &str,
606        query: super::analytics_source_catalog::CreateAnalyticsSourceProfile,
607    ) -> RedDBResult<RuntimeQueryResult> {
608        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
609        let store = self.inner.db.store();
610        let profile = super::analytics_source_catalog::create(
611            store.as_ref(),
612            &self.inner.db.collection_contracts(),
613            query,
614        )?;
615        self.invalidate_result_cache();
616        Ok(RuntimeQueryResult::ok_message(
617            raw_query.to_string(),
618            &format!("analytics source '{}' created", profile.name),
619            "create",
620        ))
621    }
622}
623
624fn query_control_event_specs(expr: &QueryExpr) -> Vec<QueryControlEventSpec> {
625    use crate::runtime::control_events::{EventKind, Sensitivity};
626
627    let mut specs = Vec::new();
628    let mut schema = |action: &'static str, resource: Option<String>| {
629        specs.push(QueryControlEventSpec {
630            kind: EventKind::SchemaDdl,
631            action,
632            resource,
633            fields: Vec::new(),
634        });
635    };
636    match expr {
637        QueryExpr::CreateTable(q) => {
638            schema("create_table", Some(format!("table:{}", q.name)));
639            if let Some(column) = &q.tenant_by {
640                specs.push(QueryControlEventSpec {
641                    kind: EventKind::TenantGovernance,
642                    action: "create_table_tenant_by",
643                    resource: Some(format!("table:{}", q.name)),
644                    fields: vec![("tenant_column".to_string(), Sensitivity::raw(column))],
645                });
646            }
647        }
648        QueryExpr::CreateCollection(q) => {
649            schema("create_collection", Some(format!("collection:{}", q.name)));
650        }
651        QueryExpr::CreateVector(q) => schema("create_vector", Some(format!("vector:{}", q.name))),
652        QueryExpr::DropTable(q) => schema("drop_table", Some(format!("table:{}", q.name))),
653        QueryExpr::DropGraph(q) => schema("drop_graph", Some(format!("graph:{}", q.name))),
654        QueryExpr::DropVector(q) => schema("drop_vector", Some(format!("vector:{}", q.name))),
655        QueryExpr::DropDocument(q) => {
656            schema("drop_document", Some(format!("document:{}", q.name)));
657        }
658        QueryExpr::DropKv(q) => schema("drop_kv", Some(format!("kv:{}", q.name))),
659        QueryExpr::DropCollection(q) => {
660            schema("drop_collection", Some(format!("collection:{}", q.name)));
661        }
662        QueryExpr::Truncate(q) => schema("truncate", Some(format!("collection:{}", q.name))),
663        QueryExpr::AlterTable(q) => {
664            schema("alter_table", Some(format!("table:{}", q.name)));
665            for op in &q.operations {
666                match op {
667                    crate::storage::query::ast::AlterOperation::EnableRowLevelSecurity => {
668                        specs.push(QueryControlEventSpec {
669                            kind: EventKind::RlsGovernance,
670                            action: "enable_rls",
671                            resource: Some(format!("table:{}", q.name)),
672                            fields: Vec::new(),
673                        });
674                    }
675                    crate::storage::query::ast::AlterOperation::DisableRowLevelSecurity => {
676                        specs.push(QueryControlEventSpec {
677                            kind: EventKind::RlsGovernance,
678                            action: "disable_rls",
679                            resource: Some(format!("table:{}", q.name)),
680                            fields: Vec::new(),
681                        });
682                    }
683                    crate::storage::query::ast::AlterOperation::EnableTenancy { column } => {
684                        specs.push(QueryControlEventSpec {
685                            kind: EventKind::TenantGovernance,
686                            action: "enable_tenancy",
687                            resource: Some(format!("table:{}", q.name)),
688                            fields: vec![("tenant_column".to_string(), Sensitivity::raw(column))],
689                        });
690                    }
691                    crate::storage::query::ast::AlterOperation::DisableTenancy => {
692                        specs.push(QueryControlEventSpec {
693                            kind: EventKind::TenantGovernance,
694                            action: "disable_tenancy",
695                            resource: Some(format!("table:{}", q.name)),
696                            fields: Vec::new(),
697                        });
698                    }
699                    _ => {}
700                }
701            }
702        }
703        QueryExpr::CreateIndex(q) => {
704            schema(
705                "create_index",
706                Some(format!("index:{}:{}", q.table, q.name)),
707            );
708        }
709        QueryExpr::DropIndex(q) => {
710            schema("drop_index", Some(format!("index:{}:{}", q.table, q.name)));
711        }
712        QueryExpr::CreateTimeSeries(q) => {
713            schema("create_timeseries", Some(format!("timeseries:{}", q.name)));
714        }
715        QueryExpr::CreateMetric(q) => {
716            schema("create_metric", Some(format!("metric:{}", q.path)));
717        }
718        QueryExpr::AlterMetric(q) => {
719            schema("alter_metric", Some(format!("metric:{}", q.path)));
720        }
721        QueryExpr::CreateSlo(q) => {
722            schema("create_slo", Some(format!("slo:{}", q.path)));
723        }
724        QueryExpr::DropTimeSeries(q) => {
725            schema("drop_timeseries", Some(format!("timeseries:{}", q.name)));
726        }
727        QueryExpr::CreateQueue(q) => schema("create_queue", Some(format!("queue:{}", q.name))),
728        QueryExpr::AlterQueue(q) => schema("alter_queue", Some(format!("queue:{}", q.name))),
729        QueryExpr::DropQueue(q) => schema("drop_queue", Some(format!("queue:{}", q.name))),
730        QueryExpr::CreateTree(q) => {
731            schema(
732                "create_tree",
733                Some(format!("tree:{}:{}", q.collection, q.name)),
734            );
735        }
736        QueryExpr::DropTree(q) => {
737            schema(
738                "drop_tree",
739                Some(format!("tree:{}:{}", q.collection, q.name)),
740            );
741        }
742        QueryExpr::CreateSchema(q) => schema("create_schema", Some(format!("schema:{}", q.name))),
743        QueryExpr::DropSchema(q) => schema("drop_schema", Some(format!("schema:{}", q.name))),
744        QueryExpr::CreateSequence(q) => {
745            schema("create_sequence", Some(format!("sequence:{}", q.name)));
746        }
747        QueryExpr::DropSequence(q) => schema("drop_sequence", Some(format!("sequence:{}", q.name))),
748        QueryExpr::CreateView(q) => schema("create_view", Some(format!("view:{}", q.name))),
749        QueryExpr::DropView(q) => schema("drop_view", Some(format!("view:{}", q.name))),
750        QueryExpr::RefreshMaterializedView(q) => {
751            schema(
752                "refresh_materialized_view",
753                Some(format!("view:{}", q.name)),
754            );
755        }
756        QueryExpr::CreatePolicy(q) => {
757            specs.push(QueryControlEventSpec {
758                kind: EventKind::RlsGovernance,
759                action: "create_policy",
760                resource: Some(format!("table:{}:policy:{}", q.table, q.name)),
761                fields: vec![(
762                    "target_kind".to_string(),
763                    Sensitivity::raw(q.target_kind.as_ident()),
764                )],
765            });
766        }
767        QueryExpr::DropPolicy(q) => {
768            specs.push(QueryControlEventSpec {
769                kind: EventKind::RlsGovernance,
770                action: "drop_policy",
771                resource: Some(format!("table:{}:policy:{}", q.table, q.name)),
772                fields: Vec::new(),
773            });
774        }
775        QueryExpr::SetTenant(value) => {
776            let mut fields = Vec::new();
777            if let Some(value) = value {
778                fields.push(("tenant".to_string(), Sensitivity::raw(value)));
779            }
780            specs.push(QueryControlEventSpec {
781                kind: EventKind::TenantGovernance,
782                action: "set_tenant",
783                resource: Some("tenant:session".to_string()),
784                fields,
785            });
786        }
787        QueryExpr::SetConfig { key, .. } => {
788            specs.push(QueryControlEventSpec {
789                kind: EventKind::ConfigWrite,
790                action: "config:write",
791                resource: Some(format!("config:{key}")),
792                fields: vec![("key".to_string(), Sensitivity::raw(key))],
793            });
794        }
795        QueryExpr::ConfigCommand(cmd) => match cmd {
796            crate::storage::query::ast::ConfigCommand::Put {
797                collection, key, ..
798            }
799            | crate::storage::query::ast::ConfigCommand::Rotate {
800                collection, key, ..
801            } => {
802                let target = format!("{collection}/{key}");
803                specs.push(QueryControlEventSpec {
804                    kind: EventKind::ConfigWrite,
805                    action: "config:write",
806                    resource: Some(format!("config:{target}")),
807                    fields: vec![
808                        ("collection".to_string(), Sensitivity::raw(collection)),
809                        ("key".to_string(), Sensitivity::raw(key)),
810                    ],
811                });
812            }
813            crate::storage::query::ast::ConfigCommand::Delete { collection, key } => {
814                let target = format!("{collection}/{key}");
815                specs.push(QueryControlEventSpec {
816                    kind: EventKind::ConfigDelete,
817                    action: "config:write",
818                    resource: Some(format!("config:{target}")),
819                    fields: vec![
820                        ("collection".to_string(), Sensitivity::raw(collection)),
821                        ("key".to_string(), Sensitivity::raw(key)),
822                    ],
823                });
824            }
825            _ => {}
826        },
827        QueryExpr::AlterUser(stmt) => {
828            let disables = stmt.attributes.iter().any(|attr| {
829                matches!(
830                    attr,
831                    crate::storage::query::ast::AlterUserAttribute::Disable
832                )
833            });
834            specs.push(QueryControlEventSpec {
835                kind: if disables {
836                    EventKind::UserDisable
837                } else {
838                    EventKind::UserUpdate
839                },
840                action: "alter_user",
841                resource: Some(format!("user:{}", stmt.username)),
842                fields: Vec::new(),
843            });
844        }
845        QueryExpr::CreateUser(stmt) => {
846            specs.push(QueryControlEventSpec {
847                kind: EventKind::UserCreate,
848                action: "create_user",
849                resource: Some(format!("user:{}", stmt.username)),
850                fields: Vec::new(),
851            });
852        }
853        _ => {}
854    }
855    specs
856}
857
858pub(crate) fn control_event_outcome_for_error(
859    err: &RedDBError,
860) -> crate::runtime::control_events::Outcome {
861    match err {
862        RedDBError::ReadOnly(_) => crate::runtime::control_events::Outcome::Denied,
863        RedDBError::Query(msg)
864            if msg.contains("permission denied")
865                || msg.contains("cannot issue")
866                || msg.contains("lacks") =>
867        {
868            crate::runtime::control_events::Outcome::Denied
869        }
870        _ => crate::runtime::control_events::Outcome::Error,
871    }
872}
873
874/// Convert the rows produced by a materialized-view body into
875/// `UnifiedEntity` table rows targeting the backing collection.
876/// Issue #595 slice 9c — feeds `UnifiedStore::refresh_collection`.
877///
878/// Graph fragments and vector hits are ignored: a materialized view
879/// is a relational result set (SELECT-shaped); slices 11+ may extend
880/// this once we have a richer view body shape. Each row materialises
881/// the union of its schema-bound columns + overflow.
882fn view_records_to_entities(
883    table: &str,
884    records: &[crate::storage::query::unified::UnifiedRecord],
885) -> Vec<crate::storage::UnifiedEntity> {
886    use std::collections::HashMap;
887    let table_arc: std::sync::Arc<str> = std::sync::Arc::from(table);
888    let mut out = Vec::with_capacity(records.len());
889    for record in records {
890        let mut named: HashMap<String, crate::storage::schema::Value> = HashMap::new();
891        for (name, value) in record.iter_fields() {
892            named.insert(name.to_string(), value.clone());
893        }
894        let entity = crate::storage::UnifiedEntity::new(
895            crate::storage::EntityId::new(0),
896            crate::storage::EntityKind::TableRow {
897                table: std::sync::Arc::clone(&table_arc),
898                row_id: 0,
899            },
900            crate::storage::EntityData::Row(crate::storage::RowData {
901                columns: Vec::new(),
902                named: Some(named),
903                schema: None,
904            }),
905        );
906        out.push(entity);
907    }
908    out
909}
910
911fn system_keyed_collection_contract(
912    name: &str,
913    model: crate::catalog::CollectionModel,
914) -> crate::physical::CollectionContract {
915    let now = crate::utils::now_unix_millis() as u128;
916    crate::physical::CollectionContract {
917        name: name.to_string(),
918        declared_model: model,
919        schema_mode: crate::catalog::SchemaMode::Dynamic,
920        origin: crate::physical::ContractOrigin::Implicit,
921        version: 1,
922        created_at_unix_ms: now,
923        updated_at_unix_ms: now,
924        default_ttl_ms: None,
925        vector_dimension: None,
926        vector_metric: None,
927        context_index_fields: Vec::new(),
928        declared_columns: Vec::new(),
929        table_def: None,
930        timestamps_enabled: false,
931        context_index_enabled: false,
932        metrics_raw_retention_ms: None,
933        metrics_rollup_policies: Vec::new(),
934        metrics_tenant_identity: None,
935        metrics_namespace: None,
936        append_only: false,
937        subscriptions: Vec::new(),
938        analytics_config: Vec::new(),
939        session_key: None,
940        session_gap_ms: None,
941        retention_duration_ms: None,
942        analytical_storage: None,
943    }
944}
945
946pub use super::execution_context::{
947    capture_current_snapshot, clear_current_auth_identity, clear_current_connection_id,
948    clear_current_snapshot, clear_current_tenant, current_auth_identity_for_audit,
949    current_connection_id, current_tenant, entity_visible_under_current_snapshot,
950    entity_visible_with_context, set_current_auth_identity, set_current_connection_id,
951    set_current_snapshot, set_current_tenant, snapshot_bundle, with_snapshot_bundle,
952    SnapshotBundle, SnapshotContext,
953};
954pub(crate) use super::execution_context::{
955    current_auth_identity, current_config_value, current_role_projected, current_scope_override,
956    current_secret_value, current_snapshot_requires_index_fallback, current_user_projected,
957    has_scope_override_active, parse_set_local_tenant, update_current_config_value,
958    update_current_secret_value, xids_visible_under_current_snapshot, ConfigSnapshotGuard,
959    CurrentSnapshotGuard, ScopeOverrideGuard, SecretStoreGuard, TxLocalTenantGuard,
960};
961
962fn table_row_index_fields(
963    entity: &crate::storage::unified::entity::UnifiedEntity,
964) -> Vec<(String, crate::storage::schema::Value)> {
965    let crate::storage::EntityData::Row(row) = &entity.data else {
966        return Vec::new();
967    };
968    if let Some(named) = &row.named {
969        return named
970            .iter()
971            .map(|(name, value)| (name.clone(), value.clone()))
972            .collect();
973    }
974    if let Some(schema) = &row.schema {
975        return schema
976            .iter()
977            .zip(row.columns.iter())
978            .map(|(name, value)| (name.clone(), value.clone()))
979            .collect();
980    }
981    Vec::new()
982}
983
984fn named_text(
985    named: &std::collections::HashMap<String, crate::storage::schema::Value>,
986    key: &str,
987) -> Option<String> {
988    match named.get(key) {
989        Some(crate::storage::schema::Value::Text(value)) => Some(value.to_string()),
990        _ => None,
991    }
992}
993
994fn named_bool(
995    named: &std::collections::HashMap<String, crate::storage::schema::Value>,
996    key: &str,
997) -> Option<bool> {
998    match named.get(key) {
999        Some(crate::storage::schema::Value::Boolean(value)) => Some(*value),
1000        _ => None,
1001    }
1002}
1003
1004fn index_method_kind_as_str(method: super::index_store::IndexMethodKind) -> &'static str {
1005    match method {
1006        super::index_store::IndexMethodKind::Hash => "hash",
1007        super::index_store::IndexMethodKind::Bitmap => "bitmap",
1008        super::index_store::IndexMethodKind::Spatial => "spatial",
1009        super::index_store::IndexMethodKind::BTree => "btree",
1010    }
1011}
1012
1013fn index_method_kind_from_str(raw: &str) -> Option<super::index_store::IndexMethodKind> {
1014    match raw {
1015        "hash" => Some(super::index_store::IndexMethodKind::Hash),
1016        "bitmap" => Some(super::index_store::IndexMethodKind::Bitmap),
1017        "spatial" | "rtree" => Some(super::index_store::IndexMethodKind::Spatial),
1018        "btree" => Some(super::index_store::IndexMethodKind::BTree),
1019        _ => None,
1020    }
1021}
1022
1023fn runtime_pool_lock(runtime: &RedDBRuntime) -> std::sync::MutexGuard<'_, PoolState> {
1024    runtime
1025        .inner
1026        .pool
1027        .lock()
1028        .unwrap_or_else(|poisoned| poisoned.into_inner())
1029}
1030
1031/// The graph-analytics table-valued functions recognized in FROM position.
1032/// Both the graph-collection form and the inline `nodes => / edges =>` form
1033/// (issue #799) accept these names.
1034fn is_graph_tvf_name(name: &str) -> bool {
1035    name.eq_ignore_ascii_case("components")
1036        || name.eq_ignore_ascii_case("louvain")
1037        || name.eq_ignore_ascii_case("degree_centrality")
1038        || name.eq_ignore_ascii_case("shortest_path")
1039        || name.eq_ignore_ascii_case("betweenness")
1040        || name.eq_ignore_ascii_case("eigenvector")
1041        || name.eq_ignore_ascii_case("pagerank")
1042}
1043
1044/// Map a declared `WITH ANALYTICS` view to the concrete graph algorithm name
1045/// and named-argument list that [`RedDBRuntime::dispatch_graph_algorithm`]
1046/// consumes (issue #800). The `using` option selects the algorithm inside the
1047/// output family; unsupported algorithms and the options that do not apply to
1048/// the chosen algorithm are rejected so a view never silently ignores a
1049/// declared parameter.
1050fn analytics_view_algorithm(
1051    graph: &str,
1052    view: &crate::catalog::AnalyticsViewDescriptor,
1053) -> RedDBResult<(String, Vec<(String, f64)>)> {
1054    use crate::catalog::AnalyticsOutput;
1055
1056    let mut named_args: Vec<(String, f64)> = Vec::new();
1057    let algorithm = match view.output {
1058        AnalyticsOutput::Communities => {
1059            let algo = view.algorithm.as_deref().unwrap_or("louvain");
1060            if !algo.eq_ignore_ascii_case("louvain") {
1061                return Err(RedDBError::Query(format!(
1062                    "analytics output 'communities' on graph '{graph}' has unsupported algorithm '{algo}' (expected louvain)"
1063                )));
1064            }
1065            if let Some(resolution) = view.resolution {
1066                named_args.push(("resolution".to_string(), resolution));
1067            }
1068            "louvain".to_string()
1069        }
1070        AnalyticsOutput::Components => {
1071            if let Some(algo) = view.algorithm.as_deref() {
1072                if !algo.eq_ignore_ascii_case("components")
1073                    && !algo.eq_ignore_ascii_case("connected_components")
1074                {
1075                    return Err(RedDBError::Query(format!(
1076                        "analytics output 'components' on graph '{graph}' has unsupported algorithm '{algo}' (expected connected_components)"
1077                    )));
1078                }
1079            }
1080            "components".to_string()
1081        }
1082        AnalyticsOutput::Centrality => {
1083            let algo = view
1084                .algorithm
1085                .as_deref()
1086                .unwrap_or("pagerank")
1087                .to_ascii_lowercase();
1088            match algo.as_str() {
1089                "pagerank" => {
1090                    if let Some(max_iterations) = view.max_iterations {
1091                        named_args.push(("max_iterations".to_string(), max_iterations as f64));
1092                    }
1093                }
1094                "eigenvector" => {
1095                    if let Some(max_iterations) = view.max_iterations {
1096                        named_args.push(("max_iterations".to_string(), max_iterations as f64));
1097                    }
1098                    if let Some(tolerance) = view.tolerance {
1099                        named_args.push(("tolerance".to_string(), tolerance));
1100                    }
1101                }
1102                "betweenness" => {}
1103                other => {
1104                    return Err(RedDBError::Query(format!(
1105                        "analytics output 'centrality' on graph '{graph}' has unsupported algorithm '{other}' (expected pagerank, betweenness, or eigenvector)"
1106                    )));
1107                }
1108            }
1109            algo
1110        }
1111    };
1112    Ok((algorithm, named_args))
1113}
1114
1115/// Reject any named arguments for a TVF that accepts none.
1116fn reject_named_args(name: &str, named_args: &[(String, f64)]) -> RedDBResult<()> {
1117    if let Some((key, _)) = named_args.first() {
1118        return Err(RedDBError::Query(format!(
1119            "table function '{name}' has no named argument '{key}'"
1120        )));
1121    }
1122    Ok(())
1123}
1124
1125/// Resolve louvain's optional `resolution` named arg (γ, default 1.0). Any
1126/// other named key, or a non-finite / non-positive resolution, is rejected.
1127fn louvain_resolution(named_args: &[(String, f64)]) -> RedDBResult<f64> {
1128    let mut resolution = 1.0_f64;
1129    for (key, value) in named_args {
1130        if key.eq_ignore_ascii_case("resolution") {
1131            if !value.is_finite() || *value <= 0.0 {
1132                return Err(RedDBError::Query(format!(
1133                    "table function 'louvain' resolution must be > 0, got {value}"
1134                )));
1135            }
1136            resolution = *value;
1137        } else {
1138            return Err(RedDBError::Query(format!(
1139                "table function 'louvain' has no named argument '{key}' (expected 'resolution')"
1140            )));
1141        }
1142    }
1143    Ok(resolution)
1144}
1145
1146/// Undirected degree centrality over abstract inputs: each edge contributes
1147/// 1 to both of its endpoints. Returns `(node_id, degree)` deterministically
1148/// in ascending node-id order, so identical input always yields identical
1149/// rows.
1150fn abstract_degree_centrality(
1151    nodes: &[String],
1152    edges: &[(
1153        String,
1154        String,
1155        crate::storage::engine::graph_algorithms::Weight,
1156    )],
1157) -> Vec<(String, usize)> {
1158    let mut degree: std::collections::BTreeMap<String, usize> = std::collections::BTreeMap::new();
1159    for n in nodes {
1160        degree.entry(n.clone()).or_insert(0);
1161    }
1162    for (a, b, _w) in edges {
1163        *degree.entry(a.clone()).or_insert(0) += 1;
1164        *degree.entry(b.clone()).or_insert(0) += 1;
1165    }
1166    degree.into_iter().collect()
1167}
1168
1169/// Ordered column names for a materialized subquery result: the projection
1170/// columns when present, else the first record's field order.
1171fn ordered_result_columns(result: &crate::storage::query::unified::UnifiedResult) -> Vec<String> {
1172    if !result.columns.is_empty() {
1173        return result.columns.clone();
1174    }
1175    result
1176        .records
1177        .first()
1178        .map(|record| {
1179            record
1180                .column_names()
1181                .iter()
1182                .map(|column| column.to_string())
1183                .collect()
1184        })
1185        .unwrap_or_default()
1186}
1187
1188/// Canonical node-id string for a cell value, so the node universe (from the
1189/// `nodes` subquery) and the edge endpoints (from the `edges` subquery)
1190/// compare equal regardless of integer-vs-text typing. `Null` is not a node.
1191fn value_to_node_id(value: &crate::storage::schema::Value) -> Option<String> {
1192    use crate::storage::schema::Value;
1193    match value {
1194        Value::Null => None,
1195        Value::Text(s) => Some(s.to_string()),
1196        Value::Integer(n) => Some(n.to_string()),
1197        Value::UnsignedInteger(n) => Some(n.to_string()),
1198        Value::NodeRef(s) => Some(s.clone()),
1199        other => Some(other.to_string()),
1200    }
1201}
1202
1203/// Numeric edge weight from a cell value (the optional third `edges` column).
1204fn value_to_weight(value: &crate::storage::schema::Value) -> Option<f32> {
1205    use crate::storage::schema::Value;
1206    match value {
1207        Value::Float(f) => Some(*f as f32),
1208        Value::Integer(n) => Some(*n as f32),
1209        Value::UnsignedInteger(n) => Some(*n as f32),
1210        _ => None,
1211    }
1212}
1213
1214/// Build the node universe from a materialized `nodes` subquery result: the
1215/// first projected column of each row is the node id (issue #799). Zero rows
1216/// is a valid empty node set; a row set with no columns is a shape error.
1217fn inline_node_ids(
1218    name: &str,
1219    result: &crate::storage::query::unified::UnifiedResult,
1220) -> RedDBResult<Vec<String>> {
1221    if result.records.is_empty() {
1222        return Ok(Vec::new());
1223    }
1224    let columns = ordered_result_columns(result);
1225    let Some(first_col) = columns.first() else {
1226        return Err(RedDBError::Query(format!(
1227            "table function '{name}' inline form: `nodes` subquery must project at least one column (the node id)"
1228        )));
1229    };
1230    let mut ids = Vec::with_capacity(result.records.len());
1231    for record in &result.records {
1232        if let Some(id) = record.get(first_col).and_then(value_to_node_id) {
1233            ids.push(id);
1234        }
1235    }
1236    Ok(ids)
1237}
1238
1239/// Build the edge list from a materialized `edges` subquery result: the first
1240/// two projected columns are `(source, target)` and an optional third column
1241/// is the numeric weight (defaulting to 1.0). Fewer than two columns is a
1242/// shape error (issue #799).
1243fn inline_edges(
1244    name: &str,
1245    result: &crate::storage::query::unified::UnifiedResult,
1246) -> RedDBResult<
1247    Vec<(
1248        String,
1249        String,
1250        crate::storage::engine::graph_algorithms::Weight,
1251    )>,
1252> {
1253    if result.records.is_empty() {
1254        return Ok(Vec::new());
1255    }
1256    let columns = ordered_result_columns(result);
1257    if columns.len() < 2 {
1258        return Err(RedDBError::Query(format!(
1259            "table function '{name}' inline form: `edges` subquery must project at least two columns (source, target), got {}",
1260            columns.len()
1261        )));
1262    }
1263    let src_col = &columns[0];
1264    let dst_col = &columns[1];
1265    let weight_col = columns.get(2);
1266    let mut edges = Vec::with_capacity(result.records.len());
1267    for record in &result.records {
1268        let (Some(src), Some(dst)) = (
1269            record.get(src_col).and_then(value_to_node_id),
1270            record.get(dst_col).and_then(value_to_node_id),
1271        ) else {
1272            // A null/absent endpoint is not a valid edge; skip it.
1273            continue;
1274        };
1275        let weight = match weight_col {
1276            Some(col) => match record.get(col) {
1277                None | Some(crate::storage::schema::Value::Null) => 1.0,
1278                Some(value) => value_to_weight(value).ok_or_else(|| {
1279                    RedDBError::Query(format!(
1280                        "table function '{name}' inline form: `edges` weight column must be numeric"
1281                    ))
1282                })?,
1283            },
1284            None => 1.0,
1285        };
1286        edges.push((src, dst, weight));
1287    }
1288    Ok(edges)
1289}
1290
1291fn cache_scope_insert(scopes: &mut HashSet<String>, name: &str) {
1292    if name.is_empty() || name.starts_with("__subq_") || is_universal_query_source(name) {
1293        return;
1294    }
1295    scopes.insert(name.to_string());
1296}
1297
1298fn collect_table_source_scopes(scopes: &mut HashSet<String>, query: &TableQuery) {
1299    match query.source.as_ref() {
1300        Some(crate::storage::query::ast::TableSource::Name(name)) => {
1301            cache_scope_insert(scopes, name)
1302        }
1303        Some(crate::storage::query::ast::TableSource::Subquery(subquery)) => {
1304            collect_query_expr_result_cache_scopes(scopes, subquery);
1305        }
1306        // Graph-collection TVFs (e.g. `louvain(g)`) read the graph store
1307        // read-only. The result is now cached (issue #802) and scoped to the
1308        // graph collection named in the first argument, so any mutation on
1309        // that collection (`INSERT INTO g NODE/EDGE …`) invalidates the
1310        // entry via `invalidate_result_cache_for_table`. Non-graph or
1311        // zero-arg functions contribute no scope.
1312        Some(crate::storage::query::ast::TableSource::Function { name, args, .. }) => {
1313            if is_graph_tvf_name(name) {
1314                if let Some(graph) = args.first() {
1315                    cache_scope_insert(scopes, graph);
1316                }
1317            }
1318        }
1319        // The inline-graph form reads ordinary tables/docs through its
1320        // `nodes`/`edges` subqueries, so its result cache must be scoped to
1321        // those source collections — mutating any of them invalidates the
1322        // cached result (issue #799).
1323        Some(crate::storage::query::ast::TableSource::InlineGraphFunction {
1324            nodes, edges, ..
1325        }) => {
1326            collect_query_expr_result_cache_scopes(scopes, nodes);
1327            collect_query_expr_result_cache_scopes(scopes, edges);
1328        }
1329        None => cache_scope_insert(scopes, &query.table),
1330    }
1331}
1332
1333fn collect_vector_source_scopes(
1334    scopes: &mut HashSet<String>,
1335    source: &crate::storage::query::ast::VectorSource,
1336) {
1337    match source {
1338        crate::storage::query::ast::VectorSource::Reference { collection, .. } => {
1339            cache_scope_insert(scopes, collection);
1340        }
1341        crate::storage::query::ast::VectorSource::Subquery(subquery) => {
1342            collect_query_expr_result_cache_scopes(scopes, subquery);
1343        }
1344        crate::storage::query::ast::VectorSource::Literal(_)
1345        | crate::storage::query::ast::VectorSource::Text(_) => {}
1346    }
1347}
1348
1349fn collect_path_selector_scopes(
1350    scopes: &mut HashSet<String>,
1351    selector: &crate::storage::query::ast::NodeSelector,
1352) {
1353    if let crate::storage::query::ast::NodeSelector::ByRow { table, .. } = selector {
1354        cache_scope_insert(scopes, table);
1355    }
1356}
1357
1358fn collect_query_expr_result_cache_scopes(scopes: &mut HashSet<String>, expr: &QueryExpr) {
1359    match expr {
1360        QueryExpr::Table(query) => collect_table_source_scopes(scopes, query),
1361        QueryExpr::Join(query) => {
1362            collect_query_expr_result_cache_scopes(scopes, &query.left);
1363            collect_query_expr_result_cache_scopes(scopes, &query.right);
1364        }
1365        QueryExpr::Path(query) => {
1366            collect_path_selector_scopes(scopes, &query.from);
1367            collect_path_selector_scopes(scopes, &query.to);
1368        }
1369        QueryExpr::Vector(query) => {
1370            cache_scope_insert(scopes, &query.collection);
1371            collect_vector_source_scopes(scopes, &query.query_vector);
1372        }
1373        QueryExpr::Hybrid(query) => {
1374            collect_query_expr_result_cache_scopes(scopes, &query.structured);
1375            cache_scope_insert(scopes, &query.vector.collection);
1376            collect_vector_source_scopes(scopes, &query.vector.query_vector);
1377        }
1378        QueryExpr::Insert(query) => cache_scope_insert(scopes, &query.table),
1379        QueryExpr::Update(query) => cache_scope_insert(scopes, &query.table),
1380        QueryExpr::Delete(query) => cache_scope_insert(scopes, &query.table),
1381        QueryExpr::CreateTable(query) => cache_scope_insert(scopes, &query.name),
1382        QueryExpr::CreateCollection(query) => cache_scope_insert(scopes, &query.name),
1383        QueryExpr::CreateVector(query) => cache_scope_insert(scopes, &query.name),
1384        QueryExpr::DropTable(query) => cache_scope_insert(scopes, &query.name),
1385        QueryExpr::DropGraph(query) => cache_scope_insert(scopes, &query.name),
1386        QueryExpr::DropVector(query) => cache_scope_insert(scopes, &query.name),
1387        QueryExpr::DropDocument(query) => cache_scope_insert(scopes, &query.name),
1388        QueryExpr::DropKv(query) => cache_scope_insert(scopes, &query.name),
1389        QueryExpr::DropCollection(query) => cache_scope_insert(scopes, &query.name),
1390        QueryExpr::Truncate(query) => cache_scope_insert(scopes, &query.name),
1391        QueryExpr::AlterTable(query) => cache_scope_insert(scopes, &query.name),
1392        QueryExpr::CreateIndex(query) => cache_scope_insert(scopes, &query.table),
1393        QueryExpr::DropIndex(query) => cache_scope_insert(scopes, &query.table),
1394        QueryExpr::CreateTimeSeries(query) => cache_scope_insert(scopes, &query.name),
1395        QueryExpr::CreateMetric(query) => cache_scope_insert(scopes, &query.path),
1396        QueryExpr::AlterMetric(query) => cache_scope_insert(scopes, &query.path),
1397        QueryExpr::CreateSlo(query) => cache_scope_insert(scopes, &query.path),
1398        QueryExpr::DropTimeSeries(query) => cache_scope_insert(scopes, &query.name),
1399        QueryExpr::CreateQueue(query) => cache_scope_insert(scopes, &query.name),
1400        QueryExpr::AlterQueue(query) => cache_scope_insert(scopes, &query.name),
1401        QueryExpr::DropQueue(query) => cache_scope_insert(scopes, &query.name),
1402        QueryExpr::QueueSelect(query) => cache_scope_insert(scopes, &query.queue),
1403        QueryExpr::QueueCommand(query) => match query {
1404            QueueCommand::Push { queue, .. }
1405            | QueueCommand::Pop { queue, .. }
1406            | QueueCommand::Peek { queue, .. }
1407            | QueueCommand::Len { queue }
1408            | QueueCommand::Purge { queue }
1409            | QueueCommand::GroupCreate { queue, .. }
1410            | QueueCommand::GroupRead { queue, .. }
1411            | QueueCommand::Pending { queue, .. }
1412            | QueueCommand::Claim { queue, .. }
1413            | QueueCommand::Ack { queue, .. }
1414            | QueueCommand::Nack { queue, .. } => cache_scope_insert(scopes, queue),
1415            QueueCommand::Move {
1416                source,
1417                destination,
1418                ..
1419            } => {
1420                cache_scope_insert(scopes, source);
1421                cache_scope_insert(scopes, destination);
1422            }
1423        },
1424        QueryExpr::EventsBackfill(query) => {
1425            cache_scope_insert(scopes, &query.collection);
1426            cache_scope_insert(scopes, &query.target_queue);
1427        }
1428        QueryExpr::CreateTree(query) => cache_scope_insert(scopes, &query.collection),
1429        QueryExpr::DropTree(query) => cache_scope_insert(scopes, &query.collection),
1430        QueryExpr::TreeCommand(query) => match query {
1431            TreeCommand::Insert { collection, .. }
1432            | TreeCommand::Move { collection, .. }
1433            | TreeCommand::Delete { collection, .. }
1434            | TreeCommand::Validate { collection, .. }
1435            | TreeCommand::Rebalance { collection, .. } => cache_scope_insert(scopes, collection),
1436        },
1437        QueryExpr::SearchCommand(query) => match query {
1438            SearchCommand::Similar { collection, .. }
1439            | SearchCommand::Hybrid { collection, .. }
1440            | SearchCommand::SpatialRadius { collection, .. }
1441            | SearchCommand::SpatialBbox { collection, .. }
1442            | SearchCommand::SpatialNearest { collection, .. } => {
1443                cache_scope_insert(scopes, collection);
1444            }
1445            SearchCommand::Text { collection, .. }
1446            | SearchCommand::Multimodal { collection, .. }
1447            | SearchCommand::Index { collection, .. }
1448            | SearchCommand::Context { collection, .. } => {
1449                if let Some(collection) = collection.as_deref() {
1450                    cache_scope_insert(scopes, collection);
1451                }
1452            }
1453        },
1454        QueryExpr::Ask(query) => {
1455            if let Some(collection) = query.collection.as_deref() {
1456                cache_scope_insert(scopes, collection);
1457            }
1458        }
1459        QueryExpr::ExplainAlter(query) => cache_scope_insert(scopes, &query.target.name),
1460        QueryExpr::MaintenanceCommand(cmd) => match cmd {
1461            crate::storage::query::ast::MaintenanceCommand::Vacuum { target, .. }
1462            | crate::storage::query::ast::MaintenanceCommand::Analyze { target } => {
1463                if let Some(t) = target {
1464                    cache_scope_insert(scopes, t);
1465                }
1466            }
1467        },
1468        QueryExpr::CopyFrom(cmd) => cache_scope_insert(scopes, &cmd.table),
1469        QueryExpr::CreateView(cmd) => {
1470            cache_scope_insert(scopes, &cmd.name);
1471            // Invalidating the view should also invalidate its dependencies.
1472            collect_query_expr_result_cache_scopes(scopes, &cmd.query);
1473        }
1474        QueryExpr::DropView(cmd) => cache_scope_insert(scopes, &cmd.name),
1475        QueryExpr::RefreshMaterializedView(cmd) => cache_scope_insert(scopes, &cmd.name),
1476        QueryExpr::CreatePolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1477        QueryExpr::DropPolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1478        QueryExpr::CreateServer(_) | QueryExpr::DropServer(_) => {}
1479        QueryExpr::CreateForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1480        QueryExpr::DropForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1481        QueryExpr::Graph(_)
1482        | QueryExpr::GraphCommand(_)
1483        | QueryExpr::ProbabilisticCommand(_)
1484        | QueryExpr::SetConfig { .. }
1485        | QueryExpr::ShowConfig { .. }
1486        | QueryExpr::SetSecret { .. }
1487        | QueryExpr::DeleteSecret { .. }
1488        | QueryExpr::ShowSecrets { .. }
1489        | QueryExpr::SetTenant(_)
1490        | QueryExpr::ShowTenant
1491        | QueryExpr::TransactionControl(_)
1492        | QueryExpr::CreateSchema(_)
1493        | QueryExpr::DropSchema(_)
1494        | QueryExpr::CreateSequence(_)
1495        | QueryExpr::DropSequence(_)
1496        | QueryExpr::Grant(_)
1497        | QueryExpr::Revoke(_)
1498        | QueryExpr::AlterUser(_)
1499        | QueryExpr::CreateUser(_)
1500        | QueryExpr::CreateIamPolicy { .. }
1501        | QueryExpr::DropIamPolicy { .. }
1502        | QueryExpr::AttachPolicy { .. }
1503        | QueryExpr::DetachPolicy { .. }
1504        | QueryExpr::ShowPolicies { .. }
1505        | QueryExpr::ShowEffectivePermissions { .. }
1506        | QueryExpr::RankOf(_)
1507        | QueryExpr::ApproxRankOf(_)
1508        | QueryExpr::RankRange(_)
1509        | QueryExpr::SimulatePolicy { .. }
1510        | QueryExpr::LintPolicy { .. }
1511        | QueryExpr::MigratePolicyMode { .. }
1512        | QueryExpr::CreateMigration(_)
1513        | QueryExpr::ApplyMigration(_)
1514        | QueryExpr::RollbackMigration(_)
1515        | QueryExpr::ExplainMigration(_)
1516        | QueryExpr::EventsBackfillStatus { .. } => {}
1517        QueryExpr::KvCommand(cmd) => {
1518            use crate::storage::query::ast::KvCommand;
1519            match cmd {
1520                KvCommand::Put { collection, .. }
1521                | KvCommand::InvalidateTags { collection, .. }
1522                | KvCommand::Get { collection, .. }
1523                | KvCommand::Unseal { collection, .. }
1524                | KvCommand::Rotate { collection, .. }
1525                | KvCommand::History { collection, .. }
1526                | KvCommand::List { collection, .. }
1527                | KvCommand::Purge { collection, .. }
1528                | KvCommand::Watch { collection, .. }
1529                | KvCommand::Delete { collection, .. }
1530                | KvCommand::Incr { collection, .. }
1531                | KvCommand::Cas { collection, .. } => cache_scope_insert(scopes, collection),
1532            }
1533        }
1534        QueryExpr::ConfigCommand(cmd) => {
1535            use crate::storage::query::ast::ConfigCommand;
1536            match cmd {
1537                ConfigCommand::Put { collection, .. }
1538                | ConfigCommand::Get { collection, .. }
1539                | ConfigCommand::Resolve { collection, .. }
1540                | ConfigCommand::Rotate { collection, .. }
1541                | ConfigCommand::Delete { collection, .. }
1542                | ConfigCommand::History { collection, .. }
1543                | ConfigCommand::List { collection, .. }
1544                | ConfigCommand::Watch { collection, .. }
1545                | ConfigCommand::InvalidVolatileOperation { collection, .. } => {
1546                    cache_scope_insert(scopes, collection)
1547                }
1548            }
1549        }
1550    }
1551}
1552
1553/// Combine matching RLS policies for a table + action into a single
1554/// `Filter` suitable for AND-ing into a caller's `WHERE` clause.
1555///
1556/// Returns `None` when RLS is disabled or no policy admits the caller's
1557/// role — callers use that to short-circuit the mutation (for DELETE /
1558/// UPDATE we simply skip the operation, which PG expresses as "no rows
1559/// match the policy + predicate combination").
1560pub(crate) fn rls_policy_filter(
1561    runtime: &RedDBRuntime,
1562    table: &str,
1563    action: crate::storage::query::ast::PolicyAction,
1564) -> Option<crate::storage::query::ast::Filter> {
1565    rls_policy_filter_for_kind(
1566        runtime,
1567        table,
1568        action,
1569        crate::storage::query::ast::PolicyTargetKind::Table,
1570    )
1571}
1572
1573/// Kind-aware policy filter combiner (Phase 2.5.5 RLS universal).
1574/// Graph / vector / queue / timeseries scans pass the concrete kind;
1575/// policies targeting other kinds are ignored. Legacy Table-scoped
1576/// policies still apply cross-kind — callers register auto-tenancy
1577/// policies as Table today.
1578pub(crate) fn rls_policy_filter_for_kind(
1579    runtime: &RedDBRuntime,
1580    table: &str,
1581    action: crate::storage::query::ast::PolicyAction,
1582    kind: crate::storage::query::ast::PolicyTargetKind,
1583) -> Option<crate::storage::query::ast::Filter> {
1584    use crate::storage::query::ast::Filter;
1585
1586    if !runtime.inner.rls_enabled_tables.read().contains(table) {
1587        return None;
1588    }
1589    let role = current_auth_identity().map(|(_, role)| role);
1590    let role_str = role.map(|r| r.as_str().to_string());
1591    let policies = runtime.matching_rls_policies_for_kind(table, role_str.as_deref(), action, kind);
1592    if policies.is_empty() {
1593        return None;
1594    }
1595    policies
1596        .into_iter()
1597        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1598}
1599
1600/// Returns true when the table has RLS enforcement enabled. Convenience
1601/// shortcut so DML paths can gate the AND-combine work without reaching
1602/// into `runtime.inner.rls_enabled_tables` directly.
1603pub(crate) fn rls_is_enabled(runtime: &RedDBRuntime, table: &str) -> bool {
1604    runtime.inner.rls_enabled_tables.read().contains(table)
1605}
1606
1607/// Per-entity gate used by the graph materialiser for `GraphNode`
1608/// entities. RLS is checked against the source collection with
1609/// `kind = Nodes`, which `matching_rls_policies_for_kind` resolves to
1610/// either `Nodes`-targeted policies or legacy `Table`-targeted ones
1611/// (for back-compat with auto-tenancy declarations). Cached per
1612/// collection so big graphs only resolve the policy chain once.
1613fn node_passes_rls(
1614    runtime: &RedDBRuntime,
1615    collection: &str,
1616    role: Option<&str>,
1617    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1618    entity: &crate::storage::unified::entity::UnifiedEntity,
1619) -> bool {
1620    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1621
1622    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1623        return true;
1624    }
1625    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1626        let policies = runtime.matching_rls_policies_for_kind(
1627            collection,
1628            role,
1629            PolicyAction::Select,
1630            PolicyTargetKind::Nodes,
1631        );
1632        if policies.is_empty() {
1633            None
1634        } else {
1635            policies
1636                .into_iter()
1637                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1638        }
1639    });
1640    let Some(filter) = filter else {
1641        return false;
1642    };
1643    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1644        Some(&runtime.inner.db),
1645        entity,
1646        filter,
1647        collection,
1648        collection,
1649    )
1650}
1651
1652/// Edge counterpart of `node_passes_rls`. Same caching strategy with
1653/// `kind = Edges`.
1654fn edge_passes_rls(
1655    runtime: &RedDBRuntime,
1656    collection: &str,
1657    role: Option<&str>,
1658    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1659    entity: &crate::storage::unified::entity::UnifiedEntity,
1660) -> bool {
1661    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1662
1663    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1664        return true;
1665    }
1666    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1667        let policies = runtime.matching_rls_policies_for_kind(
1668            collection,
1669            role,
1670            PolicyAction::Select,
1671            PolicyTargetKind::Edges,
1672        );
1673        if policies.is_empty() {
1674            None
1675        } else {
1676            policies
1677                .into_iter()
1678                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1679        }
1680    });
1681    let Some(filter) = filter else {
1682        return false;
1683    };
1684    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1685        Some(&runtime.inner.db),
1686        entity,
1687        filter,
1688        collection,
1689        collection,
1690    )
1691}
1692
1693/// RLS policy injection (Phase 2.5.2 PG parity).
1694///
1695/// Fetch every matching policy for the current thread-local role and
1696/// fold them into the query's filter. Semantics mirror PostgreSQL:
1697///
1698/// * Multiple policies on the same table combine with **OR** — a row is
1699///   visible if *any* policy admits it.
1700/// * The combined policy predicate is **AND**-ed into the caller's
1701///   existing `WHERE` clause so explicit predicates continue to trim
1702///   the policy-allowed set.
1703/// * No matching policies + RLS enabled = zero rows (PG's
1704///   restrictive-default). Callers get `None` and return an empty
1705///   `UnifiedResult` without ever dispatching the scan.
1706///
1707/// This runs only when `RuntimeInner::rls_enabled_tables` already
1708/// contains the table name — callers gate the hot path upfront to
1709/// avoid the lock acquisition on tables without RLS.
1710///
1711/// Returns `None` when no policy admits the current role; returns
1712/// `Some(mutated_table)` with policy filters folded in otherwise.
1713fn inject_rls_filters(
1714    runtime: &RedDBRuntime,
1715    frame: &dyn super::statement_frame::ReadFrame,
1716    mut table: crate::storage::query::ast::TableQuery,
1717) -> Option<crate::storage::query::ast::TableQuery> {
1718    use crate::storage::query::ast::{Filter, PolicyAction};
1719
1720    // `None` role falls through to policies with no `TO role` clause.
1721    let role = frame.identity().map(|(_, role)| role);
1722    let role_str = role.map(|r| r.as_str().to_string());
1723    let policies =
1724        runtime.matching_rls_policies(&table.table, role_str.as_deref(), PolicyAction::Select);
1725
1726    if policies.is_empty() {
1727        // RLS enabled + no policy match = deny everything. Signal the
1728        // caller to short-circuit with an empty result set.
1729        return None;
1730    }
1731
1732    // Combine policy predicates with OR (PG's permissive default).
1733    let combined = policies
1734        .into_iter()
1735        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1736        .expect("policies non-empty");
1737
1738    // AND into the caller's existing predicate. The predicate may live
1739    // in `where_expr` rather than `filter`: `resolve_table_expr_subqueries`
1740    // nulls `filter` whenever `where_expr` is present (the case for a
1741    // view body rewritten into `SELECT … WHERE …`). Folding only into
1742    // `filter` here would silently drop that `where_expr` predicate at
1743    // eval time because `effective_table_filter` prefers `filter` —
1744    // e.g. `WITHIN TENANT … SELECT * FROM <view>` would apply the
1745    // tenant policy but lose the view's own WHERE (#635).
1746    use crate::storage::query::sql_lowering::{expr_to_filter, filter_to_expr};
1747    let had_where_expr = table.where_expr.is_some();
1748    let existing = table
1749        .filter
1750        .take()
1751        .or_else(|| table.where_expr.as_ref().map(expr_to_filter));
1752    let new_filter = match existing {
1753        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1754        None => combined,
1755    };
1756    // Keep `where_expr` in lock-step with the merged `filter` so
1757    // whichever the executor consults sees the full predicate.
1758    if had_where_expr {
1759        table.where_expr = Some(filter_to_expr(&new_filter));
1760    }
1761    table.filter = Some(new_filter);
1762    Some(table)
1763}
1764
1765/// Apply per-table RLS to a `JoinQuery` by folding each side's policy
1766/// predicate into the join's outer filter. Walking the merged record
1767/// at the join layer (rather than mutating the per-side scan filter)
1768/// keeps the planner's strategy choice and per-side index selection
1769/// undisturbed — the policy predicate uses the qualified `t.col` form
1770/// that resolves cleanly against the merged record's keys.
1771///
1772/// Returns `None` when any leaf has RLS enabled and no policy admits
1773/// the caller — the join short-circuits to an empty result.
1774fn inject_rls_into_join(
1775    runtime: &RedDBRuntime,
1776    frame: &dyn super::statement_frame::ReadFrame,
1777    mut join: crate::storage::query::ast::JoinQuery,
1778) -> Option<crate::storage::query::ast::JoinQuery> {
1779    use crate::storage::query::ast::Filter;
1780
1781    let mut policy_filters: Vec<Filter> = Vec::new();
1782    if !collect_join_side_policy(runtime, frame, join.left.as_ref(), &mut policy_filters) {
1783        return None;
1784    }
1785    if !collect_join_side_policy(runtime, frame, join.right.as_ref(), &mut policy_filters) {
1786        return None;
1787    }
1788
1789    if policy_filters.is_empty() {
1790        return Some(join);
1791    }
1792
1793    let combined = policy_filters
1794        .into_iter()
1795        .reduce(|acc, f| Filter::And(Box::new(acc), Box::new(f)))
1796        .expect("policy_filters non-empty");
1797
1798    join.filter = Some(match join.filter.take() {
1799        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1800        None => combined,
1801    });
1802
1803    Some(join)
1804}
1805
1806/// For each `Table` leaf reachable through nested joins, append the
1807/// RLS-policy filter (combined with OR across that side's matching
1808/// policies) into `out`. Returns `false` when a side has RLS enabled
1809/// but no policy admits the caller — the join must short-circuit.
1810fn collect_join_side_policy(
1811    runtime: &RedDBRuntime,
1812    frame: &dyn super::statement_frame::ReadFrame,
1813    expr: &crate::storage::query::ast::QueryExpr,
1814    out: &mut Vec<crate::storage::query::ast::Filter>,
1815) -> bool {
1816    use crate::storage::query::ast::{Filter, PolicyAction, QueryExpr};
1817    match expr {
1818        QueryExpr::Table(t) => {
1819            if !runtime.inner.rls_enabled_tables.read().contains(&t.table) {
1820                return true;
1821            }
1822            let role = frame.identity().map(|(_, role)| role);
1823            let role_str = role.map(|r| r.as_str().to_string());
1824            let policies =
1825                runtime.matching_rls_policies(&t.table, role_str.as_deref(), PolicyAction::Select);
1826            if policies.is_empty() {
1827                return false;
1828            }
1829            let combined = policies
1830                .into_iter()
1831                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1832                .expect("policies non-empty");
1833            out.push(combined);
1834            true
1835        }
1836        QueryExpr::Join(inner) => {
1837            collect_join_side_policy(runtime, frame, inner.left.as_ref(), out)
1838                && collect_join_side_policy(runtime, frame, inner.right.as_ref(), out)
1839        }
1840        _ => true,
1841    }
1842}
1843
1844/// Foreign-table post-scan filter application (Phase 3.2.2 PG parity).
1845///
1846/// Phase 3.2 FDW wrappers don't advertise filter pushdown, so the runtime
1847/// applies `WHERE` / `ORDER BY` / `LIMIT` / `OFFSET` after the wrapper
1848/// materialises all rows. Projections are best-effort — when the query
1849/// lists explicit columns we keep only those; a `SELECT *` keeps every
1850/// wrapper-emitted field verbatim.
1851///
1852/// When a wrapper later opts into pushdown (`supports_pushdown = true`)
1853/// the runtime will pass the compiled filter down instead of post-filtering.
1854fn apply_foreign_table_filters(
1855    records: Vec<crate::storage::query::unified::UnifiedRecord>,
1856    query: &crate::storage::query::ast::TableQuery,
1857) -> crate::storage::query::unified::UnifiedResult {
1858    use crate::storage::query::sql_lowering::{
1859        effective_table_filter, effective_table_projections,
1860    };
1861    use crate::storage::query::unified::UnifiedResult;
1862
1863    let filter = effective_table_filter(query);
1864    let projections = effective_table_projections(query);
1865
1866    // Step 1 — WHERE. Reuse the cross-store evaluator so the semantics
1867    // match native-collection queries (same operators, same NULL handling).
1868    let mut filtered: Vec<_> = records
1869        .into_iter()
1870        .filter(|record| match &filter {
1871            Some(f) => {
1872                super::join_filter::evaluate_runtime_filter_with_db(None, record, f, None, None)
1873            }
1874            None => true,
1875        })
1876        .collect();
1877
1878    // Step 2 — LIMIT / OFFSET. Applied after filter to match SQL semantics.
1879    if let Some(offset) = query.offset {
1880        let offset = offset as usize;
1881        if offset >= filtered.len() {
1882            filtered.clear();
1883        } else {
1884            filtered.drain(0..offset);
1885        }
1886    }
1887    if let Some(limit) = query.limit {
1888        filtered.truncate(limit as usize);
1889    }
1890
1891    // Step 3 — columns list. `SELECT *` (no explicit projections) keeps
1892    // the wrapper's column set; an explicit list trims to those names.
1893    let columns: Vec<String> = if projections.is_empty() {
1894        filtered
1895            .first()
1896            .map(|r| r.column_names().iter().map(|k| k.to_string()).collect())
1897            .unwrap_or_default()
1898    } else {
1899        projections
1900            .iter()
1901            .map(super::join_filter::projection_name)
1902            .collect()
1903    };
1904
1905    let mut result = UnifiedResult::empty();
1906    result.columns = columns;
1907    result.records = filtered;
1908    result
1909}
1910
1911/// Collect every concrete table reference inside a `QueryExpr`.
1912///
1913/// Used by view bookkeeping (dependency tracking for materialised
1914/// invalidation) and any other rewriter that needs to know the base
1915/// tables a query pulls from. Does not descend into projections/filters;
1916/// only the `FROM` side.
1917pub(crate) fn collect_table_refs(expr: &QueryExpr) -> Vec<String> {
1918    let mut scopes: HashSet<String> = HashSet::new();
1919    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1920    scopes.into_iter().collect()
1921}
1922
1923fn query_expr_result_cache_scopes(expr: &QueryExpr) -> HashSet<String> {
1924    let mut scopes = HashSet::new();
1925    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1926    scopes
1927}
1928
1929/// Heuristic: does the raw SQL reference a built-in whose output
1930/// varies by connection, clock, or randomness? Such queries must
1931/// skip the 30s result cache — see the call site for rationale.
1932///
1933/// ASCII case-insensitive substring match. False positives (the
1934/// token appears in a quoted string) only skip caching, which is
1935/// the conservative direction.
1936/// If `sql` starts with `EXPLAIN` followed by a non-`ALTER` token,
1937/// return the trimmed inner statement; otherwise `None`.
1938///
1939/// `EXPLAIN ALTER FOR CREATE TABLE ...` is a separate schema-diff
1940/// command handled inside the normal SQL parser, so we leave it
1941/// alone here.
1942fn strip_explain_prefix(sql: &str) -> Option<&str> {
1943    let trimmed = sql.trim_start();
1944    let (head, rest) = trimmed.split_at(
1945        trimmed
1946            .find(|c: char| c.is_whitespace())
1947            .unwrap_or(trimmed.len()),
1948    );
1949    if !head.eq_ignore_ascii_case("EXPLAIN") {
1950        return None;
1951    }
1952    let rest = rest.trim_start();
1953    if rest.is_empty() {
1954        return None;
1955    }
1956    // Peek the next token — if ALTER or ASK, defer to the normal parser.
1957    // `EXPLAIN ASK` is an executable read path: it runs retrieval and
1958    // provider selection, then short-circuits before the LLM call.
1959    let next_head_end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len());
1960    if rest[..next_head_end].eq_ignore_ascii_case("ALTER")
1961        || rest[..next_head_end].eq_ignore_ascii_case("ASK")
1962    {
1963        return None;
1964    }
1965    Some(rest)
1966}
1967
1968/// Cheap prefix check for a leading `WITH` keyword. Used to gate the
1969/// CTE-aware parse in `execute_query` without paying for a full
1970/// lexer pass on every statement. Treats `WITHIN` as not-a-CTE so
1971/// `WITHIN TENANT '...' SELECT ...` doesn't mis-route.
1972pub(super) fn has_with_prefix(sql: &str) -> bool {
1973    let trimmed = sql.trim_start();
1974    let head_end = trimmed
1975        .find(|c: char| c.is_whitespace() || c == '(')
1976        .unwrap_or(trimmed.len());
1977    trimmed[..head_end].eq_ignore_ascii_case("WITH")
1978}
1979
1980/// If the query is a plain SELECT whose top-level `TableQuery`
1981/// carries an `AS OF` clause, return a typed spec that the runtime
1982/// can feed to `vcs_resolve_as_of`. Returns `None` for any other
1983/// shape — joins, DML, EXPLAIN, or parse failures — so callers fall
1984/// back to the connection's regular MVCC snapshot. A cheap textual
1985/// prefilter skips the parse entirely when the source doesn't
1986/// mention `AS OF` / `as of`, keeping the autocommit hot path free.
1987fn peek_top_level_as_of(sql: &str) -> Option<crate::application::vcs::AsOfSpec> {
1988    peek_top_level_as_of_with_table(sql).map(|(spec, _)| spec)
1989}
1990
1991/// Same as `peek_top_level_as_of` but also returns the table name
1992/// targeted by the AS OF clause (when the FROM clause names a
1993/// concrete table). `None` for the table slot means scalar SELECT
1994/// or a subquery source — callers treat those as "no enforcement".
1995pub(super) fn peek_top_level_as_of_with_table(
1996    sql: &str,
1997) -> Option<(crate::application::vcs::AsOfSpec, Option<String>)> {
1998    if !sql
1999        .as_bytes()
2000        .windows(5)
2001        .any(|w| w.eq_ignore_ascii_case(b"as of"))
2002    {
2003        return None;
2004    }
2005    let parsed = crate::storage::query::parser::parse(sql).ok()?;
2006    let crate::storage::query::ast::QueryExpr::Table(table) = parsed.query else {
2007        return None;
2008    };
2009    let clause = table.as_of?;
2010    let table_name = if table.table.is_empty() || table.table == "any" {
2011        None
2012    } else {
2013        Some(table.table.clone())
2014    };
2015    let spec = match clause {
2016        crate::storage::query::ast::AsOfClause::Commit(h) => {
2017            crate::application::vcs::AsOfSpec::Commit(h)
2018        }
2019        crate::storage::query::ast::AsOfClause::Branch(b) => {
2020            crate::application::vcs::AsOfSpec::Branch(b)
2021        }
2022        crate::storage::query::ast::AsOfClause::Tag(t) => crate::application::vcs::AsOfSpec::Tag(t),
2023        crate::storage::query::ast::AsOfClause::TimestampMs(ts) => {
2024            crate::application::vcs::AsOfSpec::TimestampMs(ts)
2025        }
2026        crate::storage::query::ast::AsOfClause::Snapshot(x) => {
2027            crate::application::vcs::AsOfSpec::Snapshot(x)
2028        }
2029    };
2030    Some((spec, table_name))
2031}
2032
2033pub(super) fn query_has_volatile_builtin(sql: &str) -> bool {
2034    // Lowercase the bytes up to the first null/newline into a small
2035    // stack buffer for cheap contains() checks. Most SQL fits in the
2036    // buffer; longer queries fall back to owned lowercase.
2037    const VOLATILE_TOKENS: &[&str] = &[
2038        "pg_advisory_lock",
2039        "pg_try_advisory_lock",
2040        "pg_advisory_unlock",
2041        "random()",
2042        // NOW() / CURRENT_TIMESTAMP / CURRENT_DATE intentionally
2043        // omitted for now — they ARE volatile but today's tests rely
2044        // on caching them. Revisit once a tighter volatility story
2045        // lands.
2046    ];
2047    let lowered = sql.to_ascii_lowercase();
2048    VOLATILE_TOKENS.iter().any(|t| lowered.contains(t))
2049}
2050
2051pub(super) fn query_is_ask_statement(sql: &str) -> bool {
2052    let trimmed = sql.trim_start();
2053    let head_end = trimmed
2054        .find(|c: char| c.is_whitespace() || c == '(' || c == ';')
2055        .unwrap_or(trimmed.len());
2056    trimmed[..head_end].eq_ignore_ascii_case("ASK")
2057}
2058
2059/// Pick the `(global_mode, collection_mode)` pair for an expression,
2060/// or `None` for variants that opt out of intent-locking entirely
2061/// (admin statements like `SHOW CONFIG`, transaction control, tenant
2062/// toggles).
2063///
2064/// Phase-1 contract:
2065/// - Reads  — `(IX-compatible) (Global, IS) → (Collection, IS)`
2066/// - Writes — `(IX-compatible) (Global, IX) → (Collection, IX)`
2067/// - DDL    — `(strong)        (Global, IX) → (Collection, X)`
2068pub(super) fn intent_lock_modes_for(
2069    expr: &QueryExpr,
2070) -> Option<(
2071    crate::storage::transaction::lock::LockMode,
2072    crate::storage::transaction::lock::LockMode,
2073)> {
2074    use crate::storage::transaction::lock::LockMode::{Exclusive, IntentExclusive, IntentShared};
2075
2076    match expr {
2077        // Reads — IS / IS.
2078        QueryExpr::Table(_)
2079        | QueryExpr::Join(_)
2080        | QueryExpr::Vector(_)
2081        | QueryExpr::Hybrid(_)
2082        | QueryExpr::Graph(_)
2083        | QueryExpr::Path(_)
2084        | QueryExpr::Ask(_)
2085        | QueryExpr::SearchCommand(_)
2086        | QueryExpr::GraphCommand(_)
2087        | QueryExpr::RankOf(_)
2088        | QueryExpr::ApproxRankOf(_)
2089        | QueryExpr::RankRange(_)
2090        | QueryExpr::QueueSelect(_) => Some((IntentShared, IntentShared)),
2091
2092        // Writes — IX / IX. Non-tabular mutations (vector insert,
2093        // graph node insert, queue push, timeseries point insert)
2094        // don't carry their own dispatch arm here; they ride through
2095        // the Insert variant or a command variant covered by the
2096        // read-side arm above. P1.T4 expands only the TableQuery-ish
2097        // writes; non-tabular kinds inherit when their DML variants
2098        // land in later phases.
2099        QueryExpr::Insert(_)
2100        | QueryExpr::Update(_)
2101        | QueryExpr::Delete(_)
2102        | QueryExpr::QueueCommand(QueueCommand::Move { .. }) => {
2103            Some((IntentExclusive, IntentExclusive))
2104        }
2105        QueryExpr::QueueCommand(_) => Some((IntentShared, IntentShared)),
2106
2107        // DDL — IX / X. A DDL against collection `c` blocks all
2108        // other writers + readers on `c` but leaves other collections
2109        // running (because Global stays IX, not X).
2110        QueryExpr::CreateTable(_)
2111        | QueryExpr::CreateCollection(_)
2112        | QueryExpr::CreateVector(_)
2113        | QueryExpr::DropTable(_)
2114        | QueryExpr::DropGraph(_)
2115        | QueryExpr::DropVector(_)
2116        | QueryExpr::DropDocument(_)
2117        | QueryExpr::DropKv(_)
2118        | QueryExpr::DropCollection(_)
2119        | QueryExpr::Truncate(_)
2120        | QueryExpr::AlterTable(_)
2121        | QueryExpr::CreateIndex(_)
2122        | QueryExpr::DropIndex(_)
2123        | QueryExpr::CreateTimeSeries(_)
2124        | QueryExpr::CreateMetric(_)
2125        | QueryExpr::AlterMetric(_)
2126        | QueryExpr::CreateSlo(_)
2127        | QueryExpr::DropTimeSeries(_)
2128        | QueryExpr::CreateQueue(_)
2129        | QueryExpr::AlterQueue(_)
2130        | QueryExpr::DropQueue(_)
2131        | QueryExpr::CreateTree(_)
2132        | QueryExpr::DropTree(_)
2133        | QueryExpr::CreatePolicy(_)
2134        | QueryExpr::DropPolicy(_)
2135        | QueryExpr::CreateView(_)
2136        | QueryExpr::DropView(_)
2137        | QueryExpr::RefreshMaterializedView(_)
2138        | QueryExpr::CreateSchema(_)
2139        | QueryExpr::DropSchema(_)
2140        | QueryExpr::CreateSequence(_)
2141        | QueryExpr::DropSequence(_)
2142        | QueryExpr::CreateServer(_)
2143        | QueryExpr::DropServer(_)
2144        | QueryExpr::CreateForeignTable(_)
2145        | QueryExpr::DropForeignTable(_) => Some((IntentExclusive, Exclusive)),
2146
2147        // Admin / control — skip intent locks. `SET TENANT`,
2148        // `BEGIN / COMMIT / ROLLBACK`, `SET CONFIG`, `SHOW CONFIG`,
2149        // `VACUUM`, etc. don't touch collection data the same way
2150        // and the existing transaction layer already serialises the
2151        // pieces that matter.
2152        _ => None,
2153    }
2154}
2155
2156/// Best-effort collection inventory for an expression. Used to pick
2157/// `Collection(...)` resources for the intent-lock guard. Overshoots
2158/// are fine (take an extra IS, benign); undershoots leak writes past
2159/// DDL X locks, so err on the side of listing more names.
2160pub(super) fn collections_referenced(expr: &QueryExpr) -> Vec<String> {
2161    let mut out = Vec::new();
2162    walk_collections(expr, &mut out);
2163    out.sort();
2164    out.dedup();
2165    out
2166}
2167
2168fn walk_collections(expr: &QueryExpr, out: &mut Vec<String>) {
2169    match expr {
2170        QueryExpr::Table(t) => out.push(t.table.clone()),
2171        QueryExpr::Join(j) => {
2172            walk_collections(&j.left, out);
2173            walk_collections(&j.right, out);
2174        }
2175        QueryExpr::Insert(i) => out.push(i.table.clone()),
2176        QueryExpr::Update(u) => out.push(u.table.clone()),
2177        QueryExpr::Delete(d) => out.push(d.table.clone()),
2178        QueryExpr::QueueSelect(q) => out.push(q.queue.clone()),
2179
2180        // DDL — include the target collection so DDL takes
2181        // `(Collection, X)` and blocks concurrent readers / writers
2182        // on the same collection. Other collections stay live
2183        // because Global is still IX.
2184        QueryExpr::CreateTable(q) => out.push(q.name.clone()),
2185        QueryExpr::CreateCollection(q) => out.push(q.name.clone()),
2186        QueryExpr::CreateVector(q) => out.push(q.name.clone()),
2187        QueryExpr::DropTable(q) => out.push(q.name.clone()),
2188        QueryExpr::DropGraph(q) => out.push(q.name.clone()),
2189        QueryExpr::DropVector(q) => out.push(q.name.clone()),
2190        QueryExpr::DropDocument(q) => out.push(q.name.clone()),
2191        QueryExpr::DropKv(q) => out.push(q.name.clone()),
2192        QueryExpr::DropCollection(q) => out.push(q.name.clone()),
2193        QueryExpr::Truncate(q) => out.push(q.name.clone()),
2194        QueryExpr::AlterTable(q) => out.push(q.name.clone()),
2195        QueryExpr::CreateIndex(q) => out.push(q.table.clone()),
2196        QueryExpr::DropIndex(q) => out.push(q.table.clone()),
2197        QueryExpr::CreateTimeSeries(q) => out.push(q.name.clone()),
2198        QueryExpr::CreateMetric(q) => out.push(q.path.clone()),
2199        QueryExpr::AlterMetric(q) => out.push(q.path.clone()),
2200        QueryExpr::CreateSlo(q) => out.push(q.path.clone()),
2201        QueryExpr::DropTimeSeries(q) => out.push(q.name.clone()),
2202        QueryExpr::CreateQueue(q) => out.push(q.name.clone()),
2203        QueryExpr::AlterQueue(q) => out.push(q.name.clone()),
2204        QueryExpr::DropQueue(q) => out.push(q.name.clone()),
2205        QueryExpr::QueueCommand(QueueCommand::Move {
2206            source,
2207            destination,
2208            ..
2209        }) => {
2210            out.push(source.clone());
2211            out.push(destination.clone());
2212        }
2213        QueryExpr::CreatePolicy(q) => out.push(q.table.clone()),
2214        QueryExpr::CreateView(q) => out.push(q.name.clone()),
2215        QueryExpr::DropView(q) => out.push(q.name.clone()),
2216        QueryExpr::RefreshMaterializedView(q) => out.push(q.name.clone()),
2217
2218        // Vector / Hybrid / Graph / Path / commands reference
2219        // collections through fields whose shape varies; without a
2220        // uniform accessor we fall back to the global lock only —
2221        // benign because every runtime path still holds the global
2222        // mode.
2223        _ => {}
2224    }
2225}
2226
2227impl RedDBRuntime {
2228    pub fn in_memory() -> RedDBResult<Self> {
2229        Self::with_options(RedDBOptions::in_memory())
2230    }
2231
2232    pub fn flush(&self) -> RedDBResult<()> {
2233        self.inner
2234            .db
2235            .flush()
2236            .map_err(|err| RedDBError::Internal(err.to_string()))
2237    }
2238
2239    /// Handle to the intent-lock manager for tests + introspection.
2240    /// Production code acquires via `LockerGuard::new(rt.lock_manager())`
2241    /// rather than touching the manager directly.
2242    pub fn lock_manager(&self) -> std::sync::Arc<crate::storage::transaction::lock::LockManager> {
2243        self.inner.lock_manager.clone()
2244    }
2245
2246    /// Process-local governance registry for managed policy/config guardrails.
2247    pub fn config_registry(&self) -> std::sync::Arc<crate::auth::registry::ConfigRegistry> {
2248        self.inner.config_registry.clone()
2249    }
2250
2251    pub fn query_audit(&self) -> std::sync::Arc<crate::runtime::query_audit::QueryAuditStream> {
2252        self.inner.query_audit.clone()
2253    }
2254
2255    pub fn control_events_require_persistence(&self) -> bool {
2256        self.inner.control_event_config.require_persistence()
2257    }
2258
2259    pub fn control_event_config(&self) -> crate::runtime::control_events::ControlEventConfig {
2260        self.inner.control_event_config
2261    }
2262
2263    pub fn control_event_ledger(
2264        &self,
2265    ) -> Arc<dyn crate::runtime::control_events::ControlEventLedger> {
2266        self.inner.control_event_ledger.read().clone()
2267    }
2268
2269    #[doc(hidden)]
2270    pub fn replace_control_event_ledger_for_tests(
2271        &self,
2272        ledger: Arc<dyn crate::runtime::control_events::ControlEventLedger>,
2273    ) {
2274        *self.inner.control_event_ledger.write() = ledger;
2275    }
2276
2277    #[inline(never)]
2278    pub fn with_options(options: RedDBOptions) -> RedDBResult<Self> {
2279        Self::with_pool(options, ConnectionPoolConfig::default())
2280    }
2281
2282    pub fn with_pool(
2283        options: RedDBOptions,
2284        pool_config: ConnectionPoolConfig,
2285    ) -> RedDBResult<Self> {
2286        // PLAN.md Phase 9.1 — capture wall-clock before storage
2287        // open so the cold-start phase markers can be backfilled
2288        // once Lifecycle is constructed below. Storage open
2289        // encapsulates auto-restore + WAL replay; we treat the
2290        // whole window as one combined "restore" + "wal_replay"
2291        // phase split at the same boundary because the storage
2292        // layer doesn't yet emit a finer signal.
2293        let boot_open_start_ms = std::time::SystemTime::now()
2294            .duration_since(std::time::UNIX_EPOCH)
2295            .map(|d| d.as_millis() as u64)
2296            .unwrap_or(0);
2297        let embedded_single_file = options.storage_profile.deploy_profile
2298            == crate::storage::DeployProfile::Embedded
2299            && options.storage_profile.packaging == crate::storage::StoragePackaging::SingleFile;
2300        let db = Arc::new(
2301            RedDB::open_with_options(&options)
2302                .map_err(|err| RedDBError::Internal(err.to_string()))?,
2303        );
2304        let result_blob_cache_config = if embedded_single_file {
2305            crate::storage::cache::BlobCacheConfig::default()
2306        } else {
2307            crate::storage::cache::BlobCacheConfig::default().with_l2_path(
2308                reddb_file::layout::result_cache_l2_path(
2309                    &options.resolved_path(reddb_file::default_database_path()),
2310                ),
2311            )
2312        };
2313        let result_blob_cache =
2314            crate::storage::cache::BlobCache::open_with_l2(result_blob_cache_config).map_err(
2315                |err| RedDBError::Internal(format!("open result Blob Cache L2 failed: {err:?}")),
2316            )?;
2317        let storage_ready_ms = std::time::SystemTime::now()
2318            .duration_since(std::time::UNIX_EPOCH)
2319            .map(|d| d.as_millis() as u64)
2320            .unwrap_or(0);
2321
2322        let runtime = Self {
2323            inner: Arc::new(RuntimeInner {
2324                db: db.clone(),
2325                layout: PhysicalLayout::from_options(&options),
2326                embedded_single_file,
2327                indices: IndexCatalog::register_default_vector_graph(
2328                    options.has_capability(crate::api::Capability::Table),
2329                    options.has_capability(crate::api::Capability::Graph),
2330                ),
2331                pool_config,
2332                pool: Mutex::new(PoolState::default()),
2333                started_at_unix_ms: SystemTime::now()
2334                    .duration_since(UNIX_EPOCH)
2335                    .unwrap_or_default()
2336                    .as_millis(),
2337                probabilistic: super::probabilistic_store::ProbabilisticStore::new(),
2338                index_store: super::index_store::IndexStore::new(),
2339                cdc: crate::replication::cdc::CdcBuffer::new(100_000),
2340                backup_scheduler: crate::replication::scheduler::BackupScheduler::new(3600),
2341                query_cache: parking_lot::RwLock::new(
2342                    crate::storage::query::planner::cache::PlanCache::new(1000),
2343                ),
2344                result_cache: parking_lot::RwLock::new((
2345                    HashMap::new(),
2346                    std::collections::VecDeque::new(),
2347                )),
2348                result_blob_cache,
2349                result_blob_entries: parking_lot::RwLock::new((
2350                    HashMap::new(),
2351                    std::collections::VecDeque::new(),
2352                )),
2353                ask_answer_cache_entries: parking_lot::RwLock::new((
2354                    HashSet::new(),
2355                    std::collections::VecDeque::new(),
2356                )),
2357                result_cache_shadow_divergences: std::sync::atomic::AtomicU64::new(0),
2358                result_cache_hits: std::sync::atomic::AtomicU64::new(0),
2359                result_cache_misses: std::sync::atomic::AtomicU64::new(0),
2360                result_cache_evictions: std::sync::atomic::AtomicU64::new(0),
2361                ask_daily_spend: parking_lot::RwLock::new(HashMap::new()),
2362                queue_message_locks: parking_lot::RwLock::new(HashMap::new()),
2363                rmw_locks: RmwLockTable::new(),
2364                planner_dirty_tables: parking_lot::RwLock::new(HashSet::new()),
2365                ec_registry: Arc::new(crate::ec::config::EcRegistry::new()),
2366                config_registry: Arc::new(crate::auth::registry::ConfigRegistry::new()),
2367                ec_worker: crate::ec::worker::EcWorker::new(),
2368                auth_store: parking_lot::RwLock::new(None),
2369                oauth_validator: parking_lot::RwLock::new(None),
2370                browser_token_authority: parking_lot::RwLock::new(None),
2371                views: parking_lot::RwLock::new(HashMap::new()),
2372                materialized_views: parking_lot::RwLock::new(
2373                    crate::storage::cache::result::MaterializedViewCache::new(),
2374                ),
2375                retention_sweeper: parking_lot::RwLock::new(
2376                    crate::runtime::retention_sweeper::RetentionSweeperState::new(),
2377                ),
2378                snapshot_manager: Arc::new(
2379                    crate::storage::transaction::snapshot::SnapshotManager::new(),
2380                ),
2381                tx_contexts: parking_lot::RwLock::new(HashMap::new()),
2382                tx_local_tenants: parking_lot::RwLock::new(HashMap::new()),
2383                env_config_overrides: crate::runtime::config_overlay::collect_env_overrides(),
2384                lock_manager: Arc::new({
2385                    // Sourced from the matrix: Tier B key
2386                    // `concurrency.locking.deadlock_timeout_ms`
2387                    // (default 5000). Env var wins at boot so
2388                    // operators can tune without touching red_config.
2389                    let env = crate::runtime::config_overlay::collect_env_overrides();
2390                    let timeout_ms = env
2391                        .get("concurrency.locking.deadlock_timeout_ms")
2392                        .and_then(|raw| raw.parse::<u64>().ok())
2393                        .unwrap_or_else(|| {
2394                            match crate::runtime::config_matrix::default_for(
2395                                "concurrency.locking.deadlock_timeout_ms",
2396                            ) {
2397                                Some(crate::serde_json::Value::Number(n)) => n as u64,
2398                                _ => 5000,
2399                            }
2400                        });
2401                    let cfg = crate::storage::transaction::lock::LockConfig {
2402                        default_timeout: std::time::Duration::from_millis(timeout_ms),
2403                        ..Default::default()
2404                    };
2405                    crate::storage::transaction::lock::LockManager::new(cfg)
2406                }),
2407                rls_policies: parking_lot::RwLock::new(HashMap::new()),
2408                rls_enabled_tables: parking_lot::RwLock::new(HashSet::new()),
2409                foreign_tables: Arc::new(crate::storage::fdw::ForeignTableRegistry::with_builtins()),
2410                pending_tombstones: parking_lot::RwLock::new(HashMap::new()),
2411                pending_versioned_updates: parking_lot::RwLock::new(HashMap::new()),
2412                pending_kv_watch_events: parking_lot::RwLock::new(HashMap::new()),
2413                pending_store_wal_actions: parking_lot::RwLock::new(HashMap::new()),
2414                queue_wait_registry: std::sync::Arc::new(
2415                    crate::runtime::queue_wait_registry::QueueWaitRegistry::new(),
2416                ),
2417                pending_queue_wakes: parking_lot::RwLock::new(HashMap::new()),
2418                tenant_tables: parking_lot::RwLock::new(HashMap::new()),
2419                ddl_epoch: std::sync::atomic::AtomicU64::new(0),
2420                write_gate: Arc::new(crate::runtime::write_gate::WriteGate::from_options(
2421                    &options,
2422                )),
2423                lifecycle: crate::runtime::lifecycle::Lifecycle::new(),
2424                resource_limits: crate::runtime::resource_limits::ResourceLimits::from_env(),
2425                audit_log: {
2426                    // Default audit-log path for the in-memory case
2427                    // sits in the system temp dir; persistent runs
2428                    // place it next to the resolved data file.
2429                    //
2430                    // gh-471 iter 2: route through the resolved
2431                    // `LogDestination`. Performance/Max tiers emit a
2432                    // file-backed log destination under the file-owned
2433                    // support-directory logs tier;
2434                    // lower tiers / ephemeral runs report `Stderr`
2435                    // and we keep the legacy file-next-to-data sink.
2436                    let data_path = options.data_path.clone().unwrap_or_else(|| {
2437                        if embedded_single_file {
2438                            std::env::temp_dir()
2439                                .join("reddb-embedded-runtime")
2440                                .join(format!("audit-{}", std::process::id()))
2441                        } else {
2442                            std::env::temp_dir().join("reddb")
2443                        }
2444                    });
2445                    let (audit_dest, _) = crate::api::tier_wiring::current_log_destinations();
2446                    Arc::new(crate::runtime::audit_log::AuditLogger::for_destination(
2447                        &audit_dest,
2448                        &data_path,
2449                    ))
2450                },
2451                control_event_ledger: parking_lot::RwLock::new(Arc::new(
2452                    crate::runtime::control_events::RuntimeLedger::new(db.store()),
2453                )),
2454                control_event_config: options.control_events,
2455                query_audit: Arc::new(crate::runtime::query_audit::QueryAuditStream::new(
2456                    db.store(),
2457                    options.query_audit.clone(),
2458                )),
2459                lease_lifecycle: std::sync::OnceLock::new(),
2460                replica_apply_metrics: std::sync::Arc::new(
2461                    crate::replication::logical::ReplicaApplyMetrics::default(),
2462                ),
2463                quota_bucket: crate::runtime::quota_bucket::QuotaBucket::from_env(),
2464                schema_vocabulary: parking_lot::RwLock::new(
2465                    crate::runtime::schema_vocabulary::SchemaVocabulary::new(),
2466                ),
2467                slow_query_logger: {
2468                    // Issue #205 — slow-query sink lives in the same
2469                    // directory the audit log uses, so backup/restore
2470                    // ships them together. Threshold + sample-pct
2471                    // default conservatively (1 s, 100% sampling) so
2472                    // emitted lines are rare and complete. Operators
2473                    // tune via env / config matrix in a follow-up.
2474                    //
2475                    // gh-471 iter 2: same routing as the audit log —
2476                    // `LogDestination::File(...)` for Performance/Max
2477                    // lands under the file-owned support-directory logs tier;
2478                    // lower tiers fall back to `red-slow.log` in the
2479                    // data directory.
2480                    let fallback_dir = options
2481                        .data_path
2482                        .as_ref()
2483                        .and_then(|p| p.parent().map(std::path::PathBuf::from))
2484                        .unwrap_or_else(|| {
2485                            if embedded_single_file {
2486                                std::env::temp_dir()
2487                                    .join("reddb-embedded-runtime")
2488                                    .join(format!("slow-{}", std::process::id()))
2489                            } else {
2490                                std::env::temp_dir().join("reddb")
2491                            }
2492                        });
2493                    let threshold_ms = std::env::var("RED_SLOW_QUERY_THRESHOLD_MS")
2494                        .ok()
2495                        .and_then(|s| s.parse::<u64>().ok())
2496                        .unwrap_or(1000);
2497                    let sample_pct = std::env::var("RED_SLOW_QUERY_SAMPLE_PCT")
2498                        .ok()
2499                        .and_then(|s| s.parse::<u8>().ok())
2500                        .unwrap_or(100);
2501                    let (_, slow_dest) = crate::api::tier_wiring::current_log_destinations();
2502                    crate::telemetry::slow_query_logger::SlowQueryLogger::for_destination(
2503                        &slow_dest,
2504                        &fallback_dir,
2505                        threshold_ms,
2506                        sample_pct,
2507                    )
2508                },
2509                kv_stats: crate::runtime::KvStatsCounters::default(),
2510                metrics_ingest_stats: crate::runtime::MetricsIngestCounters::default(),
2511                metrics_tenant_activity_stats:
2512                    crate::runtime::MetricsTenantActivityCounters::default(),
2513                queue_telemetry: Arc::new(
2514                    crate::runtime::queue_telemetry::QueueTelemetryCounters::default(),
2515                ),
2516                queue_presence: Arc::new(
2517                    crate::storage::queue::presence::ConsumerPresenceRegistry::new(),
2518                ),
2519                vector_introspection: Arc::new(
2520                    crate::storage::vector::introspection::VectorIntrospectionRegistry::new(),
2521                ),
2522                kv_tag_index: crate::runtime::KvTagIndex::default(),
2523                chain_tip_cache: parking_lot::Mutex::new(HashMap::new()),
2524                chain_integrity_broken: parking_lot::Mutex::new(HashMap::new()),
2525                integrity_tombstones: parking_lot::Mutex::new(Vec::new()),
2526                integrity_tombstones_state: std::sync::atomic::AtomicU8::new(0),
2527            }),
2528        };
2529
2530        // Issue #205 — install the process-wide OperatorEvent sink so
2531        // emit sites buried in storage / replication / signal handlers
2532        // can record without threading an `&AuditLogger` through every
2533        // call stack. First registration wins; subsequent in-memory
2534        // runtimes (test harnesses) fall through to tracing+eprintln.
2535        crate::telemetry::operator_event::install_global_audit_sink(Arc::clone(
2536            &runtime.inner.audit_log,
2537        ));
2538
2539        // PLAN.md Phase 9.1 — backfill cold-start phase markers
2540        // from the wall-clock captured before storage open. The
2541        // entire `RedDB::open_with_options` call covers both
2542        // auto-restore (when configured) and WAL replay. We
2543        // record both phases against the same boundary today;
2544        // a follow-up will split them once the storage layer
2545        // surfaces a finer-grained event.
2546        runtime
2547            .inner
2548            .lifecycle
2549            .set_restore_started_at_ms(boot_open_start_ms);
2550        runtime
2551            .inner
2552            .lifecycle
2553            .set_restore_ready_at_ms(storage_ready_ms);
2554        runtime
2555            .inner
2556            .lifecycle
2557            .set_wal_replay_started_at_ms(boot_open_start_ms);
2558        runtime
2559            .inner
2560            .lifecycle
2561            .set_wal_replay_ready_at_ms(storage_ready_ms);
2562
2563        let restored_cdc_lsn = runtime
2564            .inner
2565            .db
2566            .replication
2567            .as_ref()
2568            .map(|repl| {
2569                repl.logical_wal_spool
2570                    .as_ref()
2571                    .map(|spool| spool.current_lsn())
2572                    .unwrap_or(0)
2573            })
2574            .unwrap_or(0)
2575            .max(runtime.config_u64("red.config.timeline.last_archived_lsn", 0));
2576        runtime.inner.cdc.set_current_lsn(restored_cdc_lsn);
2577        runtime.rehydrate_snapshot_xid_floor();
2578        runtime
2579            .bootstrap_system_keyed_collections()
2580            .map_err(|err| RedDBError::Internal(format!("bootstrap system collections: {err}")))?;
2581        runtime.rehydrate_declared_column_schemas();
2582        runtime.rehydrate_runtime_index_registry()?;
2583        runtime
2584            .load_probabilistic_state()
2585            .map_err(|err| RedDBError::Internal(format!("load probabilistic state: {err}")))?;
2586
2587        // Phase 2.5.4: replay `tenant_tables.{table}.column` markers so
2588        // tables declared via `TENANT BY (col)` survive restart. Each
2589        // entry re-registers the auto-policy and flips RLS on again.
2590        runtime.rehydrate_tenant_tables();
2591        // Issue #593 slice 9a — replay persisted materialized-view
2592        // descriptors so `CREATE MATERIALIZED VIEW v AS …` survives a
2593        // restart. Runs after the system-keyed collections bootstrap
2594        // and before the API opens.
2595        runtime.rehydrate_materialized_view_descriptors();
2596        if let Some(repl) = &runtime.inner.db.replication {
2597            repl.wal_buffer.set_current_lsn(restored_cdc_lsn);
2598        }
2599
2600        // Save system info to red_config on boot
2601        {
2602            let sys = SystemInfo::collect();
2603            runtime.inner.db.store().set_config_tree(
2604                "red.system",
2605                &crate::serde_json::json!({
2606                    "pid": sys.pid,
2607                    "cpu_cores": sys.cpu_cores,
2608                    "total_memory_bytes": sys.total_memory_bytes,
2609                    "available_memory_bytes": sys.available_memory_bytes,
2610                    "os": sys.os,
2611                    "arch": sys.arch,
2612                    "hostname": sys.hostname,
2613                    "started_at": SystemTime::now()
2614                        .duration_since(UNIX_EPOCH)
2615                        .unwrap_or_default()
2616                        .as_millis() as u64
2617                }),
2618            );
2619
2620            // Seed defaults on first boot (only if red_config is empty or missing defaults)
2621            let store = runtime.inner.db.store();
2622            if store
2623                .get_collection("red_config")
2624                .map(|m| m.query_all(|_| true).len())
2625                .unwrap_or(0)
2626                <= 10
2627            {
2628                store.set_config_tree("red.ai", &crate::json!({
2629                    "default": crate::json!({
2630                        "provider": "openai",
2631                        "model": crate::ai::DEFAULT_OPENAI_PROMPT_MODEL
2632                    }),
2633                    "max_embedding_inputs": 256,
2634                    "max_prompt_batch": 256,
2635                    "timeout": crate::json!({ "connect_secs": 10, "read_secs": 90, "write_secs": 30 })
2636                }));
2637                store.set_config_tree(
2638                    "red.server",
2639                    &crate::json!({
2640                        "max_scan_limit": 1000,
2641                        "max_body_size": 1048576,
2642                        "read_timeout_ms": 5000,
2643                        "write_timeout_ms": 5000
2644                    }),
2645                );
2646                store.set_config_tree(
2647                    "red.storage",
2648                    &crate::json!({
2649                        "page_size": 4096,
2650                        "page_cache_capacity": 100000,
2651                        "auto_checkpoint_pages": 1000,
2652                        "snapshot_retention": 16,
2653                        "verify_checksums": true,
2654                        "segment": crate::json!({
2655                            "max_entities": 100000,
2656                            "max_bytes": 268435456_u64,
2657                            "compression_level": 6
2658                        }),
2659                        "hnsw": crate::json!({ "m": 16, "ef_construction": 100, "ef_search": 50 }),
2660                        "ivf": crate::json!({ "n_lists": 100, "n_probes": 10 }),
2661                        "bm25": crate::json!({ "k1": 1.2, "b": 0.75 })
2662                    }),
2663                );
2664                store.set_config_tree(
2665                    "red.search",
2666                    &crate::json!({
2667                        "rag": crate::json!({
2668                            "max_chunks_per_source": 10,
2669                            "max_total_chunks": 25,
2670                            "similarity_threshold": 0.8,
2671                            "graph_depth": 2,
2672                            "min_relevance": 0.3
2673                        }),
2674                        "fusion": crate::json!({
2675                            "vector_weight": 0.5,
2676                            "graph_weight": 0.3,
2677                            "table_weight": 0.2,
2678                            "dedup_threshold": 0.85
2679                        })
2680                    }),
2681                );
2682                store.set_config_tree(
2683                    "red.auth",
2684                    &crate::json!({
2685                        "enabled": false,
2686                        "session_ttl_secs": 3600,
2687                        "require_auth": false
2688                    }),
2689                );
2690                store.set_config_tree(
2691                    "red.query",
2692                    &crate::json!({
2693                        "connection_pool": crate::json!({ "max_connections": 64, "max_idle": 16 }),
2694                        "max_recursion_depth": 1000
2695                    }),
2696                );
2697                store.set_config_tree(
2698                    "red.indexes",
2699                    &crate::json!({
2700                        "auto_select": true,
2701                        "bloom_filter": crate::json!({
2702                            "enabled": true,
2703                            "false_positive_rate": 0.01,
2704                            "prune_on_scan": true
2705                        }),
2706                        "hash": crate::json!({ "enabled": true }),
2707                        "bitmap": crate::json!({ "enabled": true, "max_cardinality": 1000 }),
2708                        "spatial": crate::json!({ "enabled": true })
2709                    }),
2710                );
2711                store.set_config_tree(
2712                    "red.memtable",
2713                    &crate::json!({
2714                        "enabled": true,
2715                        "max_bytes": 67108864_u64,
2716                        "flush_threshold": 0.75
2717                    }),
2718                );
2719                store.set_config_tree(
2720                    "red.probabilistic",
2721                    &crate::json!({
2722                        "hll_registers": 16384,
2723                        "sketch_default_width": 1000,
2724                        "sketch_default_depth": 5,
2725                        "filter_default_capacity": 100000
2726                    }),
2727                );
2728                store.set_config_tree(
2729                    "red.timeseries",
2730                    &crate::json!({
2731                        "default_chunk_size": 1024,
2732                        "compression": crate::json!({
2733                            "timestamps": "delta_of_delta",
2734                            "values": "gorilla_xor"
2735                        }),
2736                        "default_retention_days": 0
2737                    }),
2738                );
2739                store.set_config_tree(
2740                    "red.queue",
2741                    &crate::json!({
2742                        "default_max_size": 0,
2743                        "default_max_attempts": 3,
2744                        "visibility_timeout_ms": 30000,
2745                        "consumer_idle_timeout_ms": 60000
2746                    }),
2747                );
2748                store.set_config_tree(
2749                    "red.backup",
2750                    &crate::json!({
2751                        "enabled": false,
2752                        "interval_secs": 3600,
2753                        "retention_count": 24,
2754                        "upload": false,
2755                        "backend": "local"
2756                    }),
2757                );
2758                store.set_config_tree(
2759                    "red.wal",
2760                    &crate::json!({
2761                        "archive": crate::json!({
2762                            "enabled": false,
2763                            "retention_hours": 168,
2764                            "prefix": reddb_file::backup_wal_prefix("")
2765                        })
2766                    }),
2767                );
2768                store.set_config_tree(
2769                    "red.cdc",
2770                    &crate::json!({
2771                        "enabled": true,
2772                        "buffer_size": 100000
2773                    }),
2774                );
2775                store.set_config_tree(
2776                    "red.config.secret",
2777                    &crate::json!({
2778                        "auto_encrypt": true,
2779                        "auto_decrypt": true
2780                    }),
2781                );
2782            }
2783
2784            // Perf-parity config matrix: heal the Tier A (critical)
2785            // keys unconditionally on every boot. Idempotent — only
2786            // writes the default when the key is missing. Keeps
2787            // `SHOW CONFIG` showing every guarantee the operator has
2788            // (durability.mode, concurrency.locking.enabled, …) even
2789            // on long-running datadirs that predate the matrix.
2790            crate::runtime::config_matrix::heal_critical_keys(store.as_ref());
2791            seed_storage_deploy_config(store.as_ref(), options.storage_profile);
2792
2793            // Phase 5 — Lehman-Yao runtime flag. Read the Tier A
2794            // `storage.btree.lehman_yao` value from the matrix (env
2795            // > file > red_config > default) and publish it to the
2796            // storage layer's atomic so the B-tree read / split
2797            // paths can branch without re-reading the config on
2798            // every hot-path call.
2799            let lehman_yao = runtime.config_bool("storage.btree.lehman_yao", true);
2800            crate::storage::engine::btree::lehman_yao::set_enabled(lehman_yao);
2801            if lehman_yao {
2802                tracing::info!(
2803                    "storage.btree.lehman_yao=true — lock-free concurrent descent enabled"
2804                );
2805            }
2806
2807            // Config file overlay — mounted `/etc/reddb/config.json`
2808            // (override path via REDDB_CONFIG_FILE). Writes keys with
2809            // write-if-absent semantics so a later user `SET CONFIG`
2810            // always wins. Missing file = silent no-op.
2811            let overlay_path = crate::runtime::config_overlay::config_file_path();
2812            let _ =
2813                crate::runtime::config_overlay::apply_config_file(store.as_ref(), &overlay_path);
2814        }
2815
2816        // VCS ("Git for Data") — create the `red_*` metadata
2817        // collections on first boot. Idempotent: `get_or_create_collection`
2818        // is a no-op if the collection already exists.
2819        {
2820            let store = runtime.inner.db.store();
2821            for name in crate::application::vcs_collections::ALL {
2822                let _ = store.get_or_create_collection(*name);
2823            }
2824            // Seed VCS config namespace with sensible defaults on first
2825            // boot, matching the pattern used by red.ai / red.storage.
2826            store.set_config_tree(
2827                crate::application::vcs_collections::CONFIG_NAMESPACE,
2828                &crate::json!({
2829                    "default_branch": "main",
2830                    "author": crate::json!({
2831                        "name": "reddb",
2832                        "email": "reddb@localhost"
2833                    }),
2834                    "protected_branches": crate::json!(["main"]),
2835                    "closure": crate::json!({
2836                        "enabled": true,
2837                        "lazy": true
2838                    }),
2839                    "merge": crate::json!({
2840                        "default_strategy": "auto",
2841                        "fast_forward": true
2842                    })
2843                }),
2844            );
2845        }
2846
2847        // Migrations — create the `red_migrations` / `red_migration_deps`
2848        // system collections on first boot. Idempotent.
2849        {
2850            let store = runtime.inner.db.store();
2851            for name in crate::application::migration_collections::ALL {
2852                let _ = store.get_or_create_collection(*name);
2853            }
2854        }
2855
2856        // Topology graph (#803) — ensure the built-in `red.topology.cluster`
2857        // graph collection (declared WITH ANALYTICS) and its metadata sidecar
2858        // exist. Idempotent and survives restarts via the WAL-backed contract.
2859        let _ = crate::application::topology_collections::ensure(&runtime);
2860
2861        // Start background maintenance thread (context index refresh +
2862        // session purge). Held by a WEAK reference to `RuntimeInner`
2863        // so dropping the last `RedDBRuntime` handle actually releases
2864        // the underlying Arc<Pager> (and its file lock). Polling at
2865        // 200ms means shutdown latency is bounded; the real 60-second
2866        // work cadence is tracked independently via a `last_work`
2867        // timestamp.
2868        //
2869        // The previous version captured `rt = runtime.clone()` by
2870        // strong reference and ran an unterminated `loop`, which held
2871        // Arc<RuntimeInner> forever — reopening a persistent database
2872        // in the same process failed with "Database is locked" because
2873        // the pager could never drop. See the regression test
2874        // `finding_1_select_after_bulk_insert_persistent_reopen`.
2875        {
2876            let weak = Arc::downgrade(&runtime.inner);
2877            std::thread::Builder::new()
2878                .name("reddb-maintenance".into())
2879                .spawn(move || {
2880                    let tick = std::time::Duration::from_millis(200);
2881                    let work_interval = std::time::Duration::from_secs(60);
2882                    let mut last_work = std::time::Instant::now();
2883                    loop {
2884                        std::thread::sleep(tick);
2885                        let Some(inner) = weak.upgrade() else {
2886                            // All strong references dropped — the
2887                            // runtime is gone, exit cleanly.
2888                            break;
2889                        };
2890                        if last_work.elapsed() >= work_interval {
2891                            let _stats = inner.db.store().context_index().stats();
2892                            last_work = std::time::Instant::now();
2893                        }
2894                    }
2895                })
2896                .ok();
2897        }
2898
2899        // Start backup scheduler if enabled via red_config
2900        {
2901            let store = runtime.inner.db.store();
2902            let mut backup_enabled = false;
2903            let mut backup_interval = 3600u64;
2904
2905            if let Some(manager) = store.get_collection("red_config") {
2906                manager.for_each_entity(|entity| {
2907                    if let Some(row) = entity.data.as_row() {
2908                        let key = row.get_field("key").and_then(|v| match v {
2909                            crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2910                            _ => None,
2911                        });
2912                        let val = row.get_field("value");
2913                        if key == Some("red.config.backup.enabled") {
2914                            backup_enabled = match val {
2915                                Some(crate::storage::schema::Value::Boolean(true)) => true,
2916                                Some(crate::storage::schema::Value::Text(s)) => &**s == "true",
2917                                _ => false,
2918                            };
2919                        } else if key == Some("red.config.backup.interval_secs") {
2920                            if let Some(crate::storage::schema::Value::Integer(n)) = val {
2921                                backup_interval = *n as u64;
2922                            }
2923                        }
2924                    }
2925                    true
2926                });
2927            }
2928
2929            if backup_enabled {
2930                runtime.inner.backup_scheduler.set_interval(backup_interval);
2931                let rt = runtime.clone();
2932                runtime
2933                    .inner
2934                    .backup_scheduler
2935                    .start(move || rt.trigger_backup().map_err(|e| format!("{}", e)));
2936            }
2937        }
2938
2939        // Load EC registry from red_config and start worker
2940        {
2941            runtime
2942                .inner
2943                .ec_registry
2944                .load_from_config_store(runtime.inner.db.store().as_ref());
2945            if !runtime.inner.ec_registry.async_configs().is_empty() {
2946                runtime.inner.ec_worker.start(
2947                    Arc::clone(&runtime.inner.ec_registry),
2948                    Arc::clone(&runtime.inner.db.store()),
2949                );
2950            }
2951        }
2952
2953        if let crate::replication::ReplicationRole::Replica { primary_addr } =
2954            runtime.inner.db.options().replication.role.clone()
2955        {
2956            let rt = runtime.clone();
2957            std::thread::Builder::new()
2958                .name("reddb-replica".into())
2959                .spawn(move || rt.run_replica_loop(primary_addr))
2960                .ok();
2961        }
2962
2963        // PLAN.md Phase 1 — Lifecycle Contract. Mark Ready once every
2964        // boot stage above has completed (WAL replay, restore-from-
2965        // remote, replica-loop spawn). Health probes flip from 503 to
2966        // 200 here; shutdown begins from this state.
2967        runtime.inner.lifecycle.mark_ready();
2968
2969        // Issue #583 slice 10 — ContinuousMaterializedView scheduler.
2970        // Low-priority background ticker that drains the cache's
2971        // `claim_due_at` set every ~50ms. Holds only a Weak<RuntimeInner>
2972        // so the thread exits cleanly when the runtime drops (≤50ms
2973        // latency between drop and exit). Materialized views without
2974        // a `REFRESH EVERY` clause stay on the manual-refresh path
2975        // and are skipped by `claim_due_at`, so the loop is a no-op
2976        // when no scheduled views exist.
2977        {
2978            let weak_inner = Arc::downgrade(&runtime.inner);
2979            std::thread::Builder::new()
2980                .name("reddb-mv-scheduler".into())
2981                .spawn(move || loop {
2982                    std::thread::sleep(std::time::Duration::from_millis(50));
2983                    let Some(inner) = weak_inner.upgrade() else {
2984                        break;
2985                    };
2986                    let rt = RedDBRuntime { inner };
2987                    rt.refresh_due_materialized_views();
2988                })
2989                .ok();
2990        }
2991
2992        // Issue #584 slice 12 — DeclarativeRetention background sweeper.
2993        // Low-priority ticker that physically reclaims rows whose
2994        // timestamp has fallen beyond the retention window. Holds a
2995        // `Weak<RuntimeInner>` so the thread exits within one tick of
2996        // the runtime drop (graceful shutdown leaves storage consistent
2997        // because each tick goes through the standard DELETE path —
2998        // there is no half-finished mutation state to clean up). The
2999        // tick interval is intentionally longer than the MV scheduler
3000        // (500ms) because retention is order-of-seconds at minimum.
3001        if !runtime.write_gate().is_read_only() {
3002            let weak_inner = Arc::downgrade(&runtime.inner);
3003            std::thread::Builder::new()
3004                .name("reddb-retention-sweeper".into())
3005                .spawn(move || loop {
3006                    std::thread::sleep(std::time::Duration::from_millis(500));
3007                    let Some(inner) = weak_inner.upgrade() else {
3008                        break;
3009                    };
3010                    let rt = RedDBRuntime { inner };
3011                    rt.sweep_retention_tick(
3012                        crate::runtime::retention_sweeper::DEFAULT_SWEEPER_BATCH,
3013                    );
3014                })
3015                .ok();
3016        }
3017
3018        Ok(runtime)
3019    }
3020
3021    fn rehydrate_snapshot_xid_floor(&self) {
3022        let store = self.inner.db.store();
3023        for collection in store.list_collections() {
3024            let Some(manager) = store.get_collection(&collection) else {
3025                continue;
3026            };
3027            for entity in manager.query_all(|_| true) {
3028                self.inner
3029                    .snapshot_manager
3030                    .observe_committed_xid(entity.xmin);
3031                self.inner
3032                    .snapshot_manager
3033                    .observe_committed_xid(entity.xmax);
3034            }
3035        }
3036    }
3037
3038    /// Provision an empty Table-shaped collection that backs a
3039    /// `CREATE MATERIALIZED VIEW v` (issue #594 slice 9b of #575).
3040    /// `SELECT FROM v` reads this collection directly; the rewriter is
3041    /// configured to skip materialized views so the body is no longer
3042    /// substituted. REFRESH still writes to the cache slot — wiring it
3043    /// into this backing collection is the job of slice 9c.
3044    ///
3045    /// Idempotent: re-running for the same name leaves the existing
3046    /// collection in place (mirrors `CREATE TABLE IF NOT EXISTS`
3047    /// semantics). This keeps `CREATE OR REPLACE MATERIALIZED VIEW v`
3048    /// cheap — the body change does not invalidate already-buffered
3049    /// rows. Until 9c lands the backing is always empty anyway.
3050    pub(crate) fn ensure_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
3051        let store = self.inner.db.store();
3052        let mut changed = false;
3053        if store.get_collection(name).is_none() {
3054            store.get_or_create_collection(name);
3055            changed = true;
3056        }
3057        if self.inner.db.collection_contract(name).is_none() {
3058            self.inner
3059                .db
3060                .save_collection_contract(system_keyed_collection_contract(
3061                    name,
3062                    crate::catalog::CollectionModel::Table,
3063                ))
3064                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3065            changed = true;
3066        }
3067        if changed {
3068            self.inner
3069                .db
3070                .persist_metadata()
3071                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3072        }
3073        Ok(())
3074    }
3075
3076    /// Inverse of [`ensure_materialized_view_backing`] — drops the
3077    /// backing collection on `DROP MATERIALIZED VIEW v`. No-op when
3078    /// the collection was never created (e.g. a `DROP MATERIALIZED
3079    /// VIEW IF EXISTS v` against an unknown name).
3080    pub(crate) fn drop_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
3081        let store = self.inner.db.store();
3082        if store.get_collection(name).is_none() {
3083            return Ok(());
3084        }
3085        store
3086            .drop_collection(name)
3087            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3088        // The contract may have been dropped already (DROP TABLE path)
3089        // — ignore "not found" errors by checking presence first.
3090        if self.inner.db.collection_contract(name).is_some() {
3091            self.inner
3092                .db
3093                .remove_collection_contract(name)
3094                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3095        }
3096        self.invalidate_result_cache();
3097        self.inner
3098            .db
3099            .persist_metadata()
3100            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3101        Ok(())
3102    }
3103
3104    fn bootstrap_system_keyed_collections(&self) -> RedDBResult<()> {
3105        let mut changed = false;
3106        for (name, model) in [
3107            ("red.config", crate::catalog::CollectionModel::Config),
3108            ("red.vault", crate::catalog::CollectionModel::Vault),
3109            // Issue #593 — materialized-view catalog. One row per
3110            // `CREATE MATERIALIZED VIEW`; rehydrated at boot before
3111            // the API opens.
3112            (
3113                crate::runtime::continuous_materialized_view::CATALOG_COLLECTION,
3114                crate::catalog::CollectionModel::Config,
3115            ),
3116        ] {
3117            if self.inner.db.store().get_collection(name).is_none() {
3118                self.inner.db.store().get_or_create_collection(name);
3119                changed = true;
3120            }
3121            if self.inner.db.collection_contract(name).is_none() {
3122                self.inner
3123                    .db
3124                    .save_collection_contract(system_keyed_collection_contract(name, model))
3125                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
3126                changed = true;
3127            }
3128        }
3129        if changed {
3130            self.inner
3131                .db
3132                .persist_metadata()
3133                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3134        }
3135        Ok(())
3136    }
3137
3138    pub fn db(&self) -> Arc<RedDB> {
3139        Arc::clone(&self.inner.db)
3140    }
3141
3142    /// Direct access to the runtime's secondary-index store.
3143    /// Used by bulk-insert entry points (gRPC binary bulk, HTTP bulk,
3144    /// wire bulk) that need to push new rows through the per-index
3145    /// maintenance hook after `store.bulk_insert` returns.
3146    pub fn index_store_ref(&self) -> &super::index_store::IndexStore {
3147        &self.inner.index_store
3148    }
3149
3150    /// Apply a DDL event to the schema-vocabulary reverse index
3151    /// (issue #120). Called by DDL execution paths after the catalog
3152    /// mutation has succeeded so the index never holds entries for
3153    /// half-applied DDL.
3154    pub(crate) fn schema_vocabulary_apply(
3155        &self,
3156        event: crate::runtime::schema_vocabulary::DdlEvent,
3157    ) {
3158        self.inner.schema_vocabulary.write().on_ddl(event);
3159    }
3160
3161    /// Lookup `token` in the schema-vocabulary reverse index. Returns
3162    /// an owned `Vec<VocabHit>` because the underlying read lock
3163    /// cannot be borrowed across the call boundary; the slice from
3164    /// `SchemaVocabulary::lookup` is cloned per hit.
3165    pub fn schema_vocabulary_lookup(
3166        &self,
3167        token: &str,
3168    ) -> Vec<crate::runtime::schema_vocabulary::VocabHit> {
3169        self.inner.schema_vocabulary.read().lookup(token).to_vec()
3170    }
3171
3172    /// Inject an AuthStore into the runtime. Called by server boot
3173    /// after the vault has been bootstrapped, so that `Value::Secret`
3174    /// auto-encrypt/decrypt can reach the vault AES key.
3175    pub fn set_auth_store(&self, store: Arc<crate::auth::store::AuthStore>) {
3176        *self.inner.auth_store.write() = Some(store);
3177    }
3178
3179    /// Snapshot the current AuthStore (if any). Used by the wire listener
3180    /// to validate bearer tokens issued via HTTP `/auth/login`.
3181    pub fn auth_store(&self) -> Option<Arc<crate::auth::store::AuthStore>> {
3182        self.inner.auth_store.read().clone()
3183    }
3184
3185    /// Read a vault KV secret from the configured AuthStore, if present.
3186    pub fn vault_kv_get(&self, key: &str) -> Option<String> {
3187        self.inner
3188            .auth_store
3189            .read()
3190            .as_ref()
3191            .and_then(|store| store.vault_kv_get(key))
3192    }
3193
3194    /// Write a vault KV secret and fail if the encrypted vault write is
3195    /// unavailable or cannot be made durable.
3196    pub fn vault_kv_try_set(&self, key: String, value: String) -> RedDBResult<()> {
3197        let store = self.inner.auth_store.read().clone().ok_or_else(|| {
3198            RedDBError::Query("secret storage requires an enabled, unsealed vault".to_string())
3199        })?;
3200        store
3201            .vault_kv_try_set(key, value)
3202            .map_err(|err| RedDBError::Query(err.to_string()))
3203    }
3204
3205    /// Inject an `OAuthValidator` into the runtime. When set, HTTP and
3206    /// wire transports try OAuth JWT validation before falling back to
3207    /// the local AuthStore lookup. Pass `None` to disable.
3208    pub fn set_oauth_validator(&self, validator: Option<Arc<crate::auth::oauth::OAuthValidator>>) {
3209        *self.inner.oauth_validator.write() = validator;
3210    }
3211
3212    /// Returns a clone of the configured `OAuthValidator` Arc, if any.
3213    /// Hot path: called per HTTP request when an Authorization header
3214    /// is present, so we hand back a cheap Arc clone.
3215    pub fn oauth_validator(&self) -> Option<Arc<crate::auth::oauth::OAuthValidator>> {
3216        self.inner.oauth_validator.read().clone()
3217    }
3218
3219    /// Inject the browser-token authority (issue #936). When set, the
3220    /// RedWire WS handshake accepts the short-lived access JWT it mints
3221    /// (alongside, and tried before, the federated OAuth validator), and
3222    /// the `/auth/browser/*` HTTP endpoints can issue/rotate the pair.
3223    /// `None` leaves the browser credential flow inert.
3224    pub fn set_browser_token_authority(
3225        &self,
3226        authority: Option<Arc<crate::auth::browser_token::BrowserTokenAuthority>>,
3227    ) {
3228        *self.inner.browser_token_authority.write() = authority;
3229    }
3230
3231    /// Snapshot the browser-token authority, if wired. Read on the WS
3232    /// handshake path and by the `/auth/browser/*` handlers; a cheap Arc
3233    /// clone keeps the lock hold short.
3234    pub fn browser_token_authority(
3235        &self,
3236    ) -> Option<Arc<crate::auth::browser_token::BrowserTokenAuthority>> {
3237        self.inner.browser_token_authority.read().clone()
3238    }
3239
3240    /// Returns the vault AES key (`red.secret.aes_key`) if an auth
3241    /// store is wired and a key has been generated. Used by the
3242    /// `Value::Secret` encrypt/decrypt pipeline.
3243    pub(crate) fn secret_aes_key(&self) -> Option<[u8; 32]> {
3244        let guard = self.inner.auth_store.read();
3245        guard.as_ref().and_then(|s| s.vault_secret_key())
3246    }
3247
3248    /// Resolve a boolean flag from `red_config`. Defaults to `default`
3249    /// when the key is missing or not coercible. If the same key has
3250    /// been written multiple times (SET CONFIG appends new rows), the
3251    /// most recent entity wins. Env-var overrides
3252    /// (`REDDB_<UP_DOTTED>`) take highest precedence.
3253    pub(crate) fn config_bool(&self, key: &str, default: bool) -> bool {
3254        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3255            if let Some(crate::storage::schema::Value::Boolean(b)) =
3256                crate::runtime::config_overlay::coerce_env_value(key, raw)
3257            {
3258                return b;
3259            }
3260        }
3261        let store = self.inner.db.store();
3262        let Some(manager) = store.get_collection("red_config") else {
3263            return default;
3264        };
3265        let mut result = default;
3266        let mut latest_id: u64 = 0;
3267        manager.for_each_entity(|entity| {
3268            if let Some(row) = entity.data.as_row() {
3269                let entry_key = row.get_field("key").and_then(|v| match v {
3270                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3271                    _ => None,
3272                });
3273                if entry_key == Some(key) {
3274                    let id = entity.id.raw();
3275                    if id >= latest_id {
3276                        latest_id = id;
3277                        result = match row.get_field("value") {
3278                            Some(crate::storage::schema::Value::Boolean(b)) => *b,
3279                            Some(crate::storage::schema::Value::Text(s)) => {
3280                                matches!(s.as_ref(), "true" | "TRUE" | "True" | "1")
3281                            }
3282                            Some(crate::storage::schema::Value::Integer(n)) => *n != 0,
3283                            _ => default,
3284                        };
3285                    }
3286                }
3287            }
3288            true
3289        });
3290        result
3291    }
3292
3293    pub(crate) fn config_u64(&self, key: &str, default: u64) -> u64 {
3294        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3295            if let Some(crate::storage::schema::Value::UnsignedInteger(n)) =
3296                crate::runtime::config_overlay::coerce_env_value(key, raw)
3297            {
3298                return n;
3299            }
3300        }
3301        let store = self.inner.db.store();
3302        let Some(manager) = store.get_collection("red_config") else {
3303            return default;
3304        };
3305        let mut result = default;
3306        let mut latest_id: u64 = 0;
3307        manager.for_each_entity(|entity| {
3308            if let Some(row) = entity.data.as_row() {
3309                let entry_key = row.get_field("key").and_then(|v| match v {
3310                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3311                    _ => None,
3312                });
3313                if entry_key == Some(key) {
3314                    let id = entity.id.raw();
3315                    if id >= latest_id {
3316                        latest_id = id;
3317                        result = match row.get_field("value") {
3318                            Some(crate::storage::schema::Value::Integer(n)) => *n as u64,
3319                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n,
3320                            Some(crate::storage::schema::Value::Text(s)) => {
3321                                s.parse::<u64>().unwrap_or(default)
3322                            }
3323                            _ => default,
3324                        };
3325                    }
3326                }
3327            }
3328            true
3329        });
3330        result
3331    }
3332
3333    pub(crate) fn config_f64(&self, key: &str, default: f64) -> f64 {
3334        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3335            if let Ok(n) = raw.parse::<f64>() {
3336                return n;
3337            }
3338        }
3339        let store = self.inner.db.store();
3340        let Some(manager) = store.get_collection("red_config") else {
3341            return default;
3342        };
3343        let mut result = default;
3344        let mut latest_id: u64 = 0;
3345        manager.for_each_entity(|entity| {
3346            if let Some(row) = entity.data.as_row() {
3347                let entry_key = row.get_field("key").and_then(|v| match v {
3348                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3349                    _ => None,
3350                });
3351                if entry_key == Some(key) {
3352                    let id = entity.id.raw();
3353                    if id >= latest_id {
3354                        latest_id = id;
3355                        result = match row.get_field("value") {
3356                            Some(crate::storage::schema::Value::Float(n)) => *n,
3357                            Some(crate::storage::schema::Value::Integer(n)) => *n as f64,
3358                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n as f64,
3359                            Some(crate::storage::schema::Value::Text(s)) => {
3360                                s.parse::<f64>().unwrap_or(default)
3361                            }
3362                            _ => default,
3363                        };
3364                    }
3365                }
3366            }
3367            true
3368        });
3369        result
3370    }
3371
3372    pub(crate) fn config_string(&self, key: &str, default: &str) -> String {
3373        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3374            return raw.clone();
3375        }
3376        let store = self.inner.db.store();
3377        let Some(manager) = store.get_collection("red_config") else {
3378            return default.to_string();
3379        };
3380        let mut result = default.to_string();
3381        let mut latest_id: u64 = 0;
3382        manager.for_each_entity(|entity| {
3383            if let Some(row) = entity.data.as_row() {
3384                let entry_key = row.get_field("key").and_then(|v| match v {
3385                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3386                    _ => None,
3387                });
3388                if entry_key == Some(key) {
3389                    let id = entity.id.raw();
3390                    if id >= latest_id {
3391                        latest_id = id;
3392                        if let Some(crate::storage::schema::Value::Text(value)) =
3393                            row.get_field("value")
3394                        {
3395                            result = value.to_string();
3396                        }
3397                    }
3398                }
3399            }
3400            true
3401        });
3402        result
3403    }
3404
3405    /// Whether `SECRET('...')` literals should be encrypted with the
3406    /// vault AES key on INSERT. Default `true`.
3407    pub(crate) fn secret_auto_encrypt(&self) -> bool {
3408        self.config_bool("red.config.secret.auto_encrypt", true)
3409    }
3410
3411    /// Whether `Value::Secret` columns should be decrypted back to
3412    /// plaintext on SELECT when the vault is unsealed. Default `true`.
3413    /// Turning this off keeps secrets masked as `***` even while the
3414    /// vault is open — useful for audit trails or read-only exports.
3415    pub(crate) fn secret_auto_decrypt(&self) -> bool {
3416        self.config_bool("red.config.secret.auto_decrypt", true)
3417    }
3418
3419    /// Walk every record in `result` and swap `Value::Secret(bytes)`
3420    /// for the decrypted plaintext when the runtime has the vault
3421    /// AES key AND `red.config.secret.auto_decrypt = true`. If the
3422    /// key is missing, the vault is sealed, or auto_decrypt is off,
3423    /// secrets are left as `Value::Secret` which every formatter
3424    /// (Display, JSON) already masks as `***`.
3425    pub(crate) fn apply_secret_decryption(&self, result: &mut RuntimeQueryResult) {
3426        if !self.secret_auto_decrypt() {
3427            return;
3428        }
3429        let Some(key) = self.secret_aes_key() else {
3430            return;
3431        };
3432        for record in result.result.records.iter_mut() {
3433            for value in record.values_mut() {
3434                if let Value::Secret(ref bytes) = value {
3435                    if let Some(plain) =
3436                        super::impl_dml::decrypt_secret_payload(&key, bytes.as_slice())
3437                    {
3438                        if let Ok(text) = String::from_utf8(plain) {
3439                            *value = Value::text(text);
3440                        }
3441                    }
3442                }
3443            }
3444        }
3445    }
3446
3447    /// Emit a CDC change event and replicate to WAL buffer.
3448    /// Create a `MutationEngine` bound to this runtime.
3449    ///
3450    /// The engine is cheap to construct (no allocation) and should be
3451    /// dropped after `apply` returns. Use this from application-layer
3452    /// `create_row` / `create_rows_batch` instead of calling
3453    /// `bulk_insert` + `index_entity_insert` + `cdc_emit` separately.
3454    pub(crate) fn mutation_engine(&self) -> crate::runtime::mutation::MutationEngine<'_> {
3455        crate::runtime::mutation::MutationEngine::new(self)
3456    }
3457
3458    /// Public-mutation gate snapshot (PLAN.md W1).
3459    ///
3460    /// Surfaces that accept untrusted client requests (SQL DML/DDL,
3461    /// gRPC mutating RPCs, HTTP/native wire mutations, admin
3462    /// maintenance, serverless lifecycle) call `check_write` before
3463    /// dispatching to storage. Returns `RedDBError::ReadOnly` on any
3464    /// instance running as a replica or with `options.read_only =
3465    /// true`. The replica internal logical-WAL apply path reaches into
3466    /// the store directly and never calls this method, so legitimate
3467    /// replica catch-up still works.
3468    pub fn check_write(&self, kind: crate::runtime::write_gate::WriteKind) -> RedDBResult<()> {
3469        self.inner.write_gate.check(kind)
3470    }
3471
3472    /// Read-only handle to the gate, useful for transports that want
3473    /// to surface the policy in health/status output without taking on
3474    /// a dependency on the concrete enum.
3475    pub fn write_gate(&self) -> &crate::runtime::write_gate::WriteGate {
3476        &self.inner.write_gate
3477    }
3478
3479    /// Process lifecycle handle (PLAN.md Phase 1). Health probes,
3480    /// admin/shutdown, and signal handlers consult this single
3481    /// state machine.
3482    pub fn lifecycle(&self) -> &crate::runtime::lifecycle::Lifecycle {
3483        &self.inner.lifecycle
3484    }
3485
3486    /// Operator-imposed resource limits (PLAN.md Phase 4.1).
3487    pub fn resource_limits(&self) -> &crate::runtime::resource_limits::ResourceLimits {
3488        &self.inner.resource_limits
3489    }
3490
3491    /// Append-only audit log for admin mutations (PLAN.md Phase 6.5).
3492    pub fn audit_log(&self) -> &crate::runtime::audit_log::AuditLogger {
3493        &self.inner.audit_log
3494    }
3495
3496    /// Shared `Arc` to the audit logger — used by collaborators (the
3497    /// lease lifecycle, future request-context plumbing) that need to
3498    /// keep the logger alive past the runtime's stack frame.
3499    pub fn audit_log_arc(&self) -> Arc<crate::runtime::audit_log::AuditLogger> {
3500        Arc::clone(&self.inner.audit_log)
3501    }
3502
3503    pub(crate) fn emit_control_event(
3504        &self,
3505        kind: crate::runtime::control_events::EventKind,
3506        outcome: crate::runtime::control_events::Outcome,
3507        action: &'static str,
3508        resource: Option<String>,
3509        reason: Option<String>,
3510        extra_fields: Vec<(String, crate::runtime::control_events::Sensitivity)>,
3511    ) -> RedDBResult<()> {
3512        use crate::runtime::control_events::{
3513            ActorRef, ControlEvent, ControlEventCtx, ControlEventLedger, Sensitivity,
3514        };
3515
3516        let tenant = current_tenant();
3517        let principal = current_auth_identity();
3518        let actor_user = principal
3519            .as_ref()
3520            .map(|(principal, _)| UserId::from_parts(tenant.as_deref(), principal));
3521        let actor = actor_user
3522            .as_ref()
3523            .map(ActorRef::User)
3524            .unwrap_or(ActorRef::Anonymous);
3525        let ctx = ControlEventCtx {
3526            actor,
3527            scope: tenant
3528                .as_ref()
3529                .map(|scope| std::borrow::Cow::Borrowed(scope.as_str())),
3530            request_id: Some(std::borrow::Cow::Owned(format!(
3531                "conn-{}",
3532                current_connection_id()
3533            ))),
3534            trace_id: None,
3535        };
3536        let mut fields = std::collections::HashMap::new();
3537        fields.insert(
3538            "connection_id".to_string(),
3539            Sensitivity::raw(current_connection_id().to_string()),
3540        );
3541        if let Some((_, role)) = principal {
3542            fields.insert("actor_role".to_string(), Sensitivity::raw(role.as_str()));
3543        }
3544        for (key, value) in extra_fields {
3545            fields.insert(key, value);
3546        }
3547        let event = ControlEvent {
3548            kind,
3549            outcome,
3550            action: std::borrow::Cow::Borrowed(action),
3551            resource,
3552            reason,
3553            matched_policy_id: None,
3554            fields,
3555        };
3556        let ledger = self.inner.control_event_ledger.read();
3557        match ledger.emit(&ctx, event) {
3558            Ok(_) => Ok(()),
3559            Err(err) if self.inner.control_event_config.require_persistence() => {
3560                Err(RedDBError::Internal(err.to_string()))
3561            }
3562            Err(_) => Ok(()),
3563        }
3564    }
3565
3566    fn policy_mutation_control_ctx<'a>(
3567        &self,
3568        actor: &'a crate::auth::UserId,
3569        tenant: Option<&'a str>,
3570    ) -> crate::runtime::control_events::ControlEventCtx<'a> {
3571        crate::runtime::control_events::ControlEventCtx {
3572            actor: crate::runtime::control_events::ActorRef::User(actor),
3573            scope: tenant.map(std::borrow::Cow::Borrowed),
3574            request_id: Some(std::borrow::Cow::Owned(format!(
3575                "conn-{}",
3576                current_connection_id()
3577            ))),
3578            trace_id: None,
3579        }
3580    }
3581
3582    fn emit_query_audit(
3583        &self,
3584        query: &str,
3585        plan: &QueryAuditPlan,
3586        duration_ms: u64,
3587        result: &RuntimeQueryResult,
3588    ) {
3589        if !self.inner.query_audit.has_rules() {
3590            return;
3591        }
3592        let actor = current_auth_identity().map(|(principal, _)| principal);
3593        let tenant = current_tenant();
3594        let row_count = if result.statement_type == "select" {
3595            result.result.records.len() as u64
3596        } else {
3597            result.affected_rows
3598        };
3599        self.inner
3600            .query_audit
3601            .emit(crate::runtime::query_audit::QueryAuditEvent {
3602                actor,
3603                tenant,
3604                statement_kind: plan.statement_kind,
3605                touched_collections: plan.collections.clone(),
3606                duration_ms,
3607                row_count,
3608                request_id: Some(crate::crypto::uuid::Uuid::new_v7().to_string()),
3609                query_hash: Some(blake3::hash(query.as_bytes()).to_hex().to_string()),
3610            });
3611    }
3612
3613    /// Shared queue telemetry counters (delivered/acked/nacked).
3614    pub(crate) fn queue_telemetry(
3615        &self,
3616    ) -> &crate::runtime::queue_telemetry::QueueTelemetryCounters {
3617        &self.inner.queue_telemetry
3618    }
3619
3620    /// Snapshots of the queue telemetry counters in label-deterministic
3621    /// order for `/metrics` rendering and the integration test.
3622    pub fn queue_telemetry_snapshot(
3623        &self,
3624    ) -> crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3625        crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3626            delivered: self.inner.queue_telemetry.delivered_snapshot(),
3627            acked: self.inner.queue_telemetry.acked_snapshot(),
3628            nacked: self.inner.queue_telemetry.nacked_snapshot(),
3629            wait_started: self.inner.queue_telemetry.wait_started_snapshot(),
3630            wait_woken: self.inner.queue_telemetry.wait_woken_snapshot(),
3631            wait_timed_out: self.inner.queue_telemetry.wait_timed_out_snapshot(),
3632            wait_cancelled: self.inner.queue_telemetry.wait_cancelled_snapshot(),
3633            wait_duration: self.inner.queue_telemetry.wait_duration_snapshot(),
3634        }
3635    }
3636
3637    /// Issue #742 — consumer presence registry. Heartbeats land here
3638    /// from `QUEUE READ` (and, in a follow-up slice, an explicit
3639    /// `QUEUE HEARTBEAT` command); Red UI and `red.queue_consumers`
3640    /// read snapshots through `queue_consumer_presence_snapshot`.
3641    pub(crate) fn queue_presence(
3642        &self,
3643    ) -> &std::sync::Arc<crate::storage::queue::presence::ConsumerPresenceRegistry> {
3644        &self.inner.queue_presence
3645    }
3646
3647    /// Issue #742 — point-in-time presence snapshot, classifying each
3648    /// `(queue, group, consumer)` as active/stale/expired against the
3649    /// supplied TTL. Wall-clock is read once here so the lifecycle
3650    /// flags inside the snapshot are internally consistent.
3651    pub fn queue_consumer_presence_snapshot(
3652        &self,
3653        ttl_ms: u64,
3654    ) -> Vec<crate::storage::queue::presence::ConsumerPresence> {
3655        let now_ns = std::time::SystemTime::now()
3656            .duration_since(std::time::UNIX_EPOCH)
3657            .map(|d| d.as_nanos() as u64)
3658            .unwrap_or(0);
3659        self.inner.queue_presence.snapshot(now_ns, ttl_ms)
3660    }
3661
3662    /// Issue #742 — active-consumer count per `(queue, group)` for the
3663    /// queue-metadata surface. Stale/expired entries are excluded by
3664    /// definition; they are still visible in the per-row snapshot.
3665    pub fn queue_active_consumer_counts(
3666        &self,
3667        ttl_ms: u64,
3668    ) -> std::collections::HashMap<(String, String), u32> {
3669        let now_ns = std::time::SystemTime::now()
3670            .duration_since(std::time::UNIX_EPOCH)
3671            .map(|d| d.as_nanos() as u64)
3672            .unwrap_or(0);
3673        self.inner
3674            .queue_presence
3675            .count_active_by_group(now_ns, ttl_ms)
3676    }
3677
3678    /// Issue #743 — vector + TurboQuant introspection registry. Engine
3679    /// publish points (collection create, artifact build start /
3680    /// finish, fallback toggle, drop) update this; Red UI and
3681    /// `red.*` vector virtual tables read snapshots through
3682    /// `vector_introspection_snapshot` / `vector_introspection_get`.
3683    pub(crate) fn vector_introspection_registry(
3684        &self,
3685    ) -> &std::sync::Arc<crate::storage::vector::introspection::VectorIntrospectionRegistry> {
3686        &self.inner.vector_introspection
3687    }
3688
3689    /// Issue #743 — full snapshot of every tracked vector collection's
3690    /// `(VectorMetadata, ArtifactMetadata)`. Deterministically ordered
3691    /// by collection name so Red UI tables and tests both see a
3692    /// stable shape.
3693    pub fn vector_introspection_snapshot(
3694        &self,
3695    ) -> Vec<crate::storage::vector::introspection::VectorIntrospection> {
3696        self.inner.vector_introspection.snapshot()
3697    }
3698
3699    /// Issue #743 — single-collection lookup, for the per-collection
3700    /// metadata endpoint Red UI hits when an operator opens one
3701    /// vector's toolbar.
3702    pub fn vector_introspection_get(
3703        &self,
3704        collection: &str,
3705    ) -> Option<crate::storage::vector::introspection::VectorIntrospection> {
3706        self.inner.vector_introspection.get(collection)
3707    }
3708
3709    /// Slice 10 of issue #527 — render-time scan of pending entries
3710    /// per (queue, group) for the `queue_pending_gauge` exposition.
3711    /// Walks `red_queue_meta` live so the gauge cannot drift from
3712    /// the source of truth.
3713    pub fn queue_pending_counts(&self) -> Vec<((String, String), u64)> {
3714        let store = self.inner.db.store();
3715        crate::runtime::impl_queue::pending_counts_by_group(store.as_ref())
3716            .into_iter()
3717            .collect()
3718    }
3719
3720    /// Shared `Arc` to the write gate. Same rationale as
3721    /// `audit_log_arc`: collaborators (lease lifecycle, refresh
3722    /// thread) need a clone-cheap handle they can move into a
3723    /// background thread.
3724    pub fn write_gate_arc(&self) -> Arc<crate::runtime::write_gate::WriteGate> {
3725        Arc::clone(&self.inner.write_gate)
3726    }
3727
3728    /// Serverless writer-lease state machine. `None` when the operator
3729    /// did not opt into lease fencing (`RED_LEASE_REQUIRED` unset).
3730    pub fn lease_lifecycle(&self) -> Option<&Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3731        self.inner.lease_lifecycle.get()
3732    }
3733
3734    /// Install the lease lifecycle. Idempotent; subsequent calls
3735    /// return the previously stored value untouched.
3736    pub fn set_lease_lifecycle(
3737        &self,
3738        lifecycle: Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>,
3739    ) -> Result<(), Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3740        self.inner.lease_lifecycle.set(lifecycle)
3741    }
3742
3743    /// Reject the call when the requested batch size exceeds
3744    /// `RED_MAX_BATCH_SIZE`. Returns `RedDBError::QuotaExceeded`
3745    /// shaped so the HTTP layer can map it to 413 Payload Too
3746    /// Large (PLAN.md Phase 4.1).
3747    pub fn check_batch_size(&self, requested: usize) -> RedDBResult<()> {
3748        if self.inner.resource_limits.batch_size_exceeded(requested) {
3749            let max = self.inner.resource_limits.max_batch_size.unwrap_or(0);
3750            return Err(RedDBError::QuotaExceeded(format!(
3751                "max_batch_size:{requested}:{max}"
3752            )));
3753        }
3754        Ok(())
3755    }
3756
3757    /// Reject the call when the local DB file exceeds
3758    /// `RED_MAX_DB_SIZE_BYTES`. Reads file metadata once per call —
3759    /// the cost is a single `stat()` syscall, negligible against the
3760    /// I/O the caller is about to do. Returns `QuotaExceeded` shaped
3761    /// for HTTP 507 Insufficient Storage.
3762    pub fn check_db_size(&self) -> RedDBResult<()> {
3763        let Some(limit) = self.inner.resource_limits.max_db_size_bytes else {
3764            return Ok(());
3765        };
3766        if limit == 0 {
3767            return Ok(());
3768        }
3769        let Some(path) = self.inner.db.path() else {
3770            return Ok(());
3771        };
3772        let current = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
3773        if current > limit {
3774            return Err(RedDBError::QuotaExceeded(format!(
3775                "max_db_size_bytes:{current}:{limit}"
3776            )));
3777        }
3778        Ok(())
3779    }
3780
3781    /// Graceful shutdown coordinator (PLAN.md Phase 1.1).
3782    ///
3783    /// Steps, in order, all idempotent across re-entrant calls:
3784    ///   1. Move lifecycle into `ShuttingDown` (concurrent callers
3785    ///      observe `Stopped` after first finishes).
3786    ///   2. Flush WAL + run final checkpoint via `db.flush()` so
3787    ///      every acked write is durable on disk.
3788    ///   3. If `backup_on_shutdown == true` and a remote backend is
3789    ///      configured, run a synchronous `trigger_backup()` so the
3790    ///      remote head reflects the final state.
3791    ///   4. Stamp the report and move to `Stopped`. Subsequent calls
3792    ///      return the cached report without re-running anything.
3793    ///
3794    /// On any error, the runtime is still marked `Stopped` so the
3795    /// process can exit; the caller logs the error context but does
3796    /// not retry the same shutdown — the operator can inspect the
3797    /// report fields to see which step failed.
3798    pub fn graceful_shutdown(
3799        &self,
3800        backup_on_shutdown: bool,
3801    ) -> RedDBResult<crate::runtime::lifecycle::ShutdownReport> {
3802        if !self.inner.lifecycle.begin_shutdown() {
3803            // Someone else already shut down (or is in flight). Return
3804            // the cached report so the HTTP caller and SIGTERM handler
3805            // get the same idempotent answer.
3806            return Ok(self.inner.lifecycle.shutdown_report().unwrap_or_default());
3807        }
3808
3809        let started_ms = std::time::SystemTime::now()
3810            .duration_since(std::time::UNIX_EPOCH)
3811            .map(|d| d.as_millis() as u64)
3812            .unwrap_or(0);
3813        let mut report = crate::runtime::lifecycle::ShutdownReport {
3814            started_at_ms: started_ms,
3815            ..Default::default()
3816        };
3817
3818        // Flush WAL + run any pending checkpoint. Local fsync is
3819        // unconditional — even a lease-lost replica needs its WAL on
3820        // disk before exit so a future restore has the latest tail.
3821        // The remote upload is gated separately so a lost-lease writer
3822        // doesn't clobber the new holder's state on its way out.
3823        let flush_res = self.inner.db.flush_local_only();
3824        report.flushed_wal = flush_res.is_ok();
3825        report.final_checkpoint = flush_res.is_ok();
3826        if let Err(err) = &flush_res {
3827            tracing::error!(
3828                target: "reddb::lifecycle",
3829                error = %err,
3830                "graceful_shutdown: local flush failed"
3831            );
3832        } else if let Err(lease_err) =
3833            self.assert_remote_write_allowed("shutdown/checkpoint_upload")
3834        {
3835            tracing::warn!(
3836                target: "reddb::serverless::lease",
3837                error = %lease_err,
3838                "graceful_shutdown: remote upload skipped — lease not held"
3839            );
3840        } else if let Err(err) = self.inner.db.upload_to_remote_backend() {
3841            tracing::error!(
3842                target: "reddb::lifecycle",
3843                error = %err,
3844                "graceful_shutdown: remote upload failed"
3845            );
3846        }
3847
3848        // Optional final backup. Skipped silently when no remote
3849        // backend is configured — `trigger_backup()` returns Err
3850        // anyway in that case, but logging it as a shutdown failure
3851        // would be misleading on a standalone (no-backend) runtime.
3852        if backup_on_shutdown && self.inner.db.remote_backend.is_some() {
3853            // The trigger_backup gate now reads `WriteKind::Backup`,
3854            // which a replica/read_only instance refuses. That's
3855            // intentional — replicas don't drive backups; only the
3856            // primary does. We still want shutdown to flush its WAL
3857            // even if the backup branch is gated off.
3858            match self.trigger_backup() {
3859                Ok(result) => {
3860                    report.backup_uploaded = result.uploaded;
3861                }
3862                Err(err) => {
3863                    tracing::warn!(
3864                        target: "reddb::lifecycle",
3865                        error = %err,
3866                        "graceful_shutdown: final backup skipped"
3867                    );
3868                }
3869            }
3870        }
3871
3872        let completed_ms = std::time::SystemTime::now()
3873            .duration_since(std::time::UNIX_EPOCH)
3874            .map(|d| d.as_millis() as u64)
3875            .unwrap_or(started_ms);
3876        report.completed_at_ms = completed_ms;
3877        report.duration_ms = completed_ms.saturating_sub(started_ms);
3878
3879        self.inner.lifecycle.finish_shutdown(report.clone());
3880        Ok(report)
3881    }
3882
3883    /// PLAN.md Phase 4.4 — per-caller quota bucket. Always
3884    /// returned; `is_configured()` lets callers short-circuit.
3885    pub fn quota_bucket(&self) -> &crate::runtime::quota_bucket::QuotaBucket {
3886        &self.inner.quota_bucket
3887    }
3888
3889    /// PLAN.md Phase 6.3 — whether at-rest encryption is configured.
3890    /// Reads `RED_ENCRYPTION_KEY` / `RED_ENCRYPTION_KEY_FILE` lazily;
3891    /// returns `("enabled", None)` when a key is loadable, `("error", Some(msg))`
3892    /// when the operator set the env but it doesn't parse, and
3893    /// `("disabled", None)` when no key is configured. The pager
3894    /// hookup is deferred — this accessor surfaces the operator's
3895    /// intent for /admin/status without yet using the key in writes.
3896    pub fn encryption_at_rest_status(&self) -> (&'static str, Option<String>) {
3897        match crate::crypto::page_encryption::key_from_env() {
3898            Ok(Some(_)) => ("enabled", None),
3899            Ok(None) => ("disabled", None),
3900            Err(err) => ("error", Some(err)),
3901        }
3902    }
3903
3904    /// PLAN.md Phase 11.5 — current replica apply health label
3905    /// (`ok`, `gap`, `divergence`, `apply_error`, `connecting`,
3906    /// `stalled_gap`). Read from the persisted `red.replication.state`
3907    /// config key updated by the replica loop. Returns `None` on
3908    /// non-replica instances or when no apply has run yet.
3909    pub fn replica_apply_health(&self) -> Option<String> {
3910        let state = self.config_string("red.replication.state", "");
3911        if state.is_empty() {
3912            None
3913        } else {
3914            Some(state)
3915        }
3916    }
3917
3918    pub fn acquire(&self) -> RedDBResult<RuntimeConnection> {
3919        let mut pool = self
3920            .inner
3921            .pool
3922            .lock()
3923            .map_err(|e| RedDBError::Internal(format!("connection pool lock poisoned: {e}")))?;
3924        if pool.active >= self.inner.pool_config.max_connections {
3925            return Err(RedDBError::Internal(
3926                "connection pool exhausted".to_string(),
3927            ));
3928        }
3929
3930        let id = if let Some(id) = pool.idle.pop() {
3931            id
3932        } else {
3933            let id = pool.next_id;
3934            pool.next_id += 1;
3935            id
3936        };
3937        pool.active += 1;
3938        pool.total_checkouts += 1;
3939        drop(pool);
3940
3941        Ok(RuntimeConnection {
3942            id,
3943            inner: Arc::clone(&self.inner),
3944        })
3945    }
3946
3947    pub fn checkpoint(&self) -> RedDBResult<()> {
3948        // Local fsync always allowed — losing the lease shouldn't
3949        // prevent us from durably persisting what's already in memory.
3950        // The remote upload is the side-effect that risks clobbering a
3951        // peer's state, so it's behind the lease gate.
3952        self.inner.db.flush_local_only().map_err(|err| {
3953            // Issue #205 — local flush failure is a CheckpointFailed
3954            // operator-grade event. The local-flush path also covers
3955            // the WAL fsync we depend on, so a failure here doubles as
3956            // the WalFsyncFailed signal for the runtime entry point.
3957            let msg = err.to_string();
3958            crate::telemetry::operator_event::OperatorEvent::CheckpointFailed {
3959                lsn: 0,
3960                error: msg.clone(),
3961            }
3962            .emit_global();
3963            crate::telemetry::operator_event::OperatorEvent::WalFsyncFailed {
3964                path: "<flush_local_only>".to_string(),
3965                error: msg.clone(),
3966            }
3967            .emit_global();
3968            RedDBError::Engine(msg)
3969        })?;
3970        if let Err(err) = self.assert_remote_write_allowed("checkpoint") {
3971            tracing::warn!(
3972                target: "reddb::serverless::lease",
3973                error = %err,
3974                "checkpoint: skipping remote upload — lease not held"
3975            );
3976            return Ok(());
3977        }
3978        self.inner
3979            .db
3980            .upload_to_remote_backend()
3981            .map_err(|err| RedDBError::Engine(err.to_string()))
3982    }
3983
3984    /// Guard remote-mutating operations on the writer lease.
3985    /// Returns `Ok(())` when no remote backend is configured (the
3986    /// lease is irrelevant) or the lease state is `NotRequired` /
3987    /// `Held`. Returns `RedDBError::ReadOnly` when the lease is
3988    /// `NotHeld`, with an audit-friendly action label so the caller
3989    /// can record the rejection.
3990    pub(crate) fn assert_remote_write_allowed(&self, action: &str) -> RedDBResult<()> {
3991        if self.inner.db.remote_backend.is_none() {
3992            return Ok(());
3993        }
3994        match self.inner.write_gate.lease_state() {
3995            crate::runtime::write_gate::LeaseGateState::NotHeld => {
3996                self.inner.audit_log.record(
3997                    action,
3998                    "system",
3999                    "remote_backend",
4000                    "err: writer lease not held",
4001                    crate::json::Value::Null,
4002                );
4003                Err(RedDBError::ReadOnly(format!(
4004                    "writer lease not held — {action} blocked (serverless fence)"
4005                )))
4006            }
4007            _ => Ok(()),
4008        }
4009    }
4010
4011    pub fn run_maintenance(&self) -> RedDBResult<()> {
4012        self.inner
4013            .db
4014            .run_maintenance()
4015            .map_err(|err| RedDBError::Internal(err.to_string()))
4016    }
4017
4018    pub fn scan_collection(
4019        &self,
4020        collection: &str,
4021        cursor: Option<ScanCursor>,
4022        limit: usize,
4023    ) -> RedDBResult<ScanPage> {
4024        let store = self.inner.db.store();
4025        let manager = store
4026            .get_collection(collection)
4027            .ok_or_else(|| RedDBError::NotFound(collection.to_string()))?;
4028
4029        let mut entities = manager.query_all(|_| true);
4030        entities.sort_by_key(|entity| entity.id.raw());
4031
4032        let offset = cursor.map(|cursor| cursor.offset).unwrap_or(0);
4033        let total = entities.len();
4034        let end = total.min(offset.saturating_add(limit.max(1)));
4035        let items = if offset >= total {
4036            Vec::new()
4037        } else {
4038            entities[offset..end].to_vec()
4039        };
4040        let next = (end < total).then_some(ScanCursor { offset: end });
4041
4042        Ok(ScanPage {
4043            collection: collection.to_string(),
4044            items,
4045            next,
4046            total,
4047        })
4048    }
4049
4050    pub fn catalog(&self) -> CatalogModelSnapshot {
4051        self.inner.db.catalog_model_snapshot()
4052    }
4053
4054    pub fn catalog_consistency_report(&self) -> crate::catalog::CatalogConsistencyReport {
4055        self.inner.db.catalog_consistency_report()
4056    }
4057
4058    pub fn catalog_attention_summary(&self) -> CatalogAttentionSummary {
4059        crate::catalog::attention_summary(&self.catalog())
4060    }
4061
4062    pub fn collection_attention(&self) -> Vec<CollectionDescriptor> {
4063        crate::catalog::collection_attention(&self.catalog())
4064    }
4065
4066    pub fn index_attention(&self) -> Vec<CatalogIndexStatus> {
4067        crate::catalog::index_attention(&self.catalog())
4068    }
4069
4070    pub fn graph_projection_attention(&self) -> Vec<CatalogGraphProjectionStatus> {
4071        crate::catalog::graph_projection_attention(&self.catalog())
4072    }
4073
4074    pub fn analytics_job_attention(&self) -> Vec<CatalogAnalyticsJobStatus> {
4075        crate::catalog::analytics_job_attention(&self.catalog())
4076    }
4077
4078    pub fn stats(&self) -> RuntimeStats {
4079        let pool = runtime_pool_lock(self);
4080        RuntimeStats {
4081            active_connections: pool.active,
4082            idle_connections: pool.idle.len(),
4083            total_checkouts: pool.total_checkouts,
4084            paged_mode: self.inner.db.is_paged(),
4085            started_at_unix_ms: self.inner.started_at_unix_ms,
4086            store: self.inner.db.stats(),
4087            system: SystemInfo::collect(),
4088            result_blob_cache: self.inner.result_blob_cache.stats(),
4089            kv: self.inner.kv_stats.snapshot(),
4090            metrics_ingest: self.inner.metrics_ingest_stats.snapshot(),
4091        }
4092    }
4093
4094    pub(crate) fn record_metrics_ingest(
4095        &self,
4096        accepted_samples: u64,
4097        accepted_series: u64,
4098        rejected_samples: u64,
4099        rejected_series: u64,
4100    ) {
4101        self.inner.metrics_ingest_stats.record(
4102            accepted_samples,
4103            accepted_series,
4104            rejected_samples,
4105            rejected_series,
4106        );
4107    }
4108
4109    pub(crate) fn record_metrics_cardinality_budget_rejections(&self, rejected_series: u64) {
4110        self.inner
4111            .metrics_ingest_stats
4112            .record_cardinality_budget_rejections(rejected_series);
4113    }
4114
4115    pub(crate) fn record_metrics_tenant_activity(
4116        &self,
4117        tenant: &str,
4118        namespace: &str,
4119        operation: &str,
4120    ) {
4121        self.inner
4122            .metrics_tenant_activity_stats
4123            .record(tenant, namespace, operation);
4124    }
4125
4126    pub(crate) fn metrics_tenant_activity_snapshot(
4127        &self,
4128    ) -> Vec<crate::runtime::MetricsTenantActivityStats> {
4129        self.inner.metrics_tenant_activity_stats.snapshot()
4130    }
4131
4132    /// Execute a query under a typed scope override without embedding
4133    /// the tenant / user / role values into the SQL string. Use this
4134    /// from transport middleware (HTTP / gRPC / worker loops) where the
4135    /// scope is resolved from auth claims and the SQL is a parameterised
4136    /// template — avoids the string-concat injection risk of building
4137    /// `WITHIN TENANT '<id>' …` manually, and is drop-in compatible with
4138    /// prepared statements that didn't know about tenancy.
4139    ///
4140    /// Precedence matches the `WITHIN` clause: the passed `scope`
4141    /// overrides `SET LOCAL TENANT`, which overrides `SET TENANT`.
4142    /// The override is pushed on the thread-local scope stack for the
4143    /// duration of the call and popped on return — pool-shared
4144    /// connections cannot leak it across requests.
4145    pub fn execute_query_with_scope(
4146        &self,
4147        query: &str,
4148        scope: crate::runtime::within_clause::ScopeOverride,
4149    ) -> RedDBResult<RuntimeQueryResult> {
4150        if scope.is_empty() {
4151            return self.execute_query(query);
4152        }
4153        let _scope_guard = ScopeOverrideGuard::install(scope);
4154        self.execute_query(query)
4155    }
4156
4157    /// Issue #205 — single lifecycle exit for slow-query logging.
4158    ///
4159    /// `execute_query_inner` does the real work; this wrapper times it
4160    /// and, if elapsed exceeds the configured threshold, hands the
4161    /// triple `(QueryKind, elapsed_ms, sql_redacted, scope)` to the
4162    /// SlowQueryLogger. The threshold + sample_pct were captured at
4163    /// SlowQueryLogger construction (runtime startup), so the per-call
4164    /// cost on below-threshold paths is one relaxed atomic load.
4165    pub fn execute_query(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4166        let started = std::time::Instant::now();
4167        let result = self.execute_query_inner(query);
4168        self.finish_query_lifecycle(query, started, result)
4169    }
4170
4171    /// Execute a SQL statement with already-decoded positional bind
4172    /// parameters. Transports should call this instead of parsing +
4173    /// binding on their side and then reaching for `execute_query_expr`:
4174    /// this entry keeps parameterized statements inside the same
4175    /// statement lifecycle as textual SQL (snapshot guard, config/secret
4176    /// guards, coarse auth, intent locks, slow-query logging, integrity
4177    /// tombstone filtering, and causal bookmarks).
4178    pub fn execute_query_with_params(
4179        &self,
4180        query: &str,
4181        params: &[Value],
4182    ) -> RedDBResult<RuntimeQueryResult> {
4183        if params.is_empty() {
4184            return self.execute_query(query);
4185        }
4186        let started = std::time::Instant::now();
4187        let result = self.execute_query_with_params_inner(query, params);
4188        self.finish_query_lifecycle(query, started, result)
4189    }
4190
4191    fn finish_query_lifecycle(
4192        &self,
4193        query: &str,
4194        started: std::time::Instant,
4195        mut result: RedDBResult<RuntimeQueryResult>,
4196    ) -> RedDBResult<RuntimeQueryResult> {
4197        // Issue #765 / S6 — filter integrity-tombstoned rows out of SELECT
4198        // results before they reach any consumer. Fast no-op (one relaxed
4199        // atomic load) unless an input-stream digest mismatch has tombstoned
4200        // a RID range on this store.
4201        if let Ok(ref mut query_result) = result {
4202            if query_result.statement_type == "select" {
4203                self.filter_integrity_tombstoned(&mut query_result.result);
4204            }
4205        }
4206        let elapsed_ms = started.elapsed().as_millis() as u64;
4207
4208        // Build EffectiveScope from the same thread-locals frame-build
4209        // consults — keeps the slow-log row consistent with the audit /
4210        // RLS view of "this statement". `ai_scope()` is the canonical
4211        // builder.
4212        let scope = self.ai_scope();
4213        let kind = match result
4214            .as_ref()
4215            .map(|r| r.statement_type)
4216            .unwrap_or("select")
4217        {
4218            "select" => crate::telemetry::slow_query_logger::QueryKind::Select,
4219            "insert" => crate::telemetry::slow_query_logger::QueryKind::Insert,
4220            "update" => crate::telemetry::slow_query_logger::QueryKind::Update,
4221            "delete" => crate::telemetry::slow_query_logger::QueryKind::Delete,
4222            _ => crate::telemetry::slow_query_logger::QueryKind::Internal,
4223        };
4224        // SQL redaction: pass the raw query through. The slow-query
4225        // logger writes structured JSON so embedded literals stay
4226        // escape-safe at the JSON boundary (proven by
4227        // `adversarial_sql_is_escape_safe` in slow_query_logger.rs).
4228        // PII redaction (e.g. literal masking) is a follow-up.
4229        self.inner
4230            .slow_query_logger
4231            .record(kind, elapsed_ms, query.to_string(), &scope);
4232
4233        if let Ok(ref mut query_result) = result {
4234            if matches!(query_result.statement_type, "insert" | "update" | "delete") {
4235                let bookmark = crate::replication::CausalBookmark::new(
4236                    self.current_replication_term(),
4237                    self.cdc_current_lsn(),
4238                );
4239                query_result.bookmark = Some(bookmark.encode());
4240            }
4241        }
4242
4243        result
4244    }
4245
4246    fn execute_query_with_params_inner(
4247        &self,
4248        query: &str,
4249        params: &[Value],
4250    ) -> RedDBResult<RuntimeQueryResult> {
4251        let parsed = parse_multi(query).map_err(|err| RedDBError::Query(err.to_string()))?;
4252        let bound = crate::storage::query::user_params::bind(&parsed, params).map_err(|err| {
4253            RedDBError::Validation {
4254                message: err.to_string(),
4255                validation: crate::json!({
4256                    "code": "INVALID_PARAMS",
4257                    "surface": "query.params",
4258                }),
4259            }
4260        })?;
4261        self.execute_bound_query_expr_in_frame(query, bound)
4262    }
4263
4264    fn execute_bound_query_expr_in_frame(
4265        &self,
4266        query: &str,
4267        expr: QueryExpr,
4268    ) -> RedDBResult<RuntimeQueryResult> {
4269        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
4270        let execution_query = rewritten_query.as_deref().unwrap_or(query);
4271        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
4272        let _frame_guards = frame.install(self);
4273        let _log_span = crate::telemetry::span::query_span(query).entered();
4274
4275        let expr = self.rewrite_view_refs(expr);
4276        let mode = detect_mode(execution_query);
4277        let control_event_specs = query_control_event_specs(&expr);
4278        let _lock_guard = match frame.prepare_dispatch(self, &expr) {
4279            Ok(guard) => guard,
4280            Err(err) => {
4281                let outcome = control_event_outcome_for_error(&err);
4282                for spec in &control_event_specs {
4283                    self.emit_control_event(
4284                        spec.kind,
4285                        outcome,
4286                        spec.action,
4287                        spec.resource.clone(),
4288                        Some(err.to_string()),
4289                        spec.fields.clone(),
4290                    )?;
4291                }
4292                return Err(err);
4293            }
4294        };
4295
4296        let mut result = self.dispatch_expr(expr, query, mode)?;
4297        if result.statement_type == "select" {
4298            self.apply_secret_decryption(&mut result);
4299        }
4300        Ok(result)
4301    }
4302
4303    pub fn causal_session(&self) -> crate::runtime::CausalSession {
4304        crate::runtime::CausalSession {
4305            runtime: self.clone(),
4306            bookmark: None,
4307            wait_timeout: std::time::Duration::from_secs(5),
4308        }
4309    }
4310
4311    pub fn wait_for_bookmark(
4312        &self,
4313        bookmark: &crate::replication::CausalBookmark,
4314        timeout: std::time::Duration,
4315    ) -> RedDBResult<()> {
4316        let deadline = std::time::Instant::now() + timeout;
4317        loop {
4318            let applied_lsn = self.local_contiguous_applied_lsn();
4319            if applied_lsn >= bookmark.commit_lsn() {
4320                return Ok(());
4321            }
4322            let now = std::time::Instant::now();
4323            if now >= deadline {
4324                return Err(RedDBError::InvalidOperation(format!(
4325                    "timed out waiting for causal bookmark lsn {}; applied={}",
4326                    bookmark.commit_lsn(),
4327                    applied_lsn
4328                )));
4329            }
4330            let remaining = deadline.saturating_duration_since(now);
4331            std::thread::sleep(remaining.min(std::time::Duration::from_millis(5)));
4332        }
4333    }
4334
4335    fn local_contiguous_applied_lsn(&self) -> u64 {
4336        match self.inner.db.options().replication.role {
4337            crate::replication::ReplicationRole::Replica { .. } => {
4338                self.config_u64("red.replication.last_applied_lsn", 0)
4339            }
4340            _ => self.cdc_current_lsn(),
4341        }
4342    }
4343
4344    #[inline(never)]
4345    fn execute_query_inner(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4346        // ── ULTRA-TURBO: autocommit `SELECT * FROM t WHERE _entity_id = N` ──
4347        //
4348        // Moved above every boot-cost the normal path pays (WITHIN
4349        // strip, SET LOCAL parse, tx_local_tenants read, snapshot
4350        // guard, tracing span, tx_contexts read) because the bench's
4351        // `select_point` scenario was observed at 28× vs PostgreSQL —
4352        // the dominant cost wasn't the entity fetch but the ceremony
4353        // before it. Only fires when there's no ambient transaction
4354        // context or WITHIN override, so the snapshot install we skip
4355        // truly is a no-op for this query.
4356        if !has_scope_override_active()
4357            && !query.trim_start().starts_with("WITHIN")
4358            && !query.trim_start().starts_with("within")
4359            && !self.inner.query_audit.has_rules()
4360            && !self
4361                .inner
4362                .tx_contexts
4363                .read()
4364                .contains_key(&current_connection_id())
4365        {
4366            if let Some(result) = self.try_fast_entity_lookup(query) {
4367                return result;
4368            }
4369        }
4370
4371        // `WITHIN TENANT '<id>' [USER '<u>'] [AS ROLE '<r>'] <stmt>` —
4372        // strip the prefix, push a stack-scoped override, recurse on
4373        // the inner statement, pop on return. Stack lives in a
4374        // thread-local but is balanced by the RAII guard, so a
4375        // pool-shared connection cannot leak the override across
4376        // requests and an early `?` return still pops cleanly.
4377        match crate::runtime::within_clause::try_strip_within_prefix(query) {
4378            Ok(Some((scope, inner))) => {
4379                let _scope_guard = ScopeOverrideGuard::install(scope);
4380                // Re-enter the inner path, NOT `execute_query`, so the
4381                // slow-query lifecycle hook records exactly one row per
4382                // top-level statement (the WITHIN-stripped form would
4383                // double-record).
4384                return self.execute_query_inner(inner);
4385            }
4386            Ok(None) => {}
4387            Err(msg) => return Err(RedDBError::Query(msg)),
4388        }
4389
4390        // `EXPLAIN <stmt>` — introspection. Runs the planner on the
4391        // inner statement (WITHOUT executing it) and returns the
4392        // CanonicalLogicalNode tree as rows so the caller can see the
4393        // operator shape and estimated cost. `EXPLAIN ALTER FOR ...`
4394        // is a distinct schema-diff command and continues down the
4395        // regular SQL path.
4396        if let Some(inner) = strip_explain_prefix(query) {
4397            return self.explain_as_rows(query, inner);
4398        }
4399
4400        // `SET LOCAL TENANT '<id>'` — write the per-transaction tenant
4401        // override and return. Outside a transaction the statement is
4402        // an error (matches PG semantics: SET LOCAL only takes effect
4403        // within an active transaction).
4404        if let Some(value) = parse_set_local_tenant(query)? {
4405            let conn_id = current_connection_id();
4406            if !self.inner.tx_contexts.read().contains_key(&conn_id) {
4407                return Err(RedDBError::Query(
4408                    "SET LOCAL TENANT requires an active transaction".to_string(),
4409                ));
4410            }
4411            self.inner
4412                .tx_local_tenants
4413                .write()
4414                .insert(conn_id, value.clone());
4415            return Ok(RuntimeQueryResult::ok_message(
4416                query.to_string(),
4417                &match &value {
4418                    Some(id) => format!("local tenant set: {id}"),
4419                    None => "local tenant cleared".to_string(),
4420                },
4421                "set_local_tenant",
4422            ));
4423        }
4424
4425        if super::red_schema::is_system_schema_write(query) {
4426            return Err(RedDBError::Query(
4427                super::red_schema::READ_ONLY_ERROR.to_string(),
4428            ));
4429        }
4430
4431        if let Some(create_source) = super::analytics_source_catalog::parse_create_statement(query)?
4432        {
4433            return self.execute_create_analytics_source(query, create_source);
4434        }
4435
4436        // Issue #790 — `READ METRIC <path>` is intentionally rejected at
4437        // v0. The descriptor itself is readable through
4438        // `red.analytics.metrics`; the *output* read returns a
4439        // structured error so callers can tell "execution engine not yet
4440        // built" apart from "metric does not exist".
4441        if let Some(path) = super::metric_descriptor_catalog::parse_read_metric_statement(query) {
4442            return Err(super::metric_descriptor_catalog::read_output_unsupported(
4443                &path,
4444            ));
4445        }
4446
4447        // Issue #918 / ADR 0035 — leaderboard rank capability catalog
4448        // declarations are still recognised before the general parser.
4449        // Rank reads themselves are parser AST nodes, including Redis-flavor
4450        // Z* sugar that desugars to the same canonical rank shapes.
4451        if let Some(parsed) = super::ranking_descriptor_catalog::parse_create_ranking(query) {
4452            return self.execute_create_ranking(query, parsed?);
4453        }
4454        if super::ranking_descriptor_catalog::parse_show_rankings(query) {
4455            return self.execute_show_rankings(query);
4456        }
4457
4458        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
4459        let execution_query = rewritten_query.as_deref().unwrap_or(query);
4460
4461        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
4462        let _frame_guards = frame.install(self);
4463
4464        // Phase 6 logging: enter a span stamped with conn_id / tenant
4465        // / query_len. Every downstream tracing::info!/warn!/error!
4466        // inherits these fields — no need to thread them manually
4467        // through storage/scan layers. Entered AFTER the WITHIN /
4468        // SET LOCAL TENANT resolution above so the span reflects the
4469        // effective scope for this statement.
4470        let _log_span = crate::telemetry::span::query_span(query).entered();
4471
4472        // ── CTE prelude (#41) — `WITH x AS (...) SELECT ... FROM x` ──
4473        if let Some(rewritten) = frame.prepare_cte(execution_query)? {
4474            return self.execute_query_expr(rewritten);
4475        }
4476
4477        // ── TURBO: bypass SQL parse for SELECT * FROM x WHERE _entity_id = N ──
4478        if !self.inner.query_audit.has_rules() {
4479            if let Some(result) = self.try_fast_entity_lookup(execution_query) {
4480                return result;
4481            }
4482        }
4483
4484        // ── Result cache: return cached result if still fresh (30s TTL) ──
4485        if !self.inner.query_audit.has_rules() {
4486            if let Some(result) = frame.read_result_cache(self) {
4487                return Ok(result);
4488            }
4489        }
4490
4491        let prepared = frame.prepare_statement(self, execution_query)?;
4492        let mode = prepared.mode;
4493        let expr = prepared.expr;
4494
4495        let statement = query_expr_name(&expr);
4496        let result_cache_scopes = query_expr_result_cache_scopes(&expr);
4497        let control_event_specs = query_control_event_specs(&expr);
4498        let query_audit_plan = query_audit_plan(&expr);
4499
4500        let _lock_guard = match frame.prepare_dispatch(self, &expr) {
4501            Ok(guard) => guard,
4502            Err(err) => {
4503                let outcome = control_event_outcome_for_error(&err);
4504                for spec in &control_event_specs {
4505                    self.emit_control_event(
4506                        spec.kind,
4507                        outcome,
4508                        spec.action,
4509                        spec.resource.clone(),
4510                        Some(err.to_string()),
4511                        spec.fields.clone(),
4512                    )?;
4513                }
4514                return Err(err);
4515            }
4516        };
4517        let frame_iface: &dyn super::statement_frame::ReadFrame = &frame;
4518        let query_audit_started = std::time::Instant::now();
4519
4520        let query_result = match expr {
4521            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
4522                // Apply MVCC visibility + RLS gate while materialising the
4523                // graph: every node entity is screened against the source
4524                // collection's policy chain (basic and `Nodes`-targeted)
4525                // and dropped when the caller's tenant / role doesn't
4526                // admit it. Edges are pruned automatically because the
4527                // graph builder skips edges whose endpoints aren't in
4528                // `allowed_nodes`.
4529                let (graph, node_properties, edge_properties) =
4530                    self.materialize_graph_with_rls()?;
4531                let result =
4532                    crate::storage::query::unified::UnifiedExecutor::execute_on_with_graph_properties(
4533                        &graph,
4534                        &expr,
4535                        node_properties,
4536                        edge_properties,
4537                    )
4538                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4539
4540                Ok(RuntimeQueryResult {
4541                    query: query.to_string(),
4542                    mode,
4543                    statement,
4544                    engine: "materialized-graph",
4545                    result,
4546                    affected_rows: 0,
4547                    statement_type: "select",
4548                    bookmark: None,
4549                })
4550            }
4551            QueryExpr::Table(table) => {
4552                let table = self.resolve_table_expr_subqueries(
4553                    table,
4554                    &frame as &dyn super::statement_frame::ReadFrame,
4555                )?;
4556                // Table-valued functions (e.g. components(g)) dispatch to a
4557                // read-only executor before any catalog/virtual-table routing
4558                // (issue #795).
4559                if let Some(TableSource::Function {
4560                    name,
4561                    args,
4562                    named_args,
4563                }) = table.source.clone()
4564                {
4565                    // The graph-collection form is cacheable (issue #802): the
4566                    // result-cache read at the top of this function keys on the
4567                    // query string, and `result_cache_scopes` carries the graph
4568                    // collection (see `collect_table_source_scopes`) so a write
4569                    // to it invalidates the entry. Deterministic algorithm
4570                    // output is worth caching at any row count, so the write
4571                    // bypasses the generic ≤5-row payload heuristic.
4572                    let tvf_result = RuntimeQueryResult {
4573                        query: query.to_string(),
4574                        mode,
4575                        statement,
4576                        engine: "runtime-graph-tvf",
4577                        result: self.execute_table_function(&name, &args, &named_args)?,
4578                        affected_rows: 0,
4579                        statement_type: "select",
4580                        bookmark: None,
4581                    };
4582                    frame.write_result_cache(self, &tvf_result, result_cache_scopes.clone());
4583                    return Ok(tvf_result);
4584                }
4585                // Inline-graph TVF (issue #799): the graph is supplied by two
4586                // subqueries instead of a collection reference. Unlike the
4587                // graph-collection form, the result IS cacheable — its cache
4588                // key is the query string (the result-cache read at the top of
4589                // `execute_query_inner` keys on it) and `result_cache_scopes`
4590                // already carries the `nodes`/`edges` source collections, so a
4591                // write to any of them invalidates the entry.
4592                if let Some(TableSource::InlineGraphFunction {
4593                    name,
4594                    nodes,
4595                    edges,
4596                    named_args,
4597                }) = table.source.clone()
4598                {
4599                    let inline_result = RuntimeQueryResult {
4600                        query: query.to_string(),
4601                        mode,
4602                        statement,
4603                        engine: "runtime-graph-tvf-inline",
4604                        result: self.execute_inline_graph_function(
4605                            &name,
4606                            &nodes,
4607                            &edges,
4608                            &named_args,
4609                        )?,
4610                        affected_rows: 0,
4611                        statement_type: "select",
4612                        bookmark: None,
4613                    };
4614                    frame.write_result_cache(self, &inline_result, result_cache_scopes);
4615                    return Ok(inline_result);
4616                }
4617                if super::red_schema::is_virtual_table(&table.table) {
4618                    return Ok(RuntimeQueryResult {
4619                        query: query.to_string(),
4620                        mode,
4621                        statement,
4622                        engine: "runtime-red-schema",
4623                        result: super::red_schema::red_query(
4624                            self,
4625                            &table.table,
4626                            &table,
4627                            &frame as &dyn super::statement_frame::ReadFrame,
4628                        )?,
4629                        affected_rows: 0,
4630                        statement_type: "select",
4631                        bookmark: None,
4632                    });
4633                }
4634
4635                // `<graph>.<output>` analytics virtual view (issue #800).
4636                // Recomputed on demand — intentionally not result-cached, so it
4637                // always reflects the current graph data.
4638                if let Some(view_result) = self.try_resolve_analytics_view(
4639                    &table,
4640                    &frame as &dyn super::statement_frame::ReadFrame,
4641                )? {
4642                    return Ok(RuntimeQueryResult {
4643                        query: query.to_string(),
4644                        mode,
4645                        statement,
4646                        engine: "runtime-graph-analytics-view",
4647                        result: view_result,
4648                        affected_rows: 0,
4649                        statement_type: "select",
4650                        bookmark: None,
4651                    });
4652                }
4653
4654                if let Some(result) = self.execute_probabilistic_select(&table)? {
4655                    return Ok(RuntimeQueryResult {
4656                        query: query.to_string(),
4657                        mode,
4658                        statement,
4659                        engine: "runtime-probabilistic",
4660                        result,
4661                        affected_rows: 0,
4662                        statement_type: "select",
4663                        bookmark: None,
4664                    });
4665                }
4666
4667                // Foreign-table intercept (Phase 3.2.2 PG parity).
4668                //
4669                // When the referenced table matches a `CREATE FOREIGN TABLE`
4670                // registration, short-circuit into the FDW scan. Phase 3.2
4671                // wrappers don't yet support pushdown, so filters/projections
4672                // apply post-scan via `apply_foreign_table_filters` — good
4673                // enough for correctness; perf work lands in 3.2.3.
4674                if self.inner.foreign_tables.is_foreign_table(&table.table) {
4675                    let records = self
4676                        .inner
4677                        .foreign_tables
4678                        .scan(&table.table)
4679                        .map_err(|e| RedDBError::Internal(e.to_string()))?;
4680                    let result = apply_foreign_table_filters(records, &table);
4681                    return Ok(RuntimeQueryResult {
4682                        query: query.to_string(),
4683                        mode,
4684                        statement,
4685                        engine: "runtime-fdw",
4686                        result,
4687                        affected_rows: 0,
4688                        statement_type: "select",
4689                        bookmark: None,
4690                    });
4691                }
4692
4693                // Row-Level Security enforcement (Phase 2.5.2 PG parity).
4694                //
4695                // When RLS is enabled on this table, fetch every policy
4696                // that applies to the current (role, SELECT) pair and
4697                // fold them into the query's WHERE clause: policies
4698                // OR-combine (any of them admitting the row is enough),
4699                // then AND into the caller's existing filter.
4700                //
4701                // Anonymous callers (no thread-local identity) pass
4702                // `role = None`; policies with a specific `TO role`
4703                // clause skip, but `TO PUBLIC` policies still apply.
4704                //
4705                // When `inject_rls_filters` returns `None` the table has
4706                // RLS enabled but no policy admits the caller's role —
4707                // short-circuit with an empty result set instead of
4708                // synthesising a contradiction filter.
4709                let Some(table_with_rls) = self.authorize_relational_table_select(
4710                    table,
4711                    &frame as &dyn super::statement_frame::ReadFrame,
4712                )?
4713                else {
4714                    let empty = crate::storage::query::unified::UnifiedResult::empty();
4715                    return Ok(RuntimeQueryResult {
4716                        query: query.to_string(),
4717                        mode,
4718                        statement,
4719                        engine: "runtime-table-rls",
4720                        result: empty,
4721                        affected_rows: 0,
4722                        statement_type: "select",
4723                        bookmark: None,
4724                    });
4725                };
4726                Ok(RuntimeQueryResult {
4727                    query: query.to_string(),
4728                    mode,
4729                    statement,
4730                    engine: "runtime-table",
4731                    // #885: lend the frame-owned row-buffer arena to the
4732                    // streaming path so chunk buffers are reused across
4733                    // this statement's chunk-fetches instead of allocated
4734                    // fresh per chunk. This is the table-query dispatch
4735                    // that runs under a `StatementExecutionFrame`; the
4736                    // frameless prepared/subquery paths keep `None`.
4737                    result: execute_runtime_table_query_in(
4738                        &self.inner.db,
4739                        &table_with_rls,
4740                        Some(&self.inner.index_store),
4741                        Some(frame.row_arena()),
4742                    )?,
4743                    affected_rows: 0,
4744                    statement_type: "select",
4745                    bookmark: None,
4746                })
4747            }
4748            QueryExpr::Join(join) => {
4749                // Fold per-table RLS filters into each `QueryExpr::Table`
4750                // leaf of the join tree before executing. Without this
4751                // the join executor scans both tables raw and ignores
4752                // policies — a `WITHIN TENANT 'x'` against a join of
4753                // two tenant-scoped tables would leak cross-tenant rows.
4754                // When any leaf has RLS enabled and zero matching policy,
4755                // short-circuit to an empty join result instead of
4756                // emitting a contradiction filter.
4757                let join_with_rls = match self.authorize_relational_join_select(
4758                    join,
4759                    &frame as &dyn super::statement_frame::ReadFrame,
4760                )? {
4761                    Some(j) => j,
4762                    None => {
4763                        return Ok(RuntimeQueryResult {
4764                            query: query.to_string(),
4765                            mode,
4766                            statement,
4767                            engine: "runtime-join-rls",
4768                            result: crate::storage::query::unified::UnifiedResult::empty(),
4769                            affected_rows: 0,
4770                            statement_type: "select",
4771                            bookmark: None,
4772                        });
4773                    }
4774                };
4775                Ok(RuntimeQueryResult {
4776                    query: query.to_string(),
4777                    mode,
4778                    statement,
4779                    engine: "runtime-join",
4780                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
4781                    affected_rows: 0,
4782                    statement_type: "select",
4783                    bookmark: None,
4784                })
4785            }
4786            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
4787                query: query.to_string(),
4788                mode,
4789                statement,
4790                engine: "runtime-vector",
4791                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
4792                affected_rows: 0,
4793                statement_type: "select",
4794                bookmark: None,
4795            }),
4796            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
4797                query: query.to_string(),
4798                mode,
4799                statement,
4800                engine: "runtime-hybrid",
4801                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
4802                affected_rows: 0,
4803                statement_type: "select",
4804                bookmark: None,
4805            }),
4806            QueryExpr::RankOf(ref rank) => self.execute_rank_of(query, rank),
4807            QueryExpr::ApproxRankOf(ref rank) => self.execute_approx_rank_of(query, rank),
4808            QueryExpr::RankRange(ref range) => self.execute_rank_range(query, range),
4809            // DML execution
4810            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
4811                Err(RedDBError::Query(
4812                    super::red_schema::READ_ONLY_ERROR.to_string(),
4813                ))
4814            }
4815            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
4816                Err(RedDBError::Query(
4817                    super::red_schema::READ_ONLY_ERROR.to_string(),
4818                ))
4819            }
4820            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
4821                Err(RedDBError::Query(
4822                    super::red_schema::READ_ONLY_ERROR.to_string(),
4823                ))
4824            }
4825            QueryExpr::Insert(ref insert) => self
4826                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
4827                    self.execute_insert(query, insert)
4828                }),
4829            QueryExpr::Update(ref update) => self
4830                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
4831                    self.execute_update(query, update)
4832                }),
4833            QueryExpr::Delete(ref delete) => self
4834                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
4835                    self.execute_delete(query, delete)
4836                }),
4837            // DDL execution
4838            QueryExpr::CreateTable(ref create) => self.execute_create_table(query, create),
4839            QueryExpr::CreateCollection(ref create) => {
4840                self.execute_create_collection(query, create)
4841            }
4842            QueryExpr::CreateVector(ref create) => self.execute_create_vector(query, create),
4843            QueryExpr::DropTable(ref drop_tbl) => self.execute_drop_table(query, drop_tbl),
4844            QueryExpr::DropGraph(ref drop_graph) => self.execute_drop_graph(query, drop_graph),
4845            QueryExpr::DropVector(ref drop_vector) => self.execute_drop_vector(query, drop_vector),
4846            QueryExpr::DropDocument(ref drop_document) => {
4847                self.execute_drop_document(query, drop_document)
4848            }
4849            QueryExpr::DropKv(ref drop_kv) => self.execute_drop_kv(query, drop_kv),
4850            QueryExpr::DropCollection(ref drop_collection) => {
4851                self.execute_drop_collection(query, drop_collection)
4852            }
4853            QueryExpr::Truncate(ref truncate) => self.execute_truncate(query, truncate),
4854            QueryExpr::AlterTable(ref alter) => self.execute_alter_table(query, alter),
4855            QueryExpr::ExplainAlter(ref explain) => self.execute_explain_alter(query, explain),
4856            // Graph analytics commands
4857            QueryExpr::GraphCommand(ref cmd) => self.execute_graph_command(query, cmd),
4858            // Search commands
4859            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query, cmd),
4860            // ASK: RAG query with LLM synthesis
4861            QueryExpr::Ask(ref ask) => self.execute_ask(query, ask),
4862            QueryExpr::CreateIndex(ref create_idx) => self.execute_create_index(query, create_idx),
4863            QueryExpr::DropIndex(ref drop_idx) => self.execute_drop_index(query, drop_idx),
4864            QueryExpr::ProbabilisticCommand(ref cmd) => {
4865                self.execute_probabilistic_command(query, cmd)
4866            }
4867            // Time-series DDL
4868            QueryExpr::CreateTimeSeries(ref ts) => self.execute_create_timeseries(query, ts),
4869            QueryExpr::CreateMetric(ref metric) => self.execute_create_metric(query, metric),
4870            QueryExpr::AlterMetric(ref alter) => self.execute_alter_metric(query, alter),
4871            QueryExpr::CreateSlo(ref slo) => self.execute_create_slo(query, slo),
4872            QueryExpr::DropTimeSeries(ref ts) => self.execute_drop_timeseries(query, ts),
4873            // Queue DDL and commands
4874            QueryExpr::CreateQueue(ref q) => self.execute_create_queue(query, q),
4875            QueryExpr::AlterQueue(ref q) => self.execute_alter_queue(query, q),
4876            QueryExpr::DropQueue(ref q) => self.execute_drop_queue(query, q),
4877            QueryExpr::QueueSelect(ref q) => self.execute_queue_select(query, q),
4878            QueryExpr::QueueCommand(ref cmd) => self.execute_queue_command(query, cmd),
4879            QueryExpr::EventsBackfill(ref backfill) => {
4880                self.execute_events_backfill(query, backfill)
4881            }
4882            QueryExpr::EventsBackfillStatus { ref collection } => Err(RedDBError::Query(format!(
4883                "EVENTS BACKFILL STATUS for '{collection}' is not implemented in this slice"
4884            ))),
4885            QueryExpr::KvCommand(ref cmd) => self.execute_kv_command(query, cmd),
4886            QueryExpr::ConfigCommand(ref cmd) => self.execute_config_command(query, cmd),
4887            QueryExpr::CreateTree(ref tree) => self.execute_create_tree(query, tree),
4888            QueryExpr::DropTree(ref tree) => self.execute_drop_tree(query, tree),
4889            QueryExpr::TreeCommand(ref cmd) => self.execute_tree_command(query, cmd),
4890            // SET CONFIG key = value
4891            QueryExpr::SetConfig { ref key, ref value } => {
4892                if key.starts_with("red.secret.") {
4893                    return Err(RedDBError::Query(
4894                        "red.secret.* is reserved for vault secrets; use SET SECRET".to_string(),
4895                    ));
4896                }
4897                if key.starts_with("red.secrets.") {
4898                    return Err(RedDBError::Query(
4899                        "red.secrets.* is reserved for vault secrets; use SET SECRET".to_string(),
4900                    ));
4901                }
4902                match self.check_managed_config_write_for_set_config(key) {
4903                    Err(err) => Err(err),
4904                    Ok(()) => {
4905                        let store = self.inner.db.store();
4906                        let json_val = match value {
4907                            Value::Text(s) => crate::serde_json::Value::String(s.to_string()),
4908                            Value::Integer(n) => crate::serde_json::Value::Number(*n as f64),
4909                            Value::Float(n) => crate::serde_json::Value::Number(*n),
4910                            Value::Boolean(b) => crate::serde_json::Value::Bool(*b),
4911                            _ => crate::serde_json::Value::String(value.to_string()),
4912                        };
4913                        store.set_config_tree(key, &json_val);
4914                        update_current_config_value(key, value.clone());
4915                        // Config changes can flip runtime behavior mid-session
4916                        // (auto_decrypt, auto_encrypt, etc.) — invalidate the
4917                        // result cache so subsequent reads re-execute against
4918                        // the new config.
4919                        self.invalidate_result_cache();
4920                        Ok(RuntimeQueryResult::ok_message(
4921                            query.to_string(),
4922                            &format!("config set: {key}"),
4923                            "set",
4924                        ))
4925                    }
4926                }
4927            }
4928            // SET SECRET key = value
4929            QueryExpr::SetSecret { ref key, ref value } => {
4930                if key.starts_with("red.config.") {
4931                    return Err(RedDBError::Query(
4932                        "red.config.* is reserved for config; use SET CONFIG".to_string(),
4933                    ));
4934                }
4935                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4936                    RedDBError::Query("SET SECRET requires an enabled, unsealed vault".to_string())
4937                })?;
4938                if matches!(value, Value::Null) {
4939                    auth_store
4940                        .vault_kv_try_delete(key)
4941                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4942                    update_current_secret_value(key, None);
4943                    self.invalidate_result_cache();
4944                    return Ok(RuntimeQueryResult::ok_message(
4945                        query.to_string(),
4946                        &format!("secret deleted: {key}"),
4947                        "delete_secret",
4948                    ));
4949                }
4950                let value = secret_sql_value_to_string(value)?;
4951                auth_store
4952                    .vault_kv_try_set(key.clone(), value.clone())
4953                    .map_err(|err| RedDBError::Query(err.to_string()))?;
4954                update_current_secret_value(key, Some(value));
4955                self.invalidate_result_cache();
4956                Ok(RuntimeQueryResult::ok_message(
4957                    query.to_string(),
4958                    &format!("secret set: {key}"),
4959                    "set_secret",
4960                ))
4961            }
4962            // DELETE SECRET key
4963            QueryExpr::DeleteSecret { ref key } => {
4964                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4965                    RedDBError::Query(
4966                        "DELETE SECRET requires an enabled, unsealed vault".to_string(),
4967                    )
4968                })?;
4969                let deleted = auth_store
4970                    .vault_kv_try_delete(key)
4971                    .map_err(|err| RedDBError::Query(err.to_string()))?;
4972                if deleted {
4973                    update_current_secret_value(key, None);
4974                }
4975                self.invalidate_result_cache();
4976                Ok(RuntimeQueryResult::ok_message(
4977                    query.to_string(),
4978                    &format!("secret deleted: {key}"),
4979                    if deleted {
4980                        "delete_secret"
4981                    } else {
4982                        "delete_secret_not_found"
4983                    },
4984                ))
4985            }
4986            // SHOW SECRET[S] [prefix]
4987            QueryExpr::ShowSecrets { ref prefix } => {
4988                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4989                    RedDBError::Query("SHOW SECRET requires an enabled, unsealed vault".to_string())
4990                })?;
4991                if !auth_store.is_vault_backed() {
4992                    return Err(RedDBError::Query(
4993                        "SHOW SECRET requires an enabled, unsealed vault".to_string(),
4994                    ));
4995                }
4996                let mut keys = auth_store.vault_kv_keys();
4997                keys.sort();
4998                let mut result = UnifiedResult::with_columns(vec![
4999                    "key".into(),
5000                    "value".into(),
5001                    "status".into(),
5002                ]);
5003                for key in keys {
5004                    if let Some(ref pfx) = prefix {
5005                        if !key.starts_with(pfx) {
5006                            continue;
5007                        }
5008                    }
5009                    let mut record = UnifiedRecord::new();
5010                    record.set("key", Value::text(key));
5011                    record.set("value", Value::text("***"));
5012                    record.set("status", Value::text("active"));
5013                    result.push(record);
5014                }
5015                Ok(RuntimeQueryResult {
5016                    query: query.to_string(),
5017                    mode,
5018                    statement: "show_secrets",
5019                    engine: "runtime-secret",
5020                    result,
5021                    affected_rows: 0,
5022                    statement_type: "select",
5023                    bookmark: None,
5024                })
5025            }
5026            // SHOW CONFIG [prefix] [AS JSON|FORMAT JSON]
5027            QueryExpr::ShowConfig {
5028                ref prefix,
5029                as_json,
5030            } => {
5031                let store = self.inner.db.store();
5032                let all_collections = store.list_collections();
5033                if !all_collections.contains(&"red_config".to_string()) {
5034                    if as_json {
5035                        return Ok(show_config_json_result(
5036                            query,
5037                            mode,
5038                            prefix,
5039                            crate::serde_json::Value::Object(crate::serde_json::Map::new()),
5040                        ));
5041                    }
5042                    let result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5043                    return Ok(RuntimeQueryResult {
5044                        query: query.to_string(),
5045                        mode,
5046                        statement: "show_config",
5047                        engine: "runtime-config",
5048                        result,
5049                        affected_rows: 0,
5050                        statement_type: "select",
5051                        bookmark: None,
5052                    });
5053                }
5054                let manager = store
5055                    .get_collection("red_config")
5056                    .ok_or_else(|| RedDBError::NotFound("red_config".to_string()))?;
5057                let entities = manager.query_all(|_| true);
5058                let mut latest = std::collections::BTreeMap::<String, (u64, Value, Value)>::new();
5059                for entity in entities {
5060                    if let EntityData::Row(ref row) = entity.data {
5061                        if let Some(ref named) = row.named {
5062                            let key_val = named.get("key").cloned().unwrap_or(Value::Null);
5063                            let val = named.get("value").cloned().unwrap_or(Value::Null);
5064                            let key_str = match &key_val {
5065                                Value::Text(s) => s.as_ref(),
5066                                _ => continue,
5067                            };
5068                            if let Some(ref pfx) = prefix {
5069                                if !key_str.starts_with(pfx.as_str()) {
5070                                    continue;
5071                                }
5072                            }
5073                            let entity_id = entity.id.raw();
5074                            match latest.get(key_str) {
5075                                Some((prev_id, _, _)) if *prev_id > entity_id => {}
5076                                _ => {
5077                                    latest.insert(key_str.to_string(), (entity_id, key_val, val));
5078                                }
5079                            }
5080                        }
5081                    }
5082                }
5083                if as_json {
5084                    let mut tree = crate::serde_json::Value::Object(crate::serde_json::Map::new());
5085                    for (key, (_, _, val)) in latest {
5086                        let relative = match prefix {
5087                            Some(pfx) if key == *pfx => "",
5088                            Some(pfx) => key
5089                                .strip_prefix(pfx.as_str())
5090                                .and_then(|tail| tail.strip_prefix('.'))
5091                                .unwrap_or(key.as_str()),
5092                            None => key.as_str(),
5093                        };
5094                        insert_config_json_path(
5095                            &mut tree,
5096                            relative,
5097                            crate::presentation::entity_json::storage_value_to_json(&val),
5098                        );
5099                    }
5100                    return Ok(show_config_json_result(query, mode, prefix, tree));
5101                }
5102                let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5103                for (_, key_val, val) in latest.into_values() {
5104                    let mut record = UnifiedRecord::new();
5105                    record.set("key", key_val);
5106                    record.set("value", val);
5107                    result.push(record);
5108                }
5109                Ok(RuntimeQueryResult {
5110                    query: query.to_string(),
5111                    mode,
5112                    statement: "show_config",
5113                    engine: "runtime-config",
5114                    result,
5115                    affected_rows: 0,
5116                    statement_type: "select",
5117                    bookmark: None,
5118                })
5119            }
5120            // Session-local multi-tenancy handle (Phase 2.5.3).
5121            //
5122            // SET TENANT 'id' / SET TENANT NULL / RESET TENANT — writes
5123            // the thread-local; SHOW TENANT returns it. Paired with the
5124            // CURRENT_TENANT() scalar for use in RLS policies.
5125            QueryExpr::SetTenant(ref value) => {
5126                match value {
5127                    Some(id) => set_current_tenant(id.clone()),
5128                    None => clear_current_tenant(),
5129                }
5130                Ok(RuntimeQueryResult::ok_message(
5131                    query.to_string(),
5132                    &match value {
5133                        Some(id) => format!("tenant set: {id}"),
5134                        None => "tenant cleared".to_string(),
5135                    },
5136                    "set_tenant",
5137                ))
5138            }
5139            QueryExpr::ShowTenant => {
5140                let mut result = UnifiedResult::with_columns(vec!["tenant".into()]);
5141                let mut record = UnifiedRecord::new();
5142                record.set(
5143                    "tenant",
5144                    current_tenant().map(Value::text).unwrap_or(Value::Null),
5145                );
5146                result.push(record);
5147                Ok(RuntimeQueryResult {
5148                    query: query.to_string(),
5149                    mode,
5150                    statement: "show_tenant",
5151                    engine: "runtime-tenant",
5152                    result,
5153                    affected_rows: 0,
5154                    statement_type: "select",
5155                    bookmark: None,
5156                })
5157            }
5158            // Transaction control (Phase 2.3 PG parity).
5159            //
5160            // BEGIN allocates a real `Xid` and stores a `TxnContext` keyed by
5161            // the current connection's id. COMMIT/ROLLBACK release it through
5162            // the `SnapshotManager` so future snapshots see the correct set of
5163            // active/aborted transactions.
5164            //
5165            // Tuple stamping (xmin/xmax) and read-path visibility filtering
5166            // land in Phase 2.3.2 — this dispatch only manages the snapshot
5167            // registry. Statements running outside a TxnContext still behave
5168            // as autocommit (xid=0 → visible to every snapshot).
5169            QueryExpr::TransactionControl(ref ctl) => {
5170                use crate::storage::query::ast::TxnControl;
5171                use crate::storage::transaction::snapshot::{TxnContext, Xid};
5172                use crate::storage::transaction::IsolationLevel;
5173
5174                // Phase 2.3 keys transactions by a thread-local connection id.
5175                // The stdio/gRPC paths wire a real per-connection id later;
5176                // for embedded use (one RedDBRuntime per process-ish caller)
5177                // we fall back to a deterministic placeholder.
5178                let conn_id = current_connection_id();
5179
5180                let (kind, msg) = match ctl {
5181                    TxnControl::Begin => {
5182                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5183                        let xid = mgr.begin();
5184                        let snapshot = mgr.snapshot(xid);
5185                        let ctx = TxnContext {
5186                            xid,
5187                            isolation: IsolationLevel::SnapshotIsolation,
5188                            snapshot,
5189                            savepoints: Vec::new(),
5190                            released_sub_xids: Vec::new(),
5191                        };
5192                        self.inner.tx_contexts.write().insert(conn_id, ctx);
5193                        ("begin", format!("BEGIN — xid={xid} (snapshot isolation)"))
5194                    }
5195                    TxnControl::Commit => {
5196                        // SET LOCAL TENANT ends with the transaction.
5197                        self.inner.tx_local_tenants.write().remove(&conn_id);
5198                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5199                        match ctx {
5200                            Some(ctx) => {
5201                                let mut own_xids = std::collections::HashSet::new();
5202                                own_xids.insert(ctx.xid);
5203                                for (_, sub) in &ctx.savepoints {
5204                                    own_xids.insert(*sub);
5205                                }
5206                                for sub in &ctx.released_sub_xids {
5207                                    own_xids.insert(*sub);
5208                                }
5209                                if let Err(err) = self.check_table_row_write_conflicts(
5210                                    conn_id,
5211                                    &ctx.snapshot,
5212                                    &own_xids,
5213                                ) {
5214                                    for (_, sub) in &ctx.savepoints {
5215                                        self.inner.snapshot_manager.rollback(*sub);
5216                                    }
5217                                    for sub in &ctx.released_sub_xids {
5218                                        self.inner.snapshot_manager.rollback(*sub);
5219                                    }
5220                                    self.inner.snapshot_manager.rollback(ctx.xid);
5221                                    self.revive_pending_versioned_updates(conn_id);
5222                                    self.revive_pending_tombstones(conn_id);
5223                                    self.discard_pending_kv_watch_events(conn_id);
5224                                    self.discard_pending_queue_wakes(conn_id);
5225                                    self.discard_pending_store_wal_actions(conn_id);
5226                                    return Err(err);
5227                                }
5228                                self.restore_pending_write_stamps(conn_id);
5229                                if let Err(err) = self.flush_pending_store_wal_actions(conn_id) {
5230                                    for (_, sub) in &ctx.savepoints {
5231                                        self.inner.snapshot_manager.rollback(*sub);
5232                                    }
5233                                    for sub in &ctx.released_sub_xids {
5234                                        self.inner.snapshot_manager.rollback(*sub);
5235                                    }
5236                                    self.inner.snapshot_manager.rollback(ctx.xid);
5237                                    self.revive_pending_versioned_updates(conn_id);
5238                                    self.revive_pending_tombstones(conn_id);
5239                                    self.discard_pending_kv_watch_events(conn_id);
5240                                    return Err(err);
5241                                }
5242                                // Phase 2.3.2e: commit every open sub-xid
5243                                // so they also become visible. Their
5244                                // work is promoted to the parent txn's
5245                                // result exactly like a RELEASE would
5246                                // have done.
5247                                for (_, sub) in &ctx.savepoints {
5248                                    self.inner.snapshot_manager.commit(*sub);
5249                                }
5250                                for sub in &ctx.released_sub_xids {
5251                                    self.inner.snapshot_manager.commit(*sub);
5252                                }
5253                                self.inner.snapshot_manager.commit(ctx.xid);
5254                                self.finalize_pending_versioned_updates(conn_id);
5255                                self.finalize_pending_tombstones(conn_id);
5256                                self.finalize_pending_kv_watch_events(conn_id);
5257                                self.finalize_pending_queue_wakes(conn_id);
5258                                ("commit", format!("COMMIT — xid={} committed", ctx.xid))
5259                            }
5260                            None => (
5261                                "commit",
5262                                "COMMIT outside transaction — no-op (autocommit)".to_string(),
5263                            ),
5264                        }
5265                    }
5266                    TxnControl::Rollback => {
5267                        self.inner.tx_local_tenants.write().remove(&conn_id);
5268                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5269                        match ctx {
5270                            Some(ctx) => {
5271                                // Phase 2.3.2e: abort every open sub-xid
5272                                // too so their writes stay hidden.
5273                                for (_, sub) in &ctx.savepoints {
5274                                    self.inner.snapshot_manager.rollback(*sub);
5275                                }
5276                                for sub in &ctx.released_sub_xids {
5277                                    self.inner.snapshot_manager.rollback(*sub);
5278                                }
5279                                self.inner.snapshot_manager.rollback(ctx.xid);
5280                                // Phase 2.3.2b: tuples that the txn had
5281                                // xmax-stamped become live again — wipe xmax
5282                                // back to 0 so later snapshots see them.
5283                                self.revive_pending_versioned_updates(conn_id);
5284                                self.revive_pending_tombstones(conn_id);
5285                                self.discard_pending_kv_watch_events(conn_id);
5286                                self.discard_pending_queue_wakes(conn_id);
5287                                self.discard_pending_store_wal_actions(conn_id);
5288                                ("rollback", format!("ROLLBACK — xid={} aborted", ctx.xid))
5289                            }
5290                            None => (
5291                                "rollback",
5292                                "ROLLBACK outside transaction — no-op (autocommit)".to_string(),
5293                            ),
5294                        }
5295                    }
5296                    // Phase 2.3.2e: savepoints map onto sub-xids. Each
5297                    // SAVEPOINT allocates a fresh xid and pushes it
5298                    // onto the per-txn stack so subsequent writes can
5299                    // be selectively rolled back. RELEASE pops without
5300                    // aborting; ROLLBACK TO aborts the sub-xid (and
5301                    // any nested ones) + revives their tombstones.
5302                    TxnControl::Savepoint(name) => {
5303                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5304                        let mut guard = self.inner.tx_contexts.write();
5305                        match guard.get_mut(&conn_id) {
5306                            Some(ctx) => {
5307                                let sub = mgr.begin();
5308                                ctx.savepoints.push((name.clone(), sub));
5309                                ("savepoint", format!("SAVEPOINT {name} — sub_xid={sub}"))
5310                            }
5311                            None => (
5312                                "savepoint",
5313                                "SAVEPOINT outside transaction — no-op".to_string(),
5314                            ),
5315                        }
5316                    }
5317                    TxnControl::ReleaseSavepoint(name) => {
5318                        let mut guard = self.inner.tx_contexts.write();
5319                        match guard.get_mut(&conn_id) {
5320                            Some(ctx) => {
5321                                let pos = ctx
5322                                    .savepoints
5323                                    .iter()
5324                                    .position(|(n, _)| n == name)
5325                                    .ok_or_else(|| {
5326                                        RedDBError::Internal(format!(
5327                                            "savepoint {name} does not exist"
5328                                        ))
5329                                    })?;
5330                                // RELEASE pops the named savepoint and
5331                                // any nested ones. Their sub-xids move
5332                                // to `released_sub_xids` so they commit
5333                                // (or roll back) alongside the parent
5334                                // xid — PG semantics: released
5335                                // savepoints still contribute their
5336                                // work, but their names are gone.
5337                                let released = ctx.savepoints.len() - pos;
5338                                let popped: Vec<Xid> = ctx
5339                                    .savepoints
5340                                    .split_off(pos)
5341                                    .into_iter()
5342                                    .map(|(_, x)| x)
5343                                    .collect();
5344                                ctx.released_sub_xids.extend(popped);
5345                                (
5346                                    "release_savepoint",
5347                                    format!("RELEASE SAVEPOINT {name} — {released} level(s)"),
5348                                )
5349                            }
5350                            None => (
5351                                "release_savepoint",
5352                                "RELEASE outside transaction — no-op".to_string(),
5353                            ),
5354                        }
5355                    }
5356                    TxnControl::RollbackToSavepoint(name) => {
5357                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5358                        // Splice out the savepoint + nested ones under
5359                        // a narrow lock, then run the snapshot-manager
5360                        // + tombstone side-effects without the tx map
5361                        // held so nothing re-enters.
5362                        let drop_result: Option<(Xid, Vec<Xid>)> = {
5363                            let mut guard = self.inner.tx_contexts.write();
5364                            if let Some(ctx) = guard.get_mut(&conn_id) {
5365                                let pos = ctx
5366                                    .savepoints
5367                                    .iter()
5368                                    .position(|(n, _)| n == name)
5369                                    .ok_or_else(|| {
5370                                        RedDBError::Internal(format!(
5371                                            "savepoint {name} does not exist"
5372                                        ))
5373                                    })?;
5374                                let savepoint_xid = ctx.savepoints[pos].1;
5375                                let aborted: Vec<Xid> = ctx
5376                                    .savepoints
5377                                    .split_off(pos)
5378                                    .into_iter()
5379                                    .map(|(_, x)| x)
5380                                    .collect();
5381                                Some((savepoint_xid, aborted))
5382                            } else {
5383                                None
5384                            }
5385                        };
5386
5387                        match drop_result {
5388                            Some((savepoint_xid, aborted)) => {
5389                                for x in &aborted {
5390                                    mgr.rollback(*x);
5391                                }
5392                                let reverted_updates =
5393                                    self.revive_versioned_updates_since(conn_id, savepoint_xid);
5394                                let revived = self.revive_tombstones_since(conn_id, savepoint_xid);
5395                                (
5396                                    "rollback_to_savepoint",
5397                                    format!(
5398                                        "ROLLBACK TO SAVEPOINT {name} — aborted {} sub_xid(s), reverted {reverted_updates} update(s), revived {revived} tombstone(s)",
5399                                        aborted.len(),
5400                                    ),
5401                                )
5402                            }
5403                            None => (
5404                                "rollback_to_savepoint",
5405                                "ROLLBACK TO outside transaction — no-op".to_string(),
5406                            ),
5407                        }
5408                    }
5409                };
5410                Ok(RuntimeQueryResult::ok_message(
5411                    query.to_string(),
5412                    &msg,
5413                    kind,
5414                ))
5415            }
5416            // Schema + Sequence DDL (Phase 1.3 PG parity).
5417            //
5418            // Schemas are lightweight logical namespaces: a CREATE SCHEMA call
5419            // just registers the name in `red_config` under `schema.{name}`.
5420            // Table lookups still happen by collection name; clients using
5421            // `schema.table` qualified names collapse to collection `schema.table`.
5422            //
5423            // Sequences persist a 64-bit counter + metadata (start, increment)
5424            // in `red_config` under `sequence.{name}.*`. Scalar callers
5425            // `nextval('name')` / `currval('name')` arrive with the MVCC phase
5426            // once we have a proper mutating-function dispatch path; for now the
5427            // DDL just establishes the catalog entry so clients don't error.
5428            QueryExpr::CreateSchema(ref q) => {
5429                let store = self.inner.db.store();
5430                let key = format!("schema.{}", q.name);
5431                if store.get_config(&key).is_some() {
5432                    if q.if_not_exists {
5433                        return Ok(RuntimeQueryResult::ok_message(
5434                            query.to_string(),
5435                            &format!("schema {} already exists — skipped", q.name),
5436                            "create_schema",
5437                        ));
5438                    }
5439                    return Err(RedDBError::Internal(format!(
5440                        "schema {} already exists",
5441                        q.name
5442                    )));
5443                }
5444                store.set_config_tree(&key, &crate::serde_json::Value::Bool(true));
5445                Ok(RuntimeQueryResult::ok_message(
5446                    query.to_string(),
5447                    &format!("schema {} created", q.name),
5448                    "create_schema",
5449                ))
5450            }
5451            QueryExpr::DropSchema(ref q) => {
5452                let store = self.inner.db.store();
5453                let key = format!("schema.{}", q.name);
5454                let existed = store.get_config(&key).is_some();
5455                if !existed && !q.if_exists {
5456                    return Err(RedDBError::Internal(format!(
5457                        "schema {} does not exist",
5458                        q.name
5459                    )));
5460                }
5461                // Remove marker from red_config via set to null.
5462                store.set_config_tree(&key, &crate::serde_json::Value::Null);
5463                let suffix = if q.cascade {
5464                    " (CASCADE accepted — tables untouched)"
5465                } else {
5466                    ""
5467                };
5468                Ok(RuntimeQueryResult::ok_message(
5469                    query.to_string(),
5470                    &format!("schema {} dropped{}", q.name, suffix),
5471                    "drop_schema",
5472                ))
5473            }
5474            QueryExpr::CreateSequence(ref q) => {
5475                let store = self.inner.db.store();
5476                let base = format!("sequence.{}", q.name);
5477                let start_key = format!("{base}.start");
5478                let incr_key = format!("{base}.increment");
5479                let curr_key = format!("{base}.current");
5480                if store.get_config(&start_key).is_some() {
5481                    if q.if_not_exists {
5482                        return Ok(RuntimeQueryResult::ok_message(
5483                            query.to_string(),
5484                            &format!("sequence {} already exists — skipped", q.name),
5485                            "create_sequence",
5486                        ));
5487                    }
5488                    return Err(RedDBError::Internal(format!(
5489                        "sequence {} already exists",
5490                        q.name
5491                    )));
5492                }
5493                // Persist start + increment, and set current so the first
5494                // nextval returns `start`.
5495                let initial_current = q.start - q.increment;
5496                store.set_config_tree(
5497                    &start_key,
5498                    &crate::serde_json::Value::Number(q.start as f64),
5499                );
5500                store.set_config_tree(
5501                    &incr_key,
5502                    &crate::serde_json::Value::Number(q.increment as f64),
5503                );
5504                store.set_config_tree(
5505                    &curr_key,
5506                    &crate::serde_json::Value::Number(initial_current as f64),
5507                );
5508                Ok(RuntimeQueryResult::ok_message(
5509                    query.to_string(),
5510                    &format!(
5511                        "sequence {} created (start={}, increment={})",
5512                        q.name, q.start, q.increment
5513                    ),
5514                    "create_sequence",
5515                ))
5516            }
5517            QueryExpr::DropSequence(ref q) => {
5518                let store = self.inner.db.store();
5519                let base = format!("sequence.{}", q.name);
5520                let existed = store.get_config(&format!("{base}.start")).is_some();
5521                if !existed && !q.if_exists {
5522                    return Err(RedDBError::Internal(format!(
5523                        "sequence {} does not exist",
5524                        q.name
5525                    )));
5526                }
5527                for k in ["start", "increment", "current"] {
5528                    store.set_config_tree(&format!("{base}.{k}"), &crate::serde_json::Value::Null);
5529                }
5530                Ok(RuntimeQueryResult::ok_message(
5531                    query.to_string(),
5532                    &format!("sequence {} dropped", q.name),
5533                    "drop_sequence",
5534                ))
5535            }
5536            // Views — CREATE [MATERIALIZED] VIEW (Phase 2.1 PG parity).
5537            //
5538            // The view definition is stored in-memory on RuntimeInner (not
5539            // persisted). SELECTs that reference the view name will substitute
5540            // the stored `QueryExpr` via `resolve_view_reference` during
5541            // planning (same entry point used by table-name resolution).
5542            //
5543            // Materialized views additionally allocate a slot in
5544            // `MaterializedViewCache`; a REFRESH repopulates that slot.
5545            QueryExpr::CreateView(ref q) => {
5546                let mut views = self.inner.views.write();
5547                if views.contains_key(&q.name) && !q.or_replace {
5548                    if q.if_not_exists {
5549                        return Ok(RuntimeQueryResult::ok_message(
5550                            query.to_string(),
5551                            &format!("view {} already exists — skipped", q.name),
5552                            "create_view",
5553                        ));
5554                    }
5555                    return Err(RedDBError::Internal(format!(
5556                        "view {} already exists",
5557                        q.name
5558                    )));
5559                }
5560                views.insert(q.name.clone(), Arc::new(q.clone()));
5561                drop(views);
5562
5563                // Materialized view: register cache slot (data is empty until REFRESH).
5564                if q.materialized {
5565                    use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
5566                    let refresh = match q.refresh_every_ms {
5567                        Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
5568                        None => RefreshPolicy::Manual,
5569                    };
5570                    let dependencies = collect_table_refs(&q.query);
5571                    let def = MaterializedViewDef {
5572                        name: q.name.clone(),
5573                        query: format!("<parsed view {}>", q.name),
5574                        dependencies: dependencies.clone(),
5575                        refresh,
5576                        retention_duration_ms: q.retention_duration_ms,
5577                    };
5578                    self.inner.materialized_views.write().register(def);
5579
5580                    // Issue #593 slice 9a — persist the descriptor to
5581                    // the system catalog so the definition survives a
5582                    // restart. Upsert semantics (delete-then-insert by
5583                    // name) keep the catalog free of duplicate rows
5584                    // across `CREATE OR REPLACE` churn.
5585                    let descriptor =
5586                        crate::runtime::continuous_materialized_view::MaterializedViewDescriptor {
5587                            name: q.name.clone(),
5588                            source_sql: query.to_string(),
5589                            source_collections: dependencies,
5590                            refresh_every_ms: q.refresh_every_ms,
5591                            retention_duration_ms: q.retention_duration_ms,
5592                        };
5593                    let store = self.inner.db.store();
5594                    crate::runtime::continuous_materialized_view::persist_descriptor(
5595                        store.as_ref(),
5596                        &descriptor,
5597                    )?;
5598
5599                    // Issue #594 slice 9b — provision a Table-shaped
5600                    // backing collection named after the view. The
5601                    // rewriter skips materialized views (see
5602                    // `rewrite_view_refs_inner`) so `SELECT FROM v`
5603                    // resolves to this collection directly. Empty
5604                    // until REFRESH wires through it in 9c.
5605                    self.ensure_materialized_view_backing(&q.name)?;
5606                }
5607                // Plan cache may have cached a plan that didn't know about this
5608                // view — invalidate so future references pick up the new binding.
5609                // Result cache gets flushed too: OR REPLACE must not serve a
5610                // prior execution of the obsolete body.
5611                self.invalidate_plan_cache();
5612                self.invalidate_result_cache();
5613
5614                Ok(RuntimeQueryResult::ok_message(
5615                    query.to_string(),
5616                    &format!(
5617                        "{}view {} created",
5618                        if q.materialized { "materialized " } else { "" },
5619                        q.name
5620                    ),
5621                    "create_view",
5622                ))
5623            }
5624            QueryExpr::DropView(ref q) => {
5625                let mut views = self.inner.views.write();
5626                let removed = views.remove(&q.name);
5627                let existed = removed.is_some();
5628                let removed_materialized =
5629                    removed.as_ref().map(|v| v.materialized).unwrap_or(false);
5630                drop(views);
5631                if q.materialized || existed {
5632                    // Try the materialised cache too — silent if absent.
5633                    self.inner.materialized_views.write().remove(&q.name);
5634                    // Issue #593 slice 9a — remove any persisted
5635                    // catalog row. Idempotent: a no-op when the view
5636                    // was never materialized (no row was ever written).
5637                    let store = self.inner.db.store();
5638                    crate::runtime::continuous_materialized_view::remove_by_name(
5639                        store.as_ref(),
5640                        &q.name,
5641                    )?;
5642                }
5643                // Issue #594 slice 9b — drop the backing collection
5644                // that was provisioned at CREATE time. Only mat views
5645                // ever had one; regular views never did.
5646                if removed_materialized || q.materialized {
5647                    self.drop_materialized_view_backing(&q.name)?;
5648                }
5649                // Drop any plan / result cache entries that baked the
5650                // view body into their QueryExpr.
5651                self.invalidate_plan_cache();
5652                self.invalidate_result_cache();
5653                if !existed && !q.if_exists {
5654                    return Err(RedDBError::Internal(format!(
5655                        "view {} does not exist",
5656                        q.name
5657                    )));
5658                }
5659                self.invalidate_plan_cache();
5660                Ok(RuntimeQueryResult::ok_message(
5661                    query.to_string(),
5662                    &format!("view {} dropped", q.name),
5663                    "drop_view",
5664                ))
5665            }
5666            QueryExpr::RefreshMaterializedView(ref q) => {
5667                // Look up the view definition, execute its underlying query,
5668                // and stash the serialized result in the materialised cache.
5669                let view = {
5670                    let views = self.inner.views.read();
5671                    views.get(&q.name).cloned()
5672                };
5673                let view = match view {
5674                    Some(v) => v,
5675                    None => {
5676                        return Err(RedDBError::Internal(format!(
5677                            "view {} does not exist",
5678                            q.name
5679                        )))
5680                    }
5681                };
5682                if !view.materialized {
5683                    return Err(RedDBError::Internal(format!(
5684                        "view {} is not materialized — REFRESH requires \
5685                         CREATE MATERIALIZED VIEW",
5686                        q.name
5687                    )));
5688                }
5689                // Execute the underlying query fresh.
5690                let started = std::time::Instant::now();
5691                let now_ms = std::time::SystemTime::now()
5692                    .duration_since(std::time::UNIX_EPOCH)
5693                    .map(|d| d.as_millis() as u64)
5694                    .unwrap_or(0);
5695                match self.execute_query_expr((*view.query).clone()) {
5696                    Ok(inner_result) => {
5697                        // Issue #595 slice 9c — atomically replace the
5698                        // backing collection's contents under a single
5699                        // WAL group. Concurrent SELECT from the view
5700                        // sees either the prior or new contents, never
5701                        // partial. A crash before the WAL commit lands
5702                        // leaves the prior contents intact on recovery.
5703                        let entities =
5704                            view_records_to_entities(&q.name, &inner_result.result.records);
5705                        let row_count = entities.len() as u64;
5706                        let store = self.inner.db.store();
5707                        let serialized_records = match store.refresh_collection(&q.name, entities) {
5708                            Ok(records) => records,
5709                            Err(err) => {
5710                                let duration_ms = started.elapsed().as_millis() as u64;
5711                                let msg = err.to_string();
5712                                self.inner
5713                                    .materialized_views
5714                                    .write()
5715                                    .record_refresh_failure(
5716                                        &q.name,
5717                                        msg.clone(),
5718                                        duration_ms,
5719                                        now_ms,
5720                                    );
5721                                return Err(RedDBError::Internal(format!(
5722                                    "REFRESH MATERIALIZED VIEW {}: {msg}",
5723                                    q.name
5724                                )));
5725                            }
5726                        };
5727
5728                        // Issue #596 slice 9d — emit a Refresh
5729                        // ChangeRecord into the logical-WAL spool so
5730                        // replicas deterministically replay the same
5731                        // backing-collection contents via
5732                        // `LogicalChangeApplier::apply_record`.
5733                        if let Some(ref primary) = self.inner.db.replication {
5734                            let lsn = self.inner.cdc.emit(
5735                                crate::replication::cdc::ChangeOperation::Refresh,
5736                                &q.name,
5737                                0,
5738                                "refresh",
5739                            );
5740                            self.invalidate_result_cache_for_table(&q.name);
5741                            let timestamp = std::time::SystemTime::now()
5742                                .duration_since(std::time::UNIX_EPOCH)
5743                                .unwrap_or_default()
5744                                .as_millis() as u64;
5745                            let record = ChangeRecord::for_refresh(
5746                                lsn,
5747                                timestamp,
5748                                q.name.clone(),
5749                                serialized_records,
5750                            )
5751                            .with_term(self.current_replication_term());
5752                            let encoded = record.encode();
5753                            primary.append_logical_record(record.lsn, encoded);
5754                        }
5755
5756                        let duration_ms = started.elapsed().as_millis() as u64;
5757                        let serialized = format!("{:?}", inner_result.result);
5758                        self.inner
5759                            .materialized_views
5760                            .write()
5761                            .record_refresh_success(
5762                                &q.name,
5763                                serialized.into_bytes(),
5764                                row_count,
5765                                duration_ms,
5766                                now_ms,
5767                            );
5768                        // SELECT FROM v now reads through the rewriter
5769                        // skip into the backing collection — drop the
5770                        // result cache so prior empty-backing reads
5771                        // don't shadow the new contents.
5772                        self.invalidate_result_cache();
5773                        Ok(RuntimeQueryResult::ok_message(
5774                            query.to_string(),
5775                            &format!("materialized view {} refreshed", q.name),
5776                            "refresh_materialized_view",
5777                        ))
5778                    }
5779                    Err(err) => {
5780                        let duration_ms = started.elapsed().as_millis() as u64;
5781                        let msg = err.to_string();
5782                        self.inner
5783                            .materialized_views
5784                            .write()
5785                            .record_refresh_failure(&q.name, msg.clone(), duration_ms, now_ms);
5786                        Err(err)
5787                    }
5788                }
5789            }
5790            // Row Level Security (Phase 2.5 PG parity).
5791            //
5792            // Policies live in an in-memory registry keyed by (table, name).
5793            // Enforcement (AND-ing the policy's USING clause into every
5794            // query's WHERE for the table) arrives in Phase 2.5.2 via the
5795            // filter compiler; this dispatch only manages the catalog.
5796            QueryExpr::CreatePolicy(ref q) => {
5797                let key = (q.table.clone(), q.name.clone());
5798                self.inner
5799                    .rls_policies
5800                    .write()
5801                    .insert(key, Arc::new(q.clone()));
5802                self.invalidate_plan_cache();
5803                // Issue #120 — surface policy names in the
5804                // schema-vocabulary so AskPipeline (#121) can resolve
5805                // a policy reference back to its table.
5806                self.schema_vocabulary_apply(
5807                    crate::runtime::schema_vocabulary::DdlEvent::CreatePolicy {
5808                        collection: q.table.clone(),
5809                        policy: q.name.clone(),
5810                    },
5811                );
5812                Ok(RuntimeQueryResult::ok_message(
5813                    query.to_string(),
5814                    &format!("policy {} on {} created", q.name, q.table),
5815                    "create_policy",
5816                ))
5817            }
5818            QueryExpr::DropPolicy(ref q) => {
5819                let removed = self
5820                    .inner
5821                    .rls_policies
5822                    .write()
5823                    .remove(&(q.table.clone(), q.name.clone()))
5824                    .is_some();
5825                if !removed && !q.if_exists {
5826                    return Err(RedDBError::Internal(format!(
5827                        "policy {} on {} does not exist",
5828                        q.name, q.table
5829                    )));
5830                }
5831                self.invalidate_plan_cache();
5832                // Issue #120 — keep the schema-vocabulary policy
5833                // entry in sync.
5834                self.schema_vocabulary_apply(
5835                    crate::runtime::schema_vocabulary::DdlEvent::DropPolicy {
5836                        collection: q.table.clone(),
5837                        policy: q.name.clone(),
5838                    },
5839                );
5840                Ok(RuntimeQueryResult::ok_message(
5841                    query.to_string(),
5842                    &format!("policy {} on {} dropped", q.name, q.table),
5843                    "drop_policy",
5844                ))
5845            }
5846            // Foreign Data Wrappers (Phase 3.2 PG parity).
5847            //
5848            // CREATE SERVER / CREATE FOREIGN TABLE register into the shared
5849            // `ForeignTableRegistry`. The read path consults that registry
5850            // before dispatching a SELECT — when the table name matches a
5851            // registered foreign table, we forward the scan to the wrapper
5852            // and skip the normal collection lookup.
5853            //
5854            // Phase 3.2 is in-memory only; persistence across restarts is a
5855            // 3.2.2 follow-up that mirrors the view registry pattern.
5856            QueryExpr::CreateServer(ref q) => {
5857                use crate::storage::fdw::FdwOptions;
5858                let registry = Arc::clone(&self.inner.foreign_tables);
5859                if registry.server(&q.name).is_some() {
5860                    if q.if_not_exists {
5861                        return Ok(RuntimeQueryResult::ok_message(
5862                            query.to_string(),
5863                            &format!("server {} already exists — skipped", q.name),
5864                            "create_server",
5865                        ));
5866                    }
5867                    return Err(RedDBError::Internal(format!(
5868                        "server {} already exists",
5869                        q.name
5870                    )));
5871                }
5872                let mut opts = FdwOptions::new();
5873                for (k, v) in &q.options {
5874                    opts.values.insert(k.clone(), v.clone());
5875                }
5876                registry
5877                    .create_server(&q.name, &q.wrapper, opts)
5878                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
5879                Ok(RuntimeQueryResult::ok_message(
5880                    query.to_string(),
5881                    &format!("server {} created (wrapper {})", q.name, q.wrapper),
5882                    "create_server",
5883                ))
5884            }
5885            QueryExpr::DropServer(ref q) => {
5886                let existed = self.inner.foreign_tables.drop_server(&q.name);
5887                if !existed && !q.if_exists {
5888                    return Err(RedDBError::Internal(format!(
5889                        "server {} does not exist",
5890                        q.name
5891                    )));
5892                }
5893                Ok(RuntimeQueryResult::ok_message(
5894                    query.to_string(),
5895                    &format!(
5896                        "server {} dropped{}",
5897                        q.name,
5898                        if q.cascade { " (cascade)" } else { "" }
5899                    ),
5900                    "drop_server",
5901                ))
5902            }
5903            QueryExpr::CreateForeignTable(ref q) => {
5904                use crate::storage::fdw::{FdwOptions, ForeignColumn, ForeignTable};
5905                let registry = Arc::clone(&self.inner.foreign_tables);
5906                if registry.foreign_table(&q.name).is_some() {
5907                    if q.if_not_exists {
5908                        return Ok(RuntimeQueryResult::ok_message(
5909                            query.to_string(),
5910                            &format!("foreign table {} already exists — skipped", q.name),
5911                            "create_foreign_table",
5912                        ));
5913                    }
5914                    return Err(RedDBError::Internal(format!(
5915                        "foreign table {} already exists",
5916                        q.name
5917                    )));
5918                }
5919                let mut opts = FdwOptions::new();
5920                for (k, v) in &q.options {
5921                    opts.values.insert(k.clone(), v.clone());
5922                }
5923                let columns: Vec<ForeignColumn> = q
5924                    .columns
5925                    .iter()
5926                    .map(|c| ForeignColumn {
5927                        name: c.name.clone(),
5928                        data_type: c.data_type.clone(),
5929                        not_null: c.not_null,
5930                    })
5931                    .collect();
5932                registry
5933                    .create_foreign_table(ForeignTable {
5934                        name: q.name.clone(),
5935                        server_name: q.server.clone(),
5936                        columns,
5937                        options: opts,
5938                    })
5939                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
5940                self.invalidate_plan_cache();
5941                Ok(RuntimeQueryResult::ok_message(
5942                    query.to_string(),
5943                    &format!("foreign table {} created (server {})", q.name, q.server),
5944                    "create_foreign_table",
5945                ))
5946            }
5947            QueryExpr::DropForeignTable(ref q) => {
5948                let existed = self.inner.foreign_tables.drop_foreign_table(&q.name);
5949                if !existed && !q.if_exists {
5950                    return Err(RedDBError::Internal(format!(
5951                        "foreign table {} does not exist",
5952                        q.name
5953                    )));
5954                }
5955                self.invalidate_plan_cache();
5956                Ok(RuntimeQueryResult::ok_message(
5957                    query.to_string(),
5958                    &format!("foreign table {} dropped", q.name),
5959                    "drop_foreign_table",
5960                ))
5961            }
5962            // COPY table FROM 'path' (Phase 1.5 PG parity).
5963            //
5964            // Stream CSV rows through the shared `CsvImporter`. The collection
5965            // is auto-created on first insert (via `insert_auto`-style path);
5966            // VACUUM/ANALYZE afterwards is up to the caller.
5967            QueryExpr::CopyFrom(ref q) => {
5968                use crate::storage::import::{CsvConfig, CsvImporter};
5969                let store = self.inner.db.store();
5970                let cfg = CsvConfig {
5971                    collection: q.table.clone(),
5972                    has_header: q.has_header,
5973                    delimiter: q.delimiter.map(|c| c as u8).unwrap_or(b','),
5974                    ..CsvConfig::default()
5975                };
5976                let importer = CsvImporter::new(cfg);
5977                let stats = importer
5978                    .import_file(&q.path, store.as_ref())
5979                    .map_err(|e| RedDBError::Internal(format!("COPY failed: {e}")))?;
5980                // Tables are written → invalidate cached plans / result cache.
5981                self.note_table_write(&q.table);
5982                Ok(RuntimeQueryResult::ok_message(
5983                    query.to_string(),
5984                    &format!(
5985                        "COPY imported {} rows into {} ({} errors skipped, {}ms)",
5986                        stats.records_imported, q.table, stats.errors_skipped, stats.duration_ms
5987                    ),
5988                    "copy_from",
5989                ))
5990            }
5991            // Maintenance commands (Phase 1.2 PG parity).
5992            //
5993            // - VACUUM [FULL] [table]: refreshes planner stats for the target
5994            //   collection(s) and — when FULL — triggers a full pager persist
5995            //   (flushes dirty pages + fsync). Also invalidates the result cache
5996            //   so subsequent reads re-execute against the freshly compacted
5997            //   storage. RedDB's segment/btree GC runs continuously via the
5998            //   background lifecycle; explicit space reclamation for sealed
5999            //   segments arrives with Phase 2.3 (MVCC + dead-tuple reclamation).
6000            // - ANALYZE [table]: reruns `analyze_collection` +
6001            //   `persist_table_stats` via `refresh_table_planner_stats` so the
6002            //   planner has fresh histograms, distinct estimates, null counts.
6003            //
6004            // Both commands accept an optional target; omitting the target
6005            // iterates every collection in the store.
6006            QueryExpr::MaintenanceCommand(ref cmd) => {
6007                use crate::storage::query::ast::MaintenanceCommand as Mc;
6008                let store = self.inner.db.store();
6009                let (kind, msg) = match cmd {
6010                    Mc::Analyze { target } => {
6011                        let targets: Vec<String> = match target {
6012                            Some(t) => vec![t.clone()],
6013                            None => store.list_collections(),
6014                        };
6015                        for t in &targets {
6016                            self.refresh_table_planner_stats(t);
6017                        }
6018                        (
6019                            "analyze",
6020                            format!("ANALYZE refreshed stats for {} table(s)", targets.len()),
6021                        )
6022                    }
6023                    Mc::Vacuum { target, full } => {
6024                        let targets: Vec<String> = match target {
6025                            Some(t) => vec![t.clone()],
6026                            None => store.list_collections(),
6027                        };
6028                        let cutoff_xid = self.mvcc_vacuum_cutoff_xid();
6029                        let mut vacuum_stats =
6030                            crate::storage::unified::store::MvccVacuumStats::default();
6031                        for t in &targets {
6032                            let stats = store.vacuum_mvcc_history(t, cutoff_xid).map_err(|e| {
6033                                RedDBError::Internal(format!(
6034                                    "VACUUM MVCC history failed for {t}: {e}"
6035                                ))
6036                            })?;
6037                            if stats.reclaimed_versions > 0 {
6038                                self.rebuild_runtime_indexes_for_table(t)?;
6039                            }
6040                            vacuum_stats.add(&stats);
6041                        }
6042                        self.inner.snapshot_manager.prune_aborted(cutoff_xid);
6043                        // Stats refresh covers every target (same as ANALYZE).
6044                        for t in &targets {
6045                            self.refresh_table_planner_stats(t);
6046                        }
6047                        // FULL forces a pager persist (dirty-page flush + fsync).
6048                        // Regular VACUUM relies on the background writer / segment
6049                        // lifecycle so the command is non-blocking.
6050                        let persisted = if *full {
6051                            match store.persist() {
6052                                Ok(()) => true,
6053                                Err(e) => {
6054                                    return Err(RedDBError::Internal(format!(
6055                                        "VACUUM FULL persist failed: {e:?}"
6056                                    )));
6057                                }
6058                            }
6059                        } else {
6060                            false
6061                        };
6062                        // Result cache depended on pre-vacuum state.
6063                        self.invalidate_result_cache();
6064                        (
6065                            "vacuum",
6066                            format!(
6067                                "VACUUM{} processed {} table(s): scanned_versions={}, retained_versions={}, reclaimed_versions={}, retained_history_versions={}, reclaimed_history_versions={}, retained_tombstones={}, reclaimed_tombstones={}{}",
6068                                if *full { " FULL" } else { "" },
6069                                targets.len(),
6070                                vacuum_stats.scanned_versions,
6071                                vacuum_stats.retained_versions,
6072                                vacuum_stats.reclaimed_versions,
6073                                vacuum_stats.retained_history_versions,
6074                                vacuum_stats.reclaimed_history_versions,
6075                                vacuum_stats.retained_tombstones,
6076                                vacuum_stats.reclaimed_tombstones,
6077                                if persisted {
6078                                    " (pages flushed to disk)"
6079                                } else {
6080                                    ""
6081                                }
6082                            ),
6083                        )
6084                    }
6085                };
6086                Ok(RuntimeQueryResult::ok_message(
6087                    query.to_string(),
6088                    &msg,
6089                    kind,
6090                ))
6091            }
6092            // GRANT / REVOKE / ALTER USER (RBAC milestone).
6093            //
6094            // These hit the AuthStore directly. The statement frame /
6095            // privilege gate has already decided whether the caller may
6096            // even run the statement; here we just translate the AST into
6097            // AuthStore calls.
6098            QueryExpr::Grant(ref g) => self.execute_grant_statement(query, g),
6099            QueryExpr::Revoke(ref r) => self.execute_revoke_statement(query, r),
6100            QueryExpr::AlterUser(ref a) => self.execute_alter_user_statement(query, a),
6101            QueryExpr::CreateUser(ref u) => self.execute_create_user_statement(query, u),
6102            QueryExpr::CreateIamPolicy { ref id, ref json } => {
6103                self.execute_create_iam_policy(query, id, json)
6104            }
6105            QueryExpr::DropIamPolicy { ref id } => self.execute_drop_iam_policy(query, id),
6106            QueryExpr::AttachPolicy {
6107                ref policy_id,
6108                ref principal,
6109            } => self.execute_attach_policy(query, policy_id, principal),
6110            QueryExpr::DetachPolicy {
6111                ref policy_id,
6112                ref principal,
6113            } => self.execute_detach_policy(query, policy_id, principal),
6114            QueryExpr::ShowPolicies { ref filter } => {
6115                self.execute_show_policies(query, filter.as_ref())
6116            }
6117            QueryExpr::ShowEffectivePermissions {
6118                ref user,
6119                ref resource,
6120            } => self.execute_show_effective_permissions(query, user, resource.as_ref()),
6121            QueryExpr::SimulatePolicy {
6122                ref user,
6123                ref action,
6124                ref resource,
6125            } => self.execute_simulate_policy(query, user, action, resource),
6126            QueryExpr::LintPolicy { ref source } => self.execute_lint_policy(query, source),
6127            QueryExpr::MigratePolicyMode {
6128                ref target,
6129                dry_run,
6130            } => self.execute_migrate_policy_mode(query, target, dry_run),
6131            QueryExpr::CreateMigration(ref q) => self.execute_create_migration(query, q),
6132            QueryExpr::ApplyMigration(ref q) => self.execute_apply_migration(query, q),
6133            QueryExpr::RollbackMigration(ref q) => self.execute_rollback_migration(query, q),
6134            QueryExpr::ExplainMigration(ref q) => self.execute_explain_migration(query, q),
6135        };
6136
6137        if !control_event_specs.is_empty() {
6138            let (outcome, reason) = match &query_result {
6139                Ok(_) => (crate::runtime::control_events::Outcome::Allowed, None),
6140                Err(err) => (control_event_outcome_for_error(err), Some(err.to_string())),
6141            };
6142            for spec in &control_event_specs {
6143                self.emit_control_event(
6144                    spec.kind,
6145                    outcome,
6146                    spec.action,
6147                    spec.resource.clone(),
6148                    reason.clone(),
6149                    spec.fields.clone(),
6150                )?;
6151            }
6152        }
6153
6154        if let (Some(plan), Ok(result)) = (&query_audit_plan, &query_result) {
6155            self.emit_query_audit(
6156                query,
6157                plan,
6158                query_audit_started.elapsed().as_millis() as u64,
6159                result,
6160            );
6161        }
6162
6163        // Decrypt Value::Secret columns in-place before caching, so
6164        // cached results match the post-decrypt shape and repeat
6165        // queries skip the per-row AES-GCM pass.
6166        let mut query_result = query_result;
6167        if let Ok(ref mut result) = query_result {
6168            if result.statement_type == "select" {
6169                self.apply_secret_decryption(result);
6170            }
6171        }
6172
6173        // Cache SELECT results for 30s.
6174        // Skip: pre-serialized JSON (large clone), and result sets > 5 rows.
6175        // Large multi-row results (range scans, filtered scans) are rarely
6176        // repeated with the same literal values so the cache hit rate is near
6177        // zero while the clone cost (100 records × ~16 fields each) is high.
6178        // Aggregations (1 row) and point lookups (1 row) still benefit.
6179        if let Ok(ref result) = query_result {
6180            frame.write_result_cache(self, result, result_cache_scopes);
6181        }
6182
6183        query_result
6184    }
6185
6186    /// Snapshot of every registered materialized view's runtime
6187    /// state — feeds the `red.materialized_views` virtual table.
6188    /// Issue #583 slice 10.
6189    pub fn materialized_view_metadata(
6190        &self,
6191    ) -> Vec<crate::storage::cache::result::MaterializedViewMetadata> {
6192        // Issue #595 slice 9c — `current_row_count` is now scraped
6193        // live from the backing collection rather than read from the
6194        // cache slot. Mirrors the slice-10 invariant on
6195        // `queue_pending_gauge` in #527: the live store is the source
6196        // of truth, the cache slot only carries last-refresh telemetry
6197        // (timing, error, refresh cadence).
6198        let store = self.inner.db.store();
6199        let mut entries = self.inner.materialized_views.read().metadata();
6200        for entry in &mut entries {
6201            if let Some(manager) = store.get_collection(&entry.name) {
6202                entry.current_row_count = manager.count() as u64;
6203            }
6204        }
6205        entries
6206    }
6207
6208    /// Drive scheduled refreshes for materialized views with a
6209    /// `REFRESH EVERY <duration>` clause. Called from the background
6210    /// scheduler thread (and from unit tests with a fake clock via
6211    /// `claim_due_at`). Each invocation atomically claims the set of
6212    /// due views (so two concurrent ticks never double-fire the same
6213    /// view) and runs each refresh through the standard execution
6214    /// path — failures are captured in `last_error` and the prior
6215    /// content stays intact. Issue #583 slice 10.
6216    /// Snapshot of every tracked retention sweeper state — feeds the
6217    /// three extra columns on `red.retention`. Issue #584 slice 12.
6218    pub(crate) fn retention_sweeper_snapshot(
6219        &self,
6220    ) -> Vec<(String, crate::runtime::retention_sweeper::SweeperState)> {
6221        self.inner.retention_sweeper.read().snapshot()
6222    }
6223
6224    /// Drive one tick of the retention sweeper. Iterates collections
6225    /// with a retention policy set, physically deletes at most
6226    /// `batch_size` expired rows per collection, and records the
6227    /// `last_sweep_at_ms` / `rows_swept_total` / pending estimate that
6228    /// `red.retention` exposes. Called from the background sweeper
6229    /// thread; safe to invoke directly from tests with a small batch
6230    /// size to drain rows deterministically. Issue #584 slice 12.
6231    ///
6232    /// Deletes are issued as `DELETE FROM <collection> WHERE
6233    /// <ts_column> < <cutoff>` through the standard `execute_query`
6234    /// chokepoint so WAL participation and snapshot guards apply
6235    /// exactly as for a user-issued DELETE — replicas replay the
6236    /// sweeper's deletes via the same WAL stream with no special
6237    /// handling on the replication side.
6238    ///
6239    /// Batching is enforced by tightening the cutoff: if more than
6240    /// `batch_size` rows are expired, the cutoff is dropped to the
6241    /// `batch_size`-th oldest expired timestamp + 1 so the predicate
6242    /// matches roughly `batch_size` rows; the remainder is reported
6243    /// as `current_rows_pending_sweep_estimate` and drained on the
6244    /// next tick.
6245    pub fn sweep_retention_tick(&self, batch_size: usize) {
6246        if batch_size == 0 {
6247            return;
6248        }
6249        let now_ms = std::time::SystemTime::now()
6250            .duration_since(std::time::UNIX_EPOCH)
6251            .map(|d| d.as_millis() as u64)
6252            .unwrap_or(0);
6253
6254        let store = self.inner.db.store();
6255        let collections = store.list_collections();
6256        for name in collections {
6257            let Some(contract) = self.inner.db.collection_contract(&name) else {
6258                continue;
6259            };
6260            let Some(retention_ms) = contract.retention_duration_ms else {
6261                continue;
6262            };
6263            let Some(ts_column) =
6264                crate::runtime::retention_filter::resolve_timestamp_column(&contract)
6265            else {
6266                continue;
6267            };
6268            let Some(manager) = store.get_collection(&name) else {
6269                continue;
6270            };
6271            let cutoff = (now_ms as i64).saturating_sub(retention_ms as i64);
6272
6273            // Single pass: collect expired timestamps. We keep the
6274            // full Vec rather than a bounded heap because the partial
6275            // sort below is the simplest correct way to find the
6276            // batch-th oldest; for the slice's "1000-row default
6277            // batch" target this is bounded enough for production
6278            // operation, and the alternative (in-place heap of size
6279            // batch+1) is a follow-up optimisation.
6280            let mut expired_ts: Vec<i64> = Vec::new();
6281            manager.for_each_entity(|entity| {
6282                let ts = match ts_column.as_str() {
6283                    "created_at" => Some(entity.created_at as i64),
6284                    "updated_at" => Some(entity.updated_at as i64),
6285                    other => entity
6286                        .data
6287                        .as_row()
6288                        .and_then(|row| row.get_field(other))
6289                        .and_then(|v| match v {
6290                            crate::storage::schema::Value::TimestampMs(t) => Some(*t),
6291                            crate::storage::schema::Value::Timestamp(t) => {
6292                                Some(t.saturating_mul(1_000))
6293                            }
6294                            crate::storage::schema::Value::BigInt(t) => Some(*t),
6295                            crate::storage::schema::Value::UnsignedInteger(t) => {
6296                                i64::try_from(*t).ok()
6297                            }
6298                            crate::storage::schema::Value::Integer(t) => Some(*t),
6299                            _ => None,
6300                        }),
6301                };
6302                if let Some(t) = ts {
6303                    if t < cutoff {
6304                        expired_ts.push(t);
6305                    }
6306                }
6307                true
6308            });
6309
6310            let total_expired = expired_ts.len() as u64;
6311            if total_expired == 0 {
6312                self.inner
6313                    .retention_sweeper
6314                    .write()
6315                    .record_tick(&name, 0, 0, now_ms);
6316                continue;
6317            }
6318
6319            let (effective_cutoff, pending) = if (total_expired as usize) <= batch_size {
6320                (cutoff, 0u64)
6321            } else {
6322                // Tighten the cutoff to the (batch_size)-th oldest
6323                // expired timestamp + 1 so DELETE matches roughly
6324                // `batch_size` rows.
6325                expired_ts.sort_unstable();
6326                let nth = expired_ts[batch_size - 1];
6327                (
6328                    nth.saturating_add(1),
6329                    total_expired.saturating_sub(batch_size as u64),
6330                )
6331            };
6332
6333            let stmt = format!(
6334                "DELETE FROM {} WHERE {} < {}",
6335                name, ts_column, effective_cutoff
6336            );
6337            let deleted = match self.execute_query(&stmt) {
6338                Ok(r) => r.affected_rows,
6339                Err(_) => 0,
6340            };
6341
6342            self.inner
6343                .retention_sweeper
6344                .write()
6345                .record_tick(&name, deleted, pending, now_ms);
6346        }
6347    }
6348
6349    pub fn refresh_due_materialized_views(&self) {
6350        let due = {
6351            let mut cache = self.inner.materialized_views.write();
6352            cache.claim_due_at(std::time::Instant::now())
6353        };
6354        for name in due {
6355            // Round-trip through `execute_query` (rather than the
6356            // prepared-statement `execute_query_expr` fast path, which
6357            // explicitly rejects DDL/maintenance statements). Failures
6358            // are captured inside the RefreshMaterializedView handler
6359            // via `record_refresh_failure`; the scheduler ignores the
6360            // Result so one bad view doesn't halt the loop.
6361            let stmt = format!("REFRESH MATERIALIZED VIEW {}", name);
6362            let _ = self.execute_query(&stmt);
6363        }
6364    }
6365
6366    /// Execute a pre-parsed `QueryExpr` directly, bypassing SQL parsing and the
6367    /// plan cache. Used by the prepared-statement fast path so that `execute_prepared`
6368    /// calls pay zero parse + cache overhead.
6369    ///
6370    /// Applies secret decryption on SELECT results, identical to `execute_query`.
6371    pub fn execute_query_expr(&self, expr: QueryExpr) -> RedDBResult<RuntimeQueryResult> {
6372        let _config_snapshot_guard = ConfigSnapshotGuard::install(Arc::clone(&self.inner.db));
6373        let _secret_store_guard = SecretStoreGuard::install(self.inner.auth_store.read().clone());
6374        // View rewrite (Phase 2.1): substitute any `QueryExpr::Table(tq)`
6375        // whose `tq.table` matches a registered view with the view's
6376        // underlying query. Safe to call even when no views are registered.
6377        let expr = self.rewrite_view_refs(expr);
6378
6379        self.validate_model_operations_before_auth(&expr)?;
6380        // Granular RBAC privilege check. Runs before dispatch so a
6381        // denied caller never reaches storage. Fail-closed: any error
6382        // resolving the action / resource produces PermissionDenied.
6383        if let Err(err) = self.check_query_privilege(&expr) {
6384            return Err(RedDBError::Query(format!("permission denied: {err}")));
6385        }
6386
6387        let statement = query_expr_name(&expr);
6388        let mode = detect_mode(statement);
6389        let query_str = statement;
6390
6391        let result = self.dispatch_expr(expr, query_str, mode)?;
6392        let mut r = result;
6393        if r.statement_type == "select" {
6394            self.apply_secret_decryption(&mut r);
6395        }
6396        Ok(r)
6397    }
6398
6399    pub(super) fn validate_model_operations_before_auth(
6400        &self,
6401        expr: &QueryExpr,
6402    ) -> RedDBResult<()> {
6403        use crate::catalog::CollectionModel;
6404        use crate::runtime::ddl::polymorphic_resolver;
6405        use crate::storage::query::ast::KvCommand;
6406
6407        let system_schema_target = match expr {
6408            QueryExpr::DropTable(q) => Some(q.name.as_str()),
6409            QueryExpr::DropGraph(q) => Some(q.name.as_str()),
6410            QueryExpr::DropVector(q) => Some(q.name.as_str()),
6411            QueryExpr::DropDocument(q) => Some(q.name.as_str()),
6412            QueryExpr::DropKv(q) => Some(q.name.as_str()),
6413            QueryExpr::DropCollection(q) => Some(q.name.as_str()),
6414            QueryExpr::Truncate(q) => Some(q.name.as_str()),
6415            _ => None,
6416        };
6417        if system_schema_target.is_some_and(crate::runtime::impl_ddl::is_system_schema_name) {
6418            return Err(RedDBError::Query("system schema is read-only".to_string()));
6419        }
6420
6421        let expected = match expr {
6422            QueryExpr::DropTable(q) => Some((q.name.as_str(), CollectionModel::Table)),
6423            QueryExpr::DropGraph(q) => Some((q.name.as_str(), CollectionModel::Graph)),
6424            QueryExpr::DropVector(q) => Some((q.name.as_str(), CollectionModel::Vector)),
6425            QueryExpr::DropDocument(q) => Some((q.name.as_str(), CollectionModel::Document)),
6426            QueryExpr::DropKv(q) => Some((q.name.as_str(), q.model)),
6427            QueryExpr::DropCollection(q) => q.model.map(|model| (q.name.as_str(), model)),
6428            QueryExpr::Truncate(q) => q.model.map(|model| (q.name.as_str(), model)),
6429            QueryExpr::KvCommand(cmd) => {
6430                let (collection, model) = match cmd {
6431                    KvCommand::Put {
6432                        collection, model, ..
6433                    }
6434                    | KvCommand::Get {
6435                        collection, model, ..
6436                    }
6437                    | KvCommand::Incr {
6438                        collection, model, ..
6439                    }
6440                    | KvCommand::Cas {
6441                        collection, model, ..
6442                    }
6443                    | KvCommand::List {
6444                        collection, model, ..
6445                    }
6446                    | KvCommand::Delete {
6447                        collection, model, ..
6448                    } => (collection.as_str(), *model),
6449                    KvCommand::Rotate { collection, .. }
6450                    | KvCommand::History { collection, .. }
6451                    | KvCommand::Purge { collection, .. } => {
6452                        (collection.as_str(), CollectionModel::Vault)
6453                    }
6454                    KvCommand::InvalidateTags { collection, .. } => {
6455                        (collection.as_str(), CollectionModel::Kv)
6456                    }
6457                    KvCommand::Watch {
6458                        collection, model, ..
6459                    } => (collection.as_str(), *model),
6460                    KvCommand::Unseal { collection, .. } => {
6461                        (collection.as_str(), CollectionModel::Vault)
6462                    }
6463                };
6464                Some((collection, model))
6465            }
6466            QueryExpr::ConfigCommand(cmd) => {
6467                self.validate_config_command_before_auth(cmd)?;
6468                None
6469            }
6470            _ => None,
6471        };
6472
6473        let Some((name, expected_model)) = expected else {
6474            return Ok(());
6475        };
6476        let snapshot = self.inner.db.catalog_model_snapshot();
6477        let Some(actual_model) = snapshot
6478            .collections
6479            .iter()
6480            .find(|collection| collection.name == name)
6481            .map(|collection| collection.declared_model.unwrap_or(collection.model))
6482        else {
6483            return Ok(());
6484        };
6485        polymorphic_resolver::ensure_model_match(expected_model, actual_model)
6486    }
6487
6488    /// Walk a `QueryExpr` and replace `QueryExpr::Table(tq)` nodes whose
6489    /// `tq.table` matches a registered view name with the view's stored
6490    /// body. Recurses through joins so `SELECT ... FROM t JOIN myview ...`
6491    /// resolves correctly. Pure operation — no side effects.
6492    pub(super) fn rewrite_view_refs(&self, expr: QueryExpr) -> QueryExpr {
6493        // Fast path: no views registered → return original expression.
6494        if self.inner.views.read().is_empty() {
6495            return expr;
6496        }
6497        self.rewrite_view_refs_inner(expr)
6498    }
6499
6500    fn rewrite_view_refs_inner(&self, expr: QueryExpr) -> QueryExpr {
6501        use crate::storage::query::ast::{Filter, TableSource};
6502        match expr {
6503            QueryExpr::Table(mut tq) => {
6504                // 1. If the TableSource is a subquery, recurse into it so
6505                //    `SELECT ... FROM (SELECT ... FROM myview) t` expands.
6506                //    The legacy `table` field (set to a synthetic
6507                //    "__subq_NNNN" sentinel) stays as-is so callers that
6508                //    read it keep compiling.
6509                if let Some(TableSource::Subquery(body)) = tq.source.take() {
6510                    tq.source = Some(TableSource::Subquery(Box::new(
6511                        self.rewrite_view_refs_inner(*body),
6512                    )));
6513                    return QueryExpr::Table(tq);
6514                }
6515
6516                // 2. Restore the source field (took it above for match).
6517                // When the source was `None` or `TableSource::Name(_)`, the
6518                // real lookup key is `tq.table` — check the view registry.
6519                let maybe_view = {
6520                    let views = self.inner.views.read();
6521                    views.get(&tq.table).cloned()
6522                };
6523                let Some(view) = maybe_view else {
6524                    return QueryExpr::Table(tq);
6525                };
6526
6527                // Issue #594 slice 9b — materialized views are read
6528                // from their backing collection, not by substituting
6529                // the body. Returning the TableQuery as-is lets the
6530                // normal table-read path resolve `SELECT FROM v`
6531                // against the collection provisioned at CREATE time.
6532                if view.materialized {
6533                    return QueryExpr::Table(tq);
6534                }
6535
6536                // Recurse into the view body — views may reference other
6537                // views. The recursion yields the final QueryExpr we need
6538                // to merge the outer's filter / limit / offset into.
6539                let inner_expr = self.rewrite_view_refs_inner((*view.query).clone());
6540
6541                // Phase 5: when the body is a Table we merge the outer
6542                // TableQuery's WHERE / LIMIT / OFFSET into it so stacked
6543                // views filter recursively. Non-table bodies (Search,
6544                // Ask, Vector, Graph, Hybrid) can't meaningfully combine
6545                // with an outer Table query today — return the body
6546                // verbatim; outer predicates are lost. Full projection
6547                // merge lands in Phase 5.2.
6548                match inner_expr {
6549                    QueryExpr::Table(mut inner_tq) => {
6550                        if let Some(outer_filter) = tq.filter.take() {
6551                            inner_tq.filter = Some(match inner_tq.filter.take() {
6552                                Some(existing) => {
6553                                    Filter::And(Box::new(existing), Box::new(outer_filter))
6554                                }
6555                                None => outer_filter,
6556                            });
6557                            // Keep the `Expr` form in lock-step with the
6558                            // merged `Filter`. The executor prefers
6559                            // `where_expr` and nulls `filter` when it is
6560                            // present (see `execute_query_inner`), so a
6561                            // stacked view whose outer predicate was only
6562                            // merged into `filter` would silently drop that
6563                            // predicate at eval time (#635).
6564                            inner_tq.where_expr = inner_tq
6565                                .filter
6566                                .as_ref()
6567                                .map(crate::storage::query::sql_lowering::filter_to_expr);
6568                        }
6569                        if let Some(outer_limit) = tq.limit {
6570                            inner_tq.limit = Some(match inner_tq.limit {
6571                                Some(existing) => existing.min(outer_limit),
6572                                None => outer_limit,
6573                            });
6574                        }
6575                        if let Some(outer_offset) = tq.offset {
6576                            inner_tq.offset = Some(match inner_tq.offset {
6577                                Some(existing) => existing + outer_offset,
6578                                None => outer_offset,
6579                            });
6580                        }
6581                        QueryExpr::Table(inner_tq)
6582                    }
6583                    other => other,
6584                }
6585            }
6586            QueryExpr::Join(mut jq) => {
6587                jq.left = Box::new(self.rewrite_view_refs_inner(*jq.left));
6588                jq.right = Box::new(self.rewrite_view_refs_inner(*jq.right));
6589                QueryExpr::Join(jq)
6590            }
6591            // Other variants don't carry nested QueryExpr that can reference
6592            // a view by table name. Return as-is.
6593            other => other,
6594        }
6595    }
6596
6597    /// Apply table-level read authorization and RLS rewriting for a
6598    /// relational SELECT leaf.
6599    fn authorize_relational_table_select(
6600        &self,
6601        mut table: TableQuery,
6602        frame: &dyn super::statement_frame::ReadFrame,
6603    ) -> RedDBResult<Option<TableQuery>> {
6604        if let Some(TableSource::Subquery(inner)) = table.source.take() {
6605            let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6606            table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6607            return Ok(Some(table));
6608        }
6609
6610        self.check_table_column_projection_authz(&table, frame)?;
6611
6612        if self.inner.rls_enabled_tables.read().contains(&table.table) {
6613            return Ok(inject_rls_filters(self, frame, table));
6614        }
6615
6616        Ok(Some(table))
6617    }
6618
6619    fn authorize_relational_join_select(
6620        &self,
6621        mut join: JoinQuery,
6622        frame: &dyn super::statement_frame::ReadFrame,
6623    ) -> RedDBResult<Option<JoinQuery>> {
6624        self.check_join_column_projection_authz(&join, frame)?;
6625        join.left = Box::new(self.authorize_relational_join_child(*join.left, frame)?);
6626        join.right = Box::new(self.authorize_relational_join_child(*join.right, frame)?);
6627        Ok(inject_rls_into_join(self, frame, join))
6628    }
6629
6630    fn authorize_relational_join_child(
6631        &self,
6632        expr: QueryExpr,
6633        frame: &dyn super::statement_frame::ReadFrame,
6634    ) -> RedDBResult<QueryExpr> {
6635        match expr {
6636            QueryExpr::Table(mut table) => {
6637                if let Some(TableSource::Subquery(inner)) = table.source.take() {
6638                    let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6639                    table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6640                }
6641                Ok(QueryExpr::Table(table))
6642            }
6643            QueryExpr::Join(join) => self
6644                .authorize_relational_join_select(join, frame)?
6645                .map(QueryExpr::Join)
6646                .ok_or_else(|| {
6647                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6648                }),
6649            other => Ok(other),
6650        }
6651    }
6652
6653    fn authorize_relational_select_expr(
6654        &self,
6655        expr: QueryExpr,
6656        frame: &dyn super::statement_frame::ReadFrame,
6657    ) -> RedDBResult<QueryExpr> {
6658        match expr {
6659            QueryExpr::Table(table) => self
6660                .authorize_relational_table_select(table, frame)?
6661                .map(QueryExpr::Table)
6662                .ok_or_else(|| {
6663                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6664                }),
6665            QueryExpr::Join(join) => self
6666                .authorize_relational_join_select(join, frame)?
6667                .map(QueryExpr::Join)
6668                .ok_or_else(|| {
6669                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6670                }),
6671            other => Ok(other),
6672        }
6673    }
6674
6675    fn check_table_column_projection_authz(
6676        &self,
6677        table: &TableQuery,
6678        frame: &dyn super::statement_frame::ReadFrame,
6679    ) -> RedDBResult<()> {
6680        let Some((username, role)) = frame.identity() else {
6681            return Ok(());
6682        };
6683        let Some(auth_store) = self.inner.auth_store.read().clone() else {
6684            return Ok(());
6685        };
6686
6687        let columns = self.resolved_table_projection_columns(table)?;
6688        let request = ColumnAccessRequest::select(table.table.clone(), columns);
6689        let principal = UserId::from_parts(frame.effective_scope(), username);
6690        let ctx = runtime_iam_context(
6691            role,
6692            frame.effective_scope(),
6693            auth_store.principal_is_system_owned(&principal),
6694        );
6695        let outcome = auth_store.check_column_projection_authz(&principal, &request, &ctx);
6696        if outcome.allowed() {
6697            return Ok(());
6698        }
6699
6700        if let Some(denied) = outcome.first_denied_column() {
6701            return Err(RedDBError::Query(format!(
6702                "permission denied: principal=`{username}` cannot select column `{}`",
6703                denied.resource.name
6704            )));
6705        }
6706        Err(RedDBError::Query(format!(
6707            "permission denied: principal=`{username}` cannot select table `{}`",
6708            table.table
6709        )))
6710    }
6711
6712    fn check_join_column_projection_authz(
6713        &self,
6714        join: &JoinQuery,
6715        frame: &dyn super::statement_frame::ReadFrame,
6716    ) -> RedDBResult<()> {
6717        let mut by_table: HashMap<String, BTreeSet<String>> = HashMap::new();
6718        let projections = crate::storage::query::sql_lowering::effective_join_projections(join);
6719        self.collect_join_projection_columns(join, &projections, &mut by_table)?;
6720
6721        for (table, columns) in by_table {
6722            let query = TableQuery {
6723                table,
6724                source: None,
6725                alias: None,
6726                select_items: Vec::new(),
6727                columns: columns.into_iter().map(Projection::Column).collect(),
6728                where_expr: None,
6729                filter: None,
6730                group_by_exprs: Vec::new(),
6731                group_by: Vec::new(),
6732                having_expr: None,
6733                having: None,
6734                order_by: Vec::new(),
6735                limit: None,
6736                limit_param: None,
6737                offset: None,
6738                offset_param: None,
6739                expand: None,
6740                as_of: None,
6741                sessionize: None,
6742                distinct: false,
6743            };
6744            self.check_table_column_projection_authz(&query, frame)?;
6745        }
6746        Ok(())
6747    }
6748
6749    fn collect_join_projection_columns(
6750        &self,
6751        join: &JoinQuery,
6752        projections: &[Projection],
6753        out: &mut HashMap<String, BTreeSet<String>>,
6754    ) -> RedDBResult<()> {
6755        let left = table_side_context(join.left.as_ref());
6756        let right = table_side_context(join.right.as_ref());
6757
6758        if projections
6759            .iter()
6760            .any(|projection| matches!(projection, Projection::All))
6761        {
6762            for side in [left.as_ref(), right.as_ref()].into_iter().flatten() {
6763                out.entry(side.table.clone())
6764                    .or_default()
6765                    .extend(self.table_all_projection_columns(&side.table)?);
6766            }
6767            return Ok(());
6768        }
6769
6770        for projection in projections {
6771            collect_projection_columns_for_join_side(
6772                projection,
6773                left.as_ref(),
6774                right.as_ref(),
6775                out,
6776            )?;
6777        }
6778        Ok(())
6779    }
6780
6781    fn resolved_table_projection_columns(&self, table: &TableQuery) -> RedDBResult<Vec<String>> {
6782        let projections = crate::storage::query::sql_lowering::effective_table_projections(table);
6783        if projections
6784            .iter()
6785            .any(|projection| matches!(projection, Projection::All))
6786        {
6787            return self.table_all_projection_columns(&table.table);
6788        }
6789
6790        let mut columns = BTreeSet::new();
6791        for projection in &projections {
6792            collect_projection_columns_for_table(
6793                projection,
6794                &table.table,
6795                table.alias.as_deref(),
6796                &mut columns,
6797            );
6798        }
6799        Ok(columns.into_iter().collect())
6800    }
6801
6802    fn table_all_projection_columns(&self, table: &str) -> RedDBResult<Vec<String>> {
6803        if let Some(contract) = self.inner.db.collection_contract_arc(table) {
6804            let columns: Vec<String> = contract
6805                .declared_columns
6806                .iter()
6807                .map(|column| column.name.clone())
6808                .collect();
6809            if !columns.is_empty() {
6810                return Ok(columns);
6811            }
6812        }
6813
6814        let records = scan_runtime_table_source_records_limited(&self.inner.db, table, Some(1))?;
6815        Ok(records
6816            .first()
6817            .map(|record| {
6818                record
6819                    .column_names()
6820                    .into_iter()
6821                    .map(|column| column.to_string())
6822                    .collect()
6823            })
6824            .unwrap_or_default())
6825    }
6826
6827    fn resolve_table_expr_subqueries(
6828        &self,
6829        mut table: TableQuery,
6830        frame: &dyn super::statement_frame::ReadFrame,
6831    ) -> RedDBResult<TableQuery> {
6832        // Only a `Subquery` source needs recursive resolution. `.take()`
6833        // would otherwise drop a `Name` / `Function` source on the floor
6834        // (the `if let` skips the body but the take already cleared it),
6835        // which silently broke `SELECT * FROM components(g)` — the TVF
6836        // dispatch downstream keys off `TableSource::Function` and never
6837        // fired. Restore any non-subquery source unchanged (issue #795).
6838        match table.source.take() {
6839            Some(TableSource::Subquery(inner)) => {
6840                let inner = self.resolve_select_expr_subqueries(*inner, frame)?;
6841                table.source = Some(TableSource::Subquery(Box::new(inner)));
6842            }
6843            other => table.source = other,
6844        }
6845
6846        let outer_scopes = relation_scopes_for_query(&QueryExpr::Table(table.clone()));
6847        for item in &mut table.select_items {
6848            if let crate::storage::query::ast::SelectItem::Expr { expr, .. } = item {
6849                *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
6850            }
6851        }
6852        if let Some(where_expr) = table.where_expr.take() {
6853            table.where_expr =
6854                Some(self.resolve_expr_subqueries(where_expr, &outer_scopes, frame)?);
6855            table.filter = None;
6856        }
6857        if let Some(having_expr) = table.having_expr.take() {
6858            table.having_expr =
6859                Some(self.resolve_expr_subqueries(having_expr, &outer_scopes, frame)?);
6860            table.having = None;
6861        }
6862        for expr in &mut table.group_by_exprs {
6863            *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
6864        }
6865        for clause in &mut table.order_by {
6866            if let Some(expr) = clause.expr.take() {
6867                clause.expr = Some(self.resolve_expr_subqueries(expr, &outer_scopes, frame)?);
6868            }
6869        }
6870        Ok(table)
6871    }
6872
6873    fn resolve_select_expr_subqueries(
6874        &self,
6875        expr: QueryExpr,
6876        frame: &dyn super::statement_frame::ReadFrame,
6877    ) -> RedDBResult<QueryExpr> {
6878        match expr {
6879            QueryExpr::Table(table) => self
6880                .resolve_table_expr_subqueries(table, frame)
6881                .map(QueryExpr::Table),
6882            QueryExpr::Join(mut join) => {
6883                join.left = Box::new(self.resolve_select_expr_subqueries(*join.left, frame)?);
6884                join.right = Box::new(self.resolve_select_expr_subqueries(*join.right, frame)?);
6885                Ok(QueryExpr::Join(join))
6886            }
6887            other => Ok(other),
6888        }
6889    }
6890
6891    fn resolve_expr_subqueries(
6892        &self,
6893        expr: crate::storage::query::ast::Expr,
6894        outer_scopes: &[String],
6895        frame: &dyn super::statement_frame::ReadFrame,
6896    ) -> RedDBResult<crate::storage::query::ast::Expr> {
6897        use crate::storage::query::ast::Expr;
6898
6899        match expr {
6900            Expr::Subquery { query, span } => {
6901                let values = self.execute_expr_subquery_values(query, outer_scopes, frame)?;
6902                if values.len() > 1 {
6903                    return Err(RedDBError::Query(
6904                        "scalar subquery returned more than one row".to_string(),
6905                    ));
6906                }
6907                Ok(Expr::Literal {
6908                    value: values.into_iter().next().unwrap_or(Value::Null),
6909                    span,
6910                })
6911            }
6912            Expr::BinaryOp { op, lhs, rhs, span } => Ok(Expr::BinaryOp {
6913                op,
6914                lhs: Box::new(self.resolve_expr_subqueries(*lhs, outer_scopes, frame)?),
6915                rhs: Box::new(self.resolve_expr_subqueries(*rhs, outer_scopes, frame)?),
6916                span,
6917            }),
6918            Expr::UnaryOp { op, operand, span } => Ok(Expr::UnaryOp {
6919                op,
6920                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
6921                span,
6922            }),
6923            Expr::Cast {
6924                inner,
6925                target,
6926                span,
6927            } => Ok(Expr::Cast {
6928                inner: Box::new(self.resolve_expr_subqueries(*inner, outer_scopes, frame)?),
6929                target,
6930                span,
6931            }),
6932            Expr::FunctionCall { name, args, span } => {
6933                let args = args
6934                    .into_iter()
6935                    .map(|arg| self.resolve_expr_subqueries(arg, outer_scopes, frame))
6936                    .collect::<RedDBResult<Vec<_>>>()?;
6937                Ok(Expr::FunctionCall { name, args, span })
6938            }
6939            Expr::Case {
6940                branches,
6941                else_,
6942                span,
6943            } => {
6944                let branches = branches
6945                    .into_iter()
6946                    .map(|(cond, value)| {
6947                        Ok((
6948                            self.resolve_expr_subqueries(cond, outer_scopes, frame)?,
6949                            self.resolve_expr_subqueries(value, outer_scopes, frame)?,
6950                        ))
6951                    })
6952                    .collect::<RedDBResult<Vec<_>>>()?;
6953                let else_ = else_
6954                    .map(|expr| self.resolve_expr_subqueries(*expr, outer_scopes, frame))
6955                    .transpose()?
6956                    .map(Box::new);
6957                Ok(Expr::Case {
6958                    branches,
6959                    else_,
6960                    span,
6961                })
6962            }
6963            Expr::IsNull {
6964                operand,
6965                negated,
6966                span,
6967            } => Ok(Expr::IsNull {
6968                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
6969                negated,
6970                span,
6971            }),
6972            Expr::InList {
6973                target,
6974                values,
6975                negated,
6976                span,
6977            } => {
6978                let target =
6979                    Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?);
6980                let mut resolved = Vec::new();
6981                for value in values {
6982                    if let Expr::Subquery { query, .. } = value {
6983                        resolved.extend(
6984                            self.execute_expr_subquery_values(query, outer_scopes, frame)?
6985                                .into_iter()
6986                                .map(Expr::lit),
6987                        );
6988                    } else {
6989                        resolved.push(self.resolve_expr_subqueries(value, outer_scopes, frame)?);
6990                    }
6991                }
6992                Ok(Expr::InList {
6993                    target,
6994                    values: resolved,
6995                    negated,
6996                    span,
6997                })
6998            }
6999            Expr::Between {
7000                target,
7001                low,
7002                high,
7003                negated,
7004                span,
7005            } => Ok(Expr::Between {
7006                target: Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?),
7007                low: Box::new(self.resolve_expr_subqueries(*low, outer_scopes, frame)?),
7008                high: Box::new(self.resolve_expr_subqueries(*high, outer_scopes, frame)?),
7009                negated,
7010                span,
7011            }),
7012            other => Ok(other),
7013        }
7014    }
7015
7016    fn execute_expr_subquery_values(
7017        &self,
7018        subquery: crate::storage::query::ast::ExprSubquery,
7019        outer_scopes: &[String],
7020        frame: &dyn super::statement_frame::ReadFrame,
7021    ) -> RedDBResult<Vec<Value>> {
7022        let query = *subquery.query;
7023        if query_references_outer_scope(&query, outer_scopes) {
7024            return Err(RedDBError::Query(
7025                "NOT_YET_SUPPORTED: correlated subqueries are not supported yet; track follow-up issue #470-correlated-subqueries".to_string(),
7026            ));
7027        }
7028        let query = self.rewrite_view_refs(query);
7029        let query = self.resolve_select_expr_subqueries(query, frame)?;
7030        let query = self.authorize_relational_select_expr(query, frame)?;
7031        let result = match query {
7032            QueryExpr::Table(table) => {
7033                execute_runtime_table_query(&self.inner.db, &table, Some(&self.inner.index_store))?
7034            }
7035            QueryExpr::Join(join) => execute_runtime_join_query(&self.inner.db, &join)?,
7036            other => {
7037                return Err(RedDBError::Query(format!(
7038                    "expression subquery must be a SELECT query, got {}",
7039                    query_expr_name(&other)
7040                )))
7041            }
7042        };
7043        first_column_values(result)
7044    }
7045
7046    fn dispatch_expr(
7047        &self,
7048        expr: QueryExpr,
7049        query_str: &str,
7050        mode: QueryMode,
7051    ) -> RedDBResult<RuntimeQueryResult> {
7052        let statement = query_expr_name(&expr);
7053        match expr {
7054            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
7055                // Graph queries are not cacheable as prepared statements.
7056                Err(RedDBError::Query(
7057                    "graph queries cannot be used as prepared statements".to_string(),
7058                ))
7059            }
7060            QueryExpr::Table(table) => {
7061                let scope = self.ai_scope();
7062                let table = self.resolve_table_expr_subqueries(
7063                    table,
7064                    &scope as &dyn super::statement_frame::ReadFrame,
7065                )?;
7066                // Table-valued functions (e.g. components(g)) dispatch to a
7067                // read-only executor before any catalog/virtual-table routing
7068                // (issue #795).
7069                if let Some(TableSource::Function {
7070                    name,
7071                    args,
7072                    named_args,
7073                }) = table.source.clone()
7074                {
7075                    return Ok(RuntimeQueryResult {
7076                        query: query_str.to_string(),
7077                        mode,
7078                        statement,
7079                        engine: "runtime-graph-tvf",
7080                        result: self.execute_table_function(&name, &args, &named_args)?,
7081                        affected_rows: 0,
7082                        statement_type: "select",
7083                        bookmark: None,
7084                    });
7085                }
7086                // Inline-graph TVF (issue #799) on the prepared-statement /
7087                // direct-expr path. Result caching is wired on the
7088                // `execute_query_inner` path; here we just compute and return.
7089                if let Some(TableSource::InlineGraphFunction {
7090                    name,
7091                    nodes,
7092                    edges,
7093                    named_args,
7094                }) = table.source.clone()
7095                {
7096                    return Ok(RuntimeQueryResult {
7097                        query: query_str.to_string(),
7098                        mode,
7099                        statement,
7100                        engine: "runtime-graph-tvf-inline",
7101                        result: self.execute_inline_graph_function(
7102                            &name,
7103                            &nodes,
7104                            &edges,
7105                            &named_args,
7106                        )?,
7107                        affected_rows: 0,
7108                        statement_type: "select",
7109                        bookmark: None,
7110                    });
7111                }
7112                if super::red_schema::is_virtual_table(&table.table) {
7113                    return Ok(RuntimeQueryResult {
7114                        query: query_str.to_string(),
7115                        mode,
7116                        statement,
7117                        engine: "runtime-red-schema",
7118                        result: super::red_schema::red_query(
7119                            self,
7120                            &table.table,
7121                            &table,
7122                            &scope as &dyn super::statement_frame::ReadFrame,
7123                        )?,
7124                        affected_rows: 0,
7125                        statement_type: "select",
7126                        bookmark: None,
7127                    });
7128                }
7129                // `<graph>.<output>` analytics virtual view (issue #800).
7130                if let Some(view_result) = self.try_resolve_analytics_view(
7131                    &table,
7132                    &scope as &dyn super::statement_frame::ReadFrame,
7133                )? {
7134                    return Ok(RuntimeQueryResult {
7135                        query: query_str.to_string(),
7136                        mode,
7137                        statement,
7138                        engine: "runtime-graph-analytics-view",
7139                        result: view_result,
7140                        affected_rows: 0,
7141                        statement_type: "select",
7142                        bookmark: None,
7143                    });
7144                }
7145                let Some(table_with_rls) = self.authorize_relational_table_select(
7146                    table,
7147                    &scope as &dyn super::statement_frame::ReadFrame,
7148                )?
7149                else {
7150                    return Ok(RuntimeQueryResult {
7151                        query: query_str.to_string(),
7152                        mode,
7153                        statement,
7154                        engine: "runtime-table-rls",
7155                        result: crate::storage::query::unified::UnifiedResult::empty(),
7156                        affected_rows: 0,
7157                        statement_type: "select",
7158                        bookmark: None,
7159                    });
7160                };
7161                Ok(RuntimeQueryResult {
7162                    query: query_str.to_string(),
7163                    mode,
7164                    statement,
7165                    engine: "runtime-table",
7166                    result: execute_runtime_table_query(
7167                        &self.inner.db,
7168                        &table_with_rls,
7169                        Some(&self.inner.index_store),
7170                    )?,
7171                    affected_rows: 0,
7172                    statement_type: "select",
7173                    bookmark: None,
7174                })
7175            }
7176            QueryExpr::Join(join) => {
7177                let scope = self.ai_scope();
7178                let Some(join_with_rls) = self.authorize_relational_join_select(
7179                    join,
7180                    &scope as &dyn super::statement_frame::ReadFrame,
7181                )?
7182                else {
7183                    return Ok(RuntimeQueryResult {
7184                        query: query_str.to_string(),
7185                        mode,
7186                        statement,
7187                        engine: "runtime-join-rls",
7188                        result: crate::storage::query::unified::UnifiedResult::empty(),
7189                        affected_rows: 0,
7190                        statement_type: "select",
7191                        bookmark: None,
7192                    });
7193                };
7194                Ok(RuntimeQueryResult {
7195                    query: query_str.to_string(),
7196                    mode,
7197                    statement,
7198                    engine: "runtime-join",
7199                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
7200                    affected_rows: 0,
7201                    statement_type: "select",
7202                    bookmark: None,
7203                })
7204            }
7205            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
7206                query: query_str.to_string(),
7207                mode,
7208                statement,
7209                engine: "runtime-vector",
7210                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
7211                affected_rows: 0,
7212                statement_type: "select",
7213                bookmark: None,
7214            }),
7215            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
7216                query: query_str.to_string(),
7217                mode,
7218                statement,
7219                engine: "runtime-hybrid",
7220                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
7221                affected_rows: 0,
7222                statement_type: "select",
7223                bookmark: None,
7224            }),
7225            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
7226                Err(RedDBError::Query(
7227                    super::red_schema::READ_ONLY_ERROR.to_string(),
7228                ))
7229            }
7230            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
7231                Err(RedDBError::Query(
7232                    super::red_schema::READ_ONLY_ERROR.to_string(),
7233                ))
7234            }
7235            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
7236                Err(RedDBError::Query(
7237                    super::red_schema::READ_ONLY_ERROR.to_string(),
7238                ))
7239            }
7240            QueryExpr::Insert(ref insert) => self
7241                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
7242                    self.execute_insert(query_str, insert)
7243                }),
7244            QueryExpr::Update(ref update) => self
7245                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
7246                    self.execute_update(query_str, update)
7247                }),
7248            QueryExpr::Delete(ref delete) => self
7249                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
7250                    self.execute_delete(query_str, delete)
7251                }),
7252            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query_str, cmd),
7253            QueryExpr::Ask(ref ask) => self.execute_ask(query_str, ask),
7254            _ => Err(RedDBError::Query(format!(
7255                "prepared-statement execution does not support {statement} statements"
7256            ))),
7257        }
7258    }
7259
7260    /// Dispatch a graph-collection table-valued function call in FROM
7261    /// position (e.g. `SELECT * FROM components(g)`).
7262    ///
7263    /// Validates the function name and arity here, materializes the whole
7264    /// active graph read-only, then runs the algorithm via the shared
7265    /// `dispatch_graph_algorithm` path. Never mutates the catalog or store.
7266    fn execute_table_function(
7267        &self,
7268        name: &str,
7269        args: &[String],
7270        named_args: &[(String, f64)],
7271    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7272        if !is_graph_tvf_name(name) {
7273            return Err(RedDBError::Query(format!("unknown table function: {name}")));
7274        }
7275        // Every graph-collection TVF takes exactly one graph argument.
7276        if args.len() != 1 {
7277            return Err(RedDBError::Query(format!(
7278                "table function '{name}' takes exactly 1 graph argument, got {}",
7279                args.len()
7280            )));
7281        }
7282
7283        // Read-only materialization of the full active graph. Passing `None`
7284        // for the projection uses the full graph store. Like #795/#796, the
7285        // v0 form runs over the whole graph store regardless of the collection
7286        // argument value. Materialization never mutates any store.
7287        let (nodes, edges) = self.materialize_whole_graph_abstract()?;
7288        self.dispatch_graph_algorithm(name, nodes, edges, named_args)
7289    }
7290
7291    /// Dispatch an inline-graph table-valued function call in FROM position
7292    /// (e.g. `SELECT * FROM components(nodes => (…), edges => (…))`, issue
7293    /// #799).
7294    ///
7295    /// Materializes the two subqueries through the normal read path (so RLS,
7296    /// column authz, and MVCC visibility all apply), constructs the abstract
7297    /// graph — the first column of `nodes` is the node id; the first two-or-
7298    /// three columns of `edges` are `(source, target [, weight])` — then runs
7299    /// the same algorithm path used by the graph-collection form. Read-only.
7300    fn execute_inline_graph_function(
7301        &self,
7302        name: &str,
7303        nodes_query: &QueryExpr,
7304        edges_query: &QueryExpr,
7305        named_args: &[(String, f64)],
7306    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7307        if !is_graph_tvf_name(name) {
7308            return Err(RedDBError::Query(format!("unknown table function: {name}")));
7309        }
7310
7311        let node_result = self.execute_query_expr(nodes_query.clone())?.result;
7312        let nodes = inline_node_ids(name, &node_result)?;
7313
7314        let edge_result = self.execute_query_expr(edges_query.clone())?.result;
7315        let edges = inline_edges(name, &edge_result)?;
7316
7317        self.dispatch_graph_algorithm(name, nodes, edges, named_args)
7318    }
7319
7320    /// Materialize the whole active graph read-only into the abstract
7321    /// `(nodes, edges)` inputs the pure graph algorithms consume.
7322    fn materialize_whole_graph_abstract(
7323        &self,
7324    ) -> RedDBResult<(
7325        Vec<String>,
7326        Vec<(
7327            String,
7328            String,
7329            crate::storage::engine::graph_algorithms::Weight,
7330        )>,
7331    )> {
7332        use crate::storage::engine::graph_algorithms;
7333
7334        let graph = super::graph_dsl::materialize_graph_with_projection(
7335            self.inner.db.store().as_ref(),
7336            None,
7337        )?;
7338        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
7339        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
7340            .iter_all_edges()
7341            .into_iter()
7342            .map(|e| (e.source_id, e.target_id, e.weight))
7343            .collect();
7344        Ok((nodes, edges))
7345    }
7346
7347    /// Resolve a `<graph>.<output>` analytics virtual view (issue #800).
7348    ///
7349    /// Returns `Ok(None)` when `table` is not an analytics view — either the
7350    /// name is not dotted, a real collection of that exact name exists (a real
7351    /// collection always wins; no shadowing), the suffix is not a recognised
7352    /// analytics output, or the parent is not a graph. Returns `Ok(Some(_))`
7353    /// with the freshly computed result when it does resolve, and an error when
7354    /// the parent graph exists but the output is not enabled, a declared
7355    /// algorithm is unsupported, or the parent collection's policy denies the
7356    /// read.
7357    ///
7358    /// The view is recomputed on every call (no result-cache write) so it
7359    /// always reflects the current graph data, satisfying the on-demand
7360    /// recompute contract for this slice.
7361    fn try_resolve_analytics_view(
7362        &self,
7363        table: &TableQuery,
7364        frame: &dyn super::statement_frame::ReadFrame,
7365    ) -> RedDBResult<Option<crate::storage::query::unified::UnifiedResult>> {
7366        let full = table.table.as_str();
7367        let Some(dot) = full.rfind('.') else {
7368            return Ok(None);
7369        };
7370        // A real collection literally named `g.communities` always wins.
7371        if self.inner.db.store().get_collection(full).is_some() {
7372            return Ok(None);
7373        }
7374        let graph_name = &full[..dot];
7375        let output_name = &full[dot + 1..];
7376        let Some(output) = crate::catalog::AnalyticsOutput::from_str(output_name) else {
7377            return Ok(None);
7378        };
7379
7380        let contracts = self.inner.db.collection_contracts();
7381        let Some(contract) = contracts.iter().find(|c| c.name == graph_name) else {
7382            return Ok(None);
7383        };
7384        if contract.declared_model != crate::catalog::CollectionModel::Graph {
7385            return Ok(None);
7386        }
7387        let Some(view) = contract
7388            .analytics_config
7389            .iter()
7390            .find(|view| view.output == output)
7391        else {
7392            // The parent graph exists but this output was not declared — a
7393            // clear error beats the misleading "collection not found".
7394            return Err(RedDBError::Query(format!(
7395                "analytics output '{output_name}' is not enabled on graph '{graph_name}'; declare it with WITH ANALYTICS (...)"
7396            )));
7397        };
7398
7399        // Policy inheritance (AC5): route through the parent graph collection's
7400        // read authorization. A policy or RLS rule that denies the parent
7401        // denies its analytics views transitively.
7402        let parent_query = TableQuery::new(graph_name);
7403        if self
7404            .authorize_relational_table_select(parent_query, frame)?
7405            .is_none()
7406        {
7407            return Err(RedDBError::Query(format!(
7408                "permission denied: policy on graph '{graph_name}' denies analytics view '{output_name}'"
7409            )));
7410        }
7411
7412        let (algorithm, named_args) = analytics_view_algorithm(graph_name, view)?;
7413        let (nodes, edges) = self.materialize_whole_graph_abstract()?;
7414        let result = self.dispatch_graph_algorithm(&algorithm, nodes, edges, &named_args)?;
7415        Ok(Some(result))
7416    }
7417
7418    /// Shared algorithm dispatch over abstract `(nodes, edges)` inputs.
7419    ///
7420    /// Both the graph-collection form and the inline-graph form route here so
7421    /// named-argument validation and the projected row shape stay identical
7422    /// across the two signatures (issue #799). Projects each algorithm's
7423    /// native output shape.
7424    fn dispatch_graph_algorithm(
7425        &self,
7426        name: &str,
7427        nodes: Vec<String>,
7428        edges: Vec<(
7429            String,
7430            String,
7431            crate::storage::engine::graph_algorithms::Weight,
7432        )>,
7433        named_args: &[(String, f64)],
7434    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7435        use crate::storage::engine::graph_algorithms;
7436        use crate::storage::query::unified::UnifiedResult;
7437        use crate::storage::schema::Value;
7438
7439        if name.eq_ignore_ascii_case("components") {
7440            reject_named_args(name, named_args)?;
7441            let assignment = graph_algorithms::connected_components(&nodes, &edges);
7442            let mut result =
7443                UnifiedResult::with_columns(vec!["node_id".into(), "island_id".into()]);
7444            for (node_id, island_id) in assignment {
7445                let mut record = UnifiedRecord::new();
7446                record.set("node_id", Value::text(node_id));
7447                record.set("island_id", Value::Integer(island_id as i64));
7448                result.push(record);
7449            }
7450            return Ok(result);
7451        }
7452
7453        if name.eq_ignore_ascii_case("louvain") {
7454            // The only supported named argument is `resolution` (γ). It
7455            // defaults to 1.0 (classic modularity) and must be a finite,
7456            // strictly positive number — a non-positive (or NaN/inf)
7457            // resolution has no sensible meaning.
7458            let resolution = louvain_resolution(named_args)?;
7459            let assignment = graph_algorithms::louvain(&nodes, &edges, resolution);
7460            let mut result =
7461                UnifiedResult::with_columns(vec!["node_id".into(), "community_id".into()]);
7462            for (node_id, community_id) in assignment {
7463                let mut record = UnifiedRecord::new();
7464                record.set("node_id", Value::text(node_id));
7465                record.set("community_id", Value::Integer(community_id as i64));
7466                result.push(record);
7467            }
7468            return Ok(result);
7469        }
7470
7471        if name.eq_ignore_ascii_case("degree_centrality") {
7472            reject_named_args(name, named_args)?;
7473            let assignment = abstract_degree_centrality(&nodes, &edges);
7474            let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "degree".into()]);
7475            for (node_id, degree) in assignment {
7476                let mut record = UnifiedRecord::new();
7477                record.set("node_id", Value::text(node_id));
7478                record.set("degree", Value::Integer(degree as i64));
7479                result.push(record);
7480            }
7481            return Ok(result);
7482        }
7483
7484        if name.eq_ignore_ascii_case("shortest_path") {
7485            // Scalar named arguments: `src` and `dst` are required node ids,
7486            // `max_hops` is an optional non-negative edge-count cap. Node ids
7487            // in the graph store are integer entity ids rendered as strings, so
7488            // each id arg must be a non-negative whole number; reject anything
7489            // else (fractional, negative, NaN/inf) with a clear message.
7490            let mut src: Option<String> = None;
7491            let mut dst: Option<String> = None;
7492            let mut max_hops: Option<usize> = None;
7493            let as_node_id = |key: &str, value: f64| -> RedDBResult<String> {
7494                if !value.is_finite() || value < 0.0 || value.fract() != 0.0 {
7495                    return Err(RedDBError::Query(format!(
7496                        "table function 'shortest_path' argument '{key}' must be a non-negative integer node id, got {value}"
7497                    )));
7498                }
7499                Ok((value as i64).to_string())
7500            };
7501            for (key, value) in named_args {
7502                if key.eq_ignore_ascii_case("src") {
7503                    src = Some(as_node_id("src", *value)?);
7504                } else if key.eq_ignore_ascii_case("dst") {
7505                    dst = Some(as_node_id("dst", *value)?);
7506                } else if key.eq_ignore_ascii_case("max_hops") {
7507                    if !value.is_finite() || *value < 0.0 || value.fract() != 0.0 {
7508                        return Err(RedDBError::Query(format!(
7509                            "table function 'shortest_path' max_hops must be a non-negative integer, got {value}"
7510                        )));
7511                    }
7512                    max_hops = Some(*value as usize);
7513                } else {
7514                    return Err(RedDBError::Query(format!(
7515                        "table function 'shortest_path' has no named argument '{key}' (expected 'src', 'dst', 'max_hops')"
7516                    )));
7517                }
7518            }
7519            let src = src.ok_or_else(|| {
7520                RedDBError::Query(
7521                    "table function 'shortest_path' requires named argument 'src'".to_string(),
7522                )
7523            })?;
7524            let dst = dst.ok_or_else(|| {
7525                RedDBError::Query(
7526                    "table function 'shortest_path' requires named argument 'dst'".to_string(),
7527                )
7528            })?;
7529
7530            // Columns are always present; an unreachable pair (within the
7531            // optional `max_hops` budget) simply yields zero rows — never an
7532            // error. `hop` is the 0-based index from the source;
7533            // `cumulative_weight` is the running path weight (0 at the source,
7534            // the total at the destination). Edges are treated as undirected,
7535            // consistent with `components` / `louvain`.
7536            let mut result = UnifiedResult::with_columns(vec![
7537                "hop".into(),
7538                "node_id".into(),
7539                "cumulative_weight".into(),
7540            ]);
7541            if let Some(path) =
7542                graph_algorithms::shortest_path(&nodes, &edges, &src, &dst, max_hops)
7543            {
7544                for (hop, (node_id, cumulative_weight)) in path.into_iter().enumerate() {
7545                    let mut record = UnifiedRecord::new();
7546                    record.set("hop", Value::Integer(hop as i64));
7547                    record.set("node_id", Value::text(node_id));
7548                    record.set("cumulative_weight", Value::Float(cumulative_weight));
7549                    result.push(record);
7550                }
7551            }
7552            return Ok(result);
7553        }
7554        // ── Centrality family (issue #797): each returns rows `(node_id,
7555        // score)` over the abstract `(nodes, edges)` graph. Like the other
7556        // graph TVFs the graph is treated as undirected and scores are
7557        // deterministic; the inline-graph form shares this dispatch. ──
7558        if name.eq_ignore_ascii_case("betweenness") {
7559            reject_named_args(name, named_args)?;
7560            return Ok(Self::centrality_result(graph_algorithms::betweenness(
7561                &nodes, &edges,
7562            )));
7563        }
7564        if name.eq_ignore_ascii_case("eigenvector") {
7565            // Optional `max_iterations` (positive integer, default 100) and
7566            // `tolerance` (finite, strictly positive, default 1e-6).
7567            let mut max_iterations = 100_usize;
7568            let mut tolerance = 1e-6_f64;
7569            for (key, value) in named_args {
7570                if key.eq_ignore_ascii_case("max_iterations") {
7571                    max_iterations = parse_positive_iterations("eigenvector", value)?;
7572                } else if key.eq_ignore_ascii_case("tolerance") {
7573                    if !value.is_finite() || *value <= 0.0 {
7574                        return Err(RedDBError::Query(format!(
7575                            "table function 'eigenvector' tolerance must be > 0, got {value}"
7576                        )));
7577                    }
7578                    tolerance = *value;
7579                } else {
7580                    return Err(RedDBError::Query(format!(
7581                        "table function 'eigenvector' has no named argument '{key}' (expected 'max_iterations' or 'tolerance')"
7582                    )));
7583                }
7584            }
7585            return Ok(Self::centrality_result(graph_algorithms::eigenvector(
7586                &nodes,
7587                &edges,
7588                max_iterations,
7589                tolerance,
7590            )));
7591        }
7592        if name.eq_ignore_ascii_case("pagerank") {
7593            // Optional `damping` (in (0, 1), default 0.85) and `max_iterations`
7594            // (positive integer, default 100).
7595            let mut damping = 0.85_f64;
7596            let mut max_iterations = 100_usize;
7597            for (key, value) in named_args {
7598                if key.eq_ignore_ascii_case("damping") {
7599                    if !value.is_finite() || *value <= 0.0 || *value >= 1.0 {
7600                        return Err(RedDBError::Query(format!(
7601                            "table function 'pagerank' damping must be in (0, 1), got {value}"
7602                        )));
7603                    }
7604                    damping = *value;
7605                } else if key.eq_ignore_ascii_case("max_iterations") {
7606                    max_iterations = parse_positive_iterations("pagerank", value)?;
7607                } else {
7608                    return Err(RedDBError::Query(format!(
7609                        "table function 'pagerank' has no named argument '{key}' (expected 'damping' or 'max_iterations')"
7610                    )));
7611                }
7612            }
7613            return Ok(Self::centrality_result(graph_algorithms::pagerank(
7614                &nodes,
7615                &edges,
7616                damping,
7617                max_iterations,
7618            )));
7619        }
7620        Err(RedDBError::Query(format!("unknown table function: {name}")))
7621    }
7622
7623    /// `components(<graph_collection>)` — returns rows `(node_id, island_id)`.
7624    ///
7625    /// Materializes the active graph (nodes + weighted edges) read-only and
7626    /// runs the pure `graph_algorithms::connected_components`. Edges are
7627    /// treated as undirected; island ids are deterministic (ascending order of
7628    /// each component's smallest node).
7629    fn execute_components_tvf(
7630        &self,
7631        _collection: &str,
7632    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7633        use crate::storage::engine::graph_algorithms;
7634        use crate::storage::query::unified::UnifiedResult;
7635        use crate::storage::schema::Value;
7636
7637        // Read-only materialization of the full active graph. The named
7638        // collection identifies the active graph scope; passing `None` for the
7639        // projection uses the full graph store (the same result
7640        // `active_graph_projection` yields when no projection is registered).
7641        // Materialization never mutates any store.
7642        let graph = super::graph_dsl::materialize_graph_with_projection(
7643            self.inner.db.store().as_ref(),
7644            None,
7645        )?;
7646
7647        // Materialize abstract inputs for the pure algorithm.
7648        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
7649        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
7650            .iter_all_edges()
7651            .into_iter()
7652            .map(|e| (e.source_id, e.target_id, e.weight))
7653            .collect();
7654
7655        let assignment = graph_algorithms::connected_components(&nodes, &edges);
7656
7657        // Project into a UnifiedResult with columns ["node_id", "island_id"].
7658        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "island_id".into()]);
7659        for (node_id, island_id) in assignment {
7660            let mut record = UnifiedRecord::new();
7661            record.set("node_id", Value::text(node_id));
7662            record.set("island_id", Value::Integer(island_id as i64));
7663            result.push(record);
7664        }
7665        Ok(result)
7666    }
7667
7668    /// `louvain(<graph> [, resolution => <f64>])` — returns rows
7669    /// `(node_id, community_id)` (issue #796).
7670    ///
7671    /// Materializes the active graph (nodes + weighted edges) read-only and
7672    /// runs the pure, deterministic `graph_algorithms::louvain`. Edges are
7673    /// treated as undirected; community ids are assigned in ascending order of
7674    /// each community's smallest node, so identical input + resolution always
7675    /// yields identical rows. Like `components`, the v0 form runs over the
7676    /// whole graph store regardless of the collection argument value.
7677    fn execute_louvain_tvf(
7678        &self,
7679        _collection: &str,
7680        resolution: f64,
7681    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7682        use crate::storage::engine::graph_algorithms;
7683        use crate::storage::query::unified::UnifiedResult;
7684        use crate::storage::schema::Value;
7685
7686        let graph = super::graph_dsl::materialize_graph_with_projection(
7687            self.inner.db.store().as_ref(),
7688            None,
7689        )?;
7690
7691        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
7692        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
7693            .iter_all_edges()
7694            .into_iter()
7695            .map(|e| (e.source_id, e.target_id, e.weight))
7696            .collect();
7697
7698        let assignment = graph_algorithms::louvain(&nodes, &edges, resolution);
7699
7700        // Project into a UnifiedResult with columns ["node_id", "community_id"].
7701        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "community_id".into()]);
7702        for (node_id, community_id) in assignment {
7703            let mut record = UnifiedRecord::new();
7704            record.set("node_id", Value::text(node_id));
7705            record.set("community_id", Value::Integer(community_id as i64));
7706            result.push(record);
7707        }
7708        Ok(result)
7709    }
7710
7711    /// Project `(node_id, score)` centrality rows into a `UnifiedResult` with
7712    /// columns `["node_id", "score"]`; scores are `Value::Float`.
7713    fn centrality_result(
7714        rows: Vec<(String, f64)>,
7715    ) -> crate::storage::query::unified::UnifiedResult {
7716        use crate::storage::query::unified::UnifiedResult;
7717        use crate::storage::schema::Value;
7718        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "score".into()]);
7719        for (node_id, score) in rows {
7720            let mut record = UnifiedRecord::new();
7721            record.set("node_id", Value::text(node_id));
7722            record.set("score", Value::Float(score));
7723            result.push(record);
7724        }
7725        result
7726    }
7727
7728    /// Ultra-fast path: detect `SELECT * FROM table WHERE _entity_id = N` by string pattern
7729    /// and execute it without SQL parsing or planning. Returns None if pattern doesn't match.
7730    fn try_fast_entity_lookup(&self, query: &str) -> Option<RedDBResult<RuntimeQueryResult>> {
7731        // Pattern: "SELECT * FROM <table> WHERE _entity_id = <id>"
7732        // or "SELECT * FROM <table> WHERE _entity_id =<id>"
7733        let q = query.trim();
7734        if !q.starts_with("SELECT") && !q.starts_with("select") {
7735            return None;
7736        }
7737
7738        // Find "WHERE _entity_id = " or "WHERE _entity_id ="
7739        let where_pos = q
7740            .find("WHERE _entity_id")
7741            .or_else(|| q.find("where _entity_id"))?;
7742        let after_field = &q[where_pos + 16..].trim_start(); // skip "WHERE _entity_id"
7743        let after_eq = after_field.strip_prefix('=')?.trim_start();
7744
7745        // Parse the entity ID number
7746        let id_str = after_eq.trim();
7747        let entity_id: u64 = id_str.parse().ok()?;
7748
7749        // Extract table name: between "FROM " and " WHERE"
7750        let from_pos = q.find("FROM ").or_else(|| q.find("from "))? + 5;
7751        let table = q[from_pos..where_pos].trim();
7752        if table.is_empty()
7753            || table.contains(' ') && !table.contains(" AS ") && !table.contains(" as ")
7754        {
7755            return None; // complex query, fall through
7756        }
7757        let table_name = table.split_whitespace().next()?;
7758
7759        // Direct entity lookup — skips SQL parse, plan cache, result
7760        // cache, view rewriter, RLS gate. Safe because the gating in
7761        // `execute_query` guarantees no scope override / no
7762        // transaction context is active. MVCC visibility is still
7763        // honoured against the current snapshot.
7764        let store = self.inner.db.store();
7765        let entity = store
7766            .get(
7767                table_name,
7768                crate::storage::unified::EntityId::new(entity_id),
7769            )
7770            .filter(entity_visible_under_current_snapshot)
7771            .filter(|entity| {
7772                self.inner
7773                    .db
7774                    .replica_allows_entity_at_read(table_name, entity)
7775            });
7776
7777        let count = if entity.is_some() { 1u64 } else { 0 };
7778
7779        // Materialize a record so downstream consumers that walk
7780        // `result.records` (embedded runtime API, decrypt pass, CLI)
7781        // see the row. Previously only `pre_serialized_json` was
7782        // filled, which caused those consumers to see zero rows and
7783        // skewed benchmarks.
7784        let records: Vec<crate::storage::query::unified::UnifiedRecord> = entity
7785            .as_ref()
7786            .and_then(|e| runtime_table_record_from_entity(e.clone()))
7787            .into_iter()
7788            .collect();
7789
7790        let json = match entity {
7791            Some(ref e) => execute_runtime_serialize_single_entity(e),
7792            None => r#"{"columns":[],"record_count":0,"selection":{"scope":"any"},"records":[]}"#
7793                .to_string(),
7794        };
7795
7796        Some(Ok(RuntimeQueryResult {
7797            query: query.to_string(),
7798            mode: crate::storage::query::modes::QueryMode::Sql,
7799            statement: "select",
7800            engine: "fast-entity-lookup",
7801            result: crate::storage::query::unified::UnifiedResult {
7802                columns: Vec::new(),
7803                records,
7804                stats: crate::storage::query::unified::QueryStats {
7805                    rows_scanned: count,
7806                    ..Default::default()
7807                },
7808                pre_serialized_json: Some(json),
7809            },
7810            affected_rows: 0,
7811            statement_type: "select",
7812            bookmark: None,
7813        }))
7814    }
7815
7816    pub(crate) fn invalidate_plan_cache(&self) {
7817        self.inner.query_cache.write().clear();
7818        self.inner
7819            .ddl_epoch
7820            .fetch_add(1, std::sync::atomic::Ordering::Release);
7821    }
7822
7823    /// Read the monotonic DDL epoch counter. Bumped by every
7824    /// `invalidate_plan_cache` call so prepared-statement holders can
7825    /// detect schema drift between PREPARE and EXECUTE.
7826    pub fn ddl_epoch(&self) -> u64 {
7827        self.inner
7828            .ddl_epoch
7829            .load(std::sync::atomic::Ordering::Acquire)
7830    }
7831
7832    pub(crate) fn clear_table_planner_stats(&self, table: &str) {
7833        let store = self.inner.db.store();
7834        crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
7835        self.invalidate_plan_cache();
7836    }
7837
7838    /// Replay `tenant_tables.*.column` keys from red_config at boot so
7839    /// `CREATE TABLE ... TENANT BY (col)` declarations persist across
7840    /// restarts (Phase 2.5.4). Reads every row of the `red_config`
7841    /// collection, picks the keys matching the tenant-marker shape,
7842    /// and calls `register_tenant_table` for each.
7843    ///
7844    /// Safe no-op when `red_config` doesn't exist (first boot on a
7845    /// fresh datadir).
7846    pub(crate) fn rehydrate_tenant_tables(&self) {
7847        let store = self.inner.db.store();
7848        let Some(manager) = store.get_collection("red_config") else {
7849            return;
7850        };
7851        // Replay in insertion order (SegmentManager iteration). Multiple
7852        // toggles on the same table leave several rows behind — the
7853        // last one processed wins because each register/unregister
7854        // call overwrites the in-memory state.
7855        for entity in manager.query_all(|_| true) {
7856            let crate::storage::unified::entity::EntityData::Row(row) = &entity.data else {
7857                continue;
7858            };
7859            let Some(named) = &row.named else { continue };
7860            let Some(crate::storage::schema::Value::Text(key)) = named.get("key") else {
7861                continue;
7862            };
7863            // Shape: tenant_tables.{table}.column
7864            let Some(rest) = key.strip_prefix("tenant_tables.") else {
7865                continue;
7866            };
7867            let Some((table, suffix)) = rest.rsplit_once('.') else {
7868                // Issue #205 — a `tenant_tables.*` row that doesn't
7869                // split cleanly is a schema-shape regression: the
7870                // metadata writer must always emit the `.column`
7871                // suffix, so reaching this branch means an upgrade
7872                // with incompatible state or external tampering.
7873                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7874                    collection: "red_config".to_string(),
7875                    detail: format!("malformed tenant_tables key: {key}"),
7876                }
7877                .emit_global();
7878                continue;
7879            };
7880            if suffix != "column" {
7881                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7882                    collection: "red_config".to_string(),
7883                    detail: format!("unexpected tenant_tables suffix: {key}"),
7884                }
7885                .emit_global();
7886                continue;
7887            }
7888            match named.get("value") {
7889                Some(crate::storage::schema::Value::Text(column)) => {
7890                    self.register_tenant_table(table, column);
7891                }
7892                // Null / missing value = DISABLE TENANCY marker.
7893                Some(crate::storage::schema::Value::Null) | None => {
7894                    self.unregister_tenant_table(table);
7895                }
7896                _ => {}
7897            }
7898        }
7899    }
7900
7901    /// Replay every persisted `MaterializedViewDescriptor` from the
7902    /// `red_materialized_view_defs` system collection (issue #593
7903    /// slice 9a). For each descriptor, re-parse the original SQL,
7904    /// extract the `QueryExpr::CreateView` it produced, and populate
7905    /// the in-memory registries (`inner.views` and
7906    /// `inner.materialized_views`) directly — no write paths run, so
7907    /// rehydrate does not re-persist what it just read.
7908    ///
7909    /// Malformed rows (missing `name`/`source_sql`, parse errors) are
7910    /// skipped with a `SchemaCorruption` operator event so a single
7911    /// bad entry does not block startup.
7912    pub(crate) fn rehydrate_materialized_view_descriptors(&self) {
7913        let store = self.inner.db.store();
7914        let descriptors = crate::runtime::continuous_materialized_view::load_all(store.as_ref());
7915        for descriptor in descriptors {
7916            let parsed = match crate::storage::query::parser::parse(&descriptor.source_sql) {
7917                Ok(qc) => qc,
7918                Err(err) => {
7919                    crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7920                        collection:
7921                            crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
7922                                .to_string(),
7923                        detail: format!(
7924                            "failed to re-parse materialized-view source for {}: {err}",
7925                            descriptor.name
7926                        ),
7927                    }
7928                    .emit_global();
7929                    continue;
7930                }
7931            };
7932            let crate::storage::query::ast::QueryExpr::CreateView(create) = parsed.query else {
7933                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7934                    collection: crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
7935                        .to_string(),
7936                    detail: format!(
7937                        "materialized-view source for {} did not re-parse as CREATE VIEW",
7938                        descriptor.name
7939                    ),
7940                }
7941                .emit_global();
7942                continue;
7943            };
7944            // Populate in-memory view registry.
7945            let view_name = create.name.clone();
7946            self.inner
7947                .views
7948                .write()
7949                .insert(view_name.clone(), Arc::new(create));
7950            // Materialized cache slot (data empty until next REFRESH).
7951            use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
7952            let refresh = match descriptor.refresh_every_ms {
7953                Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
7954                None => RefreshPolicy::Manual,
7955            };
7956            let def = MaterializedViewDef {
7957                name: view_name.clone(),
7958                query: format!("<parsed view {}>", view_name),
7959                dependencies: descriptor.source_collections.clone(),
7960                refresh,
7961                retention_duration_ms: descriptor.retention_duration_ms,
7962            };
7963            self.inner.materialized_views.write().register(def);
7964            if let Err(err) = self.ensure_materialized_view_backing(&view_name) {
7965                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7966                    collection: crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
7967                        .to_string(),
7968                    detail: format!(
7969                        "failed to rehydrate backing collection for materialized view {view_name}: {err}"
7970                    ),
7971                }
7972                .emit_global();
7973            }
7974        }
7975        // A rehydrated view shape may differ from any plans the cache
7976        // bootstrapped before this method ran — flush to be safe.
7977        self.invalidate_plan_cache();
7978    }
7979
7980    pub(crate) fn rehydrate_declared_column_schemas(&self) {
7981        let store = self.inner.db.store();
7982        for contract in self.inner.db.collection_contracts() {
7983            let columns: Vec<String> = contract
7984                .declared_columns
7985                .iter()
7986                .map(|column| column.name.clone())
7987                .collect();
7988            let Some(manager) = store.get_collection(&contract.name) else {
7989                continue;
7990            };
7991            manager.set_column_schema_if_empty(columns);
7992        }
7993    }
7994
7995    /// Register a table as tenant-scoped (Phase 2.5.4). Installs the
7996    /// in-memory column mapping, the implicit RLS policy, and enables
7997    /// row-level security on the table. Idempotent — re-registering
7998    /// the same `(table, column)` replaces the prior auto-policy.
7999    pub fn register_tenant_table(&self, table: &str, column: &str) {
8000        use crate::storage::query::ast::{
8001            CompareOp, CreatePolicyQuery, Expr, FieldRef, Filter, Span,
8002        };
8003        self.inner
8004            .tenant_tables
8005            .write()
8006            .insert(table.to_string(), column.to_string());
8007
8008        // Build the policy: col = CURRENT_TENANT()
8009        // Uses CompareExpr so the comparison happens at runtime against
8010        // the thread-local tenant value read by the CURRENT_TENANT
8011        // scalar. Spans are synthetic — there's no source location for
8012        // an auto-generated policy.
8013        let lhs = Expr::Column {
8014            field: FieldRef::TableColumn {
8015                table: table.to_string(),
8016                column: column.to_string(),
8017            },
8018            span: Span::synthetic(),
8019        };
8020        let rhs = Expr::FunctionCall {
8021            name: "CURRENT_TENANT".to_string(),
8022            args: Vec::new(),
8023            span: Span::synthetic(),
8024        };
8025        let policy_filter = Filter::CompareExpr {
8026            lhs,
8027            op: CompareOp::Eq,
8028            rhs,
8029        };
8030
8031        let policy = CreatePolicyQuery {
8032            name: "__tenant_iso".to_string(),
8033            table: table.to_string(),
8034            action: None, // None = ALL actions (SELECT/INSERT/UPDATE/DELETE)
8035            role: None,   // None = every role
8036            using: Box::new(policy_filter),
8037            // Auto-tenancy defaults to Table targets. Collections of
8038            // other kinds (graph / vector / queue / timeseries) that
8039            // opt in via `ALTER ... ENABLE TENANCY` should use the
8040            // matching kind — but for now we keep the auto-policy
8041            // kind-agnostic so the evaluator can apply it to any
8042            // entity living in the collection.
8043            target_kind: crate::storage::query::ast::PolicyTargetKind::Table,
8044        };
8045
8046        // Replace any prior auto-policy for this table (column rename).
8047        self.inner.rls_policies.write().insert(
8048            (table.to_string(), "__tenant_iso".to_string()),
8049            Arc::new(policy),
8050        );
8051        self.inner
8052            .rls_enabled_tables
8053            .write()
8054            .insert(table.to_string());
8055
8056        // Auto-build a hash index on the tenant column. Every read/write
8057        // against a tenant-scoped table carries an implicit
8058        // `col = CURRENT_TENANT()` predicate from the auto-policy, so an
8059        // index on that column is on the hot path of every query. Without
8060        // it, every SELECT/UPDATE/DELETE degrades to a full scan.
8061        self.ensure_tenant_index(table, column);
8062    }
8063
8064    /// Auto-create the hash index that backs the tenant-iso RLS predicate.
8065    /// Skipped when:
8066    ///   * the column is dotted (nested path — flat secondary indices
8067    ///     don't cover those today; RLS still works via the policy)
8068    ///   * `__tenant_idx_{table}` already exists (idempotent on rehydrate)
8069    ///   * the user already registered an index whose first column matches
8070    ///     (avoids redundant duplicates of a user-defined composite)
8071    fn ensure_tenant_index(&self, table: &str, column: &str) {
8072        if column.contains('.') {
8073            return;
8074        }
8075        let index_name = format!("__tenant_idx_{table}");
8076        let registry = self.inner.index_store.list_indices(table);
8077        if registry.iter().any(|idx| idx.name == index_name) {
8078            return;
8079        }
8080        if registry
8081            .iter()
8082            .any(|idx| idx.columns.first().map(|c| c.as_str()) == Some(column))
8083        {
8084            return;
8085        }
8086
8087        let store = self.inner.db.store();
8088        let Some(manager) = store.get_collection(table) else {
8089            return;
8090        };
8091        let entities = manager.query_all(|_| true);
8092        let entity_fields: Vec<(
8093            crate::storage::unified::EntityId,
8094            Vec<(String, crate::storage::schema::Value)>,
8095        )> = entities
8096            .iter()
8097            .map(|e| {
8098                let fields = match &e.data {
8099                    crate::storage::EntityData::Row(row) => {
8100                        if let Some(ref named) = row.named {
8101                            named.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
8102                        } else if let Some(ref schema) = row.schema {
8103                            schema
8104                                .iter()
8105                                .zip(row.columns.iter())
8106                                .map(|(k, v)| (k.clone(), v.clone()))
8107                                .collect()
8108                        } else {
8109                            Vec::new()
8110                        }
8111                    }
8112                    crate::storage::EntityData::Node(node) => node
8113                        .properties
8114                        .iter()
8115                        .map(|(k, v)| (k.clone(), v.clone()))
8116                        .collect(),
8117                    _ => Vec::new(),
8118                };
8119                (e.id, fields)
8120            })
8121            .collect();
8122
8123        let columns = vec![column.to_string()];
8124        if self
8125            .inner
8126            .index_store
8127            .create_index(
8128                &index_name,
8129                table,
8130                &columns,
8131                super::index_store::IndexMethodKind::Hash,
8132                false,
8133                &entity_fields,
8134            )
8135            .is_err()
8136        {
8137            return;
8138        }
8139        self.inner
8140            .index_store
8141            .register(super::index_store::RegisteredIndex {
8142                name: index_name,
8143                collection: table.to_string(),
8144                columns,
8145                method: super::index_store::IndexMethodKind::Hash,
8146                unique: false,
8147            });
8148        self.invalidate_plan_cache();
8149    }
8150
8151    /// Drop the auto-generated tenant index, if one exists. Called from
8152    /// `unregister_tenant_table` so DISABLE TENANCY / DROP TABLE clean up.
8153    fn drop_tenant_index(&self, table: &str) {
8154        let index_name = format!("__tenant_idx_{table}");
8155        self.inner.index_store.drop_index(&index_name, table);
8156    }
8157
8158    /// Retrieve the tenant column for a table, if any (Phase 2.5.4).
8159    /// Used by the INSERT auto-fill path to know which column to
8160    /// populate with `current_tenant()` when the user didn't name it.
8161    pub fn tenant_column(&self, table: &str) -> Option<String> {
8162        self.inner.tenant_tables.read().get(table).cloned()
8163    }
8164
8165    /// Remove a table's tenant registration (Phase 2.5.4). Called by
8166    /// DROP TABLE / ALTER TABLE DISABLE TENANCY. Removes the auto-policy
8167    /// but leaves any user-installed explicit policies intact.
8168    pub fn unregister_tenant_table(&self, table: &str) {
8169        self.inner.tenant_tables.write().remove(table);
8170        self.inner
8171            .rls_policies
8172            .write()
8173            .remove(&(table.to_string(), "__tenant_iso".to_string()));
8174        self.drop_tenant_index(table);
8175        // Only clear RLS enablement if no other policies remain.
8176        let has_other_policies = self
8177            .inner
8178            .rls_policies
8179            .read()
8180            .keys()
8181            .any(|(t, _)| t == table);
8182        if !has_other_policies {
8183            self.inner.rls_enabled_tables.write().remove(table);
8184        }
8185    }
8186
8187    /// Record that the running transaction has marked `id` in `collection`
8188    /// for deletion (Phase 2.3.2b MVCC tombstones). `stamper_xid` is the
8189    /// xid that was written into `xmax` — either the parent txn xid or
8190    /// the innermost savepoint sub-xid. Savepoint rollback filters by
8191    /// this xid to revive only its own tombstones.
8192    pub(crate) fn record_pending_tombstone(
8193        &self,
8194        conn_id: u64,
8195        collection: &str,
8196        id: crate::storage::unified::entity::EntityId,
8197        stamper_xid: crate::storage::transaction::snapshot::Xid,
8198        previous_xmax: crate::storage::transaction::snapshot::Xid,
8199    ) {
8200        self.inner
8201            .pending_tombstones
8202            .write()
8203            .entry(conn_id)
8204            .or_default()
8205            .push((collection.to_string(), id, stamper_xid, previous_xmax));
8206    }
8207
8208    pub(crate) fn record_pending_versioned_update(
8209        &self,
8210        conn_id: u64,
8211        collection: &str,
8212        old_id: crate::storage::unified::entity::EntityId,
8213        new_id: crate::storage::unified::entity::EntityId,
8214        stamper_xid: crate::storage::transaction::snapshot::Xid,
8215        previous_xmax: crate::storage::transaction::snapshot::Xid,
8216    ) {
8217        self.inner
8218            .pending_versioned_updates
8219            .write()
8220            .entry(conn_id)
8221            .or_default()
8222            .push((
8223                collection.to_string(),
8224                old_id,
8225                new_id,
8226                stamper_xid,
8227                previous_xmax,
8228            ));
8229    }
8230
8231    fn with_deferred_store_wal_if_transaction<T>(
8232        &self,
8233        f: impl FnOnce() -> RedDBResult<T>,
8234    ) -> RedDBResult<T> {
8235        let conn_id = current_connection_id();
8236        if !self.inner.tx_contexts.read().contains_key(&conn_id) {
8237            return f();
8238        }
8239
8240        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
8241        let result = f();
8242        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
8243        match result {
8244            Ok(value) => {
8245                self.record_pending_store_wal_actions(conn_id, captured);
8246                Ok(value)
8247            }
8248            Err(err) => Err(err),
8249        }
8250    }
8251
8252    fn with_deferred_store_wal_for_dml<T>(
8253        &self,
8254        capture_autocommit_events: bool,
8255        f: impl FnOnce() -> RedDBResult<T>,
8256    ) -> RedDBResult<T> {
8257        let conn_id = current_connection_id();
8258        if self.inner.tx_contexts.read().contains_key(&conn_id) {
8259            return self.with_deferred_store_wal_if_transaction(f);
8260        }
8261        if !capture_autocommit_events {
8262            return f();
8263        }
8264
8265        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
8266        let result = f();
8267        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
8268        self.inner
8269            .db
8270            .store()
8271            .append_deferred_store_wal_actions(captured)
8272            .map_err(|err| RedDBError::Internal(err.to_string()))?;
8273        result
8274    }
8275
8276    fn insert_may_emit_events(&self, query: &InsertQuery) -> bool {
8277        !query.suppress_events
8278            && self.collection_has_event_subscriptions_for_operation(
8279                &query.table,
8280                crate::catalog::SubscriptionOperation::Insert,
8281            )
8282    }
8283
8284    fn update_may_emit_events(&self, query: &UpdateQuery) -> bool {
8285        !query.suppress_events
8286            && self.collection_has_event_subscriptions_for_operation(
8287                &query.table,
8288                crate::catalog::SubscriptionOperation::Update,
8289            )
8290    }
8291
8292    fn delete_may_emit_events(&self, query: &DeleteQuery) -> bool {
8293        !query.suppress_events
8294            && self.collection_has_event_subscriptions_for_operation(
8295                &query.table,
8296                crate::catalog::SubscriptionOperation::Delete,
8297            )
8298    }
8299
8300    fn collection_has_event_subscriptions_for_operation(
8301        &self,
8302        collection: &str,
8303        operation: crate::catalog::SubscriptionOperation,
8304    ) -> bool {
8305        let Some(contract) = self.db().collection_contract_arc(collection) else {
8306            return false;
8307        };
8308        contract.subscriptions.iter().any(|subscription| {
8309            subscription.enabled
8310                && (subscription.ops_filter.is_empty()
8311                    || subscription.ops_filter.contains(&operation))
8312        })
8313    }
8314
8315    fn record_pending_store_wal_actions(
8316        &self,
8317        conn_id: u64,
8318        actions: crate::storage::unified::DeferredStoreWalActions,
8319    ) {
8320        if actions.is_empty() {
8321            return;
8322        }
8323        let mut guard = self.inner.pending_store_wal_actions.write();
8324        guard.entry(conn_id).or_default().extend(actions);
8325    }
8326
8327    fn flush_pending_store_wal_actions(&self, conn_id: u64) -> RedDBResult<()> {
8328        let Some(actions) = self
8329            .inner
8330            .pending_store_wal_actions
8331            .write()
8332            .remove(&conn_id)
8333        else {
8334            return Ok(());
8335        };
8336        self.inner
8337            .db
8338            .store()
8339            .append_deferred_store_wal_actions(actions)
8340            .map_err(|err| RedDBError::Internal(err.to_string()))
8341    }
8342
8343    fn discard_pending_store_wal_actions(&self, conn_id: u64) {
8344        self.inner
8345            .pending_store_wal_actions
8346            .write()
8347            .remove(&conn_id);
8348    }
8349
8350    fn xid_conflicts_with_snapshot(
8351        &self,
8352        xid: crate::storage::transaction::snapshot::Xid,
8353        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8354        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8355    ) -> bool {
8356        xid != 0
8357            && !own_xids.contains(&xid)
8358            && !self.inner.snapshot_manager.is_aborted(xid)
8359            && !self.inner.snapshot_manager.is_active(xid)
8360            && (xid > snapshot.xid || snapshot.in_progress.contains(&xid))
8361    }
8362
8363    fn conflict_error(
8364        collection: &str,
8365        logical_id: crate::storage::unified::entity::EntityId,
8366        xid: crate::storage::transaction::snapshot::Xid,
8367    ) -> RedDBError {
8368        RedDBError::Query(format!(
8369            "serialization conflict: table row {collection}/{} was modified by concurrent transaction {xid}",
8370            logical_id.raw()
8371        ))
8372    }
8373
8374    fn check_logical_row_conflict(
8375        &self,
8376        collection: &str,
8377        logical_id: crate::storage::unified::entity::EntityId,
8378        excluded_ids: &[crate::storage::unified::entity::EntityId],
8379        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8380        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8381    ) -> RedDBResult<()> {
8382        let store = self.inner.db.store();
8383        let Some(manager) = store.get_collection(collection) else {
8384            return Ok(());
8385        };
8386
8387        for candidate in manager.query_all(|_| true) {
8388            if excluded_ids.contains(&candidate.id) || candidate.logical_id() != logical_id {
8389                continue;
8390            }
8391            if self.xid_conflicts_with_snapshot(candidate.xmin, snapshot, own_xids) {
8392                return Err(Self::conflict_error(collection, logical_id, candidate.xmin));
8393            }
8394            if self.xid_conflicts_with_snapshot(candidate.xmax, snapshot, own_xids) {
8395                return Err(Self::conflict_error(collection, logical_id, candidate.xmax));
8396            }
8397        }
8398        Ok(())
8399    }
8400
8401    pub(crate) fn check_table_row_write_conflicts(
8402        &self,
8403        conn_id: u64,
8404        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8405        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8406    ) -> RedDBResult<()> {
8407        let versioned_updates = self
8408            .inner
8409            .pending_versioned_updates
8410            .read()
8411            .get(&conn_id)
8412            .cloned()
8413            .unwrap_or_default();
8414        let tombstones = self
8415            .inner
8416            .pending_tombstones
8417            .read()
8418            .get(&conn_id)
8419            .cloned()
8420            .unwrap_or_default();
8421
8422        let store = self.inner.db.store();
8423        for (collection, old_id, new_id, xid, previous_xmax) in versioned_updates {
8424            let Some(manager) = store.get_collection(&collection) else {
8425                continue;
8426            };
8427            let Some(old) = manager.get(old_id) else {
8428                continue;
8429            };
8430            let logical_id = old.logical_id();
8431            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
8432                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
8433            }
8434            if old.xmax != xid && self.xid_conflicts_with_snapshot(old.xmax, snapshot, own_xids) {
8435                return Err(Self::conflict_error(&collection, logical_id, old.xmax));
8436            }
8437            self.check_logical_row_conflict(
8438                &collection,
8439                logical_id,
8440                &[old_id, new_id],
8441                snapshot,
8442                own_xids,
8443            )?;
8444        }
8445
8446        for (collection, id, xid, previous_xmax) in tombstones {
8447            let Some(manager) = store.get_collection(&collection) else {
8448                continue;
8449            };
8450            let Some(entity) = manager.get(id) else {
8451                continue;
8452            };
8453            let logical_id = entity.logical_id();
8454            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
8455                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
8456            }
8457            if entity.xmax != xid
8458                && self.xid_conflicts_with_snapshot(entity.xmax, snapshot, own_xids)
8459            {
8460                return Err(Self::conflict_error(&collection, logical_id, entity.xmax));
8461            }
8462            self.check_logical_row_conflict(&collection, logical_id, &[id], snapshot, own_xids)?;
8463        }
8464
8465        Ok(())
8466    }
8467
8468    pub(crate) fn restore_pending_write_stamps(&self, conn_id: u64) {
8469        let versioned_updates = self
8470            .inner
8471            .pending_versioned_updates
8472            .read()
8473            .get(&conn_id)
8474            .cloned()
8475            .unwrap_or_default();
8476        let tombstones = self
8477            .inner
8478            .pending_tombstones
8479            .read()
8480            .get(&conn_id)
8481            .cloned()
8482            .unwrap_or_default();
8483
8484        let store = self.inner.db.store();
8485        for (collection, old_id, _new_id, xid, _previous_xmax) in versioned_updates {
8486            if let Some(manager) = store.get_collection(&collection) {
8487                if let Some(mut entity) = manager.get(old_id) {
8488                    entity.set_xmax(xid);
8489                    let _ = manager.update(entity);
8490                }
8491            }
8492        }
8493        for (collection, id, xid, _previous_xmax) in tombstones {
8494            if let Some(manager) = store.get_collection(&collection) {
8495                if let Some(mut entity) = manager.get(id) {
8496                    entity.set_xmax(xid);
8497                    let _ = manager.update(entity);
8498                }
8499            }
8500        }
8501    }
8502
8503    pub(crate) fn finalize_pending_versioned_updates(&self, conn_id: u64) {
8504        self.inner
8505            .pending_versioned_updates
8506            .write()
8507            .remove(&conn_id);
8508    }
8509
8510    pub(crate) fn revive_pending_versioned_updates(&self, conn_id: u64) {
8511        let Some(pending) = self
8512            .inner
8513            .pending_versioned_updates
8514            .write()
8515            .remove(&conn_id)
8516        else {
8517            return;
8518        };
8519
8520        let store = self.inner.db.store();
8521        for (collection, old_id, new_id, xid, previous_xmax) in pending {
8522            if let Some(manager) = store.get_collection(&collection) {
8523                if let Some(mut old) = manager.get(old_id) {
8524                    if old.xmax == xid {
8525                        old.set_xmax(previous_xmax);
8526                        let _ = manager.update(old);
8527                    }
8528                }
8529            }
8530            let _ = store.delete_batch(&collection, &[new_id]);
8531        }
8532    }
8533
8534    pub(crate) fn revive_versioned_updates_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
8535        let mut guard = self.inner.pending_versioned_updates.write();
8536        let Some(pending) = guard.get_mut(&conn_id) else {
8537            return 0;
8538        };
8539
8540        let store = self.inner.db.store();
8541        let mut reverted = 0usize;
8542        pending.retain(|(collection, old_id, new_id, xid, previous_xmax)| {
8543            if *xid < stamper_xid {
8544                return true;
8545            }
8546            if let Some(manager) = store.get_collection(collection) {
8547                if let Some(mut old) = manager.get(*old_id) {
8548                    if old.xmax == *xid {
8549                        old.set_xmax(*previous_xmax);
8550                        let _ = manager.update(old);
8551                    }
8552                }
8553            }
8554            let _ = store.delete_batch(collection, &[*new_id]);
8555            reverted += 1;
8556            false
8557        });
8558        if pending.is_empty() {
8559            guard.remove(&conn_id);
8560        }
8561        reverted
8562    }
8563
8564    /// Flush tombstones on COMMIT. The xmax stamp is already the durable
8565    /// delete marker; commit only drops the rollback journal and emits
8566    /// side effects. Physical reclamation is left for VACUUM so old
8567    /// snapshots can still resolve the pre-delete row version.
8568    pub(crate) fn finalize_pending_tombstones(&self, conn_id: u64) {
8569        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
8570            return;
8571        };
8572        if pending.is_empty() {
8573            return;
8574        }
8575
8576        let store = self.inner.db.store();
8577        for (collection, id, _xid, _previous_xmax) in pending {
8578            store.context_index().remove_entity(id);
8579            self.cdc_emit(
8580                crate::replication::cdc::ChangeOperation::Delete,
8581                &collection,
8582                id.raw(),
8583                "entity",
8584            );
8585        }
8586    }
8587
8588    /// Revive tombstones on ROLLBACK — reset `xmax` to 0 so the tuples
8589    /// become visible again to future snapshots. Best-effort: a row
8590    /// already reclaimed by a concurrent VACUUM stays gone, but VACUUM
8591    /// never reclaims tuples whose xmax is still referenced by any
8592    /// active snapshot, so this case is only reachable via external
8593    /// storage corruption.
8594    pub(crate) fn revive_pending_tombstones(&self, conn_id: u64) {
8595        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
8596            return;
8597        };
8598
8599        let store = self.inner.db.store();
8600        for (collection, id, xid, previous_xmax) in pending {
8601            let Some(manager) = store.get_collection(&collection) else {
8602                continue;
8603            };
8604            if let Some(mut entity) = manager.get(id) {
8605                if entity.xmax == xid {
8606                    entity.set_xmax(previous_xmax);
8607                    let _ = manager.update(entity);
8608                }
8609            }
8610        }
8611    }
8612
8613    /// Slice C of PRD #718 — accessor for the local wait registry.
8614    pub fn queue_wait_registry(
8615        &self,
8616    ) -> std::sync::Arc<crate::runtime::queue_wait_registry::QueueWaitRegistry> {
8617        self.inner.queue_wait_registry.clone()
8618    }
8619
8620    /// Buffer a `(scope, queue)` wake on the current connection so it
8621    /// fires post-COMMIT, or notify immediately if no transaction is
8622    /// open (autocommit path). The wait registry only ever observes
8623    /// notifies for committed work — rollback drops the buffer.
8624    pub(crate) fn record_queue_wake(&self, scope: &str, queue: &str) {
8625        if self.current_xid().is_some() {
8626            let conn_id = current_connection_id();
8627            self.inner
8628                .pending_queue_wakes
8629                .write()
8630                .entry(conn_id)
8631                .or_default()
8632                .push((scope.to_string(), queue.to_string()));
8633            return;
8634        }
8635        self.inner.queue_wait_registry.notify(scope, queue);
8636    }
8637
8638    pub(crate) fn finalize_pending_queue_wakes(&self, conn_id: u64) {
8639        let Some(pending) = self.inner.pending_queue_wakes.write().remove(&conn_id) else {
8640            return;
8641        };
8642        for (scope, queue) in pending {
8643            self.inner.queue_wait_registry.notify(&scope, &queue);
8644        }
8645    }
8646
8647    pub(crate) fn discard_pending_queue_wakes(&self, conn_id: u64) {
8648        self.inner.pending_queue_wakes.write().remove(&conn_id);
8649    }
8650
8651    pub(crate) fn finalize_pending_kv_watch_events(&self, conn_id: u64) {
8652        let Some(pending) = self.inner.pending_kv_watch_events.write().remove(&conn_id) else {
8653            return;
8654        };
8655        for event in pending {
8656            self.cdc_emit_kv(
8657                event.op,
8658                &event.collection,
8659                &event.key,
8660                0,
8661                event.before,
8662                event.after,
8663            );
8664        }
8665    }
8666
8667    pub(crate) fn discard_pending_kv_watch_events(&self, conn_id: u64) {
8668        self.inner.pending_kv_watch_events.write().remove(&conn_id);
8669    }
8670
8671    /// Materialise the entire graph store while applying MVCC visibility
8672    /// AND per-collection RLS to each candidate node and edge. Mirrors
8673    /// `materialize_graph` but routes every entity through the same
8674    /// gate the SELECT path uses, with the correct `PolicyTargetKind`
8675    /// per entity kind (`Nodes` for graph nodes, `Edges` for graph
8676    /// edges). Returns the filtered `GraphStore` plus the
8677    /// `node_id → properties` map the executor needs for `RETURN n.*`
8678    /// projections.
8679    fn materialize_graph_with_rls(
8680        &self,
8681    ) -> RedDBResult<(
8682        crate::storage::engine::GraphStore,
8683        std::collections::HashMap<
8684            String,
8685            std::collections::HashMap<String, crate::storage::schema::Value>,
8686        >,
8687        crate::storage::query::unified::EdgeProperties,
8688    )> {
8689        use crate::storage::engine::GraphStore;
8690        use crate::storage::query::ast::{PolicyAction, PolicyTargetKind};
8691        use crate::storage::unified::entity::{EntityData, EntityKind};
8692        use std::collections::{HashMap, HashSet};
8693
8694        let store = self.inner.db.store();
8695        let snap_ctx = capture_current_snapshot();
8696        let role = current_auth_identity().map(|(_, r)| r.as_str().to_string());
8697
8698        let graph = GraphStore::new();
8699        let mut node_properties: HashMap<String, HashMap<String, crate::storage::schema::Value>> =
8700            HashMap::new();
8701        let mut edge_properties: crate::storage::query::unified::EdgeProperties = HashMap::new();
8702        let mut allowed_nodes: HashSet<String> = HashSet::new();
8703
8704        // Per-collection cached compiled filters — Nodes-kind for
8705        // first pass, Edges-kind for the second. None entries mean
8706        // "RLS enabled, zero matching policy → deny all of this kind".
8707        let mut node_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
8708            HashMap::new();
8709        let mut edge_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
8710            HashMap::new();
8711
8712        let collections = store.list_collections();
8713
8714        // First pass — gather nodes.
8715        for collection in &collections {
8716            let Some(manager) = store.get_collection(collection) else {
8717                continue;
8718            };
8719            let entities = manager.query_all(|_| true);
8720            for entity in entities {
8721                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
8722                    continue;
8723                }
8724                let EntityKind::GraphNode(ref node) = entity.kind else {
8725                    continue;
8726                };
8727                if !node_passes_rls(self, collection, role.as_deref(), &mut node_rls, &entity) {
8728                    continue;
8729                }
8730                let id_str = entity.id.raw().to_string();
8731                graph
8732                    .add_node_with_label(
8733                        &id_str,
8734                        &node.label,
8735                        &super::graph_node_label(&node.node_type),
8736                    )
8737                    .map_err(|err| RedDBError::Query(err.to_string()))?;
8738                allowed_nodes.insert(id_str.clone());
8739                if let EntityData::Node(node_data) = &entity.data {
8740                    node_properties.insert(id_str, node_data.properties.clone());
8741                }
8742            }
8743        }
8744
8745        // Second pass — gather edges. An edge appears only when both
8746        // endpoint nodes survived the RLS pass AND the edge itself
8747        // passes its own RLS gate.
8748        for collection in &collections {
8749            let Some(manager) = store.get_collection(collection) else {
8750                continue;
8751            };
8752            let entities = manager.query_all(|_| true);
8753            for entity in entities {
8754                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
8755                    continue;
8756                }
8757                let EntityKind::GraphEdge(ref edge) = entity.kind else {
8758                    continue;
8759                };
8760                if !allowed_nodes.contains(&edge.from_node)
8761                    || !allowed_nodes.contains(&edge.to_node)
8762                {
8763                    continue;
8764                }
8765                if !edge_passes_rls(self, collection, role.as_deref(), &mut edge_rls, &entity) {
8766                    continue;
8767                }
8768                let weight = match &entity.data {
8769                    EntityData::Edge(e) => e.weight,
8770                    _ => edge.weight as f32 / 1000.0,
8771                };
8772                let edge_label = super::graph_edge_label(&edge.label);
8773                graph
8774                    .add_edge_with_label(&edge.from_node, &edge.to_node, &edge_label, weight)
8775                    .map_err(|err| RedDBError::Query(err.to_string()))?;
8776                if let EntityData::Edge(edge_data) = &entity.data {
8777                    edge_properties.insert(
8778                        (edge.from_node.clone(), edge_label, edge.to_node.clone()),
8779                        edge_data.properties.clone(),
8780                    );
8781                }
8782            }
8783        }
8784
8785        // Suppress unused-PolicyAction/PolicyTargetKind warnings — both
8786        // are used inside the helper closures via the per-kind helpers
8787        // declared at the bottom of this file.
8788        let _ = (PolicyAction::Select, PolicyTargetKind::Nodes);
8789
8790        Ok((graph, node_properties, edge_properties))
8791    }
8792
8793    /// Phase 1.1 MVCC universal: post-save hook that stamps `xmin` on a
8794    /// freshly-inserted entity when the current connection holds an
8795    /// open transaction. Used by graph / vector / queue / timeseries
8796    /// write paths that go through the DevX builder API (`db.node(...)
8797    /// .save()` and friends) — those live in the storage crate and
8798    /// can't reach `current_xid()` without crossing layers, so the
8799    /// application layer calls this helper right after `save()` to
8800    /// finalise the MVCC stamp.
8801    ///
8802    /// Autocommit (outside BEGIN) is a no-op — no extra lookup or
8803    /// write, so the non-transactional hot path stays untouched.
8804    ///
8805    /// Best-effort: if the collection or entity disappears between
8806    /// the save and the stamp (concurrent DROP), we silently skip.
8807    pub(crate) fn stamp_xmin_if_in_txn(
8808        &self,
8809        collection: &str,
8810        id: crate::storage::unified::entity::EntityId,
8811    ) {
8812        let Some(xid) = self.current_xid() else {
8813            return;
8814        };
8815        let store = self.inner.db.store();
8816        let Some(manager) = store.get_collection(collection) else {
8817            return;
8818        };
8819        if let Some(mut entity) = manager.get(id) {
8820            entity.set_xmin(xid);
8821            let _ = manager.update(entity);
8822        }
8823    }
8824
8825    /// Revive tombstones stamped by `stamper_xid` or any sub-xid
8826    /// allocated after it (Phase 2.3.2e savepoint rollback). Any
8827    /// pending entries with `xid < stamper_xid` stay queued because
8828    /// they belong to the enclosing scope — they'll either flush on
8829    /// COMMIT or revive on an outer ROLLBACK TO SAVEPOINT.
8830    ///
8831    /// Returns the number of tuples whose `xmax` was wiped back to 0.
8832    pub(crate) fn revive_tombstones_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
8833        let mut guard = self.inner.pending_tombstones.write();
8834        let Some(pending) = guard.get_mut(&conn_id) else {
8835            return 0;
8836        };
8837
8838        let store = self.inner.db.store();
8839        let mut revived = 0usize;
8840        pending.retain(|(collection, id, xid, previous_xmax)| {
8841            if *xid < stamper_xid {
8842                // Stamped before the savepoint — keep in queue.
8843                return true;
8844            }
8845            if let Some(manager) = store.get_collection(collection) {
8846                if let Some(mut entity) = manager.get(*id) {
8847                    if entity.xmax == *xid {
8848                        entity.set_xmax(*previous_xmax);
8849                        let _ = manager.update(entity);
8850                        revived += 1;
8851                    }
8852                }
8853            }
8854            false
8855        });
8856        if pending.is_empty() {
8857            guard.remove(&conn_id);
8858        }
8859        revived
8860    }
8861
8862    /// Return the snapshot the current connection should use for visibility
8863    /// checks (Phase 2.3 PG parity).
8864    ///
8865    /// * If the connection is inside a BEGIN-wrapped transaction, reuse
8866    ///   the snapshot stored in its `TxnContext`.
8867    /// * Otherwise (autocommit), capture a fresh snapshot tied to an
8868    ///   implicit xid=0 — the read path treats pre-MVCC rows as always
8869    ///   visible so this degrades to "see everything committed".
8870    pub fn current_snapshot(&self) -> crate::storage::transaction::snapshot::Snapshot {
8871        let conn_id = current_connection_id();
8872        if let Some(ctx) = self.inner.tx_contexts.read().get(&conn_id).cloned() {
8873            return ctx.snapshot;
8874        }
8875        // Autocommit: take a fresh snapshot bounded by `peek_next_xid` so
8876        // every already-committed xid (which is strictly less) passes the
8877        // `xmin <= snap.xid` gate, while concurrently-active xids land in
8878        // the `in_progress` set and stay hidden until they commit. Using
8879        // xid=0 would incorrectly hide every MVCC-stamped tuple.
8880        let high_water = self.inner.snapshot_manager.peek_next_xid();
8881        self.inner.snapshot_manager.snapshot(high_water)
8882    }
8883
8884    /// Xid of the current connection's active transaction, or `None` when
8885    /// running outside a BEGIN/COMMIT block. Write paths call this to
8886    /// decide whether to stamp `xmin`/`xmax` on tuples.
8887    /// Phase 2.3.2e: when a savepoint is open, `writer_xid` returns the
8888    /// sub-xid so new writes can be selectively rolled back. Otherwise
8889    /// the parent txn's xid is returned, matching pre-savepoint
8890    /// behaviour. Callers that need the enclosing *transaction* xid
8891    /// (e.g. VACUUM min-active calculations) should read `ctx.xid`
8892    /// directly.
8893    pub fn current_xid(&self) -> Option<crate::storage::transaction::snapshot::Xid> {
8894        let conn_id = current_connection_id();
8895        self.inner
8896            .tx_contexts
8897            .read()
8898            .get(&conn_id)
8899            .map(|ctx| ctx.writer_xid())
8900    }
8901
8902    /// `true` when the given connection id has an open `BEGIN`. Issue
8903    /// #760 — `OpenStream` consults this to refuse output streams that
8904    /// would otherwise collide with an interactive transaction (see
8905    /// ADR 0029 "Transaction interaction"). HTTP requests pre-dating the
8906    /// connection-id plumbing run with id `0`, which never carries a
8907    /// transaction context, so this returns `false` on those paths.
8908    pub fn connection_in_transaction(&self, conn_id: u64) -> bool {
8909        self.inner.tx_contexts.read().contains_key(&conn_id)
8910    }
8911
8912    /// Access the shared `SnapshotManager` — useful for VACUUM to compute
8913    /// the oldest-active xid when reclaiming dead tuples.
8914    pub fn snapshot_manager(&self) -> Arc<crate::storage::transaction::snapshot::SnapshotManager> {
8915        Arc::clone(&self.inner.snapshot_manager)
8916    }
8917
8918    fn mvcc_vacuum_cutoff_xid(&self) -> crate::storage::transaction::snapshot::Xid {
8919        let manager = &self.inner.snapshot_manager;
8920        let next_xid = manager.peek_next_xid();
8921        let mut cutoff = next_xid;
8922        if let Some(oldest_active) = manager.oldest_active_xid() {
8923            cutoff = cutoff.min(oldest_active);
8924        }
8925        if let Some(oldest_pinned) = manager.oldest_pinned_xid() {
8926            cutoff = cutoff.min(oldest_pinned);
8927        }
8928        let retention_xids = self.config_u64("runtime.mvcc.vacuum_retention_xids", 0);
8929        if retention_xids > 0 {
8930            cutoff = cutoff.min(next_xid.saturating_sub(retention_xids));
8931        }
8932        cutoff
8933    }
8934
8935    fn rebuild_runtime_indexes_for_table(&self, table: &str) -> RedDBResult<()> {
8936        let registered = self.inner.index_store.list_indices(table);
8937        if registered.is_empty() {
8938            return Ok(());
8939        }
8940        let store = self.inner.db.store();
8941        let Some(manager) = store.get_collection(table) else {
8942            return Ok(());
8943        };
8944        let entity_fields = manager
8945            .query_all(|entity| matches!(entity.kind, crate::storage::EntityKind::TableRow { .. }))
8946            .into_iter()
8947            .map(|entity| (entity.id, table_row_index_fields(&entity)))
8948            .collect::<Vec<_>>();
8949
8950        for index in registered {
8951            self.inner.index_store.drop_index(&index.name, table);
8952            self.inner
8953                .index_store
8954                .create_index(
8955                    &index.name,
8956                    table,
8957                    &index.columns,
8958                    index.method,
8959                    index.unique,
8960                    &entity_fields,
8961                )
8962                .map_err(RedDBError::Internal)?;
8963            self.inner.index_store.register(index);
8964        }
8965        self.invalidate_plan_cache();
8966        Ok(())
8967    }
8968
8969    pub(crate) fn persist_runtime_index_descriptor(
8970        &self,
8971        index: super::index_store::RegisteredIndex,
8972    ) -> RedDBResult<()> {
8973        let store = self.inner.db.store();
8974        let _ = store.get_or_create_collection(RUNTIME_INDEX_REGISTRY_COLLECTION);
8975        let entity = crate::storage::UnifiedEntity::new(
8976            crate::storage::EntityId::new(0),
8977            crate::storage::EntityKind::TableRow {
8978                table: std::sync::Arc::from(RUNTIME_INDEX_REGISTRY_COLLECTION),
8979                row_id: 0,
8980            },
8981            crate::storage::EntityData::Row(crate::storage::RowData {
8982                columns: Vec::new(),
8983                named: Some(
8984                    [
8985                        (
8986                            "collection".to_string(),
8987                            crate::storage::schema::Value::text(index.collection.clone()),
8988                        ),
8989                        (
8990                            "name".to_string(),
8991                            crate::storage::schema::Value::text(index.name.clone()),
8992                        ),
8993                        (
8994                            "columns".to_string(),
8995                            crate::storage::schema::Value::text(index.columns.join("\u{1f}")),
8996                        ),
8997                        (
8998                            "method".to_string(),
8999                            crate::storage::schema::Value::text(index_method_kind_as_str(
9000                                index.method,
9001                            )),
9002                        ),
9003                        (
9004                            "unique".to_string(),
9005                            crate::storage::schema::Value::Boolean(index.unique),
9006                        ),
9007                        (
9008                            "dropped".to_string(),
9009                            crate::storage::schema::Value::Boolean(false),
9010                        ),
9011                    ]
9012                    .into_iter()
9013                    .collect(),
9014                ),
9015                schema: None,
9016            }),
9017        );
9018        store
9019            .insert_auto(RUNTIME_INDEX_REGISTRY_COLLECTION, entity)
9020            .map(|_| ())
9021            .map_err(|err| RedDBError::Internal(format!("{err:?}")))
9022    }
9023
9024    pub(crate) fn persist_runtime_index_drop(
9025        &self,
9026        collection: &str,
9027        name: &str,
9028    ) -> RedDBResult<()> {
9029        let store = self.inner.db.store();
9030        let _ = store.get_or_create_collection(RUNTIME_INDEX_REGISTRY_COLLECTION);
9031        let entity = crate::storage::UnifiedEntity::new(
9032            crate::storage::EntityId::new(0),
9033            crate::storage::EntityKind::TableRow {
9034                table: std::sync::Arc::from(RUNTIME_INDEX_REGISTRY_COLLECTION),
9035                row_id: 0,
9036            },
9037            crate::storage::EntityData::Row(crate::storage::RowData {
9038                columns: Vec::new(),
9039                named: Some(
9040                    [
9041                        (
9042                            "collection".to_string(),
9043                            crate::storage::schema::Value::text(collection.to_string()),
9044                        ),
9045                        (
9046                            "name".to_string(),
9047                            crate::storage::schema::Value::text(name.to_string()),
9048                        ),
9049                        (
9050                            "dropped".to_string(),
9051                            crate::storage::schema::Value::Boolean(true),
9052                        ),
9053                    ]
9054                    .into_iter()
9055                    .collect(),
9056                ),
9057                schema: None,
9058            }),
9059        );
9060        store
9061            .insert_auto(RUNTIME_INDEX_REGISTRY_COLLECTION, entity)
9062            .map(|_| ())
9063            .map_err(|err| RedDBError::Internal(format!("{err:?}")))
9064    }
9065
9066    fn rehydrate_runtime_index_registry(&self) -> RedDBResult<()> {
9067        let store = self.inner.db.store();
9068        let Some(manager) = store.get_collection(RUNTIME_INDEX_REGISTRY_COLLECTION) else {
9069            return Ok(());
9070        };
9071        let mut rows = manager.query_all(|_| true);
9072        rows.sort_by_key(|entity| entity.id.raw());
9073
9074        let mut latest = std::collections::HashMap::<
9075            (String, String),
9076            Option<super::index_store::RegisteredIndex>,
9077        >::new();
9078        for entity in rows {
9079            let crate::storage::EntityData::Row(row) = &entity.data else {
9080                continue;
9081            };
9082            let Some(named) = &row.named else {
9083                continue;
9084            };
9085            let Some(collection) = named_text(named, "collection") else {
9086                continue;
9087            };
9088            let Some(name) = named_text(named, "name") else {
9089                continue;
9090            };
9091            let dropped = named_bool(named, "dropped").unwrap_or(false);
9092            let key = (collection.clone(), name.clone());
9093            if dropped {
9094                latest.insert(key, None);
9095                continue;
9096            }
9097            let columns = named_text(named, "columns")
9098                .map(|raw| {
9099                    raw.split('\u{1f}')
9100                        .filter(|part| !part.is_empty())
9101                        .map(str::to_string)
9102                        .collect::<Vec<_>>()
9103                })
9104                .unwrap_or_default();
9105            let Some(method) =
9106                named_text(named, "method").and_then(|raw| index_method_kind_from_str(&raw))
9107            else {
9108                continue;
9109            };
9110            latest.insert(
9111                key,
9112                Some(super::index_store::RegisteredIndex {
9113                    name,
9114                    collection,
9115                    columns,
9116                    method,
9117                    unique: named_bool(named, "unique").unwrap_or(false),
9118                }),
9119            );
9120        }
9121
9122        for index in latest.into_values().flatten() {
9123            let Some(manager) = store.get_collection(&index.collection) else {
9124                continue;
9125            };
9126            let entity_fields = manager
9127                .query_all(|entity| {
9128                    matches!(entity.kind, crate::storage::EntityKind::TableRow { .. })
9129                })
9130                .into_iter()
9131                .map(|entity| (entity.id, table_row_index_fields(&entity)))
9132                .collect::<Vec<_>>();
9133            self.inner
9134                .index_store
9135                .create_index(
9136                    &index.name,
9137                    &index.collection,
9138                    &index.columns,
9139                    index.method,
9140                    index.unique,
9141                    &entity_fields,
9142                )
9143                .map_err(RedDBError::Internal)?;
9144            self.inner.index_store.register(index);
9145        }
9146        self.invalidate_plan_cache();
9147        Ok(())
9148    }
9149
9150    /// Own-tx xids (parent + open/released savepoints) for the current
9151    /// connection. Transports + tests that build a `SnapshotContext`
9152    /// manually (outside the `execute_query` scope) need this set so
9153    /// the writer's own uncommitted tuples stay visible to self.
9154    pub fn current_txn_own_xids(
9155        &self,
9156    ) -> std::collections::HashSet<crate::storage::transaction::snapshot::Xid> {
9157        let mut set = std::collections::HashSet::new();
9158        if let Some(ctx) = self.inner.tx_contexts.read().get(&current_connection_id()) {
9159            set.insert(ctx.xid);
9160            for (_, sub) in &ctx.savepoints {
9161                set.insert(*sub);
9162            }
9163            for sub in &ctx.released_sub_xids {
9164                set.insert(*sub);
9165            }
9166        }
9167        set
9168    }
9169
9170    /// Access the shared `ForeignTableRegistry` (Phase 3.2 PG parity).
9171    ///
9172    /// Callers use this to check whether a table name is a registered
9173    /// foreign table (`registry.is_foreign_table(name)`) and, if so, to
9174    /// scan it (`registry.scan(name)`). The read-path rewriter consults
9175    /// this before dispatching into native-collection lookup.
9176    pub fn foreign_tables(&self) -> Arc<crate::storage::fdw::ForeignTableRegistry> {
9177        Arc::clone(&self.inner.foreign_tables)
9178    }
9179
9180    /// Is Row-Level Security enabled for this table? (Phase 2.5 PG parity)
9181    pub fn is_rls_enabled(&self, table: &str) -> bool {
9182        self.inner.rls_enabled_tables.read().contains(table)
9183    }
9184
9185    /// Collect the USING predicates that apply to this `(table, role, action)`.
9186    ///
9187    /// Returned filters should be OR-combined (a row passes RLS when *any*
9188    /// matching policy accepts it) and then AND-ed into the query's WHERE.
9189    /// When the table has RLS disabled this returns an empty Vec — callers
9190    /// can fast-path back to the unfiltered read.
9191    pub fn matching_rls_policies(
9192        &self,
9193        table: &str,
9194        role: Option<&str>,
9195        action: crate::storage::query::ast::PolicyAction,
9196    ) -> Vec<crate::storage::query::ast::Filter> {
9197        // Default kind = Table preserves the pre-Phase-2.5.5 behaviour:
9198        // callers that don't name a kind only see Table-scoped
9199        // policies (which is what execute SELECT / UPDATE / DELETE
9200        // expect).
9201        self.matching_rls_policies_for_kind(
9202            table,
9203            role,
9204            action,
9205            crate::storage::query::ast::PolicyTargetKind::Table,
9206        )
9207    }
9208
9209    /// Kind-aware variant used by cross-model scans (Phase 2.5.5).
9210    ///
9211    /// Graph scans request `Nodes` / `Edges`, vector ANN requests
9212    /// `Vectors`, queue consumers request `Messages`, and timeseries
9213    /// range scans request `Points`. Policies tagged with a
9214    /// different kind are skipped so a graph-scoped policy doesn't
9215    /// accidentally gate a table SELECT on the same collection.
9216    pub fn matching_rls_policies_for_kind(
9217        &self,
9218        table: &str,
9219        role: Option<&str>,
9220        action: crate::storage::query::ast::PolicyAction,
9221        kind: crate::storage::query::ast::PolicyTargetKind,
9222    ) -> Vec<crate::storage::query::ast::Filter> {
9223        if !self.is_rls_enabled(table) {
9224            return Vec::new();
9225        }
9226        let policies = self.inner.rls_policies.read();
9227        policies
9228            .iter()
9229            .filter_map(|((t, _), p)| {
9230                if t != table {
9231                    return None;
9232                }
9233                // Kind gate — Table policies also apply to every
9234                // other kind *iff* the policy predicate evaluates
9235                // against entity fields that exist uniformly; the
9236                // caller's kind filter is the stricter check, so
9237                // match literally. Auto-tenancy policies stamp
9238                // Table and the caller passes the concrete kind —
9239                // we allow Table policies to apply cross-kind for
9240                // backwards compat.
9241                if p.target_kind != kind
9242                    && p.target_kind != crate::storage::query::ast::PolicyTargetKind::Table
9243                {
9244                    return None;
9245                }
9246                // Action gate — `None` means "ALL" actions.
9247                if let Some(a) = p.action {
9248                    if a != action {
9249                        return None;
9250                    }
9251                }
9252                // Role gate — `None` means "any role".
9253                if let Some(p_role) = p.role.as_deref() {
9254                    match role {
9255                        Some(r) if r == p_role => {}
9256                        _ => return None,
9257                    }
9258                }
9259                Some((*p.using).clone())
9260            })
9261            .collect()
9262    }
9263
9264    pub(crate) fn refresh_table_planner_stats(&self, table: &str) {
9265        let store = self.inner.db.store();
9266        if let Some(stats) =
9267            crate::storage::query::planner::stats_catalog::analyze_collection(store.as_ref(), table)
9268        {
9269            crate::storage::query::planner::stats_catalog::persist_table_stats(
9270                store.as_ref(),
9271                &stats,
9272            );
9273        } else {
9274            crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
9275        }
9276        self.invalidate_plan_cache();
9277    }
9278
9279    pub(crate) fn note_table_write(&self, table: &str) {
9280        // Skip the write lock when the table is already marked
9281        // dirty. With single-row UPDATEs in a loop this used to
9282        // grab the planner_dirty_tables write lock N times even
9283        // though the first call already flipped the flag.
9284        let already_dirty = self.inner.planner_dirty_tables.read().contains(table);
9285        if !already_dirty {
9286            self.inner
9287                .planner_dirty_tables
9288                .write()
9289                .insert(table.to_string());
9290        }
9291        self.invalidate_result_cache_for_table(table);
9292    }
9293
9294    /// Wrap the planner's `RuntimeQueryExplain` as rows on a
9295    /// `RuntimeQueryResult` so callers over the SQL interface see the
9296    /// plan tree in the same shape a SELECT produces.
9297    ///
9298    /// Columns: `op`, `source`, `est_rows`, `est_cost`, `depth`.
9299    /// Nodes are walked depth-first; `depth` counts from 0 at the
9300    /// root so a text renderer can indent without re-walking.
9301    fn explain_as_rows(&self, raw_query: &str, inner_sql: &str) -> RedDBResult<RuntimeQueryResult> {
9302        let explain = self.explain_query(inner_sql)?;
9303
9304        let columns = vec![
9305            "op".to_string(),
9306            "source".to_string(),
9307            "est_rows".to_string(),
9308            "est_cost".to_string(),
9309            "depth".to_string(),
9310        ];
9311
9312        let mut records: Vec<crate::storage::query::unified::UnifiedRecord> = Vec::new();
9313
9314        // Prepend `CteScan` markers when the query carried a leading
9315        // WITH clause. The CTE bodies are already inlined into the
9316        // main plan tree, but operators reading EXPLAIN need to see
9317        // which named CTEs were resolved — without this row the plan
9318        // would look indistinguishable from a hand-inlined query.
9319        for name in &explain.cte_materializations {
9320            use std::sync::Arc;
9321            let mut rec = crate::storage::query::unified::UnifiedRecord::default();
9322            rec.set_arc(Arc::from("op"), Value::text("CteScan".to_string()));
9323            rec.set_arc(Arc::from("source"), Value::text(name.clone()));
9324            rec.set_arc(Arc::from("est_rows"), Value::Float(0.0));
9325            rec.set_arc(Arc::from("est_cost"), Value::Float(0.0));
9326            rec.set_arc(Arc::from("depth"), Value::Integer(0));
9327            records.push(rec);
9328        }
9329
9330        walk_plan_node(&explain.logical_plan.root, 0, &mut records);
9331
9332        let result = crate::storage::query::unified::UnifiedResult {
9333            columns,
9334            records,
9335            stats: Default::default(),
9336            pre_serialized_json: None,
9337        };
9338
9339        Ok(RuntimeQueryResult {
9340            query: raw_query.to_string(),
9341            mode: explain.mode,
9342            statement: "explain",
9343            engine: "runtime-explain",
9344            result,
9345            affected_rows: 0,
9346            statement_type: "select",
9347            bookmark: None,
9348        })
9349    }
9350
9351    // -----------------------------------------------------------------
9352    // Granular RBAC — privilege gate + GRANT/REVOKE/ALTER USER dispatch
9353    // -----------------------------------------------------------------
9354
9355    /// Project a `QueryExpr` to the (action, resource) pair the
9356    /// privilege engine cares about. Returns `Ok(())` for statements
9357    /// that don't touch user data (transaction control, SHOW, SET, etc.).
9358    pub(crate) fn check_query_privilege(
9359        &self,
9360        expr: &crate::storage::query::ast::QueryExpr,
9361    ) -> Result<(), String> {
9362        use crate::auth::privileges::{Action, AuthzContext, Resource};
9363        use crate::auth::UserId;
9364        use crate::storage::query::ast::QueryExpr;
9365
9366        // No auth store wired (embedded mode / fresh DB / tests) → bypass.
9367        // The bootstrap path itself goes through `execute_query` so this
9368        // is the only sensible default; once auth is wired, the gate
9369        // becomes active.
9370        let auth_store = match self.inner.auth_store.read().clone() {
9371            Some(s) => s,
9372            None => return Ok(()),
9373        };
9374
9375        // Resolve principal + role from the thread-local identity.
9376        // Anonymous (no identity) is allowed to read the bootstrap path
9377        // only when auth_store says so; we treat missing identity as
9378        // platform-admin-equivalent here so embedded test harnesses
9379        // continue to work without setting an identity.
9380        let (username, role) = match current_auth_identity() {
9381            Some(p) => p,
9382            None => return Ok(()),
9383        };
9384        let tenant = current_tenant();
9385
9386        let ctx = AuthzContext {
9387            principal: &username,
9388            effective_role: role,
9389            tenant: tenant.as_deref(),
9390        };
9391        let principal_id = UserId::from_parts(tenant.as_deref(), &username);
9392
9393        // Map QueryExpr → (Action, Resource).
9394        let (action, resource) = match expr {
9395            QueryExpr::Table(t) => (Action::Select, Resource::table_from_name(&t.table)),
9396            QueryExpr::RankOf(_) | QueryExpr::ApproxRankOf(_) | QueryExpr::RankRange(_) => {
9397                (Action::Select, Resource::Database)
9398            }
9399            QueryExpr::QueueSelect(q) => {
9400                return self.check_queue_op_privilege(
9401                    &auth_store,
9402                    &principal_id,
9403                    role,
9404                    tenant.as_deref(),
9405                    "queue:peek",
9406                    &q.queue,
9407                );
9408            }
9409            QueryExpr::QueueCommand(cmd) => {
9410                use crate::storage::query::ast::QueueCommand;
9411                let (queue, action_verb) = match cmd {
9412                    QueueCommand::Push { queue, .. } => (queue.as_str(), "queue:enqueue"),
9413                    QueueCommand::Pop { queue, .. }
9414                    | QueueCommand::GroupRead { queue, .. }
9415                    | QueueCommand::Claim { queue, .. } => (queue.as_str(), "queue:read"),
9416                    QueueCommand::Peek { queue, .. }
9417                    | QueueCommand::Len { queue }
9418                    | QueueCommand::Pending { queue, .. } => (queue.as_str(), "queue:peek"),
9419                    QueueCommand::Ack { queue, .. } => (queue.as_str(), "queue:ack"),
9420                    QueueCommand::Nack {
9421                        queue, delay_ms, ..
9422                    } => {
9423                        // Per-failure retry overrides re-shape retry
9424                        // behaviour for everyone draining the queue and
9425                        // gate on the dedicated `queue:retry` verb so
9426                        // operators can grant base NACK without granting
9427                        // the override capability.
9428                        let verb = if delay_ms.is_some() {
9429                            "queue:retry"
9430                        } else {
9431                            "queue:nack"
9432                        };
9433                        (queue.as_str(), verb)
9434                    }
9435                    QueueCommand::Purge { queue } => (queue.as_str(), "queue:purge"),
9436                    // `GroupCreate` is part of the consumer-setup
9437                    // surface — read-side, never destructive.
9438                    QueueCommand::GroupCreate { queue, .. } => (queue.as_str(), "queue:read"),
9439                    QueueCommand::Move { source, .. } => (source.as_str(), "queue:dlq:move"),
9440                };
9441                return self.check_queue_op_privilege(
9442                    &auth_store,
9443                    &principal_id,
9444                    role,
9445                    tenant.as_deref(),
9446                    action_verb,
9447                    queue,
9448                );
9449            }
9450            QueryExpr::Graph(g) => {
9451                // MATCH … RETURN is the explorer's pattern-traversal
9452                // surface — gate on `graph:traverse` (#757).
9453                self.check_graph_op_privilege(
9454                    &auth_store,
9455                    &principal_id,
9456                    role,
9457                    tenant.as_deref(),
9458                    "graph:traverse",
9459                )?;
9460                if auth_store.iam_authorization_enabled() {
9461                    self.check_graph_property_projection_privilege(
9462                        &auth_store,
9463                        &principal_id,
9464                        role,
9465                        tenant.as_deref(),
9466                        g,
9467                    )?;
9468                    return Ok(());
9469                }
9470                return Ok(());
9471            }
9472            QueryExpr::Path(_) => {
9473                // PATH FROM … TO … is a path-traversal query — gates
9474                // on `graph:traverse` like neighborhood/shortest-path
9475                // (#757).
9476                return self.check_graph_op_privilege(
9477                    &auth_store,
9478                    &principal_id,
9479                    role,
9480                    tenant.as_deref(),
9481                    "graph:traverse",
9482                );
9483            }
9484            QueryExpr::GraphCommand(cmd) => {
9485                use crate::storage::query::ast::GraphCommand;
9486                let action_verb = match cmd {
9487                    // Metadata / property reads.
9488                    GraphCommand::Properties { .. } => "graph:read",
9489                    // Traversal / pattern-walk surface.
9490                    GraphCommand::Neighborhood { .. }
9491                    | GraphCommand::Traverse { .. }
9492                    | GraphCommand::ShortestPath { .. } => "graph:traverse",
9493                    // Analytics algorithms — expensive enough that Red
9494                    // UI needs to gate the runner independently of
9495                    // ordinary traversal.
9496                    GraphCommand::Centrality { .. }
9497                    | GraphCommand::Community { .. }
9498                    | GraphCommand::Components { .. }
9499                    | GraphCommand::Cycles { .. }
9500                    | GraphCommand::Clustering
9501                    | GraphCommand::TopologicalSort => "graph:algorithm:run",
9502                };
9503                return self.check_graph_op_privilege(
9504                    &auth_store,
9505                    &principal_id,
9506                    role,
9507                    tenant.as_deref(),
9508                    action_verb,
9509                );
9510            }
9511            QueryExpr::Vector(v) => {
9512                if auth_store.iam_authorization_enabled() {
9513                    self.check_vector_op_privilege(
9514                        &auth_store,
9515                        &principal_id,
9516                        role,
9517                        tenant.as_deref(),
9518                        "vector:search",
9519                        &v.collection,
9520                    )?;
9521                    self.check_table_like_column_projection_privilege(
9522                        &auth_store,
9523                        &principal_id,
9524                        role,
9525                        tenant.as_deref(),
9526                        &v.collection,
9527                        &["content".to_string()],
9528                    )?;
9529                    return Ok(());
9530                }
9531                return Ok(());
9532            }
9533            QueryExpr::SearchCommand(cmd) => {
9534                use crate::storage::query::ast::SearchCommand;
9535                if auth_store.iam_authorization_enabled() {
9536                    // `SEARCH SIMILAR [..] COLLECTION <c>` and `SEARCH
9537                    // HYBRID ... COLLECTION <c>` are the same UI
9538                    // affordances as `VECTOR SEARCH` / hybrid joins —
9539                    // Red UI must see the same `vector:search` envelope
9540                    // so a single toolbar grant is sufficient.
9541                    let collection = match cmd {
9542                        SearchCommand::Similar { collection, .. }
9543                        | SearchCommand::Hybrid { collection, .. } => Some(collection.as_str()),
9544                        _ => None,
9545                    };
9546                    if let Some(c) = collection {
9547                        self.check_vector_op_privilege(
9548                            &auth_store,
9549                            &principal_id,
9550                            role,
9551                            tenant.as_deref(),
9552                            "vector:search",
9553                            c,
9554                        )?;
9555                        return Ok(());
9556                    }
9557                }
9558                return Ok(());
9559            }
9560            QueryExpr::Hybrid(h) => {
9561                if auth_store.iam_authorization_enabled() {
9562                    // The vector half of a hybrid search is gated under
9563                    // the same `vector:search` verb as a standalone
9564                    // VECTOR SEARCH — Red UI's hybrid-search toolbar
9565                    // must surface the same UI-safe denial envelope
9566                    // when the principal lacks the grant. The
9567                    // structured half is dispatched to its own gate via
9568                    // the inner query during execution.
9569                    self.check_vector_op_privilege(
9570                        &auth_store,
9571                        &principal_id,
9572                        role,
9573                        tenant.as_deref(),
9574                        "vector:search",
9575                        &h.vector.collection,
9576                    )?;
9577                    return Ok(());
9578                }
9579                return Ok(());
9580            }
9581            QueryExpr::Insert(i) => (Action::Insert, Resource::table_from_name(&i.table)),
9582            QueryExpr::Update(u) => (Action::Update, Resource::table_from_name(&u.table)),
9583            QueryExpr::Delete(d) => (Action::Delete, Resource::table_from_name(&d.table)),
9584            // Joins inherit the read privilege from any constituent
9585            // table — for now we emit a single Select on the database
9586            // (admins bypass; non-admins need a Database/Schema grant).
9587            QueryExpr::Join(_) => (Action::Select, Resource::Database),
9588            // GRANT / REVOKE / USER DDL are authority statements;
9589            // require Admin (the helper methods enforce).
9590            QueryExpr::Grant(_)
9591            | QueryExpr::Revoke(_)
9592            | QueryExpr::AlterUser(_)
9593            | QueryExpr::CreateUser(_) => {
9594                return if role == crate::auth::Role::Admin {
9595                    Ok(())
9596                } else {
9597                    Err(format!(
9598                        "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
9599                        username, role
9600                    ))
9601                };
9602            }
9603            QueryExpr::CreateIamPolicy { id, .. } => {
9604                return self.check_policy_management_privilege(
9605                    &auth_store,
9606                    &principal_id,
9607                    role,
9608                    tenant.as_deref(),
9609                    "policy:put",
9610                    "policy",
9611                    id,
9612                );
9613            }
9614            QueryExpr::DropIamPolicy { id } => {
9615                return self.check_policy_management_privilege(
9616                    &auth_store,
9617                    &principal_id,
9618                    role,
9619                    tenant.as_deref(),
9620                    "policy:drop",
9621                    "policy",
9622                    id,
9623                );
9624            }
9625            QueryExpr::AttachPolicy { policy_id, .. } => {
9626                return self.check_policy_management_privilege(
9627                    &auth_store,
9628                    &principal_id,
9629                    role,
9630                    tenant.as_deref(),
9631                    "policy:attach",
9632                    "policy",
9633                    policy_id,
9634                );
9635            }
9636            QueryExpr::DetachPolicy { policy_id, .. } => {
9637                return self.check_policy_management_privilege(
9638                    &auth_store,
9639                    &principal_id,
9640                    role,
9641                    tenant.as_deref(),
9642                    "policy:detach",
9643                    "policy",
9644                    policy_id,
9645                );
9646            }
9647            QueryExpr::ShowPolicies { .. } | QueryExpr::ShowEffectivePermissions { .. } => {
9648                return Ok(());
9649            }
9650            QueryExpr::SimulatePolicy { .. } => {
9651                return self.check_policy_management_privilege(
9652                    &auth_store,
9653                    &principal_id,
9654                    role,
9655                    tenant.as_deref(),
9656                    "policy:simulate",
9657                    "policy",
9658                    "*",
9659                );
9660            }
9661            QueryExpr::LintPolicy { .. } => {
9662                // Linting is a read-only inspection — gate it like
9663                // simulate (policy management role).
9664                return self.check_policy_management_privilege(
9665                    &auth_store,
9666                    &principal_id,
9667                    role,
9668                    tenant.as_deref(),
9669                    "policy:simulate",
9670                    "policy",
9671                    "*",
9672                );
9673            }
9674            QueryExpr::MigratePolicyMode { dry_run, .. } => {
9675                // DRY RUN is a pre-flight inspection (policy:simulate).
9676                // The actual mode flip is a privileged mutation under
9677                // the policy:put action (it persists a new enforcement
9678                // mode to the vault KV through `set_enforcement_mode`).
9679                let action = if *dry_run {
9680                    "policy:simulate"
9681                } else {
9682                    "policy:put"
9683                };
9684                return self.check_policy_management_privilege(
9685                    &auth_store,
9686                    &principal_id,
9687                    role,
9688                    tenant.as_deref(),
9689                    action,
9690                    "policy",
9691                    "*",
9692                );
9693            }
9694            // DROP and TRUNCATE — Write-role gate + per-collection IAM policy
9695            // when IAM mode is active. Other DDL stays role-only for now.
9696            QueryExpr::DropTable(q) => {
9697                return self.check_ddl_collection_privilege(
9698                    &auth_store,
9699                    &principal_id,
9700                    role,
9701                    tenant.as_deref(),
9702                    &username,
9703                    "drop",
9704                    &q.name,
9705                );
9706            }
9707            QueryExpr::DropGraph(q) => {
9708                return self.check_ddl_collection_privilege(
9709                    &auth_store,
9710                    &principal_id,
9711                    role,
9712                    tenant.as_deref(),
9713                    &username,
9714                    "drop",
9715                    &q.name,
9716                );
9717            }
9718            QueryExpr::DropVector(q) => {
9719                return self.check_ddl_collection_privilege(
9720                    &auth_store,
9721                    &principal_id,
9722                    role,
9723                    tenant.as_deref(),
9724                    &username,
9725                    "drop",
9726                    &q.name,
9727                );
9728            }
9729            QueryExpr::DropDocument(q) => {
9730                return self.check_ddl_collection_privilege(
9731                    &auth_store,
9732                    &principal_id,
9733                    role,
9734                    tenant.as_deref(),
9735                    &username,
9736                    "drop",
9737                    &q.name,
9738                );
9739            }
9740            QueryExpr::DropKv(q) => {
9741                return self.check_ddl_collection_privilege(
9742                    &auth_store,
9743                    &principal_id,
9744                    role,
9745                    tenant.as_deref(),
9746                    &username,
9747                    "drop",
9748                    &q.name,
9749                );
9750            }
9751            QueryExpr::DropCollection(q) => {
9752                return self.check_ddl_collection_privilege(
9753                    &auth_store,
9754                    &principal_id,
9755                    role,
9756                    tenant.as_deref(),
9757                    &username,
9758                    "drop",
9759                    &q.name,
9760                );
9761            }
9762            QueryExpr::Truncate(q) => {
9763                return self.check_ddl_collection_privilege(
9764                    &auth_store,
9765                    &principal_id,
9766                    role,
9767                    tenant.as_deref(),
9768                    &username,
9769                    "truncate",
9770                    &q.name,
9771                );
9772            }
9773            // Remaining DDL (#753) — hybrid policy-aware gate. Specific
9774            // create/alter/drop verbs gate operations with a clear
9775            // per-collection target so Red UI can author fine-grained
9776            // policies (`create on collection:users`). Namespace-level
9777            // and grouped DDL fall back to broader `schema:admin` /
9778            // `schema:write` verbs against a `schema:<name>` resource.
9779            // All branches share the [`check_ddl_object_privilege`]
9780            // helper so allows / denies produce the same structured
9781            // "principal=… action=… resource=<kind>:<name> denied by
9782            // IAM policy" reason the Red UI security read contracts
9783            // (#740) already render.
9784            QueryExpr::CreateTable(q) => {
9785                return self.check_ddl_object_privilege(
9786                    &auth_store,
9787                    &principal_id,
9788                    role,
9789                    tenant.as_deref(),
9790                    &username,
9791                    "create",
9792                    "collection",
9793                    &q.name,
9794                    crate::auth::Role::Write,
9795                );
9796            }
9797            QueryExpr::CreateCollection(q) => {
9798                return self.check_ddl_object_privilege(
9799                    &auth_store,
9800                    &principal_id,
9801                    role,
9802                    tenant.as_deref(),
9803                    &username,
9804                    "create",
9805                    "collection",
9806                    &q.name,
9807                    crate::auth::Role::Write,
9808                );
9809            }
9810            QueryExpr::CreateVector(q) => {
9811                return self.check_ddl_object_privilege(
9812                    &auth_store,
9813                    &principal_id,
9814                    role,
9815                    tenant.as_deref(),
9816                    &username,
9817                    "create",
9818                    "collection",
9819                    &q.name,
9820                    crate::auth::Role::Write,
9821                );
9822            }
9823            QueryExpr::AlterTable(q) => {
9824                return self.check_ddl_object_privilege(
9825                    &auth_store,
9826                    &principal_id,
9827                    role,
9828                    tenant.as_deref(),
9829                    &username,
9830                    "alter",
9831                    "collection",
9832                    &q.name,
9833                    crate::auth::Role::Write,
9834                );
9835            }
9836            QueryExpr::CreateIndex(q) => {
9837                return self.check_ddl_object_privilege(
9838                    &auth_store,
9839                    &principal_id,
9840                    role,
9841                    tenant.as_deref(),
9842                    &username,
9843                    "create",
9844                    "collection",
9845                    &q.table,
9846                    crate::auth::Role::Write,
9847                );
9848            }
9849            QueryExpr::DropIndex(q) => {
9850                return self.check_ddl_object_privilege(
9851                    &auth_store,
9852                    &principal_id,
9853                    role,
9854                    tenant.as_deref(),
9855                    &username,
9856                    "drop",
9857                    "collection",
9858                    &q.table,
9859                    crate::auth::Role::Write,
9860                );
9861            }
9862            QueryExpr::CreateSchema(q) => {
9863                return self.check_ddl_object_privilege(
9864                    &auth_store,
9865                    &principal_id,
9866                    role,
9867                    tenant.as_deref(),
9868                    &username,
9869                    "schema:admin",
9870                    "schema",
9871                    &q.name,
9872                    crate::auth::Role::Admin,
9873                );
9874            }
9875            QueryExpr::DropSchema(q) => {
9876                return self.check_ddl_object_privilege(
9877                    &auth_store,
9878                    &principal_id,
9879                    role,
9880                    tenant.as_deref(),
9881                    &username,
9882                    "schema:admin",
9883                    "schema",
9884                    &q.name,
9885                    crate::auth::Role::Admin,
9886                );
9887            }
9888            QueryExpr::CreateSequence(q) => {
9889                return self.check_ddl_object_privilege(
9890                    &auth_store,
9891                    &principal_id,
9892                    role,
9893                    tenant.as_deref(),
9894                    &username,
9895                    "create",
9896                    "collection",
9897                    &q.name,
9898                    crate::auth::Role::Write,
9899                );
9900            }
9901            QueryExpr::DropSequence(q) => {
9902                return self.check_ddl_object_privilege(
9903                    &auth_store,
9904                    &principal_id,
9905                    role,
9906                    tenant.as_deref(),
9907                    &username,
9908                    "drop",
9909                    "collection",
9910                    &q.name,
9911                    crate::auth::Role::Write,
9912                );
9913            }
9914            QueryExpr::CreateView(q) => {
9915                return self.check_ddl_object_privilege(
9916                    &auth_store,
9917                    &principal_id,
9918                    role,
9919                    tenant.as_deref(),
9920                    &username,
9921                    "create",
9922                    "collection",
9923                    &q.name,
9924                    crate::auth::Role::Write,
9925                );
9926            }
9927            QueryExpr::DropView(q) => {
9928                return self.check_ddl_object_privilege(
9929                    &auth_store,
9930                    &principal_id,
9931                    role,
9932                    tenant.as_deref(),
9933                    &username,
9934                    "drop",
9935                    "collection",
9936                    &q.name,
9937                    crate::auth::Role::Write,
9938                );
9939            }
9940            QueryExpr::RefreshMaterializedView(q) => {
9941                return self.check_ddl_object_privilege(
9942                    &auth_store,
9943                    &principal_id,
9944                    role,
9945                    tenant.as_deref(),
9946                    &username,
9947                    "alter",
9948                    "collection",
9949                    &q.name,
9950                    crate::auth::Role::Write,
9951                );
9952            }
9953            QueryExpr::CreatePolicy(q) => {
9954                return self.check_ddl_object_privilege(
9955                    &auth_store,
9956                    &principal_id,
9957                    role,
9958                    tenant.as_deref(),
9959                    &username,
9960                    "create",
9961                    "collection",
9962                    &q.table,
9963                    crate::auth::Role::Write,
9964                );
9965            }
9966            QueryExpr::DropPolicy(q) => {
9967                return self.check_ddl_object_privilege(
9968                    &auth_store,
9969                    &principal_id,
9970                    role,
9971                    tenant.as_deref(),
9972                    &username,
9973                    "drop",
9974                    "collection",
9975                    &q.table,
9976                    crate::auth::Role::Write,
9977                );
9978            }
9979            QueryExpr::CreateServer(q) => {
9980                return self.check_ddl_object_privilege(
9981                    &auth_store,
9982                    &principal_id,
9983                    role,
9984                    tenant.as_deref(),
9985                    &username,
9986                    "schema:admin",
9987                    "schema",
9988                    &q.name,
9989                    crate::auth::Role::Admin,
9990                );
9991            }
9992            QueryExpr::DropServer(q) => {
9993                return self.check_ddl_object_privilege(
9994                    &auth_store,
9995                    &principal_id,
9996                    role,
9997                    tenant.as_deref(),
9998                    &username,
9999                    "schema:admin",
10000                    "schema",
10001                    &q.name,
10002                    crate::auth::Role::Admin,
10003                );
10004            }
10005            QueryExpr::CreateForeignTable(q) => {
10006                return self.check_ddl_object_privilege(
10007                    &auth_store,
10008                    &principal_id,
10009                    role,
10010                    tenant.as_deref(),
10011                    &username,
10012                    "schema:write",
10013                    "schema",
10014                    &q.name,
10015                    crate::auth::Role::Write,
10016                );
10017            }
10018            QueryExpr::DropForeignTable(q) => {
10019                return self.check_ddl_object_privilege(
10020                    &auth_store,
10021                    &principal_id,
10022                    role,
10023                    tenant.as_deref(),
10024                    &username,
10025                    "schema:write",
10026                    "schema",
10027                    &q.name,
10028                    crate::auth::Role::Write,
10029                );
10030            }
10031            QueryExpr::CreateTimeSeries(q) => {
10032                return self.check_ddl_object_privilege(
10033                    &auth_store,
10034                    &principal_id,
10035                    role,
10036                    tenant.as_deref(),
10037                    &username,
10038                    "create",
10039                    "collection",
10040                    &q.name,
10041                    crate::auth::Role::Write,
10042                );
10043            }
10044            QueryExpr::CreateMetric(q) => {
10045                return self.check_ddl_object_privilege(
10046                    &auth_store,
10047                    &principal_id,
10048                    role,
10049                    tenant.as_deref(),
10050                    &username,
10051                    "create",
10052                    "collection",
10053                    &q.path,
10054                    crate::auth::Role::Write,
10055                );
10056            }
10057            QueryExpr::AlterMetric(q) => {
10058                return self.check_ddl_object_privilege(
10059                    &auth_store,
10060                    &principal_id,
10061                    role,
10062                    tenant.as_deref(),
10063                    &username,
10064                    "alter",
10065                    "collection",
10066                    &q.path,
10067                    crate::auth::Role::Write,
10068                );
10069            }
10070            QueryExpr::CreateSlo(q) => {
10071                return self.check_ddl_object_privilege(
10072                    &auth_store,
10073                    &principal_id,
10074                    role,
10075                    tenant.as_deref(),
10076                    &username,
10077                    "create",
10078                    "collection",
10079                    &q.path,
10080                    crate::auth::Role::Write,
10081                );
10082            }
10083            QueryExpr::DropTimeSeries(q) => {
10084                return self.check_ddl_object_privilege(
10085                    &auth_store,
10086                    &principal_id,
10087                    role,
10088                    tenant.as_deref(),
10089                    &username,
10090                    "drop",
10091                    "collection",
10092                    &q.name,
10093                    crate::auth::Role::Write,
10094                );
10095            }
10096            QueryExpr::CreateQueue(q) => {
10097                return self.check_ddl_object_privilege(
10098                    &auth_store,
10099                    &principal_id,
10100                    role,
10101                    tenant.as_deref(),
10102                    &username,
10103                    "create",
10104                    "collection",
10105                    &q.name,
10106                    crate::auth::Role::Write,
10107                );
10108            }
10109            QueryExpr::AlterQueue(q) => {
10110                return self.check_ddl_object_privilege(
10111                    &auth_store,
10112                    &principal_id,
10113                    role,
10114                    tenant.as_deref(),
10115                    &username,
10116                    "alter",
10117                    "collection",
10118                    &q.name,
10119                    crate::auth::Role::Write,
10120                );
10121            }
10122            QueryExpr::DropQueue(q) => {
10123                return self.check_ddl_object_privilege(
10124                    &auth_store,
10125                    &principal_id,
10126                    role,
10127                    tenant.as_deref(),
10128                    &username,
10129                    "drop",
10130                    "collection",
10131                    &q.name,
10132                    crate::auth::Role::Write,
10133                );
10134            }
10135            QueryExpr::CreateTree(q) => {
10136                return self.check_ddl_object_privilege(
10137                    &auth_store,
10138                    &principal_id,
10139                    role,
10140                    tenant.as_deref(),
10141                    &username,
10142                    "create",
10143                    "collection",
10144                    &q.collection,
10145                    crate::auth::Role::Write,
10146                );
10147            }
10148            QueryExpr::DropTree(q) => {
10149                return self.check_ddl_object_privilege(
10150                    &auth_store,
10151                    &principal_id,
10152                    role,
10153                    tenant.as_deref(),
10154                    &username,
10155                    "drop",
10156                    "collection",
10157                    &q.collection,
10158                    crate::auth::Role::Write,
10159                );
10160            }
10161            // Migration DDL — CREATE MIGRATION is grouped DDL on the
10162            // schema namespace; uses the `schema:write` fallback verb
10163            // (no obvious per-collection target).
10164            QueryExpr::CreateMigration(q) => {
10165                return self.check_ddl_object_privilege(
10166                    &auth_store,
10167                    &principal_id,
10168                    role,
10169                    tenant.as_deref(),
10170                    &username,
10171                    "schema:write",
10172                    "schema",
10173                    &q.name,
10174                    crate::auth::Role::Write,
10175                );
10176            }
10177            // APPLY / ROLLBACK change data and schema — require Admin.
10178            QueryExpr::ApplyMigration(_) | QueryExpr::RollbackMigration(_) => {
10179                return if role == crate::auth::Role::Admin {
10180                    Ok(())
10181                } else {
10182                    Err(format!(
10183                        "principal=`{}` role=`{:?}` cannot issue APPLY/ROLLBACK MIGRATION",
10184                        username, role
10185                    ))
10186                };
10187            }
10188            // EXPLAIN MIGRATION is read-only — any authenticated principal.
10189            QueryExpr::ExplainMigration(_) => return Ok(()),
10190            // Everything else (SET, SHOW, transaction control, graph
10191            // commands, queue/tree commands, MaintenanceCommand …)
10192            // is allowed for any authenticated principal.
10193            _ => return Ok(()),
10194        };
10195
10196        if auth_store.iam_authorization_enabled() {
10197            let iam_action = legacy_action_to_iam(action);
10198            let iam_resource = legacy_resource_to_iam(&resource, tenant.as_deref());
10199            let iam_ctx = runtime_iam_context(
10200                role,
10201                tenant.as_deref(),
10202                auth_store.principal_is_system_owned(&principal_id),
10203            );
10204            if !auth_store.check_policy_authz_with_role(
10205                &principal_id,
10206                iam_action,
10207                &iam_resource,
10208                &iam_ctx,
10209                role,
10210            ) {
10211                return Err(format!(
10212                    "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10213                    username, iam_action, iam_resource.kind, iam_resource.name
10214                ));
10215            }
10216
10217            if let QueryExpr::Table(table) = expr {
10218                self.check_table_column_projection_privilege(
10219                    &auth_store,
10220                    &principal_id,
10221                    &iam_ctx,
10222                    table,
10223                )?;
10224            }
10225
10226            if let QueryExpr::Update(update) = expr {
10227                let columns = update_set_target_columns(update);
10228                if !columns.is_empty() {
10229                    let request = column_access_request_for_table_update(&update.table, columns);
10230                    let outcome =
10231                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
10232                    if let Some(denied) = outcome.first_denied_column() {
10233                        return Err(format!(
10234                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM column policy",
10235                            username, iam_action, denied.resource.kind, denied.resource.name
10236                        ));
10237                    }
10238                    if !outcome.allowed() {
10239                        return Err(format!(
10240                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10241                            username,
10242                            iam_action,
10243                            outcome.table_resource.kind,
10244                            outcome.table_resource.name
10245                        ));
10246                    }
10247                }
10248
10249                if let Some(columns) = update_returning_columns_for_policy(self, update) {
10250                    let request = column_access_request_for_table_select(&update.table, columns);
10251                    let outcome =
10252                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
10253                    if let Some(denied) = outcome.first_denied_column() {
10254                        return Err(format!(
10255                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM column policy",
10256                            username, denied.resource.kind, denied.resource.name
10257                        ));
10258                    }
10259                    if !outcome.allowed() {
10260                        return Err(format!(
10261                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
10262                            username, outcome.table_resource.kind, outcome.table_resource.name
10263                        ));
10264                    }
10265                }
10266            }
10267
10268            Ok(())
10269        } else {
10270            auth_store
10271                .check_grant(&ctx, action, &resource)
10272                .map_err(|e| e.to_string())
10273        }
10274    }
10275
10276    fn check_table_column_projection_privilege(
10277        &self,
10278        auth_store: &Arc<crate::auth::store::AuthStore>,
10279        principal: &crate::auth::UserId,
10280        ctx: &crate::auth::policies::EvalContext,
10281        table: &crate::storage::query::ast::TableQuery,
10282    ) -> Result<(), String> {
10283        use crate::auth::{ColumnAccessRequest, ColumnDecisionEffect};
10284
10285        let columns = requested_table_columns_for_policy(table);
10286        if columns.is_empty() {
10287            return Ok(());
10288        }
10289
10290        let request = ColumnAccessRequest::select(table.table.clone(), columns);
10291        let outcome = auth_store.check_column_projection_authz(principal, &request, ctx);
10292        if outcome.allowed() {
10293            return Ok(());
10294        }
10295
10296        if !matches!(
10297            outcome.table_decision,
10298            crate::auth::policies::Decision::Allow { .. }
10299                | crate::auth::policies::Decision::AdminBypass
10300        ) {
10301            return Err(format!(
10302                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
10303                principal, outcome.table_resource.kind, outcome.table_resource.name
10304            ));
10305        }
10306
10307        let denied = outcome
10308            .first_denied_column()
10309            .filter(|decision| decision.effective == ColumnDecisionEffect::Denied);
10310        match denied {
10311            Some(decision) => Err(format!(
10312                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
10313                principal, decision.resource.kind, decision.resource.name
10314            )),
10315            None => Ok(()),
10316        }
10317    }
10318
10319    fn check_graph_property_projection_privilege(
10320        &self,
10321        auth_store: &Arc<crate::auth::store::AuthStore>,
10322        principal: &crate::auth::UserId,
10323        role: crate::auth::Role,
10324        tenant: Option<&str>,
10325        query: &crate::storage::query::ast::GraphQuery,
10326    ) -> Result<(), String> {
10327        let columns = explicit_graph_projection_properties(query);
10328        if columns.is_empty() {
10329            return Ok(());
10330        }
10331        self.check_table_like_column_projection_privilege(
10332            auth_store, principal, role, tenant, "graph", &columns,
10333        )
10334    }
10335
10336    fn check_table_like_column_projection_privilege(
10337        &self,
10338        auth_store: &Arc<crate::auth::store::AuthStore>,
10339        principal: &crate::auth::UserId,
10340        role: crate::auth::Role,
10341        tenant: Option<&str>,
10342        table: &str,
10343        columns: &[String],
10344    ) -> Result<(), String> {
10345        let iam_ctx = runtime_iam_context(
10346            role,
10347            tenant,
10348            auth_store.principal_is_system_owned(principal),
10349        );
10350        let request =
10351            crate::auth::ColumnAccessRequest::select(table.to_string(), columns.iter().cloned());
10352        let outcome = auth_store.check_column_projection_authz(principal, &request, &iam_ctx);
10353        if outcome.allowed() {
10354            return Ok(());
10355        }
10356        let denied = outcome
10357            .first_denied_column()
10358            .map(|d| d.resource.name.clone())
10359            .unwrap_or_else(|| format!("{table}.<unknown>"));
10360        Err(format!(
10361            "principal=`{}` action=`select` resource=`column:{}` denied by IAM policy",
10362            principal, denied
10363        ))
10364    }
10365
10366    fn check_policy_management_privilege(
10367        &self,
10368        auth_store: &Arc<crate::auth::store::AuthStore>,
10369        principal: &crate::auth::UserId,
10370        role: crate::auth::Role,
10371        tenant: Option<&str>,
10372        action: &str,
10373        resource_kind: &str,
10374        resource_name: &str,
10375    ) -> Result<(), String> {
10376        let ctx = runtime_iam_context(
10377            role,
10378            tenant,
10379            auth_store.principal_is_system_owned(principal),
10380        );
10381
10382        if !auth_store.iam_authorization_enabled() {
10383            return if role == crate::auth::Role::Admin {
10384                Ok(())
10385            } else {
10386                Err(format!(
10387                    "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
10388                    principal, role
10389                ))
10390            };
10391        }
10392
10393        let mut resource = crate::auth::policies::ResourceRef::new(
10394            resource_kind.to_string(),
10395            resource_name.to_string(),
10396        );
10397        if let Some(t) = tenant {
10398            resource = resource.with_tenant(t.to_string());
10399        }
10400        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10401            Ok(())
10402        } else {
10403            Err(format!(
10404                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10405                principal, action, resource.kind, resource.name
10406            ))
10407        }
10408    }
10409
10410    fn check_managed_config_write_for_set_config(&self, key: &str) -> RedDBResult<()> {
10411        let Some(auth_store) = self.inner.auth_store.read().clone() else {
10412            return Ok(());
10413        };
10414        let (username, role) = current_auth_identity()
10415            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
10416        let tenant = current_tenant();
10417        let principal = crate::auth::UserId::from_parts(tenant.as_deref(), &username);
10418        let ctx = runtime_iam_context(
10419            role,
10420            tenant.as_deref(),
10421            auth_store.principal_is_system_owned(&principal),
10422        );
10423        let gate = crate::auth::managed_config::ManagedConfigGate::new(
10424            self.inner.config_registry.as_ref(),
10425        );
10426        match gate.check_write(&auth_store, &principal, &ctx, key) {
10427            crate::auth::managed_config::ManagedConfigDecision::PassThrough { .. }
10428            | crate::auth::managed_config::ManagedConfigDecision::Allow { .. } => Ok(()),
10429            crate::auth::managed_config::ManagedConfigDecision::Deny { reason, .. } => {
10430                Err(RedDBError::Query(format!(
10431                    "permission denied: managed config mutation blocked for `{key}`: {reason}"
10432                )))
10433            }
10434        }
10435    }
10436
10437    /// IAM privilege check for a granular queue operation (issue #755 /
10438    /// PRD #735).
10439    ///
10440    /// Each queue operation maps to a stable verb in
10441    /// [`crate::auth::action_catalog`] (`queue:enqueue`, `queue:read`,
10442    /// `queue:peek`, `queue:ack`, `queue:nack`, `queue:retry`,
10443    /// `queue:dlq:move`, `queue:purge`, `queue:presence:read`). The
10444    /// resource is `queue:<name>` scoped to the current tenant. In
10445    /// legacy mode (no IAM authorization configured) the check is a
10446    /// no-op — the role gates in `execute_queue_command` still apply
10447    /// and the legacy `select` / `write` grant table continues to
10448    /// govern queue access. In IAM-enabled mode a missing granular
10449    /// grant yields a structured, UI-safe error of the form
10450    /// `principal=… action=queue:… resource=queue:… denied by IAM
10451    /// policy` so Red UI can surface the failing toolbar action.
10452    fn check_queue_op_privilege(
10453        &self,
10454        auth_store: &Arc<crate::auth::store::AuthStore>,
10455        principal: &crate::auth::UserId,
10456        role: crate::auth::Role,
10457        tenant: Option<&str>,
10458        action: &str,
10459        queue: &str,
10460    ) -> Result<(), String> {
10461        if !auth_store.iam_authorization_enabled() {
10462            return Ok(());
10463        }
10464        let mut resource =
10465            crate::auth::policies::ResourceRef::new("queue".to_string(), queue.to_string());
10466        if let Some(t) = tenant {
10467            resource = resource.with_tenant(t.to_string());
10468        }
10469        let ctx = runtime_iam_context(
10470            role,
10471            tenant,
10472            auth_store.principal_is_system_owned(principal),
10473        );
10474        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10475            Ok(())
10476        } else {
10477            Err(format!(
10478                "principal=`{}` action=`{}` resource=`queue:{}` denied by IAM policy",
10479                principal, action, queue
10480            ))
10481        }
10482    }
10483
10484    /// IAM privilege check for a graph operation (issue #757 / PRD
10485    /// #735).
10486    ///
10487    /// Each graph operation maps to a stable verb in
10488    /// [`crate::auth::action_catalog`] — `graph:read` for
10489    /// metadata/property lookups, `graph:traverse` for MATCH / PATH /
10490    /// NEIGHBORHOOD / TRAVERSE / SHORTEST_PATH, and
10491    /// `graph:algorithm:run` for analytics algorithms (centrality,
10492    /// community, components, cycles, clustering, topological sort).
10493    /// The resource is `graph:*` scoped to the current tenant — the
10494    /// runtime today operates on a singleton graph store so the name
10495    /// has no concrete identifier; policies grant the explorer
10496    /// surface by writing `graph:*` as the resource pattern.
10497    ///
10498    /// In legacy mode (no IAM authorization configured) the check is
10499    /// a no-op so the existing role-based defaults continue to
10500    /// govern. In IAM-enabled mode a missing grant produces the
10501    /// UI-safe envelope `principal=… action=graph:… resource=graph:*
10502    /// denied by IAM policy` Red UI keys on.
10503    fn check_graph_op_privilege(
10504        &self,
10505        auth_store: &Arc<crate::auth::store::AuthStore>,
10506        principal: &crate::auth::UserId,
10507        role: crate::auth::Role,
10508        tenant: Option<&str>,
10509        action: &str,
10510    ) -> Result<(), String> {
10511        if !auth_store.iam_authorization_enabled() {
10512            return Ok(());
10513        }
10514        let mut resource =
10515            crate::auth::policies::ResourceRef::new("graph".to_string(), "*".to_string());
10516        if let Some(t) = tenant {
10517            resource = resource.with_tenant(t.to_string());
10518        }
10519        let ctx = runtime_iam_context(
10520            role,
10521            tenant,
10522            auth_store.principal_is_system_owned(principal),
10523        );
10524        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10525            Ok(())
10526        } else {
10527            Err(format!(
10528                "principal=`{}` action=`{}` resource=`graph:*` denied by IAM policy",
10529                principal, action
10530            ))
10531        }
10532    }
10533
10534    /// IAM privilege check for a granular vector operation (issue #756
10535    /// / PRD #735).
10536    ///
10537    /// Each vector operation maps to a stable verb in
10538    /// [`crate::auth::action_catalog`] (`vector:read`, `vector:search`,
10539    /// `vector:artifact:read`, `vector:artifact:rebuild`,
10540    /// `vector:admin`). The resource is `vector:<collection>` scoped to
10541    /// the current tenant. In legacy mode (no IAM authorization
10542    /// configured) the check is a no-op — the role gates and existing
10543    /// `select` / column-projection grants continue to govern access.
10544    /// In IAM-enabled mode a missing granular grant yields a
10545    /// structured, UI-safe error of the form `principal=…
10546    /// action=vector:… resource=vector:… denied by IAM policy` so Red
10547    /// UI can surface the failing toolbar action.
10548    fn check_vector_op_privilege(
10549        &self,
10550        auth_store: &Arc<crate::auth::store::AuthStore>,
10551        principal: &crate::auth::UserId,
10552        role: crate::auth::Role,
10553        tenant: Option<&str>,
10554        action: &str,
10555        collection: &str,
10556    ) -> Result<(), String> {
10557        if !auth_store.iam_authorization_enabled() {
10558            return Ok(());
10559        }
10560        let mut resource =
10561            crate::auth::policies::ResourceRef::new("vector".to_string(), collection.to_string());
10562        if let Some(t) = tenant {
10563            resource = resource.with_tenant(t.to_string());
10564        }
10565        let ctx = runtime_iam_context(
10566            role,
10567            tenant,
10568            auth_store.principal_is_system_owned(principal),
10569        );
10570        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10571            Ok(())
10572        } else {
10573            Err(format!(
10574                "principal=`{}` action=`{}` resource=`vector:{}` denied by IAM policy",
10575                principal, action, collection
10576            ))
10577        }
10578    }
10579
10580    /// IAM privilege check for DROP / TRUNCATE on a named collection.
10581    ///
10582    /// Delegates to [`check_ddl_object_privilege`] with `resource_kind =
10583    /// "collection"`. Kept as a thin wrapper so the existing DROP/TRUNCATE
10584    /// callsites stay readable.
10585    fn check_ddl_collection_privilege(
10586        &self,
10587        auth_store: &Arc<crate::auth::store::AuthStore>,
10588        principal: &crate::auth::UserId,
10589        role: crate::auth::Role,
10590        tenant: Option<&str>,
10591        username: &str,
10592        action: &str,
10593        collection: &str,
10594    ) -> Result<(), String> {
10595        self.check_ddl_object_privilege(
10596            auth_store,
10597            principal,
10598            role,
10599            tenant,
10600            username,
10601            action,
10602            "collection",
10603            collection,
10604            crate::auth::Role::Write,
10605        )
10606    }
10607
10608    /// Generalised IAM privilege check for DDL on a named object.
10609    ///
10610    /// `action` is the stable verb advertised through the action catalog
10611    /// (`create`, `alter`, `drop`, `truncate`, `schema:write`,
10612    /// `schema:admin`). `resource_kind` / `resource_name` form the policy
10613    /// resource (`collection:<name>`, `schema:<name>`). `min_role` is the
10614    /// legacy gate when IAM is not yet enabled.
10615    ///
10616    /// Behaviour:
10617    /// * Role below `min_role` → structured "principal=… role=… cannot
10618    ///   issue DDL" denial, audit recorded.
10619    /// * IAM disabled → audit-record success and allow (legacy path).
10620    /// * IAM enabled → call `check_policy_authz_with_role`. Explicit Deny
10621    ///   and DefaultDeny in PolicyOnly mode both produce a UI-safe
10622    ///   "principal=… action=… resource=<kind>:<name> denied by IAM
10623    ///   policy" string. Explicit Allow and the LegacyRbac fallback
10624    ///   allow the action.
10625    #[allow(clippy::too_many_arguments)]
10626    fn check_ddl_object_privilege(
10627        &self,
10628        auth_store: &Arc<crate::auth::store::AuthStore>,
10629        principal: &crate::auth::UserId,
10630        role: crate::auth::Role,
10631        tenant: Option<&str>,
10632        username: &str,
10633        action: &str,
10634        resource_kind: &str,
10635        resource_name: &str,
10636        min_role: crate::auth::Role,
10637    ) -> Result<(), String> {
10638        if role < min_role {
10639            let msg = format!(
10640                "principal=`{}` role=`{:?}` cannot issue DDL action=`{}` resource=`{}:{}`",
10641                username, role, action, resource_kind, resource_name
10642            );
10643            self.inner.audit_log.record(
10644                action,
10645                username,
10646                resource_name,
10647                "denied",
10648                crate::json::Value::Null,
10649            );
10650            return Err(msg);
10651        }
10652
10653        if !auth_store.iam_authorization_enabled() {
10654            self.inner.audit_log.record(
10655                action,
10656                username,
10657                resource_name,
10658                "ok",
10659                crate::json::Value::Null,
10660            );
10661            return Ok(());
10662        }
10663
10664        let mut resource = crate::auth::policies::ResourceRef::new(
10665            resource_kind.to_string(),
10666            resource_name.to_string(),
10667        );
10668        if let Some(t) = tenant {
10669            resource = resource.with_tenant(t.to_string());
10670        }
10671        let ctx = runtime_iam_context(
10672            role,
10673            tenant,
10674            auth_store.principal_is_system_owned(principal),
10675        );
10676        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10677            self.inner.audit_log.record(
10678                action,
10679                username,
10680                resource_name,
10681                "ok",
10682                crate::json::Value::Null,
10683            );
10684            Ok(())
10685        } else {
10686            self.inner.audit_log.record(
10687                action,
10688                username,
10689                resource_name,
10690                "denied",
10691                crate::json::Value::Null,
10692            );
10693            Err(format!(
10694                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10695                username, action, resource_kind, resource_name
10696            ))
10697        }
10698    }
10699
10700    /// Translate the parsed [`GrantStmt`] into AuthStore mutations.
10701    fn execute_grant_statement(
10702        &self,
10703        query: &str,
10704        stmt: &crate::storage::query::ast::GrantStmt,
10705    ) -> RedDBResult<RuntimeQueryResult> {
10706        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
10707        use crate::auth::UserId;
10708        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
10709
10710        let auth_store = self
10711            .inner
10712            .auth_store
10713            .read()
10714            .clone()
10715            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10716
10717        // Granter identity + role.
10718        let (gname, grole) = current_auth_identity().ok_or_else(|| {
10719            RedDBError::Query("GRANT requires an authenticated principal".to_string())
10720        })?;
10721        let granter = UserId::from_parts(current_tenant().as_deref(), &gname);
10722        let granter_role = grole;
10723
10724        // Build the action set.
10725        let mut actions: Vec<Action> = Vec::new();
10726        if stmt.all {
10727            actions.push(Action::All);
10728        } else {
10729            for kw in &stmt.actions {
10730                let a = Action::from_keyword(kw).ok_or_else(|| {
10731                    RedDBError::Query(format!("unknown privilege keyword `{}`", kw))
10732                })?;
10733                actions.push(a);
10734            }
10735        }
10736
10737        // Audit emit (printed; structured emission is Agent #4's lane).
10738        let mut applied = 0usize;
10739        for obj in &stmt.objects {
10740            let resource = match stmt.object_kind {
10741                GrantObjectKind::Table => Resource::Table {
10742                    schema: obj.schema.clone(),
10743                    table: obj.name.clone(),
10744                },
10745                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
10746                GrantObjectKind::Database => Resource::Database,
10747                GrantObjectKind::Function => Resource::Function {
10748                    schema: obj.schema.clone(),
10749                    name: obj.name.clone(),
10750                },
10751            };
10752            for principal in &stmt.principals {
10753                let p = match principal {
10754                    GrantPrincipalRef::Public => GrantPrincipal::Public,
10755                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
10756                    GrantPrincipalRef::User { tenant, name } => {
10757                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
10758                    }
10759                };
10760                // Tenant of the grant follows the granter's tenant
10761                // (cross-tenant guard inside `AuthStore::grant`).
10762                let tenant = granter.tenant.clone();
10763                auth_store
10764                    .grant(
10765                        &granter,
10766                        granter_role,
10767                        p.clone(),
10768                        resource.clone(),
10769                        actions.clone(),
10770                        stmt.with_grant_option,
10771                        tenant.clone(),
10772                    )
10773                    .map_err(|e| RedDBError::Query(e.to_string()))?;
10774
10775                // IAM policy translation: every GRANT also lands as a
10776                // synthetic `_grant_<id>` policy attached to the
10777                // principal so the new evaluator sees it.
10778                if let Some(policy) =
10779                    grant_to_iam_policy(&p, &resource, &actions, tenant.as_deref())
10780                {
10781                    let pid = policy.id.clone();
10782                    auth_store
10783                        .put_policy_internal(policy)
10784                        .map_err(|e| RedDBError::Query(e.to_string()))?;
10785                    let attachment = match &p {
10786                        GrantPrincipal::User(uid) => {
10787                            crate::auth::store::PrincipalRef::User(uid.clone())
10788                        }
10789                        GrantPrincipal::Group(group) => {
10790                            crate::auth::store::PrincipalRef::Group(group.clone())
10791                        }
10792                        GrantPrincipal::Public => crate::auth::store::PrincipalRef::Group(
10793                            crate::auth::store::PUBLIC_IAM_GROUP.to_string(),
10794                        ),
10795                    };
10796                    auth_store
10797                        .attach_policy(attachment, &pid)
10798                        .map_err(|e| RedDBError::Query(e.to_string()))?;
10799                }
10800                applied += 1;
10801                tracing::info!(
10802                    target: "audit",
10803                    principal = %granter,
10804                    action = "grant",
10805                    "GRANT applied"
10806                );
10807            }
10808        }
10809
10810        self.invalidate_result_cache();
10811        Ok(RuntimeQueryResult::ok_message(
10812            query.to_string(),
10813            &format!("GRANT applied to {} target(s)", applied),
10814            "grant",
10815        ))
10816    }
10817
10818    /// Translate the parsed [`RevokeStmt`] into AuthStore mutations.
10819    fn execute_revoke_statement(
10820        &self,
10821        query: &str,
10822        stmt: &crate::storage::query::ast::RevokeStmt,
10823    ) -> RedDBResult<RuntimeQueryResult> {
10824        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
10825        use crate::auth::UserId;
10826        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
10827
10828        let auth_store = self
10829            .inner
10830            .auth_store
10831            .read()
10832            .clone()
10833            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10834
10835        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
10836            RedDBError::Query("REVOKE requires an authenticated principal".to_string())
10837        })?;
10838        let granter_role = grole;
10839
10840        let actions: Vec<Action> = if stmt.all {
10841            vec![Action::All]
10842        } else {
10843            stmt.actions
10844                .iter()
10845                .map(|kw| Action::from_keyword(kw).unwrap_or(Action::Select))
10846                .collect()
10847        };
10848
10849        let mut total_removed = 0usize;
10850        for obj in &stmt.objects {
10851            let resource = match stmt.object_kind {
10852                GrantObjectKind::Table => Resource::Table {
10853                    schema: obj.schema.clone(),
10854                    table: obj.name.clone(),
10855                },
10856                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
10857                GrantObjectKind::Database => Resource::Database,
10858                GrantObjectKind::Function => Resource::Function {
10859                    schema: obj.schema.clone(),
10860                    name: obj.name.clone(),
10861                },
10862            };
10863            for principal in &stmt.principals {
10864                let p = match principal {
10865                    GrantPrincipalRef::Public => GrantPrincipal::Public,
10866                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
10867                    GrantPrincipalRef::User { tenant, name } => {
10868                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
10869                    }
10870                };
10871                let removed = auth_store
10872                    .revoke(granter_role, &p, &resource, &actions)
10873                    .map_err(|e| RedDBError::Query(e.to_string()))?;
10874                let _removed_policies =
10875                    auth_store.delete_synthetic_grant_policies(&p, &resource, &actions);
10876                total_removed += removed;
10877            }
10878        }
10879
10880        self.invalidate_result_cache();
10881        Ok(RuntimeQueryResult::ok_message(
10882            query.to_string(),
10883            &format!("REVOKE removed {} grant(s)", total_removed),
10884            "revoke",
10885        ))
10886    }
10887
10888    /// Translate the parsed [`CreateUserStmt`] into an AuthStore user.
10889    fn execute_create_user_statement(
10890        &self,
10891        query: &str,
10892        stmt: &crate::storage::query::ast::CreateUserStmt,
10893    ) -> RedDBResult<RuntimeQueryResult> {
10894        let auth_store = self
10895            .inner
10896            .auth_store
10897            .read()
10898            .clone()
10899            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10900
10901        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
10902            RedDBError::Query("CREATE USER requires an authenticated principal".to_string())
10903        })?;
10904        if grole != crate::auth::Role::Admin {
10905            return Err(RedDBError::Query(
10906                "CREATE USER requires Admin role".to_string(),
10907            ));
10908        }
10909
10910        let role = crate::auth::Role::from_str(&stmt.role)
10911            .ok_or_else(|| RedDBError::Query(format!("invalid role `{}`", stmt.role)))?;
10912        let user = auth_store
10913            .create_user_in_tenant(stmt.tenant.as_deref(), &stmt.username, &stmt.password, role)
10914            .map_err(|e| RedDBError::Query(e.to_string()))?;
10915
10916        self.invalidate_result_cache();
10917        let target = crate::auth::UserId::from_parts(user.tenant_id.as_deref(), &user.username);
10918        tracing::info!(
10919            target: "audit",
10920            principal = %target,
10921            role = %role,
10922            action = "create_user",
10923            "CREATE USER applied"
10924        );
10925
10926        Ok(RuntimeQueryResult::ok_message(
10927            query.to_string(),
10928            &format!("CREATE USER {} applied", target),
10929            "create_user",
10930        ))
10931    }
10932
10933    /// Translate the parsed [`AlterUserStmt`] into AuthStore mutations.
10934    fn execute_alter_user_statement(
10935        &self,
10936        query: &str,
10937        stmt: &crate::storage::query::ast::AlterUserStmt,
10938    ) -> RedDBResult<RuntimeQueryResult> {
10939        use crate::auth::privileges::UserAttributes;
10940        use crate::auth::UserId;
10941        use crate::storage::query::ast::AlterUserAttribute;
10942
10943        let auth_store = self
10944            .inner
10945            .auth_store
10946            .read()
10947            .clone()
10948            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10949
10950        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
10951            RedDBError::Query("ALTER USER requires an authenticated principal".to_string())
10952        })?;
10953        if grole != crate::auth::Role::Admin {
10954            return Err(RedDBError::Query(
10955                "ALTER USER requires Admin role".to_string(),
10956            ));
10957        }
10958
10959        let target = UserId::from_parts(stmt.tenant.as_deref(), &stmt.username);
10960
10961        // Apply attributes incrementally — each one reads the current
10962        // record, mutates the relevant field, writes back.
10963        let mut attrs = auth_store.user_attributes(&target);
10964        let mut enable_change: Option<bool> = None;
10965
10966        for a in &stmt.attributes {
10967            match a {
10968                AlterUserAttribute::ValidUntil(ts) => {
10969                    // Parse ISO-ish timestamp → ms since epoch. Fall
10970                    // back to integer-ms parsing for callers that pass
10971                    // `'1234567890123'`.
10972                    let ms = parse_timestamp_to_ms(ts).ok_or_else(|| {
10973                        RedDBError::Query(format!("invalid VALID UNTIL timestamp `{ts}`"))
10974                    })?;
10975                    attrs.valid_until = Some(ms);
10976                }
10977                AlterUserAttribute::ConnectionLimit(n) => {
10978                    if *n < 0 {
10979                        return Err(RedDBError::Query(
10980                            "CONNECTION LIMIT must be non-negative".to_string(),
10981                        ));
10982                    }
10983                    attrs.connection_limit = Some(*n as u32);
10984                }
10985                AlterUserAttribute::SetSearchPath(p) => {
10986                    attrs.search_path = Some(p.clone());
10987                }
10988                AlterUserAttribute::AddGroup(g) => {
10989                    if !attrs.groups.iter().any(|existing| existing == g) {
10990                        attrs.groups.push(g.clone());
10991                        attrs.groups.sort();
10992                    }
10993                }
10994                AlterUserAttribute::DropGroup(g) => {
10995                    attrs.groups.retain(|existing| existing != g);
10996                }
10997                AlterUserAttribute::Enable => enable_change = Some(true),
10998                AlterUserAttribute::Disable => enable_change = Some(false),
10999                AlterUserAttribute::Password(_) => {
11000                    // Out of scope — accept the AST but no-op so the
11001                    // parser stays compatible with future password
11002                    // rotation work.
11003                }
11004            }
11005        }
11006
11007        auth_store
11008            .set_user_attributes(&target, attrs)
11009            .map_err(|e| RedDBError::Query(e.to_string()))?;
11010        if let Some(en) = enable_change {
11011            auth_store
11012                .set_user_enabled(&target, en)
11013                .map_err(|e| RedDBError::Query(e.to_string()))?;
11014        }
11015        self.invalidate_result_cache();
11016        tracing::info!(
11017            target: "audit",
11018            principal = %target,
11019            action = "alter_user",
11020            "ALTER USER applied"
11021        );
11022
11023        Ok(RuntimeQueryResult::ok_message(
11024            query.to_string(),
11025            &format!("ALTER USER {} applied", target),
11026            "alter_user",
11027        ))
11028    }
11029
11030    // -----------------------------------------------------------------
11031    // IAM policy executors
11032    // -----------------------------------------------------------------
11033
11034    fn execute_create_iam_policy(
11035        &self,
11036        query: &str,
11037        id: &str,
11038        json: &str,
11039    ) -> RedDBResult<RuntimeQueryResult> {
11040        use crate::auth::policies::Policy;
11041
11042        let auth_store = self
11043            .inner
11044            .auth_store
11045            .read()
11046            .clone()
11047            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11048
11049        // Parse + validate. The kernel rejects oversize / bad shape /
11050        // bad action keywords. If the supplied id differs from the JSON
11051        // id, override it with the SQL-provided id (the JSON id is
11052        // optional context — the SQL DDL form is authoritative).
11053        let mut policy = Policy::from_json_str(json)
11054            .map_err(|e| RedDBError::Query(format!("policy parse: {e}")))?;
11055        if policy.id != id {
11056            policy.id = id.to_string();
11057        }
11058        let pid = policy.id.clone();
11059        let tenant = current_tenant();
11060        let (actor_name, actor_role) = current_auth_identity()
11061            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
11062        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
11063        let eval_ctx = runtime_iam_context(
11064            actor_role,
11065            tenant.as_deref(),
11066            auth_store.principal_is_system_owned(&actor),
11067        );
11068        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
11069        let ledger = self.inner.control_event_ledger.read();
11070        let control = crate::auth::store::PolicyMutationControl {
11071            ctx: &event_ctx,
11072            ledger: ledger.as_ref(),
11073            config: self.inner.control_event_config,
11074            registry: Some(self.inner.config_registry.as_ref()),
11075            actor: &actor,
11076            eval_ctx: &eval_ctx,
11077        };
11078        auth_store
11079            .put_policy_with_control_events(policy, &control)
11080            .map_err(|e| RedDBError::Query(e.to_string()))?;
11081
11082        let principal = actor_name;
11083        tracing::info!(
11084            target: "audit",
11085            principal = %principal,
11086            action = "iam:policy.put",
11087            matched_policy_id = %pid,
11088            "CREATE POLICY applied"
11089        );
11090        self.inner.audit_log.record(
11091            "iam/policy.put",
11092            &principal,
11093            &pid,
11094            "ok",
11095            crate::json::Value::Null,
11096        );
11097
11098        self.invalidate_result_cache();
11099        Ok(RuntimeQueryResult::ok_message(
11100            query.to_string(),
11101            &format!("policy `{pid}` stored"),
11102            "create_iam_policy",
11103        ))
11104    }
11105
11106    fn execute_drop_iam_policy(&self, query: &str, id: &str) -> RedDBResult<RuntimeQueryResult> {
11107        let auth_store = self
11108            .inner
11109            .auth_store
11110            .read()
11111            .clone()
11112            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11113        let tenant = current_tenant();
11114        let (actor_name, actor_role) = current_auth_identity()
11115            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
11116        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
11117        let eval_ctx = runtime_iam_context(
11118            actor_role,
11119            tenant.as_deref(),
11120            auth_store.principal_is_system_owned(&actor),
11121        );
11122        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
11123        let ledger = self.inner.control_event_ledger.read();
11124        let control = crate::auth::store::PolicyMutationControl {
11125            ctx: &event_ctx,
11126            ledger: ledger.as_ref(),
11127            config: self.inner.control_event_config,
11128            registry: Some(self.inner.config_registry.as_ref()),
11129            actor: &actor,
11130            eval_ctx: &eval_ctx,
11131        };
11132        auth_store
11133            .delete_policy_with_control_events(id, &control)
11134            .map_err(|e| RedDBError::Query(e.to_string()))?;
11135
11136        let principal = actor_name;
11137        tracing::info!(
11138            target: "audit",
11139            principal = %principal,
11140            action = "iam:policy.drop",
11141            matched_policy_id = %id,
11142            "DROP POLICY applied"
11143        );
11144        self.inner.audit_log.record(
11145            "iam/policy.drop",
11146            &principal,
11147            id,
11148            "ok",
11149            crate::json::Value::Null,
11150        );
11151
11152        self.invalidate_result_cache();
11153        Ok(RuntimeQueryResult::ok_message(
11154            query.to_string(),
11155            &format!("policy `{id}` dropped"),
11156            "drop_iam_policy",
11157        ))
11158    }
11159
11160    fn execute_attach_policy(
11161        &self,
11162        query: &str,
11163        policy_id: &str,
11164        principal: &crate::storage::query::ast::PolicyPrincipalRef,
11165    ) -> RedDBResult<RuntimeQueryResult> {
11166        use crate::auth::store::PrincipalRef;
11167        use crate::auth::UserId;
11168        use crate::storage::query::ast::PolicyPrincipalRef;
11169
11170        let auth_store = self
11171            .inner
11172            .auth_store
11173            .read()
11174            .clone()
11175            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11176        let p = match principal {
11177            PolicyPrincipalRef::User(u) => {
11178                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
11179            }
11180            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
11181        };
11182        let pretty_target = principal_label(principal);
11183        let tenant = current_tenant();
11184        let (actor_name, actor_role) = current_auth_identity()
11185            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
11186        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
11187        let eval_ctx = runtime_iam_context(
11188            actor_role,
11189            tenant.as_deref(),
11190            auth_store.principal_is_system_owned(&actor),
11191        );
11192        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
11193        let ledger = self.inner.control_event_ledger.read();
11194        let control = crate::auth::store::PolicyMutationControl {
11195            ctx: &event_ctx,
11196            ledger: ledger.as_ref(),
11197            config: self.inner.control_event_config,
11198            registry: Some(self.inner.config_registry.as_ref()),
11199            actor: &actor,
11200            eval_ctx: &eval_ctx,
11201        };
11202        auth_store
11203            .attach_policy_with_control_events(p, policy_id, &control)
11204            .map_err(|e| RedDBError::Query(e.to_string()))?;
11205
11206        let principal_str = actor_name;
11207        tracing::info!(
11208            target: "audit",
11209            principal = %principal_str,
11210            action = "iam:policy.attach",
11211            matched_policy_id = %policy_id,
11212            target = %pretty_target,
11213            "ATTACH POLICY applied"
11214        );
11215        self.inner.audit_log.record(
11216            "iam/policy.attach",
11217            &principal_str,
11218            &pretty_target,
11219            "ok",
11220            crate::json::Value::Null,
11221        );
11222
11223        self.invalidate_result_cache();
11224        Ok(RuntimeQueryResult::ok_message(
11225            query.to_string(),
11226            &format!("policy `{policy_id}` attached to {pretty_target}"),
11227            "attach_policy",
11228        ))
11229    }
11230
11231    fn execute_detach_policy(
11232        &self,
11233        query: &str,
11234        policy_id: &str,
11235        principal: &crate::storage::query::ast::PolicyPrincipalRef,
11236    ) -> RedDBResult<RuntimeQueryResult> {
11237        use crate::auth::store::PrincipalRef;
11238        use crate::auth::UserId;
11239        use crate::storage::query::ast::PolicyPrincipalRef;
11240
11241        let auth_store = self
11242            .inner
11243            .auth_store
11244            .read()
11245            .clone()
11246            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11247        let p = match principal {
11248            PolicyPrincipalRef::User(u) => {
11249                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
11250            }
11251            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
11252        };
11253        let pretty_target = principal_label(principal);
11254        let tenant = current_tenant();
11255        let (actor_name, actor_role) = current_auth_identity()
11256            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
11257        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
11258        let eval_ctx = runtime_iam_context(
11259            actor_role,
11260            tenant.as_deref(),
11261            auth_store.principal_is_system_owned(&actor),
11262        );
11263        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
11264        let ledger = self.inner.control_event_ledger.read();
11265        let control = crate::auth::store::PolicyMutationControl {
11266            ctx: &event_ctx,
11267            ledger: ledger.as_ref(),
11268            config: self.inner.control_event_config,
11269            registry: Some(self.inner.config_registry.as_ref()),
11270            actor: &actor,
11271            eval_ctx: &eval_ctx,
11272        };
11273        auth_store
11274            .detach_policy_with_control_events(p, policy_id, &control)
11275            .map_err(|e| RedDBError::Query(e.to_string()))?;
11276
11277        let principal_str = actor_name;
11278        tracing::info!(
11279            target: "audit",
11280            principal = %principal_str,
11281            action = "iam:policy.detach",
11282            matched_policy_id = %policy_id,
11283            target = %pretty_target,
11284            "DETACH POLICY applied"
11285        );
11286        self.inner.audit_log.record(
11287            "iam/policy.detach",
11288            &principal_str,
11289            &pretty_target,
11290            "ok",
11291            crate::json::Value::Null,
11292        );
11293
11294        self.invalidate_result_cache();
11295        Ok(RuntimeQueryResult::ok_message(
11296            query.to_string(),
11297            &format!("policy `{policy_id}` detached from {pretty_target}"),
11298            "detach_policy",
11299        ))
11300    }
11301
11302    fn execute_show_policies(
11303        &self,
11304        query: &str,
11305        filter: Option<&crate::storage::query::ast::PolicyPrincipalRef>,
11306    ) -> RedDBResult<RuntimeQueryResult> {
11307        use crate::auth::UserId;
11308        use crate::storage::query::ast::PolicyPrincipalRef;
11309        use crate::storage::query::unified::UnifiedRecord;
11310        use crate::storage::schema::Value as SchemaValue;
11311        use std::sync::Arc;
11312
11313        let auth_store = self
11314            .inner
11315            .auth_store
11316            .read()
11317            .clone()
11318            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11319
11320        let pols = match filter {
11321            None => auth_store.list_policies(),
11322            Some(PolicyPrincipalRef::User(u)) => {
11323                let id = UserId::from_parts(u.tenant.as_deref(), &u.username);
11324                auth_store.effective_policies(&id)
11325            }
11326            Some(PolicyPrincipalRef::Group(g)) => auth_store.group_policies(g),
11327        };
11328
11329        let mut records = Vec::with_capacity(pols.len() + 1);
11330
11331        // Header row (#712 / S5A): synthetic record at index 0 that
11332        // reports the active PolicyEnforcementMode and the hard-cutover
11333        // version, so an operator running SHOW POLICIES can see the
11334        // current posture without a separate command.
11335        let mode = auth_store.enforcement_mode();
11336        let mut header = UnifiedRecord::default();
11337        header.set_arc(
11338            Arc::from("id"),
11339            SchemaValue::text("<enforcement_mode>".to_string()),
11340        );
11341        header.set_arc(Arc::from("statements"), SchemaValue::Integer(0));
11342        header.set_arc(Arc::from("tenant"), SchemaValue::Null);
11343        let header_json = format!(
11344            r#"{{"enforcement_mode":"{}","policy_only_hard_version":"{}"}}"#,
11345            mode.as_str(),
11346            crate::auth::enforcement_mode::POLICY_ONLY_HARD_VERSION
11347        );
11348        header.set_arc(Arc::from("json"), SchemaValue::text(header_json));
11349        records.push(header);
11350
11351        for p in pols.iter() {
11352            let mut rec = UnifiedRecord::default();
11353            rec.set_arc(Arc::from("id"), SchemaValue::text(p.id.clone()));
11354            rec.set_arc(
11355                Arc::from("statements"),
11356                SchemaValue::Integer(p.statements.len() as i64),
11357            );
11358            rec.set_arc(
11359                Arc::from("tenant"),
11360                p.tenant
11361                    .as_deref()
11362                    .map(|t| SchemaValue::text(t.to_string()))
11363                    .unwrap_or(SchemaValue::Null),
11364            );
11365            rec.set_arc(Arc::from("json"), SchemaValue::text(p.to_json_string()));
11366            records.push(rec);
11367        }
11368        let mut result = crate::storage::query::unified::UnifiedResult::empty();
11369        result.records = records;
11370        Ok(RuntimeQueryResult {
11371            query: query.to_string(),
11372            mode: crate::storage::query::modes::QueryMode::Sql,
11373            statement: "show_policies",
11374            engine: "iam-policies",
11375            result,
11376            affected_rows: 0,
11377            statement_type: "select",
11378            bookmark: None,
11379        })
11380    }
11381
11382    fn execute_show_effective_permissions(
11383        &self,
11384        query: &str,
11385        user: &crate::storage::query::ast::PolicyUserRef,
11386        resource: Option<&crate::storage::query::ast::PolicyResourceRef>,
11387    ) -> RedDBResult<RuntimeQueryResult> {
11388        use crate::auth::UserId;
11389        use crate::storage::query::unified::UnifiedRecord;
11390        use crate::storage::schema::Value as SchemaValue;
11391        use std::sync::Arc;
11392
11393        let auth_store = self
11394            .inner
11395            .auth_store
11396            .read()
11397            .clone()
11398            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11399        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
11400        let pols = auth_store.effective_policies(&id);
11401
11402        // Show one row per (policy, statement) tuple, plus any
11403        // resource-level filter passed by the caller.
11404        let mut records = Vec::new();
11405        for p in pols.iter() {
11406            for (idx, st) in p.statements.iter().enumerate() {
11407                if let Some(_r) = resource {
11408                    // Naive filter: render statement targets to strings
11409                    // and skip if no match. Conservative default = include
11410                    // (the simulator handles fine-grained matching).
11411                }
11412                let mut rec = UnifiedRecord::default();
11413                rec.set_arc(Arc::from("policy_id"), SchemaValue::text(p.id.clone()));
11414                rec.set_arc(
11415                    Arc::from("statement_index"),
11416                    SchemaValue::Integer(idx as i64),
11417                );
11418                rec.set_arc(
11419                    Arc::from("sid"),
11420                    st.sid
11421                        .as_deref()
11422                        .map(|s| SchemaValue::text(s.to_string()))
11423                        .unwrap_or(SchemaValue::Null),
11424                );
11425                rec.set_arc(
11426                    Arc::from("effect"),
11427                    SchemaValue::text(match st.effect {
11428                        crate::auth::policies::Effect::Allow => "allow",
11429                        crate::auth::policies::Effect::Deny => "deny",
11430                    }),
11431                );
11432                rec.set_arc(
11433                    Arc::from("actions"),
11434                    SchemaValue::Integer(st.actions.len() as i64),
11435                );
11436                rec.set_arc(
11437                    Arc::from("resources"),
11438                    SchemaValue::Integer(st.resources.len() as i64),
11439                );
11440                records.push(rec);
11441            }
11442        }
11443        let mut result = crate::storage::query::unified::UnifiedResult::empty();
11444        result.records = records;
11445        Ok(RuntimeQueryResult {
11446            query: query.to_string(),
11447            mode: crate::storage::query::modes::QueryMode::Sql,
11448            statement: "show_effective_permissions",
11449            engine: "iam-policies",
11450            result,
11451            affected_rows: 0,
11452            statement_type: "select",
11453            bookmark: None,
11454        })
11455    }
11456
11457    fn execute_lint_policy(
11458        &self,
11459        query: &str,
11460        source: &crate::storage::query::ast::LintPolicySource,
11461    ) -> RedDBResult<RuntimeQueryResult> {
11462        use crate::auth::policy_linter::lint;
11463        use crate::storage::query::ast::LintPolicySource;
11464        use crate::storage::query::unified::UnifiedRecord;
11465        use crate::storage::schema::Value as SchemaValue;
11466        use std::sync::Arc;
11467
11468        // Resolve the policy text. `JSON` source lints the literal
11469        // verbatim; `Id` source fetches the stored document so
11470        // operators can lint a policy by name without rebuilding the
11471        // JSON from `SHOW POLICY`.
11472        let policy_text = match source {
11473            LintPolicySource::Json(text) => text.clone(),
11474            LintPolicySource::Id(id) => {
11475                let auth_store =
11476                    self.inner.auth_store.read().clone().ok_or_else(|| {
11477                        RedDBError::Query("auth store not configured".to_string())
11478                    })?;
11479                let policy = auth_store
11480                    .get_policy(id)
11481                    .ok_or_else(|| RedDBError::Query(format!("policy `{id}` not found")))?;
11482                policy.to_json_string()
11483            }
11484        };
11485        let diagnostics = lint(&policy_text);
11486
11487        let principal_str = current_auth_identity()
11488            .map(|(u, _)| u)
11489            .unwrap_or_else(|| "anonymous".into());
11490        tracing::info!(
11491            target: "audit",
11492            principal = %principal_str,
11493            action = "iam:policy.lint",
11494            diagnostic_count = diagnostics.len(),
11495            "LINT POLICY issued"
11496        );
11497        self.inner.audit_log.record(
11498            "iam/policy.lint",
11499            &principal_str,
11500            match source {
11501                LintPolicySource::Id(id) => id.as_str(),
11502                LintPolicySource::Json(_) => "<json>",
11503            },
11504            "ok",
11505            crate::json::Value::Null,
11506        );
11507
11508        // One row per diagnostic. Column order matches the HTTP
11509        // surface's JSON keys so the two contracts line up.
11510        const COLUMNS: [&str; 5] = ["severity", "code", "message", "suggested_fix", "location"];
11511        let schema = Arc::new(
11512            COLUMNS
11513                .iter()
11514                .map(|name| Arc::<str>::from(*name))
11515                .collect::<Vec<_>>(),
11516        );
11517        let records: Vec<UnifiedRecord> = diagnostics
11518            .iter()
11519            .map(|d| {
11520                UnifiedRecord::with_schema(
11521                    Arc::clone(&schema),
11522                    vec![
11523                        SchemaValue::text(d.severity.as_str()),
11524                        SchemaValue::text(d.code.as_str()),
11525                        SchemaValue::text(d.message.clone()),
11526                        d.suggested_fix
11527                            .as_deref()
11528                            .map(SchemaValue::text)
11529                            .unwrap_or(SchemaValue::Null),
11530                        d.location
11531                            .as_deref()
11532                            .map(SchemaValue::text)
11533                            .unwrap_or(SchemaValue::Null),
11534                    ],
11535                )
11536            })
11537            .collect();
11538        let mut result = crate::storage::query::unified::UnifiedResult::with_columns(
11539            COLUMNS.iter().map(|c| c.to_string()).collect(),
11540        );
11541        result.records = records;
11542        Ok(RuntimeQueryResult {
11543            query: query.to_string(),
11544            mode: crate::storage::query::modes::QueryMode::Sql,
11545            statement: "lint_policy",
11546            engine: "iam-policies",
11547            result,
11548            affected_rows: 0,
11549            statement_type: "select",
11550            bookmark: None,
11551        })
11552    }
11553
11554    /// `MIGRATE POLICY MODE TO '<target>' [DRY RUN]` — flip the install
11555    /// from `legacy_rbac` to `policy_only` after the pre-flight delta
11556    /// simulator confirms no non-admin principal would lose access.
11557    /// Issue #714.
11558    fn execute_migrate_policy_mode(
11559        &self,
11560        query: &str,
11561        target: &str,
11562        dry_run: bool,
11563    ) -> RedDBResult<RuntimeQueryResult> {
11564        use crate::auth::enforcement_mode::PolicyEnforcementMode;
11565        use crate::auth::migrate_policy_mode::{
11566            principal_label, simulate_migration_delta, MigratePolicyDelta,
11567        };
11568        use crate::auth::policies::ResourceRef;
11569        use crate::storage::query::unified::UnifiedRecord;
11570        use crate::storage::schema::Value as SchemaValue;
11571        use std::sync::Arc;
11572
11573        // Only `policy_only` is a meaningful destination for this
11574        // command — flipping back to `legacy_rbac` is supported via
11575        // direct config writes (it doesn't need a pre-flight). We
11576        // reject everything else with the same allowlist `parse` uses.
11577        let parsed = PolicyEnforcementMode::parse(target).ok_or_else(|| {
11578            RedDBError::Query(format!(
11579                "MIGRATE POLICY MODE: invalid target `{target}` (expected `policy_only`)"
11580            ))
11581        })?;
11582        if parsed != PolicyEnforcementMode::PolicyOnly {
11583            return Err(RedDBError::Query(format!(
11584                "MIGRATE POLICY MODE: target `{target}` is not supported — only `policy_only` may be migrated to via this command"
11585            )));
11586        }
11587
11588        let auth_store = self
11589            .inner
11590            .auth_store
11591            .read()
11592            .clone()
11593            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11594
11595        // Resource enumeration: every existing collection probed as
11596        // `table:<name>`. This is the realistic resource surface for
11597        // the legacy_rbac fallback (the role floors gate per-table
11598        // actions). Wildcard / column-scoped resources are still
11599        // covered by the policy evaluator because evaluate() resolves
11600        // resource patterns relative to the concrete resources we
11601        // probe here.
11602        let snapshot = self.inner.db.catalog_model_snapshot();
11603        let resources: Vec<ResourceRef> = snapshot
11604            .collections
11605            .iter()
11606            .map(|c| ResourceRef::new("table", c.name.clone()))
11607            .collect();
11608
11609        let now_ms = crate::utils::now_unix_millis() as u128;
11610        let deltas: Vec<MigratePolicyDelta> =
11611            simulate_migration_delta(auth_store.as_ref(), &resources, now_ms);
11612
11613        let principal_str = current_auth_identity()
11614            .map(|(u, _)| u)
11615            .unwrap_or_else(|| "anonymous".into());
11616
11617        // Audit every issuance. The outcome line differentiates
11618        // dry-run, refused, and applied — operators can grep for these
11619        // strings in the audit log.
11620        let outcome_str = if dry_run {
11621            "dry_run"
11622        } else if deltas.is_empty() {
11623            "applied"
11624        } else {
11625            "refused"
11626        };
11627        tracing::info!(
11628            target: "audit",
11629            principal = %principal_str,
11630            action = "iam:policy.migrate_mode",
11631            target = %target,
11632            dry_run,
11633            delta_count = deltas.len(),
11634            outcome = outcome_str,
11635            "MIGRATE POLICY MODE issued"
11636        );
11637        self.inner.audit_log.record(
11638            "iam/policy.migrate_mode",
11639            &principal_str,
11640            target,
11641            outcome_str,
11642            crate::json::Value::Null,
11643        );
11644
11645        // Refuse the non-dry-run path when any principal would lose
11646        // access. The error string carries a compact summary plus the
11647        // delta count so operators can re-run with DRY RUN to inspect.
11648        if !dry_run && !deltas.is_empty() {
11649            let summary = deltas
11650                .iter()
11651                .take(5)
11652                .map(|d| {
11653                    format!(
11654                        "{}:{}/{}:{}",
11655                        principal_label(&d.principal),
11656                        d.action,
11657                        d.resource_kind,
11658                        d.resource_name
11659                    )
11660                })
11661                .collect::<Vec<_>>()
11662                .join(", ");
11663            let more = if deltas.len() > 5 {
11664                format!(" (and {} more)", deltas.len() - 5)
11665            } else {
11666                String::new()
11667            };
11668            return Err(RedDBError::Query(format!(
11669                "MIGRATE POLICY MODE refused: {n} principal/action/resource pair(s) would lose access under `policy_only`. Run `MIGRATE POLICY MODE TO '{target}' DRY RUN` to inspect. Sample: {summary}{more}",
11670                n = deltas.len(),
11671            )));
11672        }
11673
11674        // Mutate the live enforcement mode only on the non-dry-run
11675        // path with an empty delta. `set_enforcement_mode` also
11676        // persists to vault_kv so the new mode survives restart.
11677        if !dry_run {
11678            auth_store.set_enforcement_mode(parsed);
11679        }
11680
11681        const COLUMNS: [&str; 5] = [
11682            "principal",
11683            "role",
11684            "action",
11685            "resource_kind",
11686            "resource_name",
11687        ];
11688        let schema = Arc::new(
11689            COLUMNS
11690                .iter()
11691                .map(|name| Arc::<str>::from(*name))
11692                .collect::<Vec<_>>(),
11693        );
11694        let records: Vec<UnifiedRecord> = deltas
11695            .iter()
11696            .map(|d| {
11697                UnifiedRecord::with_schema(
11698                    Arc::clone(&schema),
11699                    vec![
11700                        SchemaValue::text(principal_label(&d.principal)),
11701                        SchemaValue::text(d.role.as_str()),
11702                        SchemaValue::text(d.action.clone()),
11703                        SchemaValue::text(d.resource_kind.clone()),
11704                        SchemaValue::text(d.resource_name.clone()),
11705                    ],
11706                )
11707            })
11708            .collect();
11709        let mut result = crate::storage::query::unified::UnifiedResult::with_columns(
11710            COLUMNS.iter().map(|c| c.to_string()).collect(),
11711        );
11712        result.records = records;
11713        Ok(RuntimeQueryResult {
11714            query: query.to_string(),
11715            mode: crate::storage::query::modes::QueryMode::Sql,
11716            statement: "migrate_policy_mode",
11717            engine: "iam-policies",
11718            result,
11719            affected_rows: 0,
11720            statement_type: "select",
11721            bookmark: None,
11722        })
11723    }
11724
11725    fn execute_simulate_policy(
11726        &self,
11727        query: &str,
11728        user: &crate::storage::query::ast::PolicyUserRef,
11729        action: &str,
11730        resource: &crate::storage::query::ast::PolicyResourceRef,
11731    ) -> RedDBResult<RuntimeQueryResult> {
11732        use crate::auth::policies::ResourceRef;
11733        use crate::auth::store::SimCtx;
11734        use crate::auth::UserId;
11735        use crate::storage::query::unified::UnifiedRecord;
11736        use crate::storage::schema::Value as SchemaValue;
11737        use std::sync::Arc;
11738
11739        let auth_store = self
11740            .inner
11741            .auth_store
11742            .read()
11743            .clone()
11744            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11745        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
11746        let r = ResourceRef::new(resource.kind.clone(), resource.name.clone());
11747        let outcome = auth_store.simulate(&id, action, &r, SimCtx::default());
11748
11749        let principal_str = current_auth_identity()
11750            .map(|(u, _)| u)
11751            .unwrap_or_else(|| "anonymous".into());
11752        let (decision_str, matched_pid, matched_sid) = decision_to_strings(&outcome.decision);
11753        tracing::info!(
11754            target: "audit",
11755            principal = %principal_str,
11756            action = "iam:policy.simulate",
11757            decision = %decision_str,
11758            matched_policy_id = ?matched_pid,
11759            matched_sid = ?matched_sid,
11760            "SIMULATE issued"
11761        );
11762        self.inner.audit_log.record(
11763            "iam/policy.simulate",
11764            &principal_str,
11765            &id.to_string(),
11766            "ok",
11767            crate::json::Value::Null,
11768        );
11769
11770        let mut rec = UnifiedRecord::default();
11771        rec.set_arc(Arc::from("decision"), SchemaValue::text(decision_str));
11772        rec.set_arc(
11773            Arc::from("matched_policy_id"),
11774            matched_pid
11775                .map(SchemaValue::text)
11776                .unwrap_or(SchemaValue::Null),
11777        );
11778        rec.set_arc(
11779            Arc::from("matched_sid"),
11780            matched_sid
11781                .map(SchemaValue::text)
11782                .unwrap_or(SchemaValue::Null),
11783        );
11784        rec.set_arc(Arc::from("reason"), SchemaValue::text(outcome.reason));
11785        rec.set_arc(
11786            Arc::from("trail_len"),
11787            SchemaValue::Integer(outcome.trail.len() as i64),
11788        );
11789        let mut result = crate::storage::query::unified::UnifiedResult::empty();
11790        result.records = vec![rec];
11791        Ok(RuntimeQueryResult {
11792            query: query.to_string(),
11793            mode: crate::storage::query::modes::QueryMode::Sql,
11794            statement: "simulate_policy",
11795            engine: "iam-policies",
11796            result,
11797            affected_rows: 0,
11798            statement_type: "select",
11799            bookmark: None,
11800        })
11801    }
11802}
11803
11804/// Translate a parsed GRANT into a synthetic IAM policy whose id
11805/// starts with `_grant_<unique>`. PUBLIC is represented as an
11806/// implicit IAM group; legacy GROUP grants are still rejected by the
11807/// grant store and are not translated here.
11808fn grant_to_iam_policy(
11809    principal: &crate::auth::privileges::GrantPrincipal,
11810    resource: &crate::auth::privileges::Resource,
11811    actions: &[crate::auth::privileges::Action],
11812    tenant: Option<&str>,
11813) -> Option<crate::auth::policies::Policy> {
11814    use crate::auth::policies::{
11815        compile_action, ActionPattern, Effect, Policy, ResourcePattern, Statement,
11816    };
11817    use crate::auth::privileges::{Action, GrantPrincipal, Resource};
11818
11819    if matches!(principal, GrantPrincipal::Group(_)) {
11820        return None;
11821    }
11822
11823    let now = crate::auth::now_ms();
11824    let id = format!("_grant_{:x}_{:x}", now, std::process::id());
11825
11826    let resource_str = match resource {
11827        Resource::Database => "table:*".to_string(),
11828        Resource::Schema(s) => format!("table:{s}.*"),
11829        Resource::Table { schema, table } => match schema {
11830            Some(s) => format!("table:{s}.{table}"),
11831            None => format!("table:{table}"),
11832        },
11833        Resource::Function { schema, name } => match schema {
11834            Some(s) => format!("function:{s}.{name}"),
11835            None => format!("function:{name}"),
11836        },
11837    };
11838
11839    // Compile actions — fall back to `*` only when the grant included
11840    // `Action::All`. Map every other action keyword to its lowercase
11841    // form so it lines up with the kernel's allowlist.
11842    let action_patterns: Vec<ActionPattern> = if actions.contains(&Action::All) {
11843        vec![ActionPattern::Wildcard]
11844    } else {
11845        actions
11846            .iter()
11847            .map(|a| compile_action(&a.as_str().to_ascii_lowercase()))
11848            .collect()
11849    };
11850    if action_patterns.is_empty() {
11851        return None;
11852    }
11853
11854    // Inline resource compilation matching the kernel's `compile_resource`:
11855    //   * `*` → wildcard
11856    //   * contains `*` → glob
11857    //   * `kind:name` → exact
11858    let resource_patterns = if resource_str == "*" {
11859        vec![ResourcePattern::Wildcard]
11860    } else if resource_str.contains('*') {
11861        vec![ResourcePattern::Glob(resource_str.clone())]
11862    } else if let Some((kind, name)) = resource_str.split_once(':') {
11863        vec![ResourcePattern::Exact {
11864            kind: kind.to_string(),
11865            name: name.to_string(),
11866        }]
11867    } else {
11868        vec![ResourcePattern::Wildcard]
11869    };
11870
11871    let policy = Policy {
11872        id,
11873        version: 1,
11874        tenant: tenant.map(|t| t.to_string()),
11875        created_at: now,
11876        updated_at: now,
11877        statements: vec![Statement {
11878            sid: None,
11879            effect: Effect::Allow,
11880            actions: action_patterns,
11881            resources: resource_patterns,
11882            condition: None,
11883        }],
11884    };
11885    if policy.validate().is_err() {
11886        return None;
11887    }
11888    Some(policy)
11889}
11890
11891/// Coerce a `key => <number>` table-function named argument into a positive
11892/// iteration count for the centrality TVFs (issue #797). The parser lexes all
11893/// named values as `f64`, so an integral, finite, strictly-positive value is
11894/// required here; anything else (fractional, zero, negative, NaN/inf) is a
11895/// clear query error. `func` names the function for the message.
11896fn parse_positive_iterations(func: &str, value: &f64) -> RedDBResult<usize> {
11897    if !value.is_finite() || *value < 1.0 || value.fract() != 0.0 {
11898        return Err(RedDBError::Query(format!(
11899            "table function '{func}' max_iterations must be a positive integer, got {value}"
11900        )));
11901    }
11902    Ok(*value as usize)
11903}
11904
11905fn legacy_action_to_iam(action: crate::auth::privileges::Action) -> &'static str {
11906    use crate::auth::privileges::Action;
11907    match action {
11908        Action::Select => "select",
11909        Action::Insert => "insert",
11910        Action::Update => "update",
11911        Action::Delete => "delete",
11912        Action::Truncate => "truncate",
11913        Action::References => "references",
11914        Action::Execute => "execute",
11915        Action::Usage => "usage",
11916        Action::All => "*",
11917    }
11918}
11919
11920fn update_set_target_columns(query: &crate::storage::query::ast::UpdateQuery) -> Vec<String> {
11921    let mut columns = Vec::new();
11922    for (column, _) in &query.assignment_exprs {
11923        if !columns.iter().any(|seen| seen == column) {
11924            columns.push(column.clone());
11925        }
11926    }
11927    columns
11928}
11929
11930fn column_access_request_for_table_update(
11931    table_name: &str,
11932    columns: Vec<String>,
11933) -> crate::auth::ColumnAccessRequest {
11934    match table_name.split_once('.') {
11935        Some((schema, table)) => {
11936            crate::auth::ColumnAccessRequest::update(table.to_string(), columns)
11937                .with_schema(schema.to_string())
11938        }
11939        None => crate::auth::ColumnAccessRequest::update(table_name.to_string(), columns),
11940    }
11941}
11942
11943fn column_access_request_for_table_select(
11944    table_name: &str,
11945    columns: Vec<String>,
11946) -> crate::auth::ColumnAccessRequest {
11947    match table_name.split_once('.') {
11948        Some((schema, table)) => {
11949            crate::auth::ColumnAccessRequest::select(table.to_string(), columns)
11950                .with_schema(schema.to_string())
11951        }
11952        None => crate::auth::ColumnAccessRequest::select(table_name.to_string(), columns),
11953    }
11954}
11955
11956fn update_returning_columns_for_policy(
11957    runtime: &RedDBRuntime,
11958    query: &crate::storage::query::ast::UpdateQuery,
11959) -> Option<Vec<String>> {
11960    let items = query.returning.as_ref()?;
11961    let mut columns = Vec::new();
11962    let project_all = items
11963        .iter()
11964        .any(|item| matches!(item, crate::storage::query::ast::ReturningItem::All));
11965    if project_all {
11966        collect_returning_star_columns(runtime, query, &mut columns);
11967    } else {
11968        for item in items {
11969            let crate::storage::query::ast::ReturningItem::Column(column) = item else {
11970                continue;
11971            };
11972            push_returning_policy_column(&mut columns, column);
11973        }
11974    }
11975    (!columns.is_empty()).then_some(columns)
11976}
11977
11978fn collect_returning_star_columns(
11979    runtime: &RedDBRuntime,
11980    query: &crate::storage::query::ast::UpdateQuery,
11981    columns: &mut Vec<String>,
11982) {
11983    let store = runtime.db().store();
11984    let Some(manager) = store.get_collection(&query.table) else {
11985        return;
11986    };
11987    if let Some(schema) = manager.column_schema() {
11988        for column in schema.iter() {
11989            push_returning_policy_column(columns, column);
11990        }
11991    }
11992    for entity in manager.query_all(|_| true) {
11993        if !returning_entity_matches_update_target(&entity, query.target) {
11994            continue;
11995        }
11996        match &entity.data {
11997            crate::storage::EntityData::Row(row) => {
11998                for (column, _) in row.iter_fields() {
11999                    push_returning_policy_column(columns, column);
12000                }
12001            }
12002            crate::storage::EntityData::Node(node) => {
12003                push_returning_policy_column(columns, "label");
12004                push_returning_policy_column(columns, "node_type");
12005                for column in node.properties.keys() {
12006                    push_returning_policy_column(columns, column);
12007                }
12008            }
12009            crate::storage::EntityData::Edge(edge) => {
12010                push_returning_policy_column(columns, "label");
12011                push_returning_policy_column(columns, "from_rid");
12012                push_returning_policy_column(columns, "to_rid");
12013                push_returning_policy_column(columns, "weight");
12014                for column in edge.properties.keys() {
12015                    push_returning_policy_column(columns, column);
12016                }
12017            }
12018            _ => {}
12019        }
12020    }
12021}
12022
12023fn push_returning_policy_column(columns: &mut Vec<String>, column: &str) {
12024    if returning_public_envelope_column(column) {
12025        return;
12026    }
12027    if !columns.iter().any(|seen| seen == column) {
12028        columns.push(column.to_string());
12029    }
12030}
12031
12032fn returning_public_envelope_column(column: &str) -> bool {
12033    matches!(
12034        column.to_ascii_lowercase().as_str(),
12035        "rid" | "collection" | "kind" | "tenant" | "created_at" | "updated_at" | "red_entity_id"
12036    )
12037}
12038
12039fn returning_entity_matches_update_target(
12040    entity: &crate::storage::UnifiedEntity,
12041    target: crate::storage::query::ast::UpdateTarget,
12042) -> bool {
12043    use crate::storage::query::ast::UpdateTarget;
12044    match target {
12045        UpdateTarget::Rows => {
12046            matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Row))
12047        }
12048        UpdateTarget::Documents => {
12049            matches!(
12050                returning_row_item_kind(entity),
12051                Some(ReturningRowKind::Document)
12052            )
12053        }
12054        UpdateTarget::Kv => matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Kv)),
12055        UpdateTarget::Nodes => matches!(
12056            (&entity.kind, &entity.data),
12057            (
12058                crate::storage::EntityKind::GraphNode(_),
12059                crate::storage::EntityData::Node(_)
12060            )
12061        ),
12062        UpdateTarget::Edges => matches!(
12063            (&entity.kind, &entity.data),
12064            (
12065                crate::storage::EntityKind::GraphEdge(_),
12066                crate::storage::EntityData::Edge(_)
12067            )
12068        ),
12069    }
12070}
12071
12072#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12073enum ReturningRowKind {
12074    Row,
12075    Document,
12076    Kv,
12077}
12078
12079fn returning_row_item_kind(entity: &crate::storage::UnifiedEntity) -> Option<ReturningRowKind> {
12080    let row = entity.data.as_row()?;
12081    let is_kv = row.iter_fields().all(|(column, _)| {
12082        column.eq_ignore_ascii_case("key") || column.eq_ignore_ascii_case("value")
12083    });
12084    if is_kv {
12085        return Some(ReturningRowKind::Kv);
12086    }
12087    let is_document = row
12088        .iter_fields()
12089        .any(|(_, value)| matches!(value, crate::storage::schema::Value::Json(_)));
12090    if is_document {
12091        Some(ReturningRowKind::Document)
12092    } else {
12093        Some(ReturningRowKind::Row)
12094    }
12095}
12096
12097fn requested_table_columns_for_policy(
12098    table: &crate::storage::query::ast::TableQuery,
12099) -> Vec<String> {
12100    use crate::storage::query::sql_lowering::{
12101        effective_table_filter, effective_table_group_by_exprs, effective_table_having_filter,
12102        effective_table_projections,
12103    };
12104
12105    let table_name = table.table.as_str();
12106    let table_alias = table.alias.as_deref();
12107    let mut columns = std::collections::BTreeSet::new();
12108
12109    for projection in effective_table_projections(table) {
12110        collect_projection_columns(&projection, table_name, table_alias, &mut columns);
12111    }
12112    if let Some(filter) = effective_table_filter(table) {
12113        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
12114    }
12115    for expr in effective_table_group_by_exprs(table) {
12116        collect_expr_columns(&expr, table_name, table_alias, &mut columns);
12117    }
12118    if let Some(filter) = effective_table_having_filter(table) {
12119        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
12120    }
12121    for order in &table.order_by {
12122        if let Some(expr) = order.expr.as_ref() {
12123            collect_expr_columns(expr, table_name, table_alias, &mut columns);
12124        } else {
12125            collect_field_ref_column(&order.field, table_name, table_alias, &mut columns);
12126        }
12127    }
12128
12129    columns.into_iter().collect()
12130}
12131
12132fn collect_projection_columns(
12133    projection: &crate::storage::query::ast::Projection,
12134    table_name: &str,
12135    table_alias: Option<&str>,
12136    columns: &mut std::collections::BTreeSet<String>,
12137) {
12138    use crate::storage::query::ast::Projection;
12139    match projection {
12140        Projection::All => {
12141            columns.insert("*".to_string());
12142        }
12143        Projection::Column(column) | Projection::Alias(column, _) => {
12144            if column != "*" {
12145                columns.insert(column.clone());
12146            }
12147        }
12148        Projection::Function(_, args) => {
12149            for arg in args {
12150                collect_projection_columns(arg, table_name, table_alias, columns);
12151            }
12152        }
12153        Projection::Expression(filter, _) => {
12154            collect_filter_columns(filter, table_name, table_alias, columns);
12155        }
12156        Projection::Field(field, _) => {
12157            collect_field_ref_column(field, table_name, table_alias, columns);
12158        }
12159        // Slice 7a (#589): no runtime support yet; recurse into args so
12160        // any column references are still tracked in case a future
12161        // executor needs the column set.
12162        Projection::Window { args, .. } => {
12163            for arg in args {
12164                collect_projection_columns(arg, table_name, table_alias, columns);
12165            }
12166        }
12167    }
12168}
12169
12170fn collect_filter_columns(
12171    filter: &crate::storage::query::ast::Filter,
12172    table_name: &str,
12173    table_alias: Option<&str>,
12174    columns: &mut std::collections::BTreeSet<String>,
12175) {
12176    use crate::storage::query::ast::Filter;
12177    match filter {
12178        Filter::Compare { field, .. }
12179        | Filter::IsNull(field)
12180        | Filter::IsNotNull(field)
12181        | Filter::In { field, .. }
12182        | Filter::Between { field, .. }
12183        | Filter::Like { field, .. }
12184        | Filter::StartsWith { field, .. }
12185        | Filter::EndsWith { field, .. }
12186        | Filter::Contains { field, .. } => {
12187            collect_field_ref_column(field, table_name, table_alias, columns);
12188        }
12189        Filter::CompareFields { left, right, .. } => {
12190            collect_field_ref_column(left, table_name, table_alias, columns);
12191            collect_field_ref_column(right, table_name, table_alias, columns);
12192        }
12193        Filter::CompareExpr { lhs, rhs, .. } => {
12194            collect_expr_columns(lhs, table_name, table_alias, columns);
12195            collect_expr_columns(rhs, table_name, table_alias, columns);
12196        }
12197        Filter::And(left, right) | Filter::Or(left, right) => {
12198            collect_filter_columns(left, table_name, table_alias, columns);
12199            collect_filter_columns(right, table_name, table_alias, columns);
12200        }
12201        Filter::Not(inner) => collect_filter_columns(inner, table_name, table_alias, columns),
12202    }
12203}
12204
12205fn collect_expr_columns(
12206    expr: &crate::storage::query::ast::Expr,
12207    table_name: &str,
12208    table_alias: Option<&str>,
12209    columns: &mut std::collections::BTreeSet<String>,
12210) {
12211    use crate::storage::query::ast::Expr;
12212    match expr {
12213        Expr::Column { field, .. } => {
12214            collect_field_ref_column(field, table_name, table_alias, columns);
12215        }
12216        Expr::Literal { .. } | Expr::Parameter { .. } => {}
12217        Expr::UnaryOp { operand, .. } | Expr::Cast { inner: operand, .. } => {
12218            collect_expr_columns(operand, table_name, table_alias, columns);
12219        }
12220        Expr::BinaryOp { lhs, rhs, .. } => {
12221            collect_expr_columns(lhs, table_name, table_alias, columns);
12222            collect_expr_columns(rhs, table_name, table_alias, columns);
12223        }
12224        Expr::FunctionCall { args, .. } => {
12225            for arg in args {
12226                collect_expr_columns(arg, table_name, table_alias, columns);
12227            }
12228        }
12229        Expr::Case {
12230            branches, else_, ..
12231        } => {
12232            for (condition, value) in branches {
12233                collect_expr_columns(condition, table_name, table_alias, columns);
12234                collect_expr_columns(value, table_name, table_alias, columns);
12235            }
12236            if let Some(value) = else_ {
12237                collect_expr_columns(value, table_name, table_alias, columns);
12238            }
12239        }
12240        Expr::IsNull { operand, .. } => {
12241            collect_expr_columns(operand, table_name, table_alias, columns);
12242        }
12243        Expr::InList { target, values, .. } => {
12244            collect_expr_columns(target, table_name, table_alias, columns);
12245            for value in values {
12246                collect_expr_columns(value, table_name, table_alias, columns);
12247            }
12248        }
12249        Expr::Between {
12250            target, low, high, ..
12251        } => {
12252            collect_expr_columns(target, table_name, table_alias, columns);
12253            collect_expr_columns(low, table_name, table_alias, columns);
12254            collect_expr_columns(high, table_name, table_alias, columns);
12255        }
12256        Expr::Subquery { .. } => {}
12257        Expr::WindowFunctionCall { args, window, .. } => {
12258            for arg in args {
12259                collect_expr_columns(arg, table_name, table_alias, columns);
12260            }
12261            for e in &window.partition_by {
12262                collect_expr_columns(e, table_name, table_alias, columns);
12263            }
12264            for o in &window.order_by {
12265                collect_expr_columns(&o.expr, table_name, table_alias, columns);
12266            }
12267        }
12268    }
12269}
12270
12271fn collect_field_ref_column(
12272    field: &crate::storage::query::ast::FieldRef,
12273    table_name: &str,
12274    table_alias: Option<&str>,
12275    columns: &mut std::collections::BTreeSet<String>,
12276) {
12277    if let Some(column) = policy_column_name_from_field_ref(field, table_name, table_alias) {
12278        if column != "*" {
12279            columns.insert(column);
12280        }
12281    }
12282}
12283
12284fn policy_column_name_from_field_ref(
12285    field: &crate::storage::query::ast::FieldRef,
12286    table_name: &str,
12287    table_alias: Option<&str>,
12288) -> Option<String> {
12289    match field {
12290        crate::storage::query::ast::FieldRef::TableColumn { table, column } => {
12291            if column == "*" {
12292                return Some("*".to_string());
12293            }
12294            if table.is_empty() || table == table_name || Some(table.as_str()) == table_alias {
12295                Some(column.clone())
12296            } else {
12297                Some(format!("{table}.{column}"))
12298            }
12299        }
12300        _ => None,
12301    }
12302}
12303
12304fn legacy_resource_to_iam(
12305    resource: &crate::auth::privileges::Resource,
12306    tenant: Option<&str>,
12307) -> crate::auth::policies::ResourceRef {
12308    use crate::auth::privileges::Resource;
12309
12310    let (kind, name) = match resource {
12311        Resource::Database => ("database".to_string(), "*".to_string()),
12312        Resource::Schema(s) => ("schema".to_string(), format!("{s}.*")),
12313        Resource::Table { schema, table } => (
12314            "table".to_string(),
12315            match schema {
12316                Some(s) => format!("{s}.{table}"),
12317                None => table.clone(),
12318            },
12319        ),
12320        Resource::Function { schema, name } => (
12321            "function".to_string(),
12322            match schema {
12323                Some(s) => format!("{s}.{name}"),
12324                None => name.clone(),
12325            },
12326        ),
12327    };
12328
12329    let mut out = crate::auth::policies::ResourceRef::new(kind, name);
12330    if let Some(t) = tenant {
12331        out = out.with_tenant(t.to_string());
12332    }
12333    out
12334}
12335
12336#[derive(Debug)]
12337struct JoinTableSide {
12338    table: String,
12339    alias: String,
12340}
12341
12342fn table_side_context(expr: &QueryExpr) -> Option<JoinTableSide> {
12343    match expr {
12344        QueryExpr::Table(table) => Some(JoinTableSide {
12345            table: table.table.clone(),
12346            alias: table.alias.clone().unwrap_or_else(|| table.table.clone()),
12347        }),
12348        _ => None,
12349    }
12350}
12351
12352fn collect_projection_columns_for_table(
12353    projection: &Projection,
12354    table: &str,
12355    alias: Option<&str>,
12356    out: &mut BTreeSet<String>,
12357) {
12358    match projection {
12359        Projection::Column(column) | Projection::Alias(column, _) => {
12360            match split_qualified_column(column) {
12361                Some((qualifier, column))
12362                    if qualifier == table || alias.is_some_and(|alias| qualifier == alias) =>
12363                {
12364                    push_policy_column(column, out);
12365                }
12366                Some(_) => {}
12367                None => push_policy_column(column, out),
12368            }
12369        }
12370        Projection::Field(
12371            FieldRef::TableColumn {
12372                table: qualifier,
12373                column,
12374            },
12375            _,
12376        ) => {
12377            if qualifier.is_empty()
12378                || qualifier == table
12379                || alias.is_some_and(|alias| qualifier == alias)
12380            {
12381                push_policy_column(column, out);
12382            }
12383        }
12384        Projection::Field(
12385            FieldRef::NodeProperty {
12386                alias: qualifier,
12387                property,
12388            },
12389            _,
12390        )
12391        | Projection::Field(
12392            FieldRef::EdgeProperty {
12393                alias: qualifier,
12394                property,
12395            },
12396            _,
12397        ) => {
12398            if qualifier == table || alias.is_some_and(|alias| qualifier == alias) {
12399                push_policy_column(property, out);
12400            }
12401        }
12402        Projection::Function(_, args) => {
12403            for arg in args {
12404                collect_projection_columns_for_table(arg, table, alias, out);
12405            }
12406        }
12407        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
12408        Projection::Window { args, .. } => {
12409            for arg in args {
12410                collect_projection_columns_for_table(arg, table, alias, out);
12411            }
12412        }
12413    }
12414}
12415
12416fn collect_projection_columns_for_join_side(
12417    projection: &Projection,
12418    left: Option<&JoinTableSide>,
12419    right: Option<&JoinTableSide>,
12420    out: &mut HashMap<String, BTreeSet<String>>,
12421) -> RedDBResult<()> {
12422    match projection {
12423        Projection::Column(column) | Projection::Alias(column, _) => {
12424            if let Some((qualifier, column)) = split_qualified_column(column) {
12425                push_qualified_join_column(qualifier, column, left, right, out);
12426            } else {
12427                push_unqualified_join_column(column, left, right, out);
12428            }
12429        }
12430        Projection::Field(FieldRef::TableColumn { table, column }, _) => {
12431            if table.is_empty() {
12432                push_unqualified_join_column(column, left, right, out);
12433            } else if let Some(side) = [left, right]
12434                .into_iter()
12435                .flatten()
12436                .find(|side| table == side.table.as_str() || table == side.alias.as_str())
12437            {
12438                push_join_column(&side.table, column, out);
12439            }
12440        }
12441        Projection::Field(FieldRef::NodeProperty { alias, property }, _)
12442        | Projection::Field(FieldRef::EdgeProperty { alias, property }, _) => {
12443            push_qualified_join_column(alias, property, left, right, out);
12444        }
12445        Projection::Function(_, args) => {
12446            for arg in args {
12447                collect_projection_columns_for_join_side(arg, left, right, out)?;
12448            }
12449        }
12450        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
12451        Projection::Window { args, .. } => {
12452            for arg in args {
12453                collect_projection_columns_for_join_side(arg, left, right, out)?;
12454            }
12455        }
12456    }
12457    Ok(())
12458}
12459
12460fn split_qualified_column(column: &str) -> Option<(&str, &str)> {
12461    let (qualifier, column) = column.split_once('.')?;
12462    if qualifier.is_empty() || column.is_empty() || column.contains('.') {
12463        return None;
12464    }
12465    Some((qualifier, column))
12466}
12467
12468fn push_qualified_join_column(
12469    qualifier: &str,
12470    column: &str,
12471    left: Option<&JoinTableSide>,
12472    right: Option<&JoinTableSide>,
12473    out: &mut HashMap<String, BTreeSet<String>>,
12474) {
12475    if let Some(side) = [left, right]
12476        .into_iter()
12477        .flatten()
12478        .find(|side| qualifier == side.table.as_str() || qualifier == side.alias.as_str())
12479    {
12480        push_join_column(&side.table, column, out);
12481    }
12482}
12483
12484fn push_unqualified_join_column(
12485    column: &str,
12486    left: Option<&JoinTableSide>,
12487    right: Option<&JoinTableSide>,
12488    out: &mut HashMap<String, BTreeSet<String>>,
12489) {
12490    for side in [left, right].into_iter().flatten() {
12491        push_join_column(&side.table, column, out);
12492    }
12493}
12494
12495fn push_join_column(table: &str, column: &str, out: &mut HashMap<String, BTreeSet<String>>) {
12496    if is_policy_column_name(column) {
12497        out.entry(table.to_string())
12498            .or_default()
12499            .insert(column.to_string());
12500    }
12501}
12502
12503fn push_policy_column(column: &str, out: &mut BTreeSet<String>) {
12504    if is_policy_column_name(column) {
12505        out.insert(column.to_string());
12506    }
12507}
12508
12509fn is_policy_column_name(column: &str) -> bool {
12510    !column.is_empty()
12511        && column != "*"
12512        && !column.starts_with("LIT:")
12513        && !column.starts_with("TYPE:")
12514}
12515
12516fn runtime_iam_context(
12517    role: crate::auth::Role,
12518    tenant: Option<&str>,
12519    principal_is_system_owned: bool,
12520) -> crate::auth::policies::EvalContext {
12521    crate::auth::policies::EvalContext {
12522        principal_tenant: tenant.map(|t| t.to_string()),
12523        current_tenant: tenant.map(|t| t.to_string()),
12524        peer_ip: None,
12525        mfa_present: false,
12526        now_ms: crate::auth::now_ms(),
12527        principal_is_admin_role: role == crate::auth::Role::Admin,
12528        principal_is_system_owned,
12529        principal_is_platform_scoped: tenant.is_none(),
12530    }
12531}
12532
12533fn explicit_table_projection_columns(
12534    query: &crate::storage::query::ast::TableQuery,
12535) -> Vec<String> {
12536    use crate::storage::query::ast::{FieldRef, Projection};
12537
12538    let mut columns = Vec::new();
12539    for projection in crate::storage::query::sql_lowering::effective_table_projections(query) {
12540        match projection {
12541            Projection::Column(column) | Projection::Alias(column, _) => {
12542                push_unique(&mut columns, column)
12543            }
12544            Projection::Field(FieldRef::TableColumn { column, .. }, _) => {
12545                push_unique(&mut columns, column)
12546            }
12547            // SELECT * and expression/function projections need the
12548            // executor-wide column-policy context mapped in
12549            // docs/security/select-relational-column-policy-audit-2026-05-08.md.
12550            _ => {}
12551        }
12552    }
12553    columns
12554}
12555
12556fn explicit_graph_projection_properties(
12557    query: &crate::storage::query::ast::GraphQuery,
12558) -> Vec<String> {
12559    use crate::storage::query::ast::{FieldRef, Projection};
12560
12561    let mut columns = Vec::new();
12562    for projection in &query.return_ {
12563        match projection {
12564            Projection::Field(FieldRef::NodeProperty { property, .. }, _)
12565            | Projection::Field(FieldRef::EdgeProperty { property, .. }, _) => {
12566                push_unique(&mut columns, property.clone())
12567            }
12568            _ => {}
12569        }
12570    }
12571    columns
12572}
12573
12574fn push_unique(columns: &mut Vec<String>, column: String) {
12575    if !columns.iter().any(|existing| existing == &column) {
12576        columns.push(column);
12577    }
12578}
12579
12580fn principal_label(p: &crate::storage::query::ast::PolicyPrincipalRef) -> String {
12581    use crate::storage::query::ast::PolicyPrincipalRef;
12582    match p {
12583        PolicyPrincipalRef::User(u) => match &u.tenant {
12584            Some(t) => format!("user:{t}/{}", u.username),
12585            None => format!("user:{}", u.username),
12586        },
12587        PolicyPrincipalRef::Group(g) => format!("group:{g}"),
12588    }
12589}
12590
12591/// Render a `Decision` into the (decision, matched_policy_id, matched_sid)
12592/// shape used by every audit emit + the simulator response.
12593pub(crate) fn decision_to_strings(
12594    d: &crate::auth::policies::Decision,
12595) -> (String, Option<String>, Option<String>) {
12596    use crate::auth::policies::Decision;
12597    match d {
12598        Decision::Allow {
12599            matched_policy_id,
12600            matched_sid,
12601        } => (
12602            "allow".into(),
12603            Some(matched_policy_id.clone()),
12604            matched_sid.clone(),
12605        ),
12606        Decision::Deny {
12607            matched_policy_id,
12608            matched_sid,
12609        } => (
12610            "deny".into(),
12611            Some(matched_policy_id.clone()),
12612            matched_sid.clone(),
12613        ),
12614        Decision::DefaultDeny => ("default_deny".into(), None, None),
12615        Decision::AdminBypass => ("admin_bypass".into(), None, None),
12616    }
12617}
12618
12619fn relation_scopes_for_query(query: &QueryExpr) -> Vec<String> {
12620    let mut scopes = Vec::new();
12621    collect_relation_scopes(query, &mut scopes);
12622    scopes.sort();
12623    scopes.dedup();
12624    scopes
12625}
12626
12627fn collect_relation_scopes(query: &QueryExpr, scopes: &mut Vec<String>) {
12628    match query {
12629        QueryExpr::Table(table) => {
12630            if !table.table.is_empty() {
12631                scopes.push(table.table.clone());
12632            }
12633            if let Some(alias) = &table.alias {
12634                scopes.push(alias.clone());
12635            }
12636        }
12637        QueryExpr::Join(join) => {
12638            collect_relation_scopes(&join.left, scopes);
12639            collect_relation_scopes(&join.right, scopes);
12640        }
12641        _ => {}
12642    }
12643}
12644
12645fn query_references_outer_scope(query: &QueryExpr, outer_scopes: &[String]) -> bool {
12646    let inner_scopes = relation_scopes_for_query(query);
12647    query_expr_references_outer_scope(query, outer_scopes, &inner_scopes)
12648}
12649
12650fn query_expr_references_outer_scope(
12651    query: &QueryExpr,
12652    outer_scopes: &[String],
12653    inner_scopes: &[String],
12654) -> bool {
12655    match query {
12656        QueryExpr::Table(table) => {
12657            table.select_items.iter().any(|item| match item {
12658                crate::storage::query::ast::SelectItem::Wildcard => false,
12659                crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
12660                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12661                }
12662            }) || table
12663                .where_expr
12664                .as_ref()
12665                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
12666                || table.filter.as_ref().is_some_and(|filter| {
12667                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
12668                })
12669                || table.having_expr.as_ref().is_some_and(|expr| {
12670                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12671                })
12672                || table.having.as_ref().is_some_and(|filter| {
12673                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
12674                })
12675                || table
12676                    .group_by_exprs
12677                    .iter()
12678                    .any(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
12679                || table.order_by.iter().any(|clause| {
12680                    clause.expr.as_ref().is_some_and(|expr| {
12681                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12682                    })
12683                })
12684        }
12685        QueryExpr::Join(join) => {
12686            query_expr_references_outer_scope(&join.left, outer_scopes, inner_scopes)
12687                || query_expr_references_outer_scope(&join.right, outer_scopes, inner_scopes)
12688                || join.filter.as_ref().is_some_and(|filter| {
12689                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
12690                })
12691                || join.return_items.iter().any(|item| match item {
12692                    crate::storage::query::ast::SelectItem::Wildcard => false,
12693                    crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
12694                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12695                    }
12696                })
12697        }
12698        _ => false,
12699    }
12700}
12701
12702fn filter_references_outer_scope(
12703    filter: &crate::storage::query::ast::Filter,
12704    outer_scopes: &[String],
12705    inner_scopes: &[String],
12706) -> bool {
12707    use crate::storage::query::ast::Filter;
12708    match filter {
12709        Filter::Compare { field, .. }
12710        | Filter::IsNull(field)
12711        | Filter::IsNotNull(field)
12712        | Filter::In { field, .. }
12713        | Filter::Between { field, .. }
12714        | Filter::Like { field, .. }
12715        | Filter::StartsWith { field, .. }
12716        | Filter::EndsWith { field, .. }
12717        | Filter::Contains { field, .. } => {
12718            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
12719        }
12720        Filter::CompareFields { left, right, .. } => {
12721            field_ref_references_outer_scope(left, outer_scopes, inner_scopes)
12722                || field_ref_references_outer_scope(right, outer_scopes, inner_scopes)
12723        }
12724        Filter::CompareExpr { lhs, rhs, .. } => {
12725            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
12726                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
12727        }
12728        Filter::And(left, right) | Filter::Or(left, right) => {
12729            filter_references_outer_scope(left, outer_scopes, inner_scopes)
12730                || filter_references_outer_scope(right, outer_scopes, inner_scopes)
12731        }
12732        Filter::Not(inner) => filter_references_outer_scope(inner, outer_scopes, inner_scopes),
12733    }
12734}
12735
12736fn expr_references_outer_scope(
12737    expr: &crate::storage::query::ast::Expr,
12738    outer_scopes: &[String],
12739    inner_scopes: &[String],
12740) -> bool {
12741    use crate::storage::query::ast::Expr;
12742    match expr {
12743        Expr::Column { field, .. } => {
12744            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
12745        }
12746        Expr::BinaryOp { lhs, rhs, .. } => {
12747            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
12748                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
12749        }
12750        Expr::UnaryOp { operand, .. }
12751        | Expr::Cast { inner: operand, .. }
12752        | Expr::IsNull { operand, .. } => {
12753            expr_references_outer_scope(operand, outer_scopes, inner_scopes)
12754        }
12755        Expr::FunctionCall { args, .. } => args
12756            .iter()
12757            .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes)),
12758        Expr::Case {
12759            branches, else_, ..
12760        } => {
12761            branches.iter().any(|(cond, value)| {
12762                expr_references_outer_scope(cond, outer_scopes, inner_scopes)
12763                    || expr_references_outer_scope(value, outer_scopes, inner_scopes)
12764            }) || else_
12765                .as_ref()
12766                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
12767        }
12768        Expr::InList { target, values, .. } => {
12769            expr_references_outer_scope(target, outer_scopes, inner_scopes)
12770                || values
12771                    .iter()
12772                    .any(|value| expr_references_outer_scope(value, outer_scopes, inner_scopes))
12773        }
12774        Expr::Between {
12775            target, low, high, ..
12776        } => {
12777            expr_references_outer_scope(target, outer_scopes, inner_scopes)
12778                || expr_references_outer_scope(low, outer_scopes, inner_scopes)
12779                || expr_references_outer_scope(high, outer_scopes, inner_scopes)
12780        }
12781        Expr::Subquery { query, .. } => query_references_outer_scope(&query.query, inner_scopes),
12782        Expr::Literal { .. } | Expr::Parameter { .. } => false,
12783        Expr::WindowFunctionCall { args, window, .. } => {
12784            args.iter()
12785                .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes))
12786                || window
12787                    .partition_by
12788                    .iter()
12789                    .any(|e| expr_references_outer_scope(e, outer_scopes, inner_scopes))
12790                || window
12791                    .order_by
12792                    .iter()
12793                    .any(|o| expr_references_outer_scope(&o.expr, outer_scopes, inner_scopes))
12794        }
12795    }
12796}
12797
12798fn field_ref_references_outer_scope(
12799    field: &crate::storage::query::ast::FieldRef,
12800    outer_scopes: &[String],
12801    inner_scopes: &[String],
12802) -> bool {
12803    match field {
12804        crate::storage::query::ast::FieldRef::TableColumn { table, .. } if !table.is_empty() => {
12805            outer_scopes.iter().any(|scope| scope == table)
12806                && !inner_scopes.iter().any(|scope| scope == table)
12807        }
12808        _ => false,
12809    }
12810}
12811
12812fn first_column_values(
12813    result: crate::storage::query::unified::UnifiedResult,
12814) -> RedDBResult<Vec<Value>> {
12815    if result.columns.len() > 1 {
12816        return Err(RedDBError::Query(
12817            "expression subquery must return exactly one column".to_string(),
12818        ));
12819    }
12820    let fallback_column = result
12821        .records
12822        .first()
12823        .and_then(|record| record.column_names().into_iter().next())
12824        .map(|name| name.to_string());
12825    let column = result.columns.first().cloned().or(fallback_column);
12826    let Some(column) = column else {
12827        return Ok(Vec::new());
12828    };
12829    Ok(result
12830        .records
12831        .iter()
12832        .map(|record| record.get(column.as_str()).cloned().unwrap_or(Value::Null))
12833        .collect())
12834}
12835
12836fn parse_timestamp_to_ms(s: &str) -> Option<u128> {
12837    // Bare integer ms.
12838    if let Ok(n) = s.parse::<u128>() {
12839        return Some(n);
12840    }
12841    // Fallback: ISO-8601 like 2030-01-02 03:04:05 — accept the date
12842    // portion only (midnight UTC). Full RFC3339 parsing is a stretch
12843    // goal; the common case is `'2030-01-01'`.
12844    if let Some(date) = s.split_whitespace().next() {
12845        let parts: Vec<&str> = date.split('-').collect();
12846        if parts.len() == 3 {
12847            let (y, m, d) = (parts[0], parts[1], parts[2]);
12848            if let (Ok(y), Ok(m), Ok(d)) = (y.parse::<i64>(), m.parse::<u32>(), d.parse::<u32>()) {
12849                // Days since 1970-01-01 — simple Julian arithmetic
12850                // suitable for years 1970-2100. Good enough for test
12851                // fixtures; precise parsing lands when we wire chrono.
12852                let days_in = days_from_civil(y, m, d);
12853                return Some((days_in as u128) * 86_400_000u128);
12854            }
12855        }
12856    }
12857    None
12858}
12859
12860/// Days from Unix epoch using H. Hinnant's civil-from-days algorithm.
12861/// Robust for the entire Gregorian range; used by `parse_timestamp_to_ms`.
12862fn days_from_civil(y: i64, m: u32, d: u32) -> i64 {
12863    let y = if m <= 2 { y - 1 } else { y };
12864    let era = if y >= 0 { y } else { y - 399 } / 400;
12865    let yoe = (y - era * 400) as u64; // [0, 399]
12866    let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) as u64 + 2) / 5 + d as u64 - 1;
12867    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
12868    era * 146097 + doe as i64 - 719468
12869}
12870
12871fn walk_plan_node(
12872    node: &crate::storage::query::planner::CanonicalLogicalNode,
12873    depth: usize,
12874    out: &mut Vec<crate::storage::query::unified::UnifiedRecord>,
12875) {
12876    use std::sync::Arc;
12877    let mut rec = crate::storage::query::unified::UnifiedRecord::default();
12878    rec.set_arc(Arc::from("op"), Value::text(node.operator.clone()));
12879    rec.set_arc(
12880        Arc::from("source"),
12881        node.source.clone().map(Value::text).unwrap_or(Value::Null),
12882    );
12883    rec.set_arc(Arc::from("est_rows"), Value::Float(node.estimated_rows));
12884    rec.set_arc(Arc::from("est_cost"), Value::Float(node.operator_cost));
12885    rec.set_arc(Arc::from("depth"), Value::Integer(depth as i64));
12886    out.push(rec);
12887    for child in &node.children {
12888        walk_plan_node(child, depth + 1, out);
12889    }
12890}
12891
12892#[cfg(test)]
12893mod inline_graph_tvf_tests {
12894    use super::*;
12895
12896    fn scopes_for(sql: &str) -> HashSet<String> {
12897        let expr = crate::storage::query::parser::parse(sql)
12898            .expect("parse")
12899            .query;
12900        query_expr_result_cache_scopes(&expr)
12901    }
12902
12903    #[test]
12904    fn inline_tvf_cache_scopes_include_source_collections() {
12905        // The result-cache key for the inline form must derive from the
12906        // `nodes`/`edges` source collections so a write to either invalidates
12907        // the cached result (issue #799).
12908        let scopes = scopes_for(
12909            "SELECT * FROM components(nodes => (SELECT id FROM hosts), edges => (SELECT src, dst FROM links))",
12910        );
12911        assert!(scopes.contains("hosts"), "nodes source scoped: {scopes:?}");
12912        assert!(scopes.contains("links"), "edges source scoped: {scopes:?}");
12913    }
12914
12915    #[test]
12916    fn graph_collection_tvf_cache_scope_is_graph_argument() {
12917        // The graph-collection form still materializes the active graph, but
12918        // result-cache invalidation is scoped to the named graph argument so
12919        // INSERT INTO g NODE/EDGE invalidates cached TVF rows.
12920        let scopes = scopes_for("SELECT * FROM components(g)");
12921        assert!(scopes.contains("g"), "collection form scoped: {scopes:?}");
12922    }
12923
12924    #[test]
12925    fn abstract_degree_centrality_counts_undirected_endpoints() {
12926        let nodes = vec!["a".to_string(), "b".to_string(), "c".to_string()];
12927        let edges = vec![
12928            ("a".to_string(), "b".to_string(), 1.0_f32),
12929            ("b".to_string(), "c".to_string(), 1.0_f32),
12930        ];
12931        let degrees = abstract_degree_centrality(&nodes, &edges);
12932        assert_eq!(
12933            degrees,
12934            vec![
12935                ("a".to_string(), 1),
12936                ("b".to_string(), 2),
12937                ("c".to_string(), 1),
12938            ]
12939        );
12940    }
12941}