Skip to main content

reddb_server/runtime/
impl_core.rs

1use super::*;
2use crate::auth::column_policy_gate::ColumnAccessRequest;
3use crate::auth::UserId;
4use crate::replication::cdc::ChangeRecord;
5use crate::storage::query::ast::TableSource;
6
7/// Read a numeric score column out of a result record as `f64`, matching
8/// the column name case-insensitively. Used by the leaderboard-rank head
9/// walk (#918) to compare scores; non-numeric / missing columns yield
10/// `None` so a row with no comparable score never shifts a rank.
11fn record_column_f64(
12    rec: &crate::storage::query::unified::UnifiedRecord,
13    column: &str,
14) -> Option<f64> {
15    let value = rec
16        .get(column)
17        .or_else(|| rec.get(&column.to_lowercase()))?;
18    match value {
19        Value::Integer(n) => Some(*n as f64),
20        Value::UnsignedInteger(n) => Some(*n as f64),
21        Value::Float(n) => Some(*n),
22        Value::Timestamp(n) | Value::Duration(n) => Some(*n as f64),
23        _ => None,
24    }
25}
26
27fn record_rid_u64(rec: &crate::storage::query::unified::UnifiedRecord) -> Option<u64> {
28    match rec.get("rid") {
29        Some(Value::UnsignedInteger(n)) => Some(*n),
30        Some(Value::Integer(n)) if *n >= 0 => Some(*n as u64),
31        _ => None,
32    }
33}
34
35fn seed_storage_deploy_config(
36    store: &crate::storage::UnifiedStore,
37    selection: crate::storage::StorageProfileSelection,
38) {
39    store.set_config_tree(
40        "storage.deploy",
41        &crate::json!({
42            "profile": selection.deploy_profile.as_str(),
43            "packaging": selection.packaging.as_str(),
44            "preset": selection.preset_name(),
45            "replica_count": selection.replica_count,
46            "managed_backup": selection.managed_backup,
47            "wal_retention": selection.wal_retention,
48        }),
49    );
50}
51
52struct RankedHeadEntry {
53    rank: u64,
54    record: crate::storage::query::unified::UnifiedRecord,
55}
56
57fn secret_sql_value_to_string(value: &Value) -> RedDBResult<String> {
58    match value {
59        Value::Text(s) => Ok(s.to_string()),
60        Value::Integer(n) => Ok(n.to_string()),
61        Value::UnsignedInteger(n) => Ok(n.to_string()),
62        Value::Float(n) => Ok(n.to_string()),
63        Value::Boolean(b) => Ok(b.to_string()),
64        Value::Null => Err(RedDBError::Query(
65            "SET SECRET key = NULL deletes the secret; use DELETE SECRET for explicit deletes"
66                .to_string(),
67        )),
68        Value::Password(_) | Value::Secret(_) => Err(RedDBError::Query(
69            "SET SECRET accepts plain scalar literals; PASSWORD() and SECRET() are for typed columns"
70                .to_string(),
71        )),
72        _ => Err(RedDBError::Query(format!(
73            "SET SECRET does not support value type {:?} yet",
74            value.data_type()
75        ))),
76    }
77}
78
79fn insert_config_json_path(
80    root: &mut crate::serde_json::Value,
81    path: &str,
82    value: crate::serde_json::Value,
83) {
84    let segments: Vec<&str> = path
85        .split('.')
86        .filter(|segment| !segment.is_empty())
87        .collect();
88    insert_config_json_segments(root, &segments, value);
89}
90
91fn insert_config_json_segments(
92    root: &mut crate::serde_json::Value,
93    segments: &[&str],
94    value: crate::serde_json::Value,
95) {
96    if segments.is_empty() {
97        *root = value;
98        return;
99    }
100
101    if !matches!(root, crate::serde_json::Value::Object(_)) {
102        *root = crate::serde_json::Value::Object(crate::serde_json::Map::new());
103    }
104
105    let crate::serde_json::Value::Object(map) = root else {
106        return;
107    };
108    if segments.len() == 1 {
109        map.insert(segments[0].to_string(), value);
110        return;
111    }
112    let entry = map
113        .entry(segments[0].to_string())
114        .or_insert_with(|| crate::serde_json::Value::Object(crate::serde_json::Map::new()));
115    insert_config_json_segments(entry, &segments[1..], value);
116}
117
118fn show_config_json_result(
119    query: &str,
120    mode: crate::storage::query::modes::QueryMode,
121    prefix: &Option<String>,
122    value: crate::serde_json::Value,
123) -> RuntimeQueryResult {
124    let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
125    let mut record = UnifiedRecord::new();
126    record.set(
127        "key",
128        prefix
129            .as_ref()
130            .map(|key| Value::text(key.clone()))
131            .unwrap_or(Value::Null),
132    );
133    record.set("value", Value::Json(value.to_string_compact().into_bytes()));
134    result.push(record);
135    RuntimeQueryResult {
136        query: query.to_string(),
137        mode,
138        statement: "show_config_json",
139        engine: "runtime-config",
140        result,
141        affected_rows: 0,
142        statement_type: "select",
143        bookmark: None,
144    }
145}
146
147#[derive(Clone)]
148struct QueryControlEventSpec {
149    kind: crate::runtime::control_events::EventKind,
150    action: &'static str,
151    resource: Option<String>,
152    fields: Vec<(String, crate::runtime::control_events::Sensitivity)>,
153}
154
155#[derive(Clone)]
156struct QueryAuditPlan {
157    statement_kind: &'static str,
158    collections: Vec<String>,
159}
160
161fn query_audit_plan(expr: &QueryExpr) -> Option<QueryAuditPlan> {
162    let mut collections = Vec::new();
163    let statement_kind = match expr {
164        QueryExpr::Table(table) => {
165            push_query_audit_collection(&mut collections, &table.table);
166            "select"
167        }
168        QueryExpr::Join(join) => {
169            collect_query_audit_collections(&join.left, &mut collections);
170            collect_query_audit_collections(&join.right, &mut collections);
171            "select"
172        }
173        QueryExpr::Insert(insert) => {
174            push_query_audit_collection(&mut collections, &insert.table);
175            "insert"
176        }
177        QueryExpr::Update(update) => {
178            push_query_audit_collection(&mut collections, &update.table);
179            "update"
180        }
181        QueryExpr::Delete(delete) => {
182            push_query_audit_collection(&mut collections, &delete.table);
183            "delete"
184        }
185        _ => return None,
186    };
187    if collections.is_empty() {
188        None
189    } else {
190        Some(QueryAuditPlan {
191            statement_kind,
192            collections,
193        })
194    }
195}
196
197fn collect_query_audit_collections(expr: &QueryExpr, collections: &mut Vec<String>) {
198    match expr {
199        QueryExpr::Table(table) => push_query_audit_collection(collections, &table.table),
200        QueryExpr::Join(join) => {
201            collect_query_audit_collections(&join.left, collections);
202            collect_query_audit_collections(&join.right, collections);
203        }
204        _ => {}
205    }
206}
207
208fn push_query_audit_collection(collections: &mut Vec<String>, name: &str) {
209    if name == "red" || name.starts_with("red.") || name.starts_with("__red_schema_") {
210        return;
211    }
212    if !collections.iter().any(|existing| existing == name) {
213        collections.push(name.to_string());
214    }
215}
216
217const RUNTIME_INDEX_REGISTRY_COLLECTION: &str = "red_index_registry";
218
219impl RedDBRuntime {
220    fn execute_create_metric(
221        &self,
222        raw_query: &str,
223        query: &crate::storage::query::ast::CreateMetricQuery,
224    ) -> RedDBResult<RuntimeQueryResult> {
225        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
226        let store = self.inner.db.store();
227        super::metric_descriptor_catalog::create(
228            store.as_ref(),
229            &query.path,
230            &query.kind,
231            &query.role,
232            super::metric_descriptor_catalog::DerivedSpec {
233                source: query.source.clone(),
234                query: query.query.clone(),
235                window_ms: query.window_ms,
236                time_field: query.time_field.clone(),
237            },
238        )?;
239        self.invalidate_result_cache();
240        Ok(RuntimeQueryResult::ok_message(
241            raw_query.to_string(),
242            &format!("metric descriptor '{}' created", query.path),
243            "create",
244        ))
245    }
246
247    /// `CREATE RANKING <name> ON <table> (<column> [ASC|DESC]) [TOP <k>]`
248    /// — declare a Ranking capability over an ordinary table's score
249    /// column (issue #918 / ADR 0035). Persists a WAL-backed catalog
250    /// record; no new Collection model is introduced. Authorized through
251    /// the same DDL write gate as `CREATE METRIC`/`CREATE INDEX`.
252    fn execute_create_ranking(
253        &self,
254        raw_query: &str,
255        req: super::ranking_descriptor_catalog::CreateRankingRequest,
256    ) -> RedDBResult<RuntimeQueryResult> {
257        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
258        let store = self.inner.db.store();
259        let descriptor = super::ranking_descriptor_catalog::create(store.as_ref(), &req)?;
260        self.invalidate_result_cache();
261        Ok(RuntimeQueryResult::ok_message(
262            raw_query.to_string(),
263            &format!(
264                "ranking '{}' created on {}({})",
265                descriptor.name, descriptor.table, descriptor.column
266            ),
267            "create",
268        ))
269    }
270
271    /// `SHOW RANKINGS` — project the declared Ranking capabilities back as
272    /// rows, so a declared capability is observable (the Analytics
273    /// "prefer SELECT over admin verbs" rule).
274    fn execute_show_rankings(&self, raw_query: &str) -> RedDBResult<RuntimeQueryResult> {
275        let store = self.inner.db.store();
276        let entries = super::ranking_descriptor_catalog::list(store.as_ref());
277        let columns = vec![
278            "name".to_string(),
279            "table".to_string(),
280            "column".to_string(),
281            "direction".to_string(),
282            "top_k".to_string(),
283        ];
284        let rows = entries
285            .into_iter()
286            .map(|e| {
287                vec![
288                    ("name".to_string(), Value::text(e.name)),
289                    ("table".to_string(), Value::text(e.table)),
290                    ("column".to_string(), Value::text(e.column)),
291                    (
292                        "direction".to_string(),
293                        Value::text(if e.descending { "DESC" } else { "ASC" }.to_string()),
294                    ),
295                    ("top_k".to_string(), Value::UnsignedInteger(e.top_k)),
296                ]
297            })
298            .collect();
299        let mut result =
300            RuntimeQueryResult::ok_records(raw_query.to_string(), columns, rows, "select");
301        result.statement = "rank_of";
302        result.engine = "runtime-rank";
303        Ok(result)
304    }
305
306    /// `RANK OF <id> IN <name>` — exact, MVCC-correct rank of a specific
307    /// row within the capability's bounded top-K head (issue #918).
308    ///
309    /// Returns a single `rank` row when the row is visible *and* falls
310    /// inside the exact head; an empty result otherwise (not visible, or
311    /// in the approximate tail — a separate slice). The computation runs
312    /// entirely over the regular read pipeline so it inherits MVCC
313    /// visibility, RLS/policy, and tenant scope from ordinary reads.
314    fn execute_rank_of(
315        &self,
316        raw_query: &str,
317        req: &crate::storage::query::ast::RankOfQuery,
318    ) -> RedDBResult<RuntimeQueryResult> {
319        let store = self.inner.db.store();
320        let descriptor = super::ranking_descriptor_catalog::get(store.as_ref(), &req.ranking)
321            .ok_or_else(|| {
322                RedDBError::Query(format!("ranking '{}' does not exist", req.ranking))
323            })?;
324        let rank = self.compute_exact_head_rank(&descriptor, req.entity_id)?;
325        let columns = vec!["rank".to_string()];
326        let rows = match rank {
327            Some(rank) => vec![vec![("rank".to_string(), Value::UnsignedInteger(rank))]],
328            None => Vec::new(),
329        };
330        let mut result =
331            RuntimeQueryResult::ok_records(raw_query.to_string(), columns, rows, "select");
332        result.statement = "rank_range";
333        result.engine = "runtime-rank";
334        Ok(result)
335    }
336
337    /// `RANK RANGE <lo> TO <hi> IN <name>` — exact, MVCC-correct entries
338    /// occupying a contiguous rank range within the bounded top-K head.
339    ///
340    /// The output is in leaderboard order and includes `rank` plus the
341    /// row columns returned by the canonical exact-head SQL read.
342    fn execute_rank_range(
343        &self,
344        raw_query: &str,
345        req: &crate::storage::query::ast::RankRangeQuery,
346    ) -> RedDBResult<RuntimeQueryResult> {
347        let store = self.inner.db.store();
348        let descriptor = super::ranking_descriptor_catalog::get(store.as_ref(), &req.ranking)
349            .ok_or_else(|| {
350                RedDBError::Query(format!("ranking '{}' does not exist", req.ranking))
351            })?;
352        let (head_columns, entries) = self.compute_ranked_head_entries(&descriptor)?;
353
354        let mut columns = Vec::with_capacity(head_columns.len() + 1);
355        columns.push("rank".to_string());
356        for column in &head_columns {
357            if column != "rank" {
358                columns.push(column.clone());
359            }
360        }
361
362        let rows = entries
363            .into_iter()
364            .filter(|entry| entry.rank >= req.lo && entry.rank <= req.hi)
365            .map(|entry| {
366                let mut row = Vec::with_capacity(columns.len());
367                row.push(("rank".to_string(), Value::UnsignedInteger(entry.rank)));
368                for column in &head_columns {
369                    if column == "rank" {
370                        continue;
371                    }
372                    if let Some(value) = entry.record.get(column) {
373                        row.push((column.clone(), value.clone()));
374                    }
375                }
376                row
377            })
378            .collect();
379        let mut result =
380            RuntimeQueryResult::ok_records(raw_query.to_string(), columns, rows, "select");
381        result.statement = "approx_rank_of";
382        result.engine = "runtime-rank";
383        Ok(result)
384    }
385
386    /// Compute the exact rank of `target_id` within the descriptor's
387    /// bounded top-K head, or `None` if the row is invisible to the
388    /// querying snapshot or beyond the exact head.
389    ///
390    /// Faithful to ADR 0035: it walks the sorted index head
391    /// (`ORDER BY <col> {DESC|ASC} LIMIT k`, served by
392    /// `try_sorted_index_lookup` + the per-row MVCC visibility re-check)
393    /// and counts only rows visible to the current snapshot. Running the
394    /// head scan through `execute_query_inner` keeps it on the same
395    /// snapshot/tenant/policy frame as ordinary reads, so the rank agrees
396    /// with `ORDER BY <col> {DESC|ASC} LIMIT` under that snapshot by
397    /// construction. RANK semantics: tied scores share a rank, so the
398    /// rank is `1 + (number of strictly-better visible rows)`.
399    fn compute_exact_head_rank(
400        &self,
401        descriptor: &super::ranking_descriptor_catalog::RankingDescriptor,
402        target_id: u64,
403    ) -> RedDBResult<Option<u64>> {
404        let (_columns, entries) = self.compute_ranked_head_entries(descriptor)?;
405        Ok(entries
406            .into_iter()
407            .find(|entry| record_rid_u64(&entry.record) == Some(target_id))
408            .map(|entry| entry.rank))
409    }
410
411    /// Return the exact head rows in deterministic rank order.
412    fn compute_ranked_head_entries(
413        &self,
414        descriptor: &super::ranking_descriptor_catalog::RankingDescriptor,
415    ) -> RedDBResult<(Vec<String>, Vec<RankedHeadEntry>)> {
416        let table = &descriptor.table;
417        let column = &descriptor.column;
418
419        // The exact head: top-K rows in rank order. Each row here already
420        // passed MVCC visibility *and* RLS/tenant filtering during the
421        // scan, so identifying the target *within* this result (rather
422        // than via a separate `red_entity_id` lookup, which takes the
423        // direct entity-fetch path that bypasses the RLS gate) is what
424        // makes the rank honor policy/tenant scope (criterion 5).
425        let dir = if descriptor.descending { "DESC" } else { "ASC" };
426        let head_sql = format!(
427            "SELECT * FROM {table} ORDER BY {column} {dir}, rid ASC LIMIT {}",
428            descriptor.top_k
429        );
430        let head_result = self.execute_query_inner(&head_sql)?;
431
432        let mut entries = Vec::with_capacity(head_result.result.records.len());
433        let mut row_position = 0u64;
434        let mut current_rank = 0u64;
435        let mut previous_score: Option<f64> = None;
436        for rec in &head_result.result.records {
437            let Some(score) = record_column_f64(rec, column) else {
438                continue;
439            };
440            row_position += 1;
441            current_rank = if previous_score == Some(score) {
442                current_rank
443            } else {
444                row_position
445            };
446            previous_score = Some(score);
447            entries.push(RankedHeadEntry {
448                rank: current_rank,
449                record: rec.clone(),
450            });
451        }
452        Ok((head_result.result.columns, entries))
453    }
454
455    /// `APPROX RANK OF <id> IN <name>` — the *approximate tail* read
456    /// (issue #923 / ADR 0035). Serves an explicitly-approximate
457    /// percentile / rank for an entry below the exact top-K head from a
458    /// per-`(table, column)` score sketch.
459    ///
460    /// The result is always labeled approximate (`approximate = true`,
461    /// distinct from the exact `RANK OF` surface which returns only a bare
462    /// `rank`) so a caller never reads a tail estimate as an exact head
463    /// position. An invisible / non-existent row yields no row, exactly
464    /// like the exact surface.
465    fn execute_approx_rank_of(
466        &self,
467        raw_query: &str,
468        req: &crate::storage::query::ast::RankOfQuery,
469    ) -> RedDBResult<RuntimeQueryResult> {
470        let store = self.inner.db.store();
471        let descriptor = super::ranking_descriptor_catalog::get(store.as_ref(), &req.ranking)
472            .ok_or_else(|| {
473                RedDBError::Query(format!("ranking '{}' does not exist", req.ranking))
474            })?;
475
476        let approx = self.compute_approx_rank(&descriptor, req.entity_id)?;
477        let columns = vec![
478            "rank".to_string(),
479            "percentile".to_string(),
480            "approximate".to_string(),
481        ];
482        let rows = match approx {
483            Some(approx) => vec![vec![
484                ("rank".to_string(), Value::UnsignedInteger(approx.rank)),
485                ("percentile".to_string(), Value::Float(approx.percentile)),
486                ("approximate".to_string(), Value::Boolean(true)),
487            ]],
488            None => Vec::new(),
489        };
490        Ok(RuntimeQueryResult::ok_records(
491            raw_query.to_string(),
492            columns,
493            rows,
494            "select",
495        ))
496    }
497
498    /// Refresh the per-`(table, column)` score sketch from the rows visible
499    /// to the current snapshot and return the target's approximate rank, or
500    /// `None` if the target row is invisible to this snapshot / tenant.
501    ///
502    /// The sketch is rebuilt from the live column on each read and persisted
503    /// back to `red_config` keyed by `(table, column)` — so it is maintained
504    /// per `(collection, score column)` and stays current as scores change
505    /// (criterion 4). The scan runs through `execute_query_inner`, inheriting
506    /// the same MVCC snapshot, RLS/tenant scope, and policy as ordinary
507    /// reads. The *approximation* is the histogram bucketing in
508    /// [`super::score_sketch::ScoreSketch`], not the data freshness, so the
509    /// estimate carries the documented error band even though it is built
510    /// from a full scan in this v0 (incremental maintenance is an ADR-0035
511    /// implementation detail, left open and reversible).
512    fn compute_approx_rank(
513        &self,
514        descriptor: &super::ranking_descriptor_catalog::RankingDescriptor,
515        target_id: u64,
516    ) -> RedDBResult<Option<super::score_sketch::ApproxRank>> {
517        let table = &descriptor.table;
518        let column = &descriptor.column;
519
520        // Scan the visible rows once: it both feeds the sketch and locates
521        // the target's score under the same snapshot/tenant/policy frame.
522        let scan_sql = format!("SELECT * FROM {table}");
523        let scan = self.execute_query_inner(&scan_sql)?;
524        let records = &scan.result.records;
525
526        let mut scores: Vec<f64> = Vec::with_capacity(records.len());
527        let mut target_score: Option<f64> = None;
528        for rec in records {
529            let Some(score) = record_column_f64(rec, column) else {
530                continue;
531            };
532            scores.push(score);
533            let rid = match rec.get("rid") {
534                Some(Value::UnsignedInteger(n)) => Some(*n),
535                Some(Value::Integer(n)) if *n >= 0 => Some(*n as u64),
536                _ => None,
537            };
538            if rid == Some(target_id) {
539                target_score = Some(score);
540            }
541        }
542
543        let sketch = super::score_sketch::ScoreSketch::from_scores(&scores);
544        // Persist the refreshed sketch per (table, column).
545        super::ranking_descriptor_catalog::save_sketch(
546            self.inner.db.store().as_ref(),
547            table,
548            column,
549            &sketch,
550        );
551
552        let Some(target_score) = target_score else {
553            // Not visible to this snapshot/tenant ⇒ no rank (matches exact).
554            return Ok(None);
555        };
556        Ok(sketch.approx_rank(target_score, descriptor.descending))
557    }
558
559    fn execute_alter_metric(
560        &self,
561        raw_query: &str,
562        query: &crate::storage::query::ast::AlterMetricQuery,
563    ) -> RedDBResult<RuntimeQueryResult> {
564        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
565        let store = self.inner.db.store();
566        super::metric_descriptor_catalog::update(
567            store.as_ref(),
568            &query.path,
569            query.set_role.as_deref(),
570            query.attempted_kind.as_deref(),
571            query.attempted_path.as_deref(),
572        )?;
573        self.invalidate_result_cache();
574        Ok(RuntimeQueryResult::ok_message(
575            raw_query.to_string(),
576            &format!("metric descriptor '{}' updated", query.path),
577            "alter",
578        ))
579    }
580
581    fn execute_create_slo(
582        &self,
583        raw_query: &str,
584        query: &crate::storage::query::ast::CreateSloQuery,
585    ) -> RedDBResult<RuntimeQueryResult> {
586        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
587        let store = self.inner.db.store();
588        super::slo_descriptor_catalog::create(
589            store.as_ref(),
590            &query.path,
591            &query.metric_path,
592            query.target,
593            query.window_ms,
594        )?;
595        self.invalidate_result_cache();
596        Ok(RuntimeQueryResult::ok_message(
597            raw_query.to_string(),
598            &format!("SLO descriptor '{}' created", query.path),
599            "create",
600        ))
601    }
602
603    fn execute_create_analytics_source(
604        &self,
605        raw_query: &str,
606        query: super::analytics_source_catalog::CreateAnalyticsSourceProfile,
607    ) -> RedDBResult<RuntimeQueryResult> {
608        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
609        let store = self.inner.db.store();
610        let profile = super::analytics_source_catalog::create(
611            store.as_ref(),
612            &self.inner.db.collection_contracts(),
613            query,
614        )?;
615        self.invalidate_result_cache();
616        Ok(RuntimeQueryResult::ok_message(
617            raw_query.to_string(),
618            &format!("analytics source '{}' created", profile.name),
619            "create",
620        ))
621    }
622}
623
624fn query_control_event_specs(expr: &QueryExpr) -> Vec<QueryControlEventSpec> {
625    use crate::runtime::control_events::{EventKind, Sensitivity};
626
627    let mut specs = Vec::new();
628    let mut schema = |action: &'static str, resource: Option<String>| {
629        specs.push(QueryControlEventSpec {
630            kind: EventKind::SchemaDdl,
631            action,
632            resource,
633            fields: Vec::new(),
634        });
635    };
636    match expr {
637        QueryExpr::CreateTable(q) => {
638            schema("create_table", Some(format!("table:{}", q.name)));
639            if let Some(column) = &q.tenant_by {
640                specs.push(QueryControlEventSpec {
641                    kind: EventKind::TenantGovernance,
642                    action: "create_table_tenant_by",
643                    resource: Some(format!("table:{}", q.name)),
644                    fields: vec![("tenant_column".to_string(), Sensitivity::raw(column))],
645                });
646            }
647        }
648        QueryExpr::CreateCollection(q) => {
649            schema("create_collection", Some(format!("collection:{}", q.name)));
650        }
651        QueryExpr::CreateVector(q) => schema("create_vector", Some(format!("vector:{}", q.name))),
652        QueryExpr::DropTable(q) => schema("drop_table", Some(format!("table:{}", q.name))),
653        QueryExpr::DropGraph(q) => schema("drop_graph", Some(format!("graph:{}", q.name))),
654        QueryExpr::DropVector(q) => schema("drop_vector", Some(format!("vector:{}", q.name))),
655        QueryExpr::DropDocument(q) => {
656            schema("drop_document", Some(format!("document:{}", q.name)));
657        }
658        QueryExpr::DropKv(q) => schema("drop_kv", Some(format!("kv:{}", q.name))),
659        QueryExpr::DropCollection(q) => {
660            schema("drop_collection", Some(format!("collection:{}", q.name)));
661        }
662        QueryExpr::Truncate(q) => schema("truncate", Some(format!("collection:{}", q.name))),
663        QueryExpr::AlterTable(q) => {
664            schema("alter_table", Some(format!("table:{}", q.name)));
665            for op in &q.operations {
666                match op {
667                    crate::storage::query::ast::AlterOperation::EnableRowLevelSecurity => {
668                        specs.push(QueryControlEventSpec {
669                            kind: EventKind::RlsGovernance,
670                            action: "enable_rls",
671                            resource: Some(format!("table:{}", q.name)),
672                            fields: Vec::new(),
673                        });
674                    }
675                    crate::storage::query::ast::AlterOperation::DisableRowLevelSecurity => {
676                        specs.push(QueryControlEventSpec {
677                            kind: EventKind::RlsGovernance,
678                            action: "disable_rls",
679                            resource: Some(format!("table:{}", q.name)),
680                            fields: Vec::new(),
681                        });
682                    }
683                    crate::storage::query::ast::AlterOperation::EnableTenancy { column } => {
684                        specs.push(QueryControlEventSpec {
685                            kind: EventKind::TenantGovernance,
686                            action: "enable_tenancy",
687                            resource: Some(format!("table:{}", q.name)),
688                            fields: vec![("tenant_column".to_string(), Sensitivity::raw(column))],
689                        });
690                    }
691                    crate::storage::query::ast::AlterOperation::DisableTenancy => {
692                        specs.push(QueryControlEventSpec {
693                            kind: EventKind::TenantGovernance,
694                            action: "disable_tenancy",
695                            resource: Some(format!("table:{}", q.name)),
696                            fields: Vec::new(),
697                        });
698                    }
699                    _ => {}
700                }
701            }
702        }
703        QueryExpr::CreateIndex(q) => {
704            schema(
705                "create_index",
706                Some(format!("index:{}:{}", q.table, q.name)),
707            );
708        }
709        QueryExpr::DropIndex(q) => {
710            schema("drop_index", Some(format!("index:{}:{}", q.table, q.name)));
711        }
712        QueryExpr::CreateTimeSeries(q) => {
713            schema("create_timeseries", Some(format!("timeseries:{}", q.name)));
714        }
715        QueryExpr::CreateMetric(q) => {
716            schema("create_metric", Some(format!("metric:{}", q.path)));
717        }
718        QueryExpr::AlterMetric(q) => {
719            schema("alter_metric", Some(format!("metric:{}", q.path)));
720        }
721        QueryExpr::CreateSlo(q) => {
722            schema("create_slo", Some(format!("slo:{}", q.path)));
723        }
724        QueryExpr::DropTimeSeries(q) => {
725            schema("drop_timeseries", Some(format!("timeseries:{}", q.name)));
726        }
727        QueryExpr::CreateQueue(q) => schema("create_queue", Some(format!("queue:{}", q.name))),
728        QueryExpr::AlterQueue(q) => schema("alter_queue", Some(format!("queue:{}", q.name))),
729        QueryExpr::DropQueue(q) => schema("drop_queue", Some(format!("queue:{}", q.name))),
730        QueryExpr::CreateTree(q) => {
731            schema(
732                "create_tree",
733                Some(format!("tree:{}:{}", q.collection, q.name)),
734            );
735        }
736        QueryExpr::DropTree(q) => {
737            schema(
738                "drop_tree",
739                Some(format!("tree:{}:{}", q.collection, q.name)),
740            );
741        }
742        QueryExpr::CreateSchema(q) => schema("create_schema", Some(format!("schema:{}", q.name))),
743        QueryExpr::DropSchema(q) => schema("drop_schema", Some(format!("schema:{}", q.name))),
744        QueryExpr::CreateSequence(q) => {
745            schema("create_sequence", Some(format!("sequence:{}", q.name)));
746        }
747        QueryExpr::DropSequence(q) => schema("drop_sequence", Some(format!("sequence:{}", q.name))),
748        QueryExpr::CreateView(q) => schema("create_view", Some(format!("view:{}", q.name))),
749        QueryExpr::DropView(q) => schema("drop_view", Some(format!("view:{}", q.name))),
750        QueryExpr::RefreshMaterializedView(q) => {
751            schema(
752                "refresh_materialized_view",
753                Some(format!("view:{}", q.name)),
754            );
755        }
756        QueryExpr::CreatePolicy(q) => {
757            specs.push(QueryControlEventSpec {
758                kind: EventKind::RlsGovernance,
759                action: "create_policy",
760                resource: Some(format!("table:{}:policy:{}", q.table, q.name)),
761                fields: vec![(
762                    "target_kind".to_string(),
763                    Sensitivity::raw(q.target_kind.as_ident()),
764                )],
765            });
766        }
767        QueryExpr::DropPolicy(q) => {
768            specs.push(QueryControlEventSpec {
769                kind: EventKind::RlsGovernance,
770                action: "drop_policy",
771                resource: Some(format!("table:{}:policy:{}", q.table, q.name)),
772                fields: Vec::new(),
773            });
774        }
775        QueryExpr::SetTenant(value) => {
776            let mut fields = Vec::new();
777            if let Some(value) = value {
778                fields.push(("tenant".to_string(), Sensitivity::raw(value)));
779            }
780            specs.push(QueryControlEventSpec {
781                kind: EventKind::TenantGovernance,
782                action: "set_tenant",
783                resource: Some("tenant:session".to_string()),
784                fields,
785            });
786        }
787        QueryExpr::SetConfig { key, .. } => {
788            specs.push(QueryControlEventSpec {
789                kind: EventKind::ConfigWrite,
790                action: "config:write",
791                resource: Some(format!("config:{key}")),
792                fields: vec![("key".to_string(), Sensitivity::raw(key))],
793            });
794        }
795        QueryExpr::ConfigCommand(cmd) => match cmd {
796            crate::storage::query::ast::ConfigCommand::Put {
797                collection, key, ..
798            }
799            | crate::storage::query::ast::ConfigCommand::Rotate {
800                collection, key, ..
801            } => {
802                let target = format!("{collection}/{key}");
803                specs.push(QueryControlEventSpec {
804                    kind: EventKind::ConfigWrite,
805                    action: "config:write",
806                    resource: Some(format!("config:{target}")),
807                    fields: vec![
808                        ("collection".to_string(), Sensitivity::raw(collection)),
809                        ("key".to_string(), Sensitivity::raw(key)),
810                    ],
811                });
812            }
813            crate::storage::query::ast::ConfigCommand::Delete { collection, key } => {
814                let target = format!("{collection}/{key}");
815                specs.push(QueryControlEventSpec {
816                    kind: EventKind::ConfigDelete,
817                    action: "config:write",
818                    resource: Some(format!("config:{target}")),
819                    fields: vec![
820                        ("collection".to_string(), Sensitivity::raw(collection)),
821                        ("key".to_string(), Sensitivity::raw(key)),
822                    ],
823                });
824            }
825            _ => {}
826        },
827        QueryExpr::AlterUser(stmt) => {
828            let disables = stmt.attributes.iter().any(|attr| {
829                matches!(
830                    attr,
831                    crate::storage::query::ast::AlterUserAttribute::Disable
832                )
833            });
834            specs.push(QueryControlEventSpec {
835                kind: if disables {
836                    EventKind::UserDisable
837                } else {
838                    EventKind::UserUpdate
839                },
840                action: "alter_user",
841                resource: Some(format!("user:{}", stmt.username)),
842                fields: Vec::new(),
843            });
844        }
845        QueryExpr::CreateUser(stmt) => {
846            specs.push(QueryControlEventSpec {
847                kind: EventKind::UserCreate,
848                action: "create_user",
849                resource: Some(format!("user:{}", stmt.username)),
850                fields: Vec::new(),
851            });
852        }
853        _ => {}
854    }
855    specs
856}
857
858pub(crate) fn control_event_outcome_for_error(
859    err: &RedDBError,
860) -> crate::runtime::control_events::Outcome {
861    match err {
862        RedDBError::ReadOnly(_) => crate::runtime::control_events::Outcome::Denied,
863        RedDBError::Query(msg)
864            if msg.contains("permission denied")
865                || msg.contains("cannot issue")
866                || msg.contains("lacks") =>
867        {
868            crate::runtime::control_events::Outcome::Denied
869        }
870        _ => crate::runtime::control_events::Outcome::Error,
871    }
872}
873
874/// Convert the rows produced by a materialized-view body into
875/// `UnifiedEntity` table rows targeting the backing collection.
876/// Issue #595 slice 9c — feeds `UnifiedStore::refresh_collection`.
877///
878/// Graph fragments and vector hits are ignored: a materialized view
879/// is a relational result set (SELECT-shaped); slices 11+ may extend
880/// this once we have a richer view body shape. Each row materialises
881/// the union of its schema-bound columns + overflow.
882fn view_records_to_entities(
883    table: &str,
884    records: &[crate::storage::query::unified::UnifiedRecord],
885) -> Vec<crate::storage::UnifiedEntity> {
886    use std::collections::HashMap;
887    let table_arc: std::sync::Arc<str> = std::sync::Arc::from(table);
888    let mut out = Vec::with_capacity(records.len());
889    for record in records {
890        let mut named: HashMap<String, crate::storage::schema::Value> = HashMap::new();
891        for (name, value) in record.iter_fields() {
892            named.insert(name.to_string(), value.clone());
893        }
894        let entity = crate::storage::UnifiedEntity::new(
895            crate::storage::EntityId::new(0),
896            crate::storage::EntityKind::TableRow {
897                table: std::sync::Arc::clone(&table_arc),
898                row_id: 0,
899            },
900            crate::storage::EntityData::Row(crate::storage::RowData {
901                columns: Vec::new(),
902                named: Some(named),
903                schema: None,
904            }),
905        );
906        out.push(entity);
907    }
908    out
909}
910
911fn system_keyed_collection_contract(
912    name: &str,
913    model: crate::catalog::CollectionModel,
914) -> crate::physical::CollectionContract {
915    let now = crate::utils::now_unix_millis() as u128;
916    crate::physical::CollectionContract {
917        name: name.to_string(),
918        declared_model: model,
919        schema_mode: crate::catalog::SchemaMode::Dynamic,
920        origin: crate::physical::ContractOrigin::Implicit,
921        version: 1,
922        created_at_unix_ms: now,
923        updated_at_unix_ms: now,
924        default_ttl_ms: None,
925        vector_dimension: None,
926        vector_metric: None,
927        context_index_fields: Vec::new(),
928        declared_columns: Vec::new(),
929        table_def: None,
930        timestamps_enabled: false,
931        context_index_enabled: false,
932        metrics_raw_retention_ms: None,
933        metrics_rollup_policies: Vec::new(),
934        metrics_tenant_identity: None,
935        metrics_namespace: None,
936        append_only: false,
937        subscriptions: Vec::new(),
938        analytics_config: Vec::new(),
939        session_key: None,
940        session_gap_ms: None,
941        retention_duration_ms: None,
942        analytical_storage: None,
943
944        ai_policy: None,
945    }
946}
947
948pub use super::execution_context::{
949    capture_current_snapshot, clear_current_auth_identity, clear_current_connection_id,
950    clear_current_snapshot, clear_current_tenant, current_auth_identity_for_audit,
951    current_connection_id, current_tenant, entity_visible_under_current_snapshot,
952    entity_visible_with_context, set_current_auth_identity, set_current_connection_id,
953    set_current_snapshot, set_current_tenant, snapshot_bundle, with_snapshot_bundle,
954    SnapshotBundle, SnapshotContext,
955};
956pub(crate) use super::execution_context::{
957    current_auth_identity, current_config_value, current_role_projected, current_scope_override,
958    current_secret_value, current_snapshot_requires_index_fallback, current_user_projected,
959    has_scope_override_active, parse_set_local_tenant, update_current_config_value,
960    update_current_secret_value, xids_visible_under_current_snapshot, ConfigSnapshotGuard,
961    CurrentSnapshotGuard, ScopeOverrideGuard, SecretStoreGuard, TxLocalTenantGuard,
962};
963
964fn table_row_index_fields(
965    entity: &crate::storage::unified::entity::UnifiedEntity,
966) -> Vec<(String, crate::storage::schema::Value)> {
967    let crate::storage::EntityData::Row(row) = &entity.data else {
968        return Vec::new();
969    };
970    if let Some(named) = &row.named {
971        return named
972            .iter()
973            .map(|(name, value)| (name.clone(), value.clone()))
974            .collect();
975    }
976    if let Some(schema) = &row.schema {
977        return schema
978            .iter()
979            .zip(row.columns.iter())
980            .map(|(name, value)| (name.clone(), value.clone()))
981            .collect();
982    }
983    Vec::new()
984}
985
986fn named_text(
987    named: &std::collections::HashMap<String, crate::storage::schema::Value>,
988    key: &str,
989) -> Option<String> {
990    match named.get(key) {
991        Some(crate::storage::schema::Value::Text(value)) => Some(value.to_string()),
992        _ => None,
993    }
994}
995
996fn named_bool(
997    named: &std::collections::HashMap<String, crate::storage::schema::Value>,
998    key: &str,
999) -> Option<bool> {
1000    match named.get(key) {
1001        Some(crate::storage::schema::Value::Boolean(value)) => Some(*value),
1002        _ => None,
1003    }
1004}
1005
1006fn index_method_kind_as_str(method: super::index_store::IndexMethodKind) -> &'static str {
1007    match method {
1008        super::index_store::IndexMethodKind::Hash => "hash",
1009        super::index_store::IndexMethodKind::Bitmap => "bitmap",
1010        super::index_store::IndexMethodKind::Spatial => "spatial",
1011        super::index_store::IndexMethodKind::BTree => "btree",
1012    }
1013}
1014
1015fn index_method_kind_from_str(raw: &str) -> Option<super::index_store::IndexMethodKind> {
1016    match raw {
1017        "hash" => Some(super::index_store::IndexMethodKind::Hash),
1018        "bitmap" => Some(super::index_store::IndexMethodKind::Bitmap),
1019        "spatial" | "rtree" => Some(super::index_store::IndexMethodKind::Spatial),
1020        "btree" => Some(super::index_store::IndexMethodKind::BTree),
1021        _ => None,
1022    }
1023}
1024
1025fn runtime_pool_lock(runtime: &RedDBRuntime) -> std::sync::MutexGuard<'_, PoolState> {
1026    runtime
1027        .inner
1028        .pool
1029        .lock()
1030        .unwrap_or_else(|poisoned| poisoned.into_inner())
1031}
1032
1033/// The graph-analytics table-valued functions recognized in FROM position.
1034/// Both the graph-collection form and the inline `nodes => / edges =>` form
1035/// (issue #799) accept these names.
1036fn is_graph_tvf_name(name: &str) -> bool {
1037    name.eq_ignore_ascii_case("components")
1038        || name.eq_ignore_ascii_case("louvain")
1039        || name.eq_ignore_ascii_case("degree_centrality")
1040        || name.eq_ignore_ascii_case("shortest_path")
1041        || name.eq_ignore_ascii_case("betweenness")
1042        || name.eq_ignore_ascii_case("eigenvector")
1043        || name.eq_ignore_ascii_case("pagerank")
1044}
1045
1046/// Map a declared `WITH ANALYTICS` view to the concrete graph algorithm name
1047/// and named-argument list that [`RedDBRuntime::dispatch_graph_algorithm`]
1048/// consumes (issue #800). The `using` option selects the algorithm inside the
1049/// output family; unsupported algorithms and the options that do not apply to
1050/// the chosen algorithm are rejected so a view never silently ignores a
1051/// declared parameter.
1052fn analytics_view_algorithm(
1053    graph: &str,
1054    view: &crate::catalog::AnalyticsViewDescriptor,
1055) -> RedDBResult<(String, Vec<(String, f64)>)> {
1056    use crate::catalog::AnalyticsOutput;
1057
1058    let mut named_args: Vec<(String, f64)> = Vec::new();
1059    let algorithm = match view.output {
1060        AnalyticsOutput::Communities => {
1061            let algo = view.algorithm.as_deref().unwrap_or("louvain");
1062            if !algo.eq_ignore_ascii_case("louvain") {
1063                return Err(RedDBError::Query(format!(
1064                    "analytics output 'communities' on graph '{graph}' has unsupported algorithm '{algo}' (expected louvain)"
1065                )));
1066            }
1067            if let Some(resolution) = view.resolution {
1068                named_args.push(("resolution".to_string(), resolution));
1069            }
1070            "louvain".to_string()
1071        }
1072        AnalyticsOutput::Components => {
1073            if let Some(algo) = view.algorithm.as_deref() {
1074                if !algo.eq_ignore_ascii_case("components")
1075                    && !algo.eq_ignore_ascii_case("connected_components")
1076                {
1077                    return Err(RedDBError::Query(format!(
1078                        "analytics output 'components' on graph '{graph}' has unsupported algorithm '{algo}' (expected connected_components)"
1079                    )));
1080                }
1081            }
1082            "components".to_string()
1083        }
1084        AnalyticsOutput::Centrality => {
1085            let algo = view
1086                .algorithm
1087                .as_deref()
1088                .unwrap_or("pagerank")
1089                .to_ascii_lowercase();
1090            match algo.as_str() {
1091                "pagerank" => {
1092                    if let Some(max_iterations) = view.max_iterations {
1093                        named_args.push(("max_iterations".to_string(), max_iterations as f64));
1094                    }
1095                }
1096                "eigenvector" => {
1097                    if let Some(max_iterations) = view.max_iterations {
1098                        named_args.push(("max_iterations".to_string(), max_iterations as f64));
1099                    }
1100                    if let Some(tolerance) = view.tolerance {
1101                        named_args.push(("tolerance".to_string(), tolerance));
1102                    }
1103                }
1104                "betweenness" => {}
1105                other => {
1106                    return Err(RedDBError::Query(format!(
1107                        "analytics output 'centrality' on graph '{graph}' has unsupported algorithm '{other}' (expected pagerank, betweenness, or eigenvector)"
1108                    )));
1109                }
1110            }
1111            algo
1112        }
1113    };
1114    Ok((algorithm, named_args))
1115}
1116
1117/// Reject any named arguments for a TVF that accepts none.
1118fn reject_named_args(name: &str, named_args: &[(String, f64)]) -> RedDBResult<()> {
1119    if let Some((key, _)) = named_args.first() {
1120        return Err(RedDBError::Query(format!(
1121            "table function '{name}' has no named argument '{key}'"
1122        )));
1123    }
1124    Ok(())
1125}
1126
1127/// Resolve louvain's optional `resolution` named arg (γ, default 1.0). Any
1128/// other named key, or a non-finite / non-positive resolution, is rejected.
1129fn louvain_resolution(named_args: &[(String, f64)]) -> RedDBResult<f64> {
1130    let mut resolution = 1.0_f64;
1131    for (key, value) in named_args {
1132        if key.eq_ignore_ascii_case("resolution") {
1133            if !value.is_finite() || *value <= 0.0 {
1134                return Err(RedDBError::Query(format!(
1135                    "table function 'louvain' resolution must be > 0, got {value}"
1136                )));
1137            }
1138            resolution = *value;
1139        } else {
1140            return Err(RedDBError::Query(format!(
1141                "table function 'louvain' has no named argument '{key}' (expected 'resolution')"
1142            )));
1143        }
1144    }
1145    Ok(resolution)
1146}
1147
1148/// Undirected degree centrality over abstract inputs: each edge contributes
1149/// 1 to both of its endpoints. Returns `(node_id, degree)` deterministically
1150/// in ascending node-id order, so identical input always yields identical
1151/// rows.
1152fn abstract_degree_centrality(
1153    nodes: &[String],
1154    edges: &[(
1155        String,
1156        String,
1157        crate::storage::engine::graph_algorithms::Weight,
1158    )],
1159) -> Vec<(String, usize)> {
1160    let mut degree: std::collections::BTreeMap<String, usize> = std::collections::BTreeMap::new();
1161    for n in nodes {
1162        degree.entry(n.clone()).or_insert(0);
1163    }
1164    for (a, b, _w) in edges {
1165        *degree.entry(a.clone()).or_insert(0) += 1;
1166        *degree.entry(b.clone()).or_insert(0) += 1;
1167    }
1168    degree.into_iter().collect()
1169}
1170
1171/// Ordered column names for a materialized subquery result: the projection
1172/// columns when present, else the first record's field order.
1173fn ordered_result_columns(result: &crate::storage::query::unified::UnifiedResult) -> Vec<String> {
1174    if !result.columns.is_empty() {
1175        return result.columns.clone();
1176    }
1177    result
1178        .records
1179        .first()
1180        .map(|record| {
1181            record
1182                .column_names()
1183                .iter()
1184                .map(|column| column.to_string())
1185                .collect()
1186        })
1187        .unwrap_or_default()
1188}
1189
1190/// Canonical node-id string for a cell value, so the node universe (from the
1191/// `nodes` subquery) and the edge endpoints (from the `edges` subquery)
1192/// compare equal regardless of integer-vs-text typing. `Null` is not a node.
1193fn value_to_node_id(value: &crate::storage::schema::Value) -> Option<String> {
1194    use crate::storage::schema::Value;
1195    match value {
1196        Value::Null => None,
1197        Value::Text(s) => Some(s.to_string()),
1198        Value::Integer(n) => Some(n.to_string()),
1199        Value::UnsignedInteger(n) => Some(n.to_string()),
1200        Value::NodeRef(s) => Some(s.clone()),
1201        other => Some(other.to_string()),
1202    }
1203}
1204
1205/// Numeric edge weight from a cell value (the optional third `edges` column).
1206fn value_to_weight(value: &crate::storage::schema::Value) -> Option<f32> {
1207    use crate::storage::schema::Value;
1208    match value {
1209        Value::Float(f) => Some(*f as f32),
1210        Value::Integer(n) => Some(*n as f32),
1211        Value::UnsignedInteger(n) => Some(*n as f32),
1212        _ => None,
1213    }
1214}
1215
1216/// Build the node universe from a materialized `nodes` subquery result: the
1217/// first projected column of each row is the node id (issue #799). Zero rows
1218/// is a valid empty node set; a row set with no columns is a shape error.
1219fn inline_node_ids(
1220    name: &str,
1221    result: &crate::storage::query::unified::UnifiedResult,
1222) -> RedDBResult<Vec<String>> {
1223    if result.records.is_empty() {
1224        return Ok(Vec::new());
1225    }
1226    let columns = ordered_result_columns(result);
1227    let Some(first_col) = columns.first() else {
1228        return Err(RedDBError::Query(format!(
1229            "table function '{name}' inline form: `nodes` subquery must project at least one column (the node id)"
1230        )));
1231    };
1232    let mut ids = Vec::with_capacity(result.records.len());
1233    for record in &result.records {
1234        if let Some(id) = record.get(first_col).and_then(value_to_node_id) {
1235            ids.push(id);
1236        }
1237    }
1238    Ok(ids)
1239}
1240
1241/// Build the edge list from a materialized `edges` subquery result: the first
1242/// two projected columns are `(source, target)` and an optional third column
1243/// is the numeric weight (defaulting to 1.0). Fewer than two columns is a
1244/// shape error (issue #799).
1245fn inline_edges(
1246    name: &str,
1247    result: &crate::storage::query::unified::UnifiedResult,
1248) -> RedDBResult<
1249    Vec<(
1250        String,
1251        String,
1252        crate::storage::engine::graph_algorithms::Weight,
1253    )>,
1254> {
1255    if result.records.is_empty() {
1256        return Ok(Vec::new());
1257    }
1258    let columns = ordered_result_columns(result);
1259    if columns.len() < 2 {
1260        return Err(RedDBError::Query(format!(
1261            "table function '{name}' inline form: `edges` subquery must project at least two columns (source, target), got {}",
1262            columns.len()
1263        )));
1264    }
1265    let src_col = &columns[0];
1266    let dst_col = &columns[1];
1267    let weight_col = columns.get(2);
1268    let mut edges = Vec::with_capacity(result.records.len());
1269    for record in &result.records {
1270        let (Some(src), Some(dst)) = (
1271            record.get(src_col).and_then(value_to_node_id),
1272            record.get(dst_col).and_then(value_to_node_id),
1273        ) else {
1274            // A null/absent endpoint is not a valid edge; skip it.
1275            continue;
1276        };
1277        let weight = match weight_col {
1278            Some(col) => match record.get(col) {
1279                None | Some(crate::storage::schema::Value::Null) => 1.0,
1280                Some(value) => value_to_weight(value).ok_or_else(|| {
1281                    RedDBError::Query(format!(
1282                        "table function '{name}' inline form: `edges` weight column must be numeric"
1283                    ))
1284                })?,
1285            },
1286            None => 1.0,
1287        };
1288        edges.push((src, dst, weight));
1289    }
1290    Ok(edges)
1291}
1292
1293fn cache_scope_insert(scopes: &mut HashSet<String>, name: &str) {
1294    if name.is_empty() || name.starts_with("__subq_") || is_universal_query_source(name) {
1295        return;
1296    }
1297    scopes.insert(name.to_string());
1298}
1299
1300fn collect_table_source_scopes(scopes: &mut HashSet<String>, query: &TableQuery) {
1301    match query.source.as_ref() {
1302        Some(crate::storage::query::ast::TableSource::Name(name)) => {
1303            cache_scope_insert(scopes, name)
1304        }
1305        Some(crate::storage::query::ast::TableSource::Subquery(subquery)) => {
1306            collect_query_expr_result_cache_scopes(scopes, subquery);
1307        }
1308        // Graph-collection TVFs (e.g. `louvain(g)`) read the graph store
1309        // read-only. The result is now cached (issue #802) and scoped to the
1310        // graph collection named in the first argument, so any mutation on
1311        // that collection (`INSERT INTO g NODE/EDGE …`) invalidates the
1312        // entry via `invalidate_result_cache_for_table`. Non-graph or
1313        // zero-arg functions contribute no scope.
1314        Some(crate::storage::query::ast::TableSource::Function { name, args, .. }) => {
1315            if is_graph_tvf_name(name) {
1316                if let Some(graph) = args.first() {
1317                    cache_scope_insert(scopes, graph);
1318                }
1319            }
1320        }
1321        // The inline-graph form reads ordinary tables/docs through its
1322        // `nodes`/`edges` subqueries, so its result cache must be scoped to
1323        // those source collections — mutating any of them invalidates the
1324        // cached result (issue #799).
1325        Some(crate::storage::query::ast::TableSource::InlineGraphFunction {
1326            nodes, edges, ..
1327        }) => {
1328            collect_query_expr_result_cache_scopes(scopes, nodes);
1329            collect_query_expr_result_cache_scopes(scopes, edges);
1330        }
1331        None => cache_scope_insert(scopes, &query.table),
1332    }
1333}
1334
1335fn collect_vector_source_scopes(
1336    scopes: &mut HashSet<String>,
1337    source: &crate::storage::query::ast::VectorSource,
1338) {
1339    match source {
1340        crate::storage::query::ast::VectorSource::Reference { collection, .. } => {
1341            cache_scope_insert(scopes, collection);
1342        }
1343        crate::storage::query::ast::VectorSource::Subquery(subquery) => {
1344            collect_query_expr_result_cache_scopes(scopes, subquery);
1345        }
1346        crate::storage::query::ast::VectorSource::Literal(_)
1347        | crate::storage::query::ast::VectorSource::Text(_) => {}
1348    }
1349}
1350
1351fn collect_path_selector_scopes(
1352    scopes: &mut HashSet<String>,
1353    selector: &crate::storage::query::ast::NodeSelector,
1354) {
1355    if let crate::storage::query::ast::NodeSelector::ByRow { table, .. } = selector {
1356        cache_scope_insert(scopes, table);
1357    }
1358}
1359
1360fn collect_query_expr_result_cache_scopes(scopes: &mut HashSet<String>, expr: &QueryExpr) {
1361    match expr {
1362        QueryExpr::Table(query) => collect_table_source_scopes(scopes, query),
1363        QueryExpr::Join(query) => {
1364            collect_query_expr_result_cache_scopes(scopes, &query.left);
1365            collect_query_expr_result_cache_scopes(scopes, &query.right);
1366        }
1367        QueryExpr::Path(query) => {
1368            collect_path_selector_scopes(scopes, &query.from);
1369            collect_path_selector_scopes(scopes, &query.to);
1370        }
1371        QueryExpr::Vector(query) => {
1372            cache_scope_insert(scopes, &query.collection);
1373            collect_vector_source_scopes(scopes, &query.query_vector);
1374        }
1375        QueryExpr::Hybrid(query) => {
1376            collect_query_expr_result_cache_scopes(scopes, &query.structured);
1377            cache_scope_insert(scopes, &query.vector.collection);
1378            collect_vector_source_scopes(scopes, &query.vector.query_vector);
1379        }
1380        QueryExpr::Insert(query) => cache_scope_insert(scopes, &query.table),
1381        QueryExpr::Update(query) => cache_scope_insert(scopes, &query.table),
1382        QueryExpr::Delete(query) => cache_scope_insert(scopes, &query.table),
1383        QueryExpr::CreateTable(query) => cache_scope_insert(scopes, &query.name),
1384        QueryExpr::CreateCollection(query) => cache_scope_insert(scopes, &query.name),
1385        QueryExpr::CreateVector(query) => cache_scope_insert(scopes, &query.name),
1386        QueryExpr::DropTable(query) => cache_scope_insert(scopes, &query.name),
1387        QueryExpr::DropGraph(query) => cache_scope_insert(scopes, &query.name),
1388        QueryExpr::DropVector(query) => cache_scope_insert(scopes, &query.name),
1389        QueryExpr::DropDocument(query) => cache_scope_insert(scopes, &query.name),
1390        QueryExpr::DropKv(query) => cache_scope_insert(scopes, &query.name),
1391        QueryExpr::DropCollection(query) => cache_scope_insert(scopes, &query.name),
1392        QueryExpr::Truncate(query) => cache_scope_insert(scopes, &query.name),
1393        QueryExpr::AlterTable(query) => cache_scope_insert(scopes, &query.name),
1394        QueryExpr::CreateIndex(query) => cache_scope_insert(scopes, &query.table),
1395        QueryExpr::DropIndex(query) => cache_scope_insert(scopes, &query.table),
1396        QueryExpr::CreateTimeSeries(query) => cache_scope_insert(scopes, &query.name),
1397        QueryExpr::CreateMetric(query) => cache_scope_insert(scopes, &query.path),
1398        QueryExpr::AlterMetric(query) => cache_scope_insert(scopes, &query.path),
1399        QueryExpr::CreateSlo(query) => cache_scope_insert(scopes, &query.path),
1400        QueryExpr::DropTimeSeries(query) => cache_scope_insert(scopes, &query.name),
1401        QueryExpr::CreateQueue(query) => cache_scope_insert(scopes, &query.name),
1402        QueryExpr::AlterQueue(query) => cache_scope_insert(scopes, &query.name),
1403        QueryExpr::DropQueue(query) => cache_scope_insert(scopes, &query.name),
1404        QueryExpr::QueueSelect(query) => cache_scope_insert(scopes, &query.queue),
1405        QueryExpr::QueueCommand(query) => match query {
1406            QueueCommand::Push { queue, .. }
1407            | QueueCommand::Pop { queue, .. }
1408            | QueueCommand::Peek { queue, .. }
1409            | QueueCommand::Len { queue }
1410            | QueueCommand::Purge { queue }
1411            | QueueCommand::GroupCreate { queue, .. }
1412            | QueueCommand::GroupRead { queue, .. }
1413            | QueueCommand::Pending { queue, .. }
1414            | QueueCommand::Claim { queue, .. }
1415            | QueueCommand::Ack { queue, .. }
1416            | QueueCommand::Nack { queue, .. } => cache_scope_insert(scopes, queue),
1417            QueueCommand::Move {
1418                source,
1419                destination,
1420                ..
1421            } => {
1422                cache_scope_insert(scopes, source);
1423                cache_scope_insert(scopes, destination);
1424            }
1425        },
1426        QueryExpr::EventsBackfill(query) => {
1427            cache_scope_insert(scopes, &query.collection);
1428            cache_scope_insert(scopes, &query.target_queue);
1429        }
1430        QueryExpr::CreateTree(query) => cache_scope_insert(scopes, &query.collection),
1431        QueryExpr::DropTree(query) => cache_scope_insert(scopes, &query.collection),
1432        QueryExpr::TreeCommand(query) => match query {
1433            TreeCommand::Insert { collection, .. }
1434            | TreeCommand::Move { collection, .. }
1435            | TreeCommand::Delete { collection, .. }
1436            | TreeCommand::Validate { collection, .. }
1437            | TreeCommand::Rebalance { collection, .. } => cache_scope_insert(scopes, collection),
1438        },
1439        QueryExpr::SearchCommand(query) => match query {
1440            SearchCommand::Similar { collection, .. }
1441            | SearchCommand::Hybrid { collection, .. }
1442            | SearchCommand::SpatialRadius { collection, .. }
1443            | SearchCommand::SpatialBbox { collection, .. }
1444            | SearchCommand::SpatialNearest { collection, .. } => {
1445                cache_scope_insert(scopes, collection);
1446            }
1447            SearchCommand::Text { collection, .. }
1448            | SearchCommand::Multimodal { collection, .. }
1449            | SearchCommand::Index { collection, .. }
1450            | SearchCommand::Context { collection, .. } => {
1451                if let Some(collection) = collection.as_deref() {
1452                    cache_scope_insert(scopes, collection);
1453                }
1454            }
1455        },
1456        QueryExpr::Ask(query) => {
1457            if let Some(collection) = query.collection.as_deref() {
1458                cache_scope_insert(scopes, collection);
1459            }
1460        }
1461        QueryExpr::ExplainAlter(query) => cache_scope_insert(scopes, &query.target.name),
1462        QueryExpr::MaintenanceCommand(cmd) => match cmd {
1463            crate::storage::query::ast::MaintenanceCommand::Vacuum { target, .. }
1464            | crate::storage::query::ast::MaintenanceCommand::Analyze { target } => {
1465                if let Some(t) = target {
1466                    cache_scope_insert(scopes, t);
1467                }
1468            }
1469        },
1470        QueryExpr::CopyFrom(cmd) => cache_scope_insert(scopes, &cmd.table),
1471        QueryExpr::CreateView(cmd) => {
1472            cache_scope_insert(scopes, &cmd.name);
1473            // Invalidating the view should also invalidate its dependencies.
1474            collect_query_expr_result_cache_scopes(scopes, &cmd.query);
1475        }
1476        QueryExpr::DropView(cmd) => cache_scope_insert(scopes, &cmd.name),
1477        QueryExpr::RefreshMaterializedView(cmd) => cache_scope_insert(scopes, &cmd.name),
1478        QueryExpr::CreatePolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1479        QueryExpr::DropPolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1480        QueryExpr::CreateServer(_) | QueryExpr::DropServer(_) => {}
1481        QueryExpr::CreateForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1482        QueryExpr::DropForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1483        QueryExpr::Graph(_)
1484        | QueryExpr::GraphCommand(_)
1485        | QueryExpr::ProbabilisticCommand(_)
1486        | QueryExpr::SetConfig { .. }
1487        | QueryExpr::ShowConfig { .. }
1488        | QueryExpr::SetSecret { .. }
1489        | QueryExpr::DeleteSecret { .. }
1490        | QueryExpr::ShowSecrets { .. }
1491        | QueryExpr::SetTenant(_)
1492        | QueryExpr::ShowTenant
1493        | QueryExpr::TransactionControl(_)
1494        | QueryExpr::CreateSchema(_)
1495        | QueryExpr::DropSchema(_)
1496        | QueryExpr::CreateSequence(_)
1497        | QueryExpr::DropSequence(_)
1498        | QueryExpr::Grant(_)
1499        | QueryExpr::Revoke(_)
1500        | QueryExpr::AlterUser(_)
1501        | QueryExpr::CreateUser(_)
1502        | QueryExpr::CreateIamPolicy { .. }
1503        | QueryExpr::DropIamPolicy { .. }
1504        | QueryExpr::AttachPolicy { .. }
1505        | QueryExpr::DetachPolicy { .. }
1506        | QueryExpr::ShowPolicies { .. }
1507        | QueryExpr::ShowEffectivePermissions { .. }
1508        | QueryExpr::RankOf(_)
1509        | QueryExpr::ApproxRankOf(_)
1510        | QueryExpr::RankRange(_)
1511        | QueryExpr::SimulatePolicy { .. }
1512        | QueryExpr::LintPolicy { .. }
1513        | QueryExpr::MigratePolicyMode { .. }
1514        | QueryExpr::CreateMigration(_)
1515        | QueryExpr::ApplyMigration(_)
1516        | QueryExpr::RollbackMigration(_)
1517        | QueryExpr::ExplainMigration(_)
1518        | QueryExpr::EventsBackfillStatus { .. } => {}
1519        QueryExpr::KvCommand(cmd) => {
1520            use crate::storage::query::ast::KvCommand;
1521            match cmd {
1522                KvCommand::Put { collection, .. }
1523                | KvCommand::InvalidateTags { collection, .. }
1524                | KvCommand::Get { collection, .. }
1525                | KvCommand::Unseal { collection, .. }
1526                | KvCommand::Rotate { collection, .. }
1527                | KvCommand::History { collection, .. }
1528                | KvCommand::List { collection, .. }
1529                | KvCommand::Purge { collection, .. }
1530                | KvCommand::Watch { collection, .. }
1531                | KvCommand::Delete { collection, .. }
1532                | KvCommand::Incr { collection, .. }
1533                | KvCommand::Cas { collection, .. } => cache_scope_insert(scopes, collection),
1534            }
1535        }
1536        QueryExpr::ConfigCommand(cmd) => {
1537            use crate::storage::query::ast::ConfigCommand;
1538            match cmd {
1539                ConfigCommand::Put { collection, .. }
1540                | ConfigCommand::Get { collection, .. }
1541                | ConfigCommand::Resolve { collection, .. }
1542                | ConfigCommand::Rotate { collection, .. }
1543                | ConfigCommand::Delete { collection, .. }
1544                | ConfigCommand::History { collection, .. }
1545                | ConfigCommand::List { collection, .. }
1546                | ConfigCommand::Watch { collection, .. }
1547                | ConfigCommand::InvalidVolatileOperation { collection, .. } => {
1548                    cache_scope_insert(scopes, collection)
1549                }
1550            }
1551        }
1552    }
1553}
1554
1555/// Combine matching RLS policies for a table + action into a single
1556/// `Filter` suitable for AND-ing into a caller's `WHERE` clause.
1557///
1558/// Returns `None` when RLS is disabled or no policy admits the caller's
1559/// role — callers use that to short-circuit the mutation (for DELETE /
1560/// UPDATE we simply skip the operation, which PG expresses as "no rows
1561/// match the policy + predicate combination").
1562pub(crate) fn rls_policy_filter(
1563    runtime: &RedDBRuntime,
1564    table: &str,
1565    action: crate::storage::query::ast::PolicyAction,
1566) -> Option<crate::storage::query::ast::Filter> {
1567    rls_policy_filter_for_kind(
1568        runtime,
1569        table,
1570        action,
1571        crate::storage::query::ast::PolicyTargetKind::Table,
1572    )
1573}
1574
1575/// Kind-aware policy filter combiner (Phase 2.5.5 RLS universal).
1576/// Graph / vector / queue / timeseries scans pass the concrete kind;
1577/// policies targeting other kinds are ignored. Legacy Table-scoped
1578/// policies still apply cross-kind — callers register auto-tenancy
1579/// policies as Table today.
1580pub(crate) fn rls_policy_filter_for_kind(
1581    runtime: &RedDBRuntime,
1582    table: &str,
1583    action: crate::storage::query::ast::PolicyAction,
1584    kind: crate::storage::query::ast::PolicyTargetKind,
1585) -> Option<crate::storage::query::ast::Filter> {
1586    use crate::storage::query::ast::Filter;
1587
1588    if !runtime.inner.rls_enabled_tables.read().contains(table) {
1589        return None;
1590    }
1591    let role = current_auth_identity().map(|(_, role)| role);
1592    let role_str = role.map(|r| r.as_str().to_string());
1593    let policies = runtime.matching_rls_policies_for_kind(table, role_str.as_deref(), action, kind);
1594    if policies.is_empty() {
1595        return None;
1596    }
1597    policies
1598        .into_iter()
1599        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1600}
1601
1602/// Returns true when the table has RLS enforcement enabled. Convenience
1603/// shortcut so DML paths can gate the AND-combine work without reaching
1604/// into `runtime.inner.rls_enabled_tables` directly.
1605pub(crate) fn rls_is_enabled(runtime: &RedDBRuntime, table: &str) -> bool {
1606    runtime.inner.rls_enabled_tables.read().contains(table)
1607}
1608
1609/// Per-entity gate used by the graph materialiser for `GraphNode`
1610/// entities. RLS is checked against the source collection with
1611/// `kind = Nodes`, which `matching_rls_policies_for_kind` resolves to
1612/// either `Nodes`-targeted policies or legacy `Table`-targeted ones
1613/// (for back-compat with auto-tenancy declarations). Cached per
1614/// collection so big graphs only resolve the policy chain once.
1615fn node_passes_rls(
1616    runtime: &RedDBRuntime,
1617    collection: &str,
1618    role: Option<&str>,
1619    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1620    entity: &crate::storage::unified::entity::UnifiedEntity,
1621) -> bool {
1622    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1623
1624    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1625        return true;
1626    }
1627    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1628        let policies = runtime.matching_rls_policies_for_kind(
1629            collection,
1630            role,
1631            PolicyAction::Select,
1632            PolicyTargetKind::Nodes,
1633        );
1634        if policies.is_empty() {
1635            None
1636        } else {
1637            policies
1638                .into_iter()
1639                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1640        }
1641    });
1642    let Some(filter) = filter else {
1643        return false;
1644    };
1645    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1646        Some(&runtime.inner.db),
1647        entity,
1648        filter,
1649        collection,
1650        collection,
1651    )
1652}
1653
1654/// Edge counterpart of `node_passes_rls`. Same caching strategy with
1655/// `kind = Edges`.
1656fn edge_passes_rls(
1657    runtime: &RedDBRuntime,
1658    collection: &str,
1659    role: Option<&str>,
1660    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1661    entity: &crate::storage::unified::entity::UnifiedEntity,
1662) -> bool {
1663    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1664
1665    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1666        return true;
1667    }
1668    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1669        let policies = runtime.matching_rls_policies_for_kind(
1670            collection,
1671            role,
1672            PolicyAction::Select,
1673            PolicyTargetKind::Edges,
1674        );
1675        if policies.is_empty() {
1676            None
1677        } else {
1678            policies
1679                .into_iter()
1680                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1681        }
1682    });
1683    let Some(filter) = filter else {
1684        return false;
1685    };
1686    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1687        Some(&runtime.inner.db),
1688        entity,
1689        filter,
1690        collection,
1691        collection,
1692    )
1693}
1694
1695/// RLS policy injection (Phase 2.5.2 PG parity).
1696///
1697/// Fetch every matching policy for the current thread-local role and
1698/// fold them into the query's filter. Semantics mirror PostgreSQL:
1699///
1700/// * Multiple policies on the same table combine with **OR** — a row is
1701///   visible if *any* policy admits it.
1702/// * The combined policy predicate is **AND**-ed into the caller's
1703///   existing `WHERE` clause so explicit predicates continue to trim
1704///   the policy-allowed set.
1705/// * No matching policies + RLS enabled = zero rows (PG's
1706///   restrictive-default). Callers get `None` and return an empty
1707///   `UnifiedResult` without ever dispatching the scan.
1708///
1709/// This runs only when `RuntimeInner::rls_enabled_tables` already
1710/// contains the table name — callers gate the hot path upfront to
1711/// avoid the lock acquisition on tables without RLS.
1712///
1713/// Returns `None` when no policy admits the current role; returns
1714/// `Some(mutated_table)` with policy filters folded in otherwise.
1715fn inject_rls_filters(
1716    runtime: &RedDBRuntime,
1717    frame: &dyn super::statement_frame::ReadFrame,
1718    mut table: crate::storage::query::ast::TableQuery,
1719) -> Option<crate::storage::query::ast::TableQuery> {
1720    use crate::storage::query::ast::{Filter, PolicyAction};
1721
1722    // `None` role falls through to policies with no `TO role` clause.
1723    let role = frame.identity().map(|(_, role)| role);
1724    let role_str = role.map(|r| r.as_str().to_string());
1725    let policies =
1726        runtime.matching_rls_policies(&table.table, role_str.as_deref(), PolicyAction::Select);
1727
1728    if policies.is_empty() {
1729        // RLS enabled + no policy match = deny everything. Signal the
1730        // caller to short-circuit with an empty result set.
1731        return None;
1732    }
1733
1734    // Combine policy predicates with OR (PG's permissive default).
1735    let combined = policies
1736        .into_iter()
1737        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1738        .expect("policies non-empty");
1739
1740    // AND into the caller's existing predicate. The predicate may live
1741    // in `where_expr` rather than `filter`: `resolve_table_expr_subqueries`
1742    // nulls `filter` whenever `where_expr` is present (the case for a
1743    // view body rewritten into `SELECT … WHERE …`). Folding only into
1744    // `filter` here would silently drop that `where_expr` predicate at
1745    // eval time because `effective_table_filter` prefers `filter` —
1746    // e.g. `WITHIN TENANT … SELECT * FROM <view>` would apply the
1747    // tenant policy but lose the view's own WHERE (#635).
1748    use crate::storage::query::sql_lowering::{expr_to_filter, filter_to_expr};
1749    let had_where_expr = table.where_expr.is_some();
1750    let existing = table
1751        .filter
1752        .take()
1753        .or_else(|| table.where_expr.as_ref().map(expr_to_filter));
1754    let new_filter = match existing {
1755        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1756        None => combined,
1757    };
1758    // Keep `where_expr` in lock-step with the merged `filter` so
1759    // whichever the executor consults sees the full predicate.
1760    if had_where_expr {
1761        table.where_expr = Some(filter_to_expr(&new_filter));
1762    }
1763    table.filter = Some(new_filter);
1764    Some(table)
1765}
1766
1767/// Apply per-table RLS to a `JoinQuery` by folding each side's policy
1768/// predicate into the join's outer filter. Walking the merged record
1769/// at the join layer (rather than mutating the per-side scan filter)
1770/// keeps the planner's strategy choice and per-side index selection
1771/// undisturbed — the policy predicate uses the qualified `t.col` form
1772/// that resolves cleanly against the merged record's keys.
1773///
1774/// Returns `None` when any leaf has RLS enabled and no policy admits
1775/// the caller — the join short-circuits to an empty result.
1776fn inject_rls_into_join(
1777    runtime: &RedDBRuntime,
1778    frame: &dyn super::statement_frame::ReadFrame,
1779    mut join: crate::storage::query::ast::JoinQuery,
1780) -> Option<crate::storage::query::ast::JoinQuery> {
1781    use crate::storage::query::ast::Filter;
1782
1783    let mut policy_filters: Vec<Filter> = Vec::new();
1784    if !collect_join_side_policy(runtime, frame, join.left.as_ref(), &mut policy_filters) {
1785        return None;
1786    }
1787    if !collect_join_side_policy(runtime, frame, join.right.as_ref(), &mut policy_filters) {
1788        return None;
1789    }
1790
1791    if policy_filters.is_empty() {
1792        return Some(join);
1793    }
1794
1795    let combined = policy_filters
1796        .into_iter()
1797        .reduce(|acc, f| Filter::And(Box::new(acc), Box::new(f)))
1798        .expect("policy_filters non-empty");
1799
1800    join.filter = Some(match join.filter.take() {
1801        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1802        None => combined,
1803    });
1804
1805    Some(join)
1806}
1807
1808/// For each `Table` leaf reachable through nested joins, append the
1809/// RLS-policy filter (combined with OR across that side's matching
1810/// policies) into `out`. Returns `false` when a side has RLS enabled
1811/// but no policy admits the caller — the join must short-circuit.
1812fn collect_join_side_policy(
1813    runtime: &RedDBRuntime,
1814    frame: &dyn super::statement_frame::ReadFrame,
1815    expr: &crate::storage::query::ast::QueryExpr,
1816    out: &mut Vec<crate::storage::query::ast::Filter>,
1817) -> bool {
1818    use crate::storage::query::ast::{Filter, PolicyAction, QueryExpr};
1819    match expr {
1820        QueryExpr::Table(t) => {
1821            if !runtime.inner.rls_enabled_tables.read().contains(&t.table) {
1822                return true;
1823            }
1824            let role = frame.identity().map(|(_, role)| role);
1825            let role_str = role.map(|r| r.as_str().to_string());
1826            let policies =
1827                runtime.matching_rls_policies(&t.table, role_str.as_deref(), PolicyAction::Select);
1828            if policies.is_empty() {
1829                return false;
1830            }
1831            let combined = policies
1832                .into_iter()
1833                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1834                .expect("policies non-empty");
1835            out.push(combined);
1836            true
1837        }
1838        QueryExpr::Join(inner) => {
1839            collect_join_side_policy(runtime, frame, inner.left.as_ref(), out)
1840                && collect_join_side_policy(runtime, frame, inner.right.as_ref(), out)
1841        }
1842        _ => true,
1843    }
1844}
1845
1846/// Foreign-table post-scan filter application (Phase 3.2.2 PG parity).
1847///
1848/// Phase 3.2 FDW wrappers don't advertise filter pushdown, so the runtime
1849/// applies `WHERE` / `ORDER BY` / `LIMIT` / `OFFSET` after the wrapper
1850/// materialises all rows. Projections are best-effort — when the query
1851/// lists explicit columns we keep only those; a `SELECT *` keeps every
1852/// wrapper-emitted field verbatim.
1853///
1854/// When a wrapper later opts into pushdown (`supports_pushdown = true`)
1855/// the runtime will pass the compiled filter down instead of post-filtering.
1856fn apply_foreign_table_filters(
1857    records: Vec<crate::storage::query::unified::UnifiedRecord>,
1858    query: &crate::storage::query::ast::TableQuery,
1859) -> crate::storage::query::unified::UnifiedResult {
1860    use crate::storage::query::sql_lowering::{
1861        effective_table_filter, effective_table_projections,
1862    };
1863    use crate::storage::query::unified::UnifiedResult;
1864
1865    let filter = effective_table_filter(query);
1866    let projections = effective_table_projections(query);
1867
1868    // Step 1 — WHERE. Reuse the cross-store evaluator so the semantics
1869    // match native-collection queries (same operators, same NULL handling).
1870    let mut filtered: Vec<_> = records
1871        .into_iter()
1872        .filter(|record| match &filter {
1873            Some(f) => {
1874                super::join_filter::evaluate_runtime_filter_with_db(None, record, f, None, None)
1875            }
1876            None => true,
1877        })
1878        .collect();
1879
1880    // Step 2 — LIMIT / OFFSET. Applied after filter to match SQL semantics.
1881    if let Some(offset) = query.offset {
1882        let offset = offset as usize;
1883        if offset >= filtered.len() {
1884            filtered.clear();
1885        } else {
1886            filtered.drain(0..offset);
1887        }
1888    }
1889    if let Some(limit) = query.limit {
1890        filtered.truncate(limit as usize);
1891    }
1892
1893    // Step 3 — columns list. `SELECT *` (no explicit projections) keeps
1894    // the wrapper's column set; an explicit list trims to those names.
1895    let columns: Vec<String> = if projections.is_empty() {
1896        filtered
1897            .first()
1898            .map(|r| r.column_names().iter().map(|k| k.to_string()).collect())
1899            .unwrap_or_default()
1900    } else {
1901        projections
1902            .iter()
1903            .map(super::join_filter::projection_name)
1904            .collect()
1905    };
1906
1907    let mut result = UnifiedResult::empty();
1908    result.columns = columns;
1909    result.records = filtered;
1910    result
1911}
1912
1913/// Collect every concrete table reference inside a `QueryExpr`.
1914///
1915/// Used by view bookkeeping (dependency tracking for materialised
1916/// invalidation) and any other rewriter that needs to know the base
1917/// tables a query pulls from. Does not descend into projections/filters;
1918/// only the `FROM` side.
1919pub(crate) fn collect_table_refs(expr: &QueryExpr) -> Vec<String> {
1920    let mut scopes: HashSet<String> = HashSet::new();
1921    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1922    scopes.into_iter().collect()
1923}
1924
1925fn query_expr_result_cache_scopes(expr: &QueryExpr) -> HashSet<String> {
1926    let mut scopes = HashSet::new();
1927    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1928    scopes
1929}
1930
1931/// Heuristic: does the raw SQL reference a built-in whose output
1932/// varies by connection, clock, or randomness? Such queries must
1933/// skip the 30s result cache — see the call site for rationale.
1934///
1935/// ASCII case-insensitive substring match. False positives (the
1936/// token appears in a quoted string) only skip caching, which is
1937/// the conservative direction.
1938/// If `sql` starts with `EXPLAIN` followed by a non-`ALTER` token,
1939/// return the trimmed inner statement; otherwise `None`.
1940///
1941/// `EXPLAIN ALTER FOR CREATE TABLE ...` is a separate schema-diff
1942/// command handled inside the normal SQL parser, so we leave it
1943/// alone here.
1944fn strip_explain_prefix(sql: &str) -> Option<&str> {
1945    let trimmed = sql.trim_start();
1946    let (head, rest) = trimmed.split_at(
1947        trimmed
1948            .find(|c: char| c.is_whitespace())
1949            .unwrap_or(trimmed.len()),
1950    );
1951    if !head.eq_ignore_ascii_case("EXPLAIN") {
1952        return None;
1953    }
1954    let rest = rest.trim_start();
1955    if rest.is_empty() {
1956        return None;
1957    }
1958    // Peek the next token — if ALTER or ASK, defer to the normal parser.
1959    // `EXPLAIN ASK` is an executable read path: it runs retrieval and
1960    // provider selection, then short-circuits before the LLM call.
1961    let next_head_end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len());
1962    if rest[..next_head_end].eq_ignore_ascii_case("ALTER")
1963        || rest[..next_head_end].eq_ignore_ascii_case("ASK")
1964    {
1965        return None;
1966    }
1967    Some(rest)
1968}
1969
1970/// Cheap prefix check for a leading `WITH` keyword. Used to gate the
1971/// CTE-aware parse in `execute_query` without paying for a full
1972/// lexer pass on every statement. Treats `WITHIN` as not-a-CTE so
1973/// `WITHIN TENANT '...' SELECT ...` doesn't mis-route.
1974pub(super) fn has_with_prefix(sql: &str) -> bool {
1975    let trimmed = sql.trim_start();
1976    let head_end = trimmed
1977        .find(|c: char| c.is_whitespace() || c == '(')
1978        .unwrap_or(trimmed.len());
1979    trimmed[..head_end].eq_ignore_ascii_case("WITH")
1980}
1981
1982/// If the query is a plain SELECT whose top-level `TableQuery`
1983/// carries an `AS OF` clause, return a typed spec that the runtime
1984/// can feed to `vcs_resolve_as_of`. Returns `None` for any other
1985/// shape — joins, DML, EXPLAIN, or parse failures — so callers fall
1986/// back to the connection's regular MVCC snapshot. A cheap textual
1987/// prefilter skips the parse entirely when the source doesn't
1988/// mention `AS OF` / `as of`, keeping the autocommit hot path free.
1989fn peek_top_level_as_of(sql: &str) -> Option<crate::application::vcs::AsOfSpec> {
1990    peek_top_level_as_of_with_table(sql).map(|(spec, _)| spec)
1991}
1992
1993/// Same as `peek_top_level_as_of` but also returns the table name
1994/// targeted by the AS OF clause (when the FROM clause names a
1995/// concrete table). `None` for the table slot means scalar SELECT
1996/// or a subquery source — callers treat those as "no enforcement".
1997pub(super) fn peek_top_level_as_of_with_table(
1998    sql: &str,
1999) -> Option<(crate::application::vcs::AsOfSpec, Option<String>)> {
2000    if !sql
2001        .as_bytes()
2002        .windows(5)
2003        .any(|w| w.eq_ignore_ascii_case(b"as of"))
2004    {
2005        return None;
2006    }
2007    let parsed = crate::storage::query::parser::parse(sql).ok()?;
2008    let crate::storage::query::ast::QueryExpr::Table(table) = parsed.query else {
2009        return None;
2010    };
2011    let clause = table.as_of?;
2012    let table_name = if table.table.is_empty() || table.table == "any" {
2013        None
2014    } else {
2015        Some(table.table.clone())
2016    };
2017    let spec = match clause {
2018        crate::storage::query::ast::AsOfClause::Commit(h) => {
2019            crate::application::vcs::AsOfSpec::Commit(h)
2020        }
2021        crate::storage::query::ast::AsOfClause::Branch(b) => {
2022            crate::application::vcs::AsOfSpec::Branch(b)
2023        }
2024        crate::storage::query::ast::AsOfClause::Tag(t) => crate::application::vcs::AsOfSpec::Tag(t),
2025        crate::storage::query::ast::AsOfClause::TimestampMs(ts) => {
2026            crate::application::vcs::AsOfSpec::TimestampMs(ts)
2027        }
2028        crate::storage::query::ast::AsOfClause::Snapshot(x) => {
2029            crate::application::vcs::AsOfSpec::Snapshot(x)
2030        }
2031    };
2032    Some((spec, table_name))
2033}
2034
2035pub(super) fn query_has_volatile_builtin(sql: &str) -> bool {
2036    // Lowercase the bytes up to the first null/newline into a small
2037    // stack buffer for cheap contains() checks. Most SQL fits in the
2038    // buffer; longer queries fall back to owned lowercase.
2039    const VOLATILE_TOKENS: &[&str] = &[
2040        "pg_advisory_lock",
2041        "pg_try_advisory_lock",
2042        "pg_advisory_unlock",
2043        "random()",
2044        // NOW() / CURRENT_TIMESTAMP / CURRENT_DATE intentionally
2045        // omitted for now — they ARE volatile but today's tests rely
2046        // on caching them. Revisit once a tighter volatility story
2047        // lands.
2048    ];
2049    let lowered = sql.to_ascii_lowercase();
2050    VOLATILE_TOKENS.iter().any(|t| lowered.contains(t))
2051}
2052
2053pub(super) fn query_is_ask_statement(sql: &str) -> bool {
2054    let trimmed = sql.trim_start();
2055    let head_end = trimmed
2056        .find(|c: char| c.is_whitespace() || c == '(' || c == ';')
2057        .unwrap_or(trimmed.len());
2058    trimmed[..head_end].eq_ignore_ascii_case("ASK")
2059}
2060
2061/// Pick the `(global_mode, collection_mode)` pair for an expression,
2062/// or `None` for variants that opt out of intent-locking entirely
2063/// (admin statements like `SHOW CONFIG`, transaction control, tenant
2064/// toggles).
2065///
2066/// Phase-1 contract:
2067/// - Reads  — `(IX-compatible) (Global, IS) → (Collection, IS)`
2068/// - Writes — `(IX-compatible) (Global, IX) → (Collection, IX)`
2069/// - DDL    — `(strong)        (Global, IX) → (Collection, X)`
2070pub(super) fn intent_lock_modes_for(
2071    expr: &QueryExpr,
2072) -> Option<(
2073    crate::storage::transaction::lock::LockMode,
2074    crate::storage::transaction::lock::LockMode,
2075)> {
2076    use crate::storage::transaction::lock::LockMode::{Exclusive, IntentExclusive, IntentShared};
2077
2078    match expr {
2079        // Reads — IS / IS.
2080        QueryExpr::Table(_)
2081        | QueryExpr::Join(_)
2082        | QueryExpr::Vector(_)
2083        | QueryExpr::Hybrid(_)
2084        | QueryExpr::Graph(_)
2085        | QueryExpr::Path(_)
2086        | QueryExpr::Ask(_)
2087        | QueryExpr::SearchCommand(_)
2088        | QueryExpr::GraphCommand(_)
2089        | QueryExpr::RankOf(_)
2090        | QueryExpr::ApproxRankOf(_)
2091        | QueryExpr::RankRange(_)
2092        | QueryExpr::QueueSelect(_) => Some((IntentShared, IntentShared)),
2093
2094        // Writes — IX / IX. Non-tabular mutations (vector insert,
2095        // graph node insert, queue push, timeseries point insert)
2096        // don't carry their own dispatch arm here; they ride through
2097        // the Insert variant or a command variant covered by the
2098        // read-side arm above. P1.T4 expands only the TableQuery-ish
2099        // writes; non-tabular kinds inherit when their DML variants
2100        // land in later phases.
2101        QueryExpr::Insert(_)
2102        | QueryExpr::Update(_)
2103        | QueryExpr::Delete(_)
2104        | QueryExpr::QueueCommand(QueueCommand::Move { .. }) => {
2105            Some((IntentExclusive, IntentExclusive))
2106        }
2107        QueryExpr::QueueCommand(_) => Some((IntentShared, IntentShared)),
2108
2109        // DDL — IX / X. A DDL against collection `c` blocks all
2110        // other writers + readers on `c` but leaves other collections
2111        // running (because Global stays IX, not X).
2112        QueryExpr::CreateTable(_)
2113        | QueryExpr::CreateCollection(_)
2114        | QueryExpr::CreateVector(_)
2115        | QueryExpr::DropTable(_)
2116        | QueryExpr::DropGraph(_)
2117        | QueryExpr::DropVector(_)
2118        | QueryExpr::DropDocument(_)
2119        | QueryExpr::DropKv(_)
2120        | QueryExpr::DropCollection(_)
2121        | QueryExpr::Truncate(_)
2122        | QueryExpr::AlterTable(_)
2123        | QueryExpr::CreateIndex(_)
2124        | QueryExpr::DropIndex(_)
2125        | QueryExpr::CreateTimeSeries(_)
2126        | QueryExpr::CreateMetric(_)
2127        | QueryExpr::AlterMetric(_)
2128        | QueryExpr::CreateSlo(_)
2129        | QueryExpr::DropTimeSeries(_)
2130        | QueryExpr::CreateQueue(_)
2131        | QueryExpr::AlterQueue(_)
2132        | QueryExpr::DropQueue(_)
2133        | QueryExpr::CreateTree(_)
2134        | QueryExpr::DropTree(_)
2135        | QueryExpr::CreatePolicy(_)
2136        | QueryExpr::DropPolicy(_)
2137        | QueryExpr::CreateView(_)
2138        | QueryExpr::DropView(_)
2139        | QueryExpr::RefreshMaterializedView(_)
2140        | QueryExpr::CreateSchema(_)
2141        | QueryExpr::DropSchema(_)
2142        | QueryExpr::CreateSequence(_)
2143        | QueryExpr::DropSequence(_)
2144        | QueryExpr::CreateServer(_)
2145        | QueryExpr::DropServer(_)
2146        | QueryExpr::CreateForeignTable(_)
2147        | QueryExpr::DropForeignTable(_) => Some((IntentExclusive, Exclusive)),
2148
2149        // Admin / control — skip intent locks. `SET TENANT`,
2150        // `BEGIN / COMMIT / ROLLBACK`, `SET CONFIG`, `SHOW CONFIG`,
2151        // `VACUUM`, etc. don't touch collection data the same way
2152        // and the existing transaction layer already serialises the
2153        // pieces that matter.
2154        _ => None,
2155    }
2156}
2157
2158/// Best-effort collection inventory for an expression. Used to pick
2159/// `Collection(...)` resources for the intent-lock guard. Overshoots
2160/// are fine (take an extra IS, benign); undershoots leak writes past
2161/// DDL X locks, so err on the side of listing more names.
2162pub(super) fn collections_referenced(expr: &QueryExpr) -> Vec<String> {
2163    let mut out = Vec::new();
2164    walk_collections(expr, &mut out);
2165    out.sort();
2166    out.dedup();
2167    out
2168}
2169
2170fn walk_collections(expr: &QueryExpr, out: &mut Vec<String>) {
2171    match expr {
2172        QueryExpr::Table(t) => out.push(t.table.clone()),
2173        QueryExpr::Join(j) => {
2174            walk_collections(&j.left, out);
2175            walk_collections(&j.right, out);
2176        }
2177        QueryExpr::Insert(i) => out.push(i.table.clone()),
2178        QueryExpr::Update(u) => out.push(u.table.clone()),
2179        QueryExpr::Delete(d) => out.push(d.table.clone()),
2180        QueryExpr::QueueSelect(q) => out.push(q.queue.clone()),
2181
2182        // DDL — include the target collection so DDL takes
2183        // `(Collection, X)` and blocks concurrent readers / writers
2184        // on the same collection. Other collections stay live
2185        // because Global is still IX.
2186        QueryExpr::CreateTable(q) => out.push(q.name.clone()),
2187        QueryExpr::CreateCollection(q) => out.push(q.name.clone()),
2188        QueryExpr::CreateVector(q) => out.push(q.name.clone()),
2189        QueryExpr::DropTable(q) => out.push(q.name.clone()),
2190        QueryExpr::DropGraph(q) => out.push(q.name.clone()),
2191        QueryExpr::DropVector(q) => out.push(q.name.clone()),
2192        QueryExpr::DropDocument(q) => out.push(q.name.clone()),
2193        QueryExpr::DropKv(q) => out.push(q.name.clone()),
2194        QueryExpr::DropCollection(q) => out.push(q.name.clone()),
2195        QueryExpr::Truncate(q) => out.push(q.name.clone()),
2196        QueryExpr::AlterTable(q) => out.push(q.name.clone()),
2197        QueryExpr::CreateIndex(q) => out.push(q.table.clone()),
2198        QueryExpr::DropIndex(q) => out.push(q.table.clone()),
2199        QueryExpr::CreateTimeSeries(q) => out.push(q.name.clone()),
2200        QueryExpr::CreateMetric(q) => out.push(q.path.clone()),
2201        QueryExpr::AlterMetric(q) => out.push(q.path.clone()),
2202        QueryExpr::CreateSlo(q) => out.push(q.path.clone()),
2203        QueryExpr::DropTimeSeries(q) => out.push(q.name.clone()),
2204        QueryExpr::CreateQueue(q) => out.push(q.name.clone()),
2205        QueryExpr::AlterQueue(q) => out.push(q.name.clone()),
2206        QueryExpr::DropQueue(q) => out.push(q.name.clone()),
2207        QueryExpr::QueueCommand(QueueCommand::Move {
2208            source,
2209            destination,
2210            ..
2211        }) => {
2212            out.push(source.clone());
2213            out.push(destination.clone());
2214        }
2215        QueryExpr::CreatePolicy(q) => out.push(q.table.clone()),
2216        QueryExpr::CreateView(q) => out.push(q.name.clone()),
2217        QueryExpr::DropView(q) => out.push(q.name.clone()),
2218        QueryExpr::RefreshMaterializedView(q) => out.push(q.name.clone()),
2219
2220        // Vector / Hybrid / Graph / Path / commands reference
2221        // collections through fields whose shape varies; without a
2222        // uniform accessor we fall back to the global lock only —
2223        // benign because every runtime path still holds the global
2224        // mode.
2225        _ => {}
2226    }
2227}
2228
2229impl RedDBRuntime {
2230    pub fn in_memory() -> RedDBResult<Self> {
2231        Self::with_options(RedDBOptions::in_memory())
2232    }
2233
2234    pub fn flush(&self) -> RedDBResult<()> {
2235        self.inner
2236            .db
2237            .flush()
2238            .map_err(|err| RedDBError::Internal(err.to_string()))
2239    }
2240
2241    /// Handle to the intent-lock manager for tests + introspection.
2242    /// Production code acquires via `LockerGuard::new(rt.lock_manager())`
2243    /// rather than touching the manager directly.
2244    pub fn lock_manager(&self) -> std::sync::Arc<crate::storage::transaction::lock::LockManager> {
2245        self.inner.lock_manager.clone()
2246    }
2247
2248    /// Process-local governance registry for managed policy/config guardrails.
2249    pub fn config_registry(&self) -> std::sync::Arc<crate::auth::registry::ConfigRegistry> {
2250        self.inner.config_registry.clone()
2251    }
2252
2253    pub fn query_audit(&self) -> std::sync::Arc<crate::runtime::query_audit::QueryAuditStream> {
2254        self.inner.query_audit.clone()
2255    }
2256
2257    pub fn control_events_require_persistence(&self) -> bool {
2258        self.inner.control_event_config.require_persistence()
2259    }
2260
2261    pub fn control_event_config(&self) -> crate::runtime::control_events::ControlEventConfig {
2262        self.inner.control_event_config
2263    }
2264
2265    pub fn control_event_ledger(
2266        &self,
2267    ) -> Arc<dyn crate::runtime::control_events::ControlEventLedger> {
2268        self.inner.control_event_ledger.read().clone()
2269    }
2270
2271    #[doc(hidden)]
2272    pub fn replace_control_event_ledger_for_tests(
2273        &self,
2274        ledger: Arc<dyn crate::runtime::control_events::ControlEventLedger>,
2275    ) {
2276        *self.inner.control_event_ledger.write() = ledger;
2277    }
2278
2279    #[inline(never)]
2280    pub fn with_options(options: RedDBOptions) -> RedDBResult<Self> {
2281        Self::with_pool(options, ConnectionPoolConfig::default())
2282    }
2283
2284    pub fn with_pool(
2285        options: RedDBOptions,
2286        pool_config: ConnectionPoolConfig,
2287    ) -> RedDBResult<Self> {
2288        // PLAN.md Phase 9.1 — capture wall-clock before storage
2289        // open so the cold-start phase markers can be backfilled
2290        // once Lifecycle is constructed below. Storage open
2291        // encapsulates auto-restore + WAL replay; we treat the
2292        // whole window as one combined "restore" + "wal_replay"
2293        // phase split at the same boundary because the storage
2294        // layer doesn't yet emit a finer signal.
2295        let boot_open_start_ms = std::time::SystemTime::now()
2296            .duration_since(std::time::UNIX_EPOCH)
2297            .map(|d| d.as_millis() as u64)
2298            .unwrap_or(0);
2299        let embedded_single_file = options.storage_profile.deploy_profile
2300            == crate::storage::DeployProfile::Embedded
2301            && options.storage_profile.packaging == crate::storage::StoragePackaging::SingleFile;
2302        let db = Arc::new(
2303            RedDB::open_with_options(&options)
2304                .map_err(|err| RedDBError::Internal(err.to_string()))?,
2305        );
2306        let result_blob_cache_config = if embedded_single_file {
2307            crate::storage::cache::BlobCacheConfig::default()
2308        } else {
2309            crate::storage::cache::BlobCacheConfig::default().with_l2_path(
2310                reddb_file::layout::result_cache_l2_path(
2311                    &options.resolved_path(reddb_file::default_database_path()),
2312                ),
2313            )
2314        };
2315        let result_blob_cache =
2316            crate::storage::cache::BlobCache::open_with_l2(result_blob_cache_config).map_err(
2317                |err| RedDBError::Internal(format!("open result Blob Cache L2 failed: {err:?}")),
2318            )?;
2319        let storage_ready_ms = std::time::SystemTime::now()
2320            .duration_since(std::time::UNIX_EPOCH)
2321            .map(|d| d.as_millis() as u64)
2322            .unwrap_or(0);
2323
2324        let runtime = Self {
2325            inner: Arc::new(RuntimeInner {
2326                db: db.clone(),
2327                layout: PhysicalLayout::from_options(&options),
2328                embedded_single_file,
2329                indices: IndexCatalog::register_default_vector_graph(
2330                    options.has_capability(crate::api::Capability::Table),
2331                    options.has_capability(crate::api::Capability::Graph),
2332                ),
2333                pool_config,
2334                pool: Mutex::new(PoolState::default()),
2335                started_at_unix_ms: SystemTime::now()
2336                    .duration_since(UNIX_EPOCH)
2337                    .unwrap_or_default()
2338                    .as_millis(),
2339                probabilistic: super::probabilistic_store::ProbabilisticStore::new(),
2340                index_store: super::index_store::IndexStore::new(),
2341                cdc: crate::replication::cdc::CdcBuffer::new(100_000),
2342                backup_scheduler: crate::replication::scheduler::BackupScheduler::new(3600),
2343                query_cache: parking_lot::RwLock::new(
2344                    crate::storage::query::planner::cache::PlanCache::new(1000),
2345                ),
2346                result_cache: parking_lot::RwLock::new((
2347                    HashMap::new(),
2348                    std::collections::VecDeque::new(),
2349                )),
2350                result_blob_cache,
2351                result_blob_entries: parking_lot::RwLock::new((
2352                    HashMap::new(),
2353                    std::collections::VecDeque::new(),
2354                )),
2355                ask_answer_cache_entries: parking_lot::RwLock::new((
2356                    HashSet::new(),
2357                    std::collections::VecDeque::new(),
2358                )),
2359                result_cache_shadow_divergences: std::sync::atomic::AtomicU64::new(0),
2360                result_cache_hits: std::sync::atomic::AtomicU64::new(0),
2361                result_cache_misses: std::sync::atomic::AtomicU64::new(0),
2362                result_cache_evictions: std::sync::atomic::AtomicU64::new(0),
2363                ask_daily_spend: parking_lot::RwLock::new(HashMap::new()),
2364                queue_message_locks: parking_lot::RwLock::new(HashMap::new()),
2365                rmw_locks: RmwLockTable::new(),
2366                planner_dirty_tables: parking_lot::RwLock::new(HashSet::new()),
2367                ec_registry: Arc::new(crate::ec::config::EcRegistry::new()),
2368                config_registry: Arc::new(crate::auth::registry::ConfigRegistry::new()),
2369                ec_worker: crate::ec::worker::EcWorker::new(),
2370                auth_store: parking_lot::RwLock::new(None),
2371                oauth_validator: parking_lot::RwLock::new(None),
2372                browser_token_authority: parking_lot::RwLock::new(None),
2373                views: parking_lot::RwLock::new(HashMap::new()),
2374                materialized_views: parking_lot::RwLock::new(
2375                    crate::storage::cache::result::MaterializedViewCache::new(),
2376                ),
2377                retention_sweeper: parking_lot::RwLock::new(
2378                    crate::runtime::retention_sweeper::RetentionSweeperState::new(),
2379                ),
2380                snapshot_manager: Arc::new(
2381                    crate::storage::transaction::snapshot::SnapshotManager::new(),
2382                ),
2383                tx_contexts: parking_lot::RwLock::new(HashMap::new()),
2384                tx_local_tenants: parking_lot::RwLock::new(HashMap::new()),
2385                env_config_overrides: crate::runtime::config_overlay::collect_env_overrides(),
2386                lock_manager: Arc::new({
2387                    // Sourced from the matrix: Tier B key
2388                    // `concurrency.locking.deadlock_timeout_ms`
2389                    // (default 5000). Env var wins at boot so
2390                    // operators can tune without touching red_config.
2391                    let env = crate::runtime::config_overlay::collect_env_overrides();
2392                    let timeout_ms = env
2393                        .get("concurrency.locking.deadlock_timeout_ms")
2394                        .and_then(|raw| raw.parse::<u64>().ok())
2395                        .unwrap_or_else(|| {
2396                            match crate::runtime::config_matrix::default_for(
2397                                "concurrency.locking.deadlock_timeout_ms",
2398                            ) {
2399                                Some(crate::serde_json::Value::Number(n)) => n as u64,
2400                                _ => 5000,
2401                            }
2402                        });
2403                    let cfg = crate::storage::transaction::lock::LockConfig {
2404                        default_timeout: std::time::Duration::from_millis(timeout_ms),
2405                        ..Default::default()
2406                    };
2407                    crate::storage::transaction::lock::LockManager::new(cfg)
2408                }),
2409                rls_policies: parking_lot::RwLock::new(HashMap::new()),
2410                rls_enabled_tables: parking_lot::RwLock::new(HashSet::new()),
2411                foreign_tables: Arc::new(crate::storage::fdw::ForeignTableRegistry::with_builtins()),
2412                pending_tombstones: parking_lot::RwLock::new(HashMap::new()),
2413                pending_versioned_updates: parking_lot::RwLock::new(HashMap::new()),
2414                pending_kv_watch_events: parking_lot::RwLock::new(HashMap::new()),
2415                pending_store_wal_actions: parking_lot::RwLock::new(HashMap::new()),
2416                queue_wait_registry: std::sync::Arc::new(
2417                    crate::runtime::queue_wait_registry::QueueWaitRegistry::new(),
2418                ),
2419                pending_queue_wakes: parking_lot::RwLock::new(HashMap::new()),
2420                tenant_tables: parking_lot::RwLock::new(HashMap::new()),
2421                ddl_epoch: std::sync::atomic::AtomicU64::new(0),
2422                write_gate: Arc::new(crate::runtime::write_gate::WriteGate::from_options(
2423                    &options,
2424                )),
2425                lifecycle: crate::runtime::lifecycle::Lifecycle::new(),
2426                resource_limits: crate::runtime::resource_limits::ResourceLimits::from_env(),
2427                audit_log: {
2428                    // Default audit-log path for the in-memory case
2429                    // sits in the system temp dir; persistent runs
2430                    // place it next to the resolved data file.
2431                    //
2432                    // gh-471 iter 2: route through the resolved
2433                    // `LogDestination`. Performance/Max tiers emit a
2434                    // file-backed log destination under the file-owned
2435                    // support-directory logs tier;
2436                    // lower tiers / ephemeral runs report `Stderr`
2437                    // and we keep the legacy file-next-to-data sink.
2438                    let data_path = options.data_path.clone().unwrap_or_else(|| {
2439                        if embedded_single_file {
2440                            std::env::temp_dir()
2441                                .join("reddb-embedded-runtime")
2442                                .join(format!("audit-{}", std::process::id()))
2443                        } else {
2444                            std::env::temp_dir().join("reddb")
2445                        }
2446                    });
2447                    let (audit_dest, _) = crate::api::tier_wiring::current_log_destinations();
2448                    Arc::new(crate::runtime::audit_log::AuditLogger::for_destination(
2449                        &audit_dest,
2450                        &data_path,
2451                    ))
2452                },
2453                control_event_ledger: parking_lot::RwLock::new(Arc::new(
2454                    crate::runtime::control_events::RuntimeLedger::new(db.store()),
2455                )),
2456                control_event_config: options.control_events,
2457                query_audit: Arc::new(crate::runtime::query_audit::QueryAuditStream::new(
2458                    db.store(),
2459                    options.query_audit.clone(),
2460                )),
2461                lease_lifecycle: std::sync::OnceLock::new(),
2462                replica_apply_metrics: std::sync::Arc::new(
2463                    crate::replication::logical::ReplicaApplyMetrics::default(),
2464                ),
2465                quota_bucket: crate::runtime::quota_bucket::QuotaBucket::from_env(),
2466                schema_vocabulary: parking_lot::RwLock::new(
2467                    crate::runtime::schema_vocabulary::SchemaVocabulary::new(),
2468                ),
2469                slow_query_logger: {
2470                    // Issue #205 — slow-query sink lives in the same
2471                    // directory the audit log uses, so backup/restore
2472                    // ships them together. Threshold + sample-pct
2473                    // default conservatively (1 s, 100% sampling) so
2474                    // emitted lines are rare and complete. Operators
2475                    // tune via env / config matrix in a follow-up.
2476                    //
2477                    // gh-471 iter 2: same routing as the audit log —
2478                    // `LogDestination::File(...)` for Performance/Max
2479                    // lands under the file-owned support-directory logs tier;
2480                    // lower tiers fall back to `red-slow.log` in the
2481                    // data directory.
2482                    let fallback_dir = options
2483                        .data_path
2484                        .as_ref()
2485                        .and_then(|p| p.parent().map(std::path::PathBuf::from))
2486                        .unwrap_or_else(|| {
2487                            if embedded_single_file {
2488                                std::env::temp_dir()
2489                                    .join("reddb-embedded-runtime")
2490                                    .join(format!("slow-{}", std::process::id()))
2491                            } else {
2492                                std::env::temp_dir().join("reddb")
2493                            }
2494                        });
2495                    let threshold_ms = std::env::var("RED_SLOW_QUERY_THRESHOLD_MS")
2496                        .ok()
2497                        .and_then(|s| s.parse::<u64>().ok())
2498                        .unwrap_or(1000);
2499                    let sample_pct = std::env::var("RED_SLOW_QUERY_SAMPLE_PCT")
2500                        .ok()
2501                        .and_then(|s| s.parse::<u8>().ok())
2502                        .unwrap_or(100);
2503                    let (_, slow_dest) = crate::api::tier_wiring::current_log_destinations();
2504                    crate::telemetry::slow_query_logger::SlowQueryLogger::for_destination(
2505                        &slow_dest,
2506                        &fallback_dir,
2507                        threshold_ms,
2508                        sample_pct,
2509                    )
2510                },
2511                kv_stats: crate::runtime::KvStatsCounters::default(),
2512                metrics_ingest_stats: crate::runtime::MetricsIngestCounters::default(),
2513                metrics_tenant_activity_stats:
2514                    crate::runtime::MetricsTenantActivityCounters::default(),
2515                queue_telemetry: Arc::new(
2516                    crate::runtime::queue_telemetry::QueueTelemetryCounters::default(),
2517                ),
2518                queue_presence: Arc::new(
2519                    crate::storage::queue::presence::ConsumerPresenceRegistry::new(),
2520                ),
2521                vector_introspection: Arc::new(
2522                    crate::storage::vector::introspection::VectorIntrospectionRegistry::new(),
2523                ),
2524                kv_tag_index: crate::runtime::KvTagIndex::default(),
2525                chain_tip_cache: parking_lot::Mutex::new(HashMap::new()),
2526                chain_integrity_broken: parking_lot::Mutex::new(HashMap::new()),
2527                integrity_tombstones: parking_lot::Mutex::new(Vec::new()),
2528                integrity_tombstones_state: std::sync::atomic::AtomicU8::new(0),
2529            }),
2530        };
2531
2532        // Issue #205 — install the process-wide OperatorEvent sink so
2533        // emit sites buried in storage / replication / signal handlers
2534        // can record without threading an `&AuditLogger` through every
2535        // call stack. First registration wins; subsequent in-memory
2536        // runtimes (test harnesses) fall through to tracing+eprintln.
2537        crate::telemetry::operator_event::install_global_audit_sink(Arc::clone(
2538            &runtime.inner.audit_log,
2539        ));
2540
2541        // PLAN.md Phase 9.1 — backfill cold-start phase markers
2542        // from the wall-clock captured before storage open. The
2543        // entire `RedDB::open_with_options` call covers both
2544        // auto-restore (when configured) and WAL replay. We
2545        // record both phases against the same boundary today;
2546        // a follow-up will split them once the storage layer
2547        // surfaces a finer-grained event.
2548        runtime
2549            .inner
2550            .lifecycle
2551            .set_restore_started_at_ms(boot_open_start_ms);
2552        runtime
2553            .inner
2554            .lifecycle
2555            .set_restore_ready_at_ms(storage_ready_ms);
2556        runtime
2557            .inner
2558            .lifecycle
2559            .set_wal_replay_started_at_ms(boot_open_start_ms);
2560        runtime
2561            .inner
2562            .lifecycle
2563            .set_wal_replay_ready_at_ms(storage_ready_ms);
2564
2565        let restored_cdc_lsn = runtime
2566            .inner
2567            .db
2568            .replication
2569            .as_ref()
2570            .map(|repl| {
2571                repl.logical_wal_spool
2572                    .as_ref()
2573                    .map(|spool| spool.current_lsn())
2574                    .unwrap_or(0)
2575            })
2576            .unwrap_or(0)
2577            .max(runtime.config_u64("red.config.timeline.last_archived_lsn", 0));
2578        runtime.inner.cdc.set_current_lsn(restored_cdc_lsn);
2579        runtime.rehydrate_snapshot_xid_floor();
2580        runtime
2581            .bootstrap_system_keyed_collections()
2582            .map_err(|err| RedDBError::Internal(format!("bootstrap system collections: {err}")))?;
2583        runtime.rehydrate_declared_column_schemas();
2584        runtime.rehydrate_runtime_index_registry()?;
2585        runtime
2586            .load_probabilistic_state()
2587            .map_err(|err| RedDBError::Internal(format!("load probabilistic state: {err}")))?;
2588
2589        // Phase 2.5.4: replay `tenant_tables.{table}.column` markers so
2590        // tables declared via `TENANT BY (col)` survive restart. Each
2591        // entry re-registers the auto-policy and flips RLS on again.
2592        runtime.rehydrate_tenant_tables();
2593        // Issue #593 slice 9a — replay persisted materialized-view
2594        // descriptors so `CREATE MATERIALIZED VIEW v AS …` survives a
2595        // restart. Runs after the system-keyed collections bootstrap
2596        // and before the API opens.
2597        runtime.rehydrate_materialized_view_descriptors();
2598        if let Some(repl) = &runtime.inner.db.replication {
2599            repl.wal_buffer.set_current_lsn(restored_cdc_lsn);
2600        }
2601
2602        // Save system info to red_config on boot
2603        {
2604            let sys = SystemInfo::collect();
2605            runtime.inner.db.store().set_config_tree(
2606                "red.system",
2607                &crate::serde_json::json!({
2608                    "pid": sys.pid,
2609                    "cpu_cores": sys.cpu_cores,
2610                    "total_memory_bytes": sys.total_memory_bytes,
2611                    "available_memory_bytes": sys.available_memory_bytes,
2612                    "os": sys.os,
2613                    "arch": sys.arch,
2614                    "hostname": sys.hostname,
2615                    "started_at": SystemTime::now()
2616                        .duration_since(UNIX_EPOCH)
2617                        .unwrap_or_default()
2618                        .as_millis() as u64
2619                }),
2620            );
2621
2622            // Seed defaults on first boot (only if red_config is empty or missing defaults)
2623            let store = runtime.inner.db.store();
2624            if store
2625                .get_collection("red_config")
2626                .map(|m| m.query_all(|_| true).len())
2627                .unwrap_or(0)
2628                <= 10
2629            {
2630                store.set_config_tree("red.ai", &crate::json!({
2631                    "default": crate::json!({
2632                        "provider": "openai",
2633                        "model": crate::ai::DEFAULT_OPENAI_PROMPT_MODEL
2634                    }),
2635                    "max_embedding_inputs": 256,
2636                    "max_prompt_batch": 256,
2637                    "timeout": crate::json!({ "connect_secs": 10, "read_secs": 90, "write_secs": 30 })
2638                }));
2639                store.set_config_tree(
2640                    "red.server",
2641                    &crate::json!({
2642                        "max_scan_limit": 1000,
2643                        "max_body_size": 1048576,
2644                        "read_timeout_ms": 5000,
2645                        "write_timeout_ms": 5000
2646                    }),
2647                );
2648                store.set_config_tree(
2649                    "red.storage",
2650                    &crate::json!({
2651                        "page_size": 4096,
2652                        "page_cache_capacity": 100000,
2653                        "auto_checkpoint_pages": 1000,
2654                        "snapshot_retention": 16,
2655                        "verify_checksums": true,
2656                        "segment": crate::json!({
2657                            "max_entities": 100000,
2658                            "max_bytes": 268435456_u64,
2659                            "compression_level": 6
2660                        }),
2661                        "hnsw": crate::json!({ "m": 16, "ef_construction": 100, "ef_search": 50 }),
2662                        "ivf": crate::json!({ "n_lists": 100, "n_probes": 10 }),
2663                        "bm25": crate::json!({ "k1": 1.2, "b": 0.75 })
2664                    }),
2665                );
2666                store.set_config_tree(
2667                    "red.search",
2668                    &crate::json!({
2669                        "rag": crate::json!({
2670                            "max_chunks_per_source": 10,
2671                            "max_total_chunks": 25,
2672                            "similarity_threshold": 0.8,
2673                            "graph_depth": 2,
2674                            "min_relevance": 0.3
2675                        }),
2676                        "fusion": crate::json!({
2677                            "vector_weight": 0.5,
2678                            "graph_weight": 0.3,
2679                            "table_weight": 0.2,
2680                            "dedup_threshold": 0.85
2681                        })
2682                    }),
2683                );
2684                store.set_config_tree(
2685                    "red.auth",
2686                    &crate::json!({
2687                        "enabled": false,
2688                        "session_ttl_secs": 3600,
2689                        "require_auth": false
2690                    }),
2691                );
2692                store.set_config_tree(
2693                    "red.query",
2694                    &crate::json!({
2695                        "connection_pool": crate::json!({ "max_connections": 64, "max_idle": 16 }),
2696                        "max_recursion_depth": 1000
2697                    }),
2698                );
2699                store.set_config_tree(
2700                    "red.indexes",
2701                    &crate::json!({
2702                        "auto_select": true,
2703                        "bloom_filter": crate::json!({
2704                            "enabled": true,
2705                            "false_positive_rate": 0.01,
2706                            "prune_on_scan": true
2707                        }),
2708                        "hash": crate::json!({ "enabled": true }),
2709                        "bitmap": crate::json!({ "enabled": true, "max_cardinality": 1000 }),
2710                        "spatial": crate::json!({ "enabled": true })
2711                    }),
2712                );
2713                store.set_config_tree(
2714                    "red.memtable",
2715                    &crate::json!({
2716                        "enabled": true,
2717                        "max_bytes": 67108864_u64,
2718                        "flush_threshold": 0.75
2719                    }),
2720                );
2721                store.set_config_tree(
2722                    "red.probabilistic",
2723                    &crate::json!({
2724                        "hll_registers": 16384,
2725                        "sketch_default_width": 1000,
2726                        "sketch_default_depth": 5,
2727                        "filter_default_capacity": 100000
2728                    }),
2729                );
2730                store.set_config_tree(
2731                    "red.timeseries",
2732                    &crate::json!({
2733                        "default_chunk_size": 1024,
2734                        "compression": crate::json!({
2735                            "timestamps": "delta_of_delta",
2736                            "values": "gorilla_xor"
2737                        }),
2738                        "default_retention_days": 0
2739                    }),
2740                );
2741                store.set_config_tree(
2742                    "red.queue",
2743                    &crate::json!({
2744                        "default_max_size": 0,
2745                        "default_max_attempts": 3,
2746                        "visibility_timeout_ms": 30000,
2747                        "consumer_idle_timeout_ms": 60000
2748                    }),
2749                );
2750                store.set_config_tree(
2751                    "red.backup",
2752                    &crate::json!({
2753                        "enabled": false,
2754                        "interval_secs": 3600,
2755                        "retention_count": 24,
2756                        "upload": false,
2757                        "backend": "local"
2758                    }),
2759                );
2760                store.set_config_tree(
2761                    "red.wal",
2762                    &crate::json!({
2763                        "archive": crate::json!({
2764                            "enabled": false,
2765                            "retention_hours": 168,
2766                            "prefix": reddb_file::backup_wal_prefix("")
2767                        })
2768                    }),
2769                );
2770                store.set_config_tree(
2771                    "red.cdc",
2772                    &crate::json!({
2773                        "enabled": true,
2774                        "buffer_size": 100000
2775                    }),
2776                );
2777                store.set_config_tree(
2778                    "red.config.secret",
2779                    &crate::json!({
2780                        "auto_encrypt": true,
2781                        "auto_decrypt": true
2782                    }),
2783                );
2784            }
2785
2786            // Perf-parity config matrix: heal the Tier A (critical)
2787            // keys unconditionally on every boot. Idempotent — only
2788            // writes the default when the key is missing. Keeps
2789            // `SHOW CONFIG` showing every guarantee the operator has
2790            // (durability.mode, concurrency.locking.enabled, …) even
2791            // on long-running datadirs that predate the matrix.
2792            crate::runtime::config_matrix::heal_critical_keys(store.as_ref());
2793            seed_storage_deploy_config(store.as_ref(), options.storage_profile);
2794
2795            // Phase 5 — Lehman-Yao runtime flag. Read the Tier A
2796            // `storage.btree.lehman_yao` value from the matrix (env
2797            // > file > red_config > default) and publish it to the
2798            // storage layer's atomic so the B-tree read / split
2799            // paths can branch without re-reading the config on
2800            // every hot-path call.
2801            let lehman_yao = runtime.config_bool("storage.btree.lehman_yao", true);
2802            crate::storage::engine::btree::lehman_yao::set_enabled(lehman_yao);
2803            if lehman_yao {
2804                tracing::info!(
2805                    "storage.btree.lehman_yao=true — lock-free concurrent descent enabled"
2806                );
2807            }
2808
2809            // Config file overlay — mounted `/etc/reddb/config.json`
2810            // (override path via REDDB_CONFIG_FILE). Writes keys with
2811            // write-if-absent semantics so a later user `SET CONFIG`
2812            // always wins. Missing file = silent no-op.
2813            let overlay_path = crate::runtime::config_overlay::config_file_path();
2814            let _ =
2815                crate::runtime::config_overlay::apply_config_file(store.as_ref(), &overlay_path);
2816        }
2817
2818        // VCS ("Git for Data") — create the `red_*` metadata
2819        // collections on first boot. Idempotent: `get_or_create_collection`
2820        // is a no-op if the collection already exists.
2821        {
2822            let store = runtime.inner.db.store();
2823            for name in crate::application::vcs_collections::ALL {
2824                let _ = store.get_or_create_collection(*name);
2825            }
2826            // Seed VCS config namespace with sensible defaults on first
2827            // boot, matching the pattern used by red.ai / red.storage.
2828            store.set_config_tree(
2829                crate::application::vcs_collections::CONFIG_NAMESPACE,
2830                &crate::json!({
2831                    "default_branch": "main",
2832                    "author": crate::json!({
2833                        "name": "reddb",
2834                        "email": "reddb@localhost"
2835                    }),
2836                    "protected_branches": crate::json!(["main"]),
2837                    "closure": crate::json!({
2838                        "enabled": true,
2839                        "lazy": true
2840                    }),
2841                    "merge": crate::json!({
2842                        "default_strategy": "auto",
2843                        "fast_forward": true
2844                    })
2845                }),
2846            );
2847        }
2848
2849        // Migrations — create the `red_migrations` / `red_migration_deps`
2850        // system collections on first boot. Idempotent.
2851        {
2852            let store = runtime.inner.db.store();
2853            for name in crate::application::migration_collections::ALL {
2854                let _ = store.get_or_create_collection(*name);
2855            }
2856        }
2857
2858        // Topology graph (#803) — ensure the built-in `red.topology.cluster`
2859        // graph collection (declared WITH ANALYTICS) and its metadata sidecar
2860        // exist. Idempotent and survives restarts via the WAL-backed contract.
2861        let _ = crate::application::topology_collections::ensure(&runtime);
2862
2863        // Start background maintenance thread (context index refresh +
2864        // session purge). Held by a WEAK reference to `RuntimeInner`
2865        // so dropping the last `RedDBRuntime` handle actually releases
2866        // the underlying Arc<Pager> (and its file lock). Polling at
2867        // 200ms means shutdown latency is bounded; the real 60-second
2868        // work cadence is tracked independently via a `last_work`
2869        // timestamp.
2870        //
2871        // The previous version captured `rt = runtime.clone()` by
2872        // strong reference and ran an unterminated `loop`, which held
2873        // Arc<RuntimeInner> forever — reopening a persistent database
2874        // in the same process failed with "Database is locked" because
2875        // the pager could never drop. See the regression test
2876        // `finding_1_select_after_bulk_insert_persistent_reopen`.
2877        {
2878            let weak = Arc::downgrade(&runtime.inner);
2879            std::thread::Builder::new()
2880                .name("reddb-maintenance".into())
2881                .spawn(move || {
2882                    let tick = std::time::Duration::from_millis(200);
2883                    let work_interval = std::time::Duration::from_secs(60);
2884                    let mut last_work = std::time::Instant::now();
2885                    loop {
2886                        std::thread::sleep(tick);
2887                        let Some(inner) = weak.upgrade() else {
2888                            // All strong references dropped — the
2889                            // runtime is gone, exit cleanly.
2890                            break;
2891                        };
2892                        if last_work.elapsed() >= work_interval {
2893                            let _stats = inner.db.store().context_index().stats();
2894                            last_work = std::time::Instant::now();
2895                        }
2896                    }
2897                })
2898                .ok();
2899        }
2900
2901        // Start backup scheduler if enabled via red_config
2902        {
2903            let store = runtime.inner.db.store();
2904            let mut backup_enabled = false;
2905            let mut backup_interval = 3600u64;
2906
2907            if let Some(manager) = store.get_collection("red_config") {
2908                manager.for_each_entity(|entity| {
2909                    if let Some(row) = entity.data.as_row() {
2910                        let key = row.get_field("key").and_then(|v| match v {
2911                            crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2912                            _ => None,
2913                        });
2914                        let val = row.get_field("value");
2915                        if key == Some("red.config.backup.enabled") {
2916                            backup_enabled = match val {
2917                                Some(crate::storage::schema::Value::Boolean(true)) => true,
2918                                Some(crate::storage::schema::Value::Text(s)) => &**s == "true",
2919                                _ => false,
2920                            };
2921                        } else if key == Some("red.config.backup.interval_secs") {
2922                            if let Some(crate::storage::schema::Value::Integer(n)) = val {
2923                                backup_interval = *n as u64;
2924                            }
2925                        }
2926                    }
2927                    true
2928                });
2929            }
2930
2931            if backup_enabled {
2932                runtime.inner.backup_scheduler.set_interval(backup_interval);
2933                let rt = runtime.clone();
2934                runtime
2935                    .inner
2936                    .backup_scheduler
2937                    .start(move || rt.trigger_backup().map_err(|e| format!("{}", e)));
2938            }
2939        }
2940
2941        // Load EC registry from red_config and start worker
2942        {
2943            runtime
2944                .inner
2945                .ec_registry
2946                .load_from_config_store(runtime.inner.db.store().as_ref());
2947            if !runtime.inner.ec_registry.async_configs().is_empty() {
2948                runtime.inner.ec_worker.start(
2949                    Arc::clone(&runtime.inner.ec_registry),
2950                    Arc::clone(&runtime.inner.db.store()),
2951                );
2952            }
2953        }
2954
2955        if let crate::replication::ReplicationRole::Replica { primary_addr } =
2956            runtime.inner.db.options().replication.role.clone()
2957        {
2958            let rt = runtime.clone();
2959            std::thread::Builder::new()
2960                .name("reddb-replica".into())
2961                .spawn(move || rt.run_replica_loop(primary_addr))
2962                .ok();
2963        }
2964
2965        // PLAN.md Phase 1 — Lifecycle Contract. Mark Ready once every
2966        // boot stage above has completed (WAL replay, restore-from-
2967        // remote, replica-loop spawn). Health probes flip from 503 to
2968        // 200 here; shutdown begins from this state.
2969        runtime.inner.lifecycle.mark_ready();
2970
2971        // Issue #583 slice 10 — ContinuousMaterializedView scheduler.
2972        // Low-priority background ticker that drains the cache's
2973        // `claim_due_at` set every ~50ms. Holds only a Weak<RuntimeInner>
2974        // so the thread exits cleanly when the runtime drops (≤50ms
2975        // latency between drop and exit). Materialized views without
2976        // a `REFRESH EVERY` clause stay on the manual-refresh path
2977        // and are skipped by `claim_due_at`, so the loop is a no-op
2978        // when no scheduled views exist.
2979        {
2980            let weak_inner = Arc::downgrade(&runtime.inner);
2981            std::thread::Builder::new()
2982                .name("reddb-mv-scheduler".into())
2983                .spawn(move || loop {
2984                    std::thread::sleep(std::time::Duration::from_millis(50));
2985                    let Some(inner) = weak_inner.upgrade() else {
2986                        break;
2987                    };
2988                    let rt = RedDBRuntime { inner };
2989                    rt.refresh_due_materialized_views();
2990                })
2991                .ok();
2992        }
2993
2994        // Issue #584 slice 12 — DeclarativeRetention background sweeper.
2995        // Low-priority ticker that physically reclaims rows whose
2996        // timestamp has fallen beyond the retention window. Holds a
2997        // `Weak<RuntimeInner>` so the thread exits within one tick of
2998        // the runtime drop (graceful shutdown leaves storage consistent
2999        // because each tick goes through the standard DELETE path —
3000        // there is no half-finished mutation state to clean up). The
3001        // tick interval is intentionally longer than the MV scheduler
3002        // (500ms) because retention is order-of-seconds at minimum.
3003        if !runtime.write_gate().is_read_only() {
3004            let weak_inner = Arc::downgrade(&runtime.inner);
3005            std::thread::Builder::new()
3006                .name("reddb-retention-sweeper".into())
3007                .spawn(move || loop {
3008                    std::thread::sleep(std::time::Duration::from_millis(500));
3009                    let Some(inner) = weak_inner.upgrade() else {
3010                        break;
3011                    };
3012                    let rt = RedDBRuntime { inner };
3013                    rt.sweep_retention_tick(
3014                        crate::runtime::retention_sweeper::DEFAULT_SWEEPER_BATCH,
3015                    );
3016                })
3017                .ok();
3018        }
3019
3020        Ok(runtime)
3021    }
3022
3023    fn rehydrate_snapshot_xid_floor(&self) {
3024        let store = self.inner.db.store();
3025        for collection in store.list_collections() {
3026            let Some(manager) = store.get_collection(&collection) else {
3027                continue;
3028            };
3029            for entity in manager.query_all(|_| true) {
3030                self.inner
3031                    .snapshot_manager
3032                    .observe_committed_xid(entity.xmin);
3033                self.inner
3034                    .snapshot_manager
3035                    .observe_committed_xid(entity.xmax);
3036            }
3037        }
3038    }
3039
3040    /// Provision an empty Table-shaped collection that backs a
3041    /// `CREATE MATERIALIZED VIEW v` (issue #594 slice 9b of #575).
3042    /// `SELECT FROM v` reads this collection directly; the rewriter is
3043    /// configured to skip materialized views so the body is no longer
3044    /// substituted. REFRESH still writes to the cache slot — wiring it
3045    /// into this backing collection is the job of slice 9c.
3046    ///
3047    /// Idempotent: re-running for the same name leaves the existing
3048    /// collection in place (mirrors `CREATE TABLE IF NOT EXISTS`
3049    /// semantics). This keeps `CREATE OR REPLACE MATERIALIZED VIEW v`
3050    /// cheap — the body change does not invalidate already-buffered
3051    /// rows. Until 9c lands the backing is always empty anyway.
3052    pub(crate) fn ensure_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
3053        let store = self.inner.db.store();
3054        let mut changed = false;
3055        if store.get_collection(name).is_none() {
3056            store.get_or_create_collection(name);
3057            changed = true;
3058        }
3059        if self.inner.db.collection_contract(name).is_none() {
3060            self.inner
3061                .db
3062                .save_collection_contract(system_keyed_collection_contract(
3063                    name,
3064                    crate::catalog::CollectionModel::Table,
3065                ))
3066                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3067            changed = true;
3068        }
3069        if changed {
3070            self.inner
3071                .db
3072                .persist_metadata()
3073                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3074        }
3075        Ok(())
3076    }
3077
3078    /// Inverse of [`ensure_materialized_view_backing`] — drops the
3079    /// backing collection on `DROP MATERIALIZED VIEW v`. No-op when
3080    /// the collection was never created (e.g. a `DROP MATERIALIZED
3081    /// VIEW IF EXISTS v` against an unknown name).
3082    pub(crate) fn drop_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
3083        let store = self.inner.db.store();
3084        if store.get_collection(name).is_none() {
3085            return Ok(());
3086        }
3087        store
3088            .drop_collection(name)
3089            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3090        // The contract may have been dropped already (DROP TABLE path)
3091        // — ignore "not found" errors by checking presence first.
3092        if self.inner.db.collection_contract(name).is_some() {
3093            self.inner
3094                .db
3095                .remove_collection_contract(name)
3096                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3097        }
3098        self.invalidate_result_cache();
3099        self.inner
3100            .db
3101            .persist_metadata()
3102            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3103        Ok(())
3104    }
3105
3106    fn bootstrap_system_keyed_collections(&self) -> RedDBResult<()> {
3107        let mut changed = false;
3108        for (name, model) in [
3109            ("red.config", crate::catalog::CollectionModel::Config),
3110            ("red.vault", crate::catalog::CollectionModel::Vault),
3111            // Issue #593 — materialized-view catalog. One row per
3112            // `CREATE MATERIALIZED VIEW`; rehydrated at boot before
3113            // the API opens.
3114            (
3115                crate::runtime::continuous_materialized_view::CATALOG_COLLECTION,
3116                crate::catalog::CollectionModel::Config,
3117            ),
3118        ] {
3119            if self.inner.db.store().get_collection(name).is_none() {
3120                self.inner.db.store().get_or_create_collection(name);
3121                changed = true;
3122            }
3123            if self.inner.db.collection_contract(name).is_none() {
3124                self.inner
3125                    .db
3126                    .save_collection_contract(system_keyed_collection_contract(name, model))
3127                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
3128                changed = true;
3129            }
3130        }
3131        if changed {
3132            self.inner
3133                .db
3134                .persist_metadata()
3135                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3136        }
3137        Ok(())
3138    }
3139
3140    pub fn db(&self) -> Arc<RedDB> {
3141        Arc::clone(&self.inner.db)
3142    }
3143
3144    /// Direct access to the runtime's secondary-index store.
3145    /// Used by bulk-insert entry points (gRPC binary bulk, HTTP bulk,
3146    /// wire bulk) that need to push new rows through the per-index
3147    /// maintenance hook after `store.bulk_insert` returns.
3148    pub fn index_store_ref(&self) -> &super::index_store::IndexStore {
3149        &self.inner.index_store
3150    }
3151
3152    /// Apply a DDL event to the schema-vocabulary reverse index
3153    /// (issue #120). Called by DDL execution paths after the catalog
3154    /// mutation has succeeded so the index never holds entries for
3155    /// half-applied DDL.
3156    pub(crate) fn schema_vocabulary_apply(
3157        &self,
3158        event: crate::runtime::schema_vocabulary::DdlEvent,
3159    ) {
3160        self.inner.schema_vocabulary.write().on_ddl(event);
3161    }
3162
3163    /// Lookup `token` in the schema-vocabulary reverse index. Returns
3164    /// an owned `Vec<VocabHit>` because the underlying read lock
3165    /// cannot be borrowed across the call boundary; the slice from
3166    /// `SchemaVocabulary::lookup` is cloned per hit.
3167    pub fn schema_vocabulary_lookup(
3168        &self,
3169        token: &str,
3170    ) -> Vec<crate::runtime::schema_vocabulary::VocabHit> {
3171        self.inner.schema_vocabulary.read().lookup(token).to_vec()
3172    }
3173
3174    /// Inject an AuthStore into the runtime. Called by server boot
3175    /// after the vault has been bootstrapped, so that `Value::Secret`
3176    /// auto-encrypt/decrypt can reach the vault AES key.
3177    pub fn set_auth_store(&self, store: Arc<crate::auth::store::AuthStore>) {
3178        *self.inner.auth_store.write() = Some(store);
3179    }
3180
3181    /// Snapshot the current AuthStore (if any). Used by the wire listener
3182    /// to validate bearer tokens issued via HTTP `/auth/login`.
3183    pub fn auth_store(&self) -> Option<Arc<crate::auth::store::AuthStore>> {
3184        self.inner.auth_store.read().clone()
3185    }
3186
3187    /// Read a vault KV secret from the configured AuthStore, if present.
3188    pub fn vault_kv_get(&self, key: &str) -> Option<String> {
3189        self.inner
3190            .auth_store
3191            .read()
3192            .as_ref()
3193            .and_then(|store| store.vault_kv_get(key))
3194    }
3195
3196    /// Write a vault KV secret and fail if the encrypted vault write is
3197    /// unavailable or cannot be made durable.
3198    pub fn vault_kv_try_set(&self, key: String, value: String) -> RedDBResult<()> {
3199        let store = self.inner.auth_store.read().clone().ok_or_else(|| {
3200            RedDBError::Query("secret storage requires an enabled, unsealed vault".to_string())
3201        })?;
3202        store
3203            .vault_kv_try_set(key, value)
3204            .map_err(|err| RedDBError::Query(err.to_string()))
3205    }
3206
3207    /// Inject an `OAuthValidator` into the runtime. When set, HTTP and
3208    /// wire transports try OAuth JWT validation before falling back to
3209    /// the local AuthStore lookup. Pass `None` to disable.
3210    pub fn set_oauth_validator(&self, validator: Option<Arc<crate::auth::oauth::OAuthValidator>>) {
3211        *self.inner.oauth_validator.write() = validator;
3212    }
3213
3214    /// Returns a clone of the configured `OAuthValidator` Arc, if any.
3215    /// Hot path: called per HTTP request when an Authorization header
3216    /// is present, so we hand back a cheap Arc clone.
3217    pub fn oauth_validator(&self) -> Option<Arc<crate::auth::oauth::OAuthValidator>> {
3218        self.inner.oauth_validator.read().clone()
3219    }
3220
3221    /// Inject the browser-token authority (issue #936). When set, the
3222    /// RedWire WS handshake accepts the short-lived access JWT it mints
3223    /// (alongside, and tried before, the federated OAuth validator), and
3224    /// the `/auth/browser/*` HTTP endpoints can issue/rotate the pair.
3225    /// `None` leaves the browser credential flow inert.
3226    pub fn set_browser_token_authority(
3227        &self,
3228        authority: Option<Arc<crate::auth::browser_token::BrowserTokenAuthority>>,
3229    ) {
3230        *self.inner.browser_token_authority.write() = authority;
3231    }
3232
3233    /// Snapshot the browser-token authority, if wired. Read on the WS
3234    /// handshake path and by the `/auth/browser/*` handlers; a cheap Arc
3235    /// clone keeps the lock hold short.
3236    pub fn browser_token_authority(
3237        &self,
3238    ) -> Option<Arc<crate::auth::browser_token::BrowserTokenAuthority>> {
3239        self.inner.browser_token_authority.read().clone()
3240    }
3241
3242    /// Returns the vault AES key (`red.secret.aes_key`) if an auth
3243    /// store is wired and a key has been generated. Used by the
3244    /// `Value::Secret` encrypt/decrypt pipeline.
3245    pub(crate) fn secret_aes_key(&self) -> Option<[u8; 32]> {
3246        let guard = self.inner.auth_store.read();
3247        guard.as_ref().and_then(|s| s.vault_secret_key())
3248    }
3249
3250    /// Resolve a boolean flag from `red_config`. Defaults to `default`
3251    /// when the key is missing or not coercible. If the same key has
3252    /// been written multiple times (SET CONFIG appends new rows), the
3253    /// most recent entity wins. Env-var overrides
3254    /// (`REDDB_<UP_DOTTED>`) take highest precedence.
3255    pub(crate) fn config_bool(&self, key: &str, default: bool) -> bool {
3256        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3257            if let Some(crate::storage::schema::Value::Boolean(b)) =
3258                crate::runtime::config_overlay::coerce_env_value(key, raw)
3259            {
3260                return b;
3261            }
3262        }
3263        let store = self.inner.db.store();
3264        let Some(manager) = store.get_collection("red_config") else {
3265            return default;
3266        };
3267        let mut result = default;
3268        let mut latest_id: u64 = 0;
3269        manager.for_each_entity(|entity| {
3270            if let Some(row) = entity.data.as_row() {
3271                let entry_key = row.get_field("key").and_then(|v| match v {
3272                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3273                    _ => None,
3274                });
3275                if entry_key == Some(key) {
3276                    let id = entity.id.raw();
3277                    if id >= latest_id {
3278                        latest_id = id;
3279                        result = match row.get_field("value") {
3280                            Some(crate::storage::schema::Value::Boolean(b)) => *b,
3281                            Some(crate::storage::schema::Value::Text(s)) => {
3282                                matches!(s.as_ref(), "true" | "TRUE" | "True" | "1")
3283                            }
3284                            Some(crate::storage::schema::Value::Integer(n)) => *n != 0,
3285                            _ => default,
3286                        };
3287                    }
3288                }
3289            }
3290            true
3291        });
3292        result
3293    }
3294
3295    pub(crate) fn config_u64(&self, key: &str, default: u64) -> u64 {
3296        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3297            if let Some(crate::storage::schema::Value::UnsignedInteger(n)) =
3298                crate::runtime::config_overlay::coerce_env_value(key, raw)
3299            {
3300                return n;
3301            }
3302        }
3303        let store = self.inner.db.store();
3304        let Some(manager) = store.get_collection("red_config") else {
3305            return default;
3306        };
3307        let mut result = default;
3308        let mut latest_id: u64 = 0;
3309        manager.for_each_entity(|entity| {
3310            if let Some(row) = entity.data.as_row() {
3311                let entry_key = row.get_field("key").and_then(|v| match v {
3312                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3313                    _ => None,
3314                });
3315                if entry_key == Some(key) {
3316                    let id = entity.id.raw();
3317                    if id >= latest_id {
3318                        latest_id = id;
3319                        result = match row.get_field("value") {
3320                            Some(crate::storage::schema::Value::Integer(n)) => *n as u64,
3321                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n,
3322                            Some(crate::storage::schema::Value::Text(s)) => {
3323                                s.parse::<u64>().unwrap_or(default)
3324                            }
3325                            _ => default,
3326                        };
3327                    }
3328                }
3329            }
3330            true
3331        });
3332        result
3333    }
3334
3335    pub(crate) fn config_f64(&self, key: &str, default: f64) -> f64 {
3336        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3337            if let Ok(n) = raw.parse::<f64>() {
3338                return n;
3339            }
3340        }
3341        let store = self.inner.db.store();
3342        let Some(manager) = store.get_collection("red_config") else {
3343            return default;
3344        };
3345        let mut result = default;
3346        let mut latest_id: u64 = 0;
3347        manager.for_each_entity(|entity| {
3348            if let Some(row) = entity.data.as_row() {
3349                let entry_key = row.get_field("key").and_then(|v| match v {
3350                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3351                    _ => None,
3352                });
3353                if entry_key == Some(key) {
3354                    let id = entity.id.raw();
3355                    if id >= latest_id {
3356                        latest_id = id;
3357                        result = match row.get_field("value") {
3358                            Some(crate::storage::schema::Value::Float(n)) => *n,
3359                            Some(crate::storage::schema::Value::Integer(n)) => *n as f64,
3360                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n as f64,
3361                            Some(crate::storage::schema::Value::Text(s)) => {
3362                                s.parse::<f64>().unwrap_or(default)
3363                            }
3364                            _ => default,
3365                        };
3366                    }
3367                }
3368            }
3369            true
3370        });
3371        result
3372    }
3373
3374    pub(crate) fn config_string(&self, key: &str, default: &str) -> String {
3375        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3376            return raw.clone();
3377        }
3378        let store = self.inner.db.store();
3379        let Some(manager) = store.get_collection("red_config") else {
3380            return default.to_string();
3381        };
3382        let mut result = default.to_string();
3383        let mut latest_id: u64 = 0;
3384        manager.for_each_entity(|entity| {
3385            if let Some(row) = entity.data.as_row() {
3386                let entry_key = row.get_field("key").and_then(|v| match v {
3387                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3388                    _ => None,
3389                });
3390                if entry_key == Some(key) {
3391                    let id = entity.id.raw();
3392                    if id >= latest_id {
3393                        latest_id = id;
3394                        if let Some(crate::storage::schema::Value::Text(value)) =
3395                            row.get_field("value")
3396                        {
3397                            result = value.to_string();
3398                        }
3399                    }
3400                }
3401            }
3402            true
3403        });
3404        result
3405    }
3406
3407    /// Whether `SECRET('...')` literals should be encrypted with the
3408    /// vault AES key on INSERT. Default `true`.
3409    pub(crate) fn secret_auto_encrypt(&self) -> bool {
3410        self.config_bool("red.config.secret.auto_encrypt", true)
3411    }
3412
3413    /// Whether `Value::Secret` columns should be decrypted back to
3414    /// plaintext on SELECT when the vault is unsealed. Default `true`.
3415    /// Turning this off keeps secrets masked as `***` even while the
3416    /// vault is open — useful for audit trails or read-only exports.
3417    pub(crate) fn secret_auto_decrypt(&self) -> bool {
3418        self.config_bool("red.config.secret.auto_decrypt", true)
3419    }
3420
3421    /// Walk every record in `result` and swap `Value::Secret(bytes)`
3422    /// for the decrypted plaintext when the runtime has the vault
3423    /// AES key AND `red.config.secret.auto_decrypt = true`. If the
3424    /// key is missing, the vault is sealed, or auto_decrypt is off,
3425    /// secrets are left as `Value::Secret` which every formatter
3426    /// (Display, JSON) already masks as `***`.
3427    pub(crate) fn apply_secret_decryption(&self, result: &mut RuntimeQueryResult) {
3428        if !self.secret_auto_decrypt() {
3429            return;
3430        }
3431        let Some(key) = self.secret_aes_key() else {
3432            return;
3433        };
3434        for record in result.result.records.iter_mut() {
3435            for value in record.values_mut() {
3436                if let Value::Secret(ref bytes) = value {
3437                    if let Some(plain) =
3438                        super::impl_dml::decrypt_secret_payload(&key, bytes.as_slice())
3439                    {
3440                        if let Ok(text) = String::from_utf8(plain) {
3441                            *value = Value::text(text);
3442                        }
3443                    }
3444                }
3445            }
3446        }
3447    }
3448
3449    /// Emit a CDC change event and replicate to WAL buffer.
3450    /// Create a `MutationEngine` bound to this runtime.
3451    ///
3452    /// The engine is cheap to construct (no allocation) and should be
3453    /// dropped after `apply` returns. Use this from application-layer
3454    /// `create_row` / `create_rows_batch` instead of calling
3455    /// `bulk_insert` + `index_entity_insert` + `cdc_emit` separately.
3456    pub(crate) fn mutation_engine(&self) -> crate::runtime::mutation::MutationEngine<'_> {
3457        crate::runtime::mutation::MutationEngine::new(self)
3458    }
3459
3460    /// Public-mutation gate snapshot (PLAN.md W1).
3461    ///
3462    /// Surfaces that accept untrusted client requests (SQL DML/DDL,
3463    /// gRPC mutating RPCs, HTTP/native wire mutations, admin
3464    /// maintenance, serverless lifecycle) call `check_write` before
3465    /// dispatching to storage. Returns `RedDBError::ReadOnly` on any
3466    /// instance running as a replica or with `options.read_only =
3467    /// true`. The replica internal logical-WAL apply path reaches into
3468    /// the store directly and never calls this method, so legitimate
3469    /// replica catch-up still works.
3470    pub fn check_write(&self, kind: crate::runtime::write_gate::WriteKind) -> RedDBResult<()> {
3471        self.inner.write_gate.check(kind)
3472    }
3473
3474    /// Read-only handle to the gate, useful for transports that want
3475    /// to surface the policy in health/status output without taking on
3476    /// a dependency on the concrete enum.
3477    pub fn write_gate(&self) -> &crate::runtime::write_gate::WriteGate {
3478        &self.inner.write_gate
3479    }
3480
3481    /// Process lifecycle handle (PLAN.md Phase 1). Health probes,
3482    /// admin/shutdown, and signal handlers consult this single
3483    /// state machine.
3484    pub fn lifecycle(&self) -> &crate::runtime::lifecycle::Lifecycle {
3485        &self.inner.lifecycle
3486    }
3487
3488    /// Operator-imposed resource limits (PLAN.md Phase 4.1).
3489    pub fn resource_limits(&self) -> &crate::runtime::resource_limits::ResourceLimits {
3490        &self.inner.resource_limits
3491    }
3492
3493    /// Append-only audit log for admin mutations (PLAN.md Phase 6.5).
3494    pub fn audit_log(&self) -> &crate::runtime::audit_log::AuditLogger {
3495        &self.inner.audit_log
3496    }
3497
3498    /// Shared `Arc` to the audit logger — used by collaborators (the
3499    /// lease lifecycle, future request-context plumbing) that need to
3500    /// keep the logger alive past the runtime's stack frame.
3501    pub fn audit_log_arc(&self) -> Arc<crate::runtime::audit_log::AuditLogger> {
3502        Arc::clone(&self.inner.audit_log)
3503    }
3504
3505    pub(crate) fn emit_control_event(
3506        &self,
3507        kind: crate::runtime::control_events::EventKind,
3508        outcome: crate::runtime::control_events::Outcome,
3509        action: &'static str,
3510        resource: Option<String>,
3511        reason: Option<String>,
3512        extra_fields: Vec<(String, crate::runtime::control_events::Sensitivity)>,
3513    ) -> RedDBResult<()> {
3514        use crate::runtime::control_events::{
3515            ActorRef, ControlEvent, ControlEventCtx, ControlEventLedger, Sensitivity,
3516        };
3517
3518        let tenant = current_tenant();
3519        let principal = current_auth_identity();
3520        let actor_user = principal
3521            .as_ref()
3522            .map(|(principal, _)| UserId::from_parts(tenant.as_deref(), principal));
3523        let actor = actor_user
3524            .as_ref()
3525            .map(ActorRef::User)
3526            .unwrap_or(ActorRef::Anonymous);
3527        let ctx = ControlEventCtx {
3528            actor,
3529            scope: tenant
3530                .as_ref()
3531                .map(|scope| std::borrow::Cow::Borrowed(scope.as_str())),
3532            request_id: Some(std::borrow::Cow::Owned(format!(
3533                "conn-{}",
3534                current_connection_id()
3535            ))),
3536            trace_id: None,
3537        };
3538        let mut fields = std::collections::HashMap::new();
3539        fields.insert(
3540            "connection_id".to_string(),
3541            Sensitivity::raw(current_connection_id().to_string()),
3542        );
3543        if let Some((_, role)) = principal {
3544            fields.insert("actor_role".to_string(), Sensitivity::raw(role.as_str()));
3545        }
3546        for (key, value) in extra_fields {
3547            fields.insert(key, value);
3548        }
3549        let event = ControlEvent {
3550            kind,
3551            outcome,
3552            action: std::borrow::Cow::Borrowed(action),
3553            resource,
3554            reason,
3555            matched_policy_id: None,
3556            fields,
3557        };
3558        let ledger = self.inner.control_event_ledger.read();
3559        match ledger.emit(&ctx, event) {
3560            Ok(_) => Ok(()),
3561            Err(err) if self.inner.control_event_config.require_persistence() => {
3562                Err(RedDBError::Internal(err.to_string()))
3563            }
3564            Err(_) => Ok(()),
3565        }
3566    }
3567
3568    fn policy_mutation_control_ctx<'a>(
3569        &self,
3570        actor: &'a crate::auth::UserId,
3571        tenant: Option<&'a str>,
3572    ) -> crate::runtime::control_events::ControlEventCtx<'a> {
3573        crate::runtime::control_events::ControlEventCtx {
3574            actor: crate::runtime::control_events::ActorRef::User(actor),
3575            scope: tenant.map(std::borrow::Cow::Borrowed),
3576            request_id: Some(std::borrow::Cow::Owned(format!(
3577                "conn-{}",
3578                current_connection_id()
3579            ))),
3580            trace_id: None,
3581        }
3582    }
3583
3584    fn emit_query_audit(
3585        &self,
3586        query: &str,
3587        plan: &QueryAuditPlan,
3588        duration_ms: u64,
3589        result: &RuntimeQueryResult,
3590    ) {
3591        if !self.inner.query_audit.has_rules() {
3592            return;
3593        }
3594        let actor = current_auth_identity().map(|(principal, _)| principal);
3595        let tenant = current_tenant();
3596        let row_count = if result.statement_type == "select" {
3597            result.result.records.len() as u64
3598        } else {
3599            result.affected_rows
3600        };
3601        self.inner
3602            .query_audit
3603            .emit(crate::runtime::query_audit::QueryAuditEvent {
3604                actor,
3605                tenant,
3606                statement_kind: plan.statement_kind,
3607                touched_collections: plan.collections.clone(),
3608                duration_ms,
3609                row_count,
3610                request_id: Some(crate::crypto::uuid::Uuid::new_v7().to_string()),
3611                query_hash: Some(blake3::hash(query.as_bytes()).to_hex().to_string()),
3612            });
3613    }
3614
3615    /// Shared queue telemetry counters (delivered/acked/nacked).
3616    pub(crate) fn queue_telemetry(
3617        &self,
3618    ) -> &crate::runtime::queue_telemetry::QueueTelemetryCounters {
3619        &self.inner.queue_telemetry
3620    }
3621
3622    /// Snapshots of the queue telemetry counters in label-deterministic
3623    /// order for `/metrics` rendering and the integration test.
3624    pub fn queue_telemetry_snapshot(
3625        &self,
3626    ) -> crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3627        crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3628            delivered: self.inner.queue_telemetry.delivered_snapshot(),
3629            acked: self.inner.queue_telemetry.acked_snapshot(),
3630            nacked: self.inner.queue_telemetry.nacked_snapshot(),
3631            wait_started: self.inner.queue_telemetry.wait_started_snapshot(),
3632            wait_woken: self.inner.queue_telemetry.wait_woken_snapshot(),
3633            wait_timed_out: self.inner.queue_telemetry.wait_timed_out_snapshot(),
3634            wait_cancelled: self.inner.queue_telemetry.wait_cancelled_snapshot(),
3635            wait_duration: self.inner.queue_telemetry.wait_duration_snapshot(),
3636        }
3637    }
3638
3639    /// Issue #742 — consumer presence registry. Heartbeats land here
3640    /// from `QUEUE READ` (and, in a follow-up slice, an explicit
3641    /// `QUEUE HEARTBEAT` command); Red UI and `red.queue_consumers`
3642    /// read snapshots through `queue_consumer_presence_snapshot`.
3643    pub(crate) fn queue_presence(
3644        &self,
3645    ) -> &std::sync::Arc<crate::storage::queue::presence::ConsumerPresenceRegistry> {
3646        &self.inner.queue_presence
3647    }
3648
3649    /// Issue #742 — point-in-time presence snapshot, classifying each
3650    /// `(queue, group, consumer)` as active/stale/expired against the
3651    /// supplied TTL. Wall-clock is read once here so the lifecycle
3652    /// flags inside the snapshot are internally consistent.
3653    pub fn queue_consumer_presence_snapshot(
3654        &self,
3655        ttl_ms: u64,
3656    ) -> Vec<crate::storage::queue::presence::ConsumerPresence> {
3657        let now_ns = std::time::SystemTime::now()
3658            .duration_since(std::time::UNIX_EPOCH)
3659            .map(|d| d.as_nanos() as u64)
3660            .unwrap_or(0);
3661        self.inner.queue_presence.snapshot(now_ns, ttl_ms)
3662    }
3663
3664    /// Issue #742 — active-consumer count per `(queue, group)` for the
3665    /// queue-metadata surface. Stale/expired entries are excluded by
3666    /// definition; they are still visible in the per-row snapshot.
3667    pub fn queue_active_consumer_counts(
3668        &self,
3669        ttl_ms: u64,
3670    ) -> std::collections::HashMap<(String, String), u32> {
3671        let now_ns = std::time::SystemTime::now()
3672            .duration_since(std::time::UNIX_EPOCH)
3673            .map(|d| d.as_nanos() as u64)
3674            .unwrap_or(0);
3675        self.inner
3676            .queue_presence
3677            .count_active_by_group(now_ns, ttl_ms)
3678    }
3679
3680    /// Issue #743 — vector + TurboQuant introspection registry. Engine
3681    /// publish points (collection create, artifact build start /
3682    /// finish, fallback toggle, drop) update this; Red UI and
3683    /// `red.*` vector virtual tables read snapshots through
3684    /// `vector_introspection_snapshot` / `vector_introspection_get`.
3685    pub(crate) fn vector_introspection_registry(
3686        &self,
3687    ) -> &std::sync::Arc<crate::storage::vector::introspection::VectorIntrospectionRegistry> {
3688        &self.inner.vector_introspection
3689    }
3690
3691    /// Issue #743 — full snapshot of every tracked vector collection's
3692    /// `(VectorMetadata, ArtifactMetadata)`. Deterministically ordered
3693    /// by collection name so Red UI tables and tests both see a
3694    /// stable shape.
3695    pub fn vector_introspection_snapshot(
3696        &self,
3697    ) -> Vec<crate::storage::vector::introspection::VectorIntrospection> {
3698        self.inner.vector_introspection.snapshot()
3699    }
3700
3701    /// Issue #743 — single-collection lookup, for the per-collection
3702    /// metadata endpoint Red UI hits when an operator opens one
3703    /// vector's toolbar.
3704    pub fn vector_introspection_get(
3705        &self,
3706        collection: &str,
3707    ) -> Option<crate::storage::vector::introspection::VectorIntrospection> {
3708        self.inner.vector_introspection.get(collection)
3709    }
3710
3711    /// Slice 10 of issue #527 — render-time scan of pending entries
3712    /// per (queue, group) for the `queue_pending_gauge` exposition.
3713    /// Walks `red_queue_meta` live so the gauge cannot drift from
3714    /// the source of truth.
3715    pub fn queue_pending_counts(&self) -> Vec<((String, String), u64)> {
3716        let store = self.inner.db.store();
3717        crate::runtime::impl_queue::pending_counts_by_group(store.as_ref())
3718            .into_iter()
3719            .collect()
3720    }
3721
3722    /// Shared `Arc` to the write gate. Same rationale as
3723    /// `audit_log_arc`: collaborators (lease lifecycle, refresh
3724    /// thread) need a clone-cheap handle they can move into a
3725    /// background thread.
3726    pub fn write_gate_arc(&self) -> Arc<crate::runtime::write_gate::WriteGate> {
3727        Arc::clone(&self.inner.write_gate)
3728    }
3729
3730    /// Serverless writer-lease state machine. `None` when the operator
3731    /// did not opt into lease fencing (`RED_LEASE_REQUIRED` unset).
3732    pub fn lease_lifecycle(&self) -> Option<&Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3733        self.inner.lease_lifecycle.get()
3734    }
3735
3736    /// Install the lease lifecycle. Idempotent; subsequent calls
3737    /// return the previously stored value untouched.
3738    pub fn set_lease_lifecycle(
3739        &self,
3740        lifecycle: Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>,
3741    ) -> Result<(), Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3742        self.inner.lease_lifecycle.set(lifecycle)
3743    }
3744
3745    /// Reject the call when the requested batch size exceeds
3746    /// `RED_MAX_BATCH_SIZE`. Returns `RedDBError::QuotaExceeded`
3747    /// shaped so the HTTP layer can map it to 413 Payload Too
3748    /// Large (PLAN.md Phase 4.1).
3749    pub fn check_batch_size(&self, requested: usize) -> RedDBResult<()> {
3750        if self.inner.resource_limits.batch_size_exceeded(requested) {
3751            let max = self.inner.resource_limits.max_batch_size.unwrap_or(0);
3752            return Err(RedDBError::QuotaExceeded(format!(
3753                "max_batch_size:{requested}:{max}"
3754            )));
3755        }
3756        Ok(())
3757    }
3758
3759    /// Reject the call when the local DB file exceeds
3760    /// `RED_MAX_DB_SIZE_BYTES`. Reads file metadata once per call —
3761    /// the cost is a single `stat()` syscall, negligible against the
3762    /// I/O the caller is about to do. Returns `QuotaExceeded` shaped
3763    /// for HTTP 507 Insufficient Storage.
3764    pub fn check_db_size(&self) -> RedDBResult<()> {
3765        let Some(limit) = self.inner.resource_limits.max_db_size_bytes else {
3766            return Ok(());
3767        };
3768        if limit == 0 {
3769            return Ok(());
3770        }
3771        let Some(path) = self.inner.db.path() else {
3772            return Ok(());
3773        };
3774        let current = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
3775        if current > limit {
3776            return Err(RedDBError::QuotaExceeded(format!(
3777                "max_db_size_bytes:{current}:{limit}"
3778            )));
3779        }
3780        Ok(())
3781    }
3782
3783    /// Graceful shutdown coordinator (PLAN.md Phase 1.1).
3784    ///
3785    /// Steps, in order, all idempotent across re-entrant calls:
3786    ///   1. Move lifecycle into `ShuttingDown` (concurrent callers
3787    ///      observe `Stopped` after first finishes).
3788    ///   2. Flush WAL + run final checkpoint via `db.flush()` so
3789    ///      every acked write is durable on disk.
3790    ///   3. If `backup_on_shutdown == true` and a remote backend is
3791    ///      configured, run a synchronous `trigger_backup()` so the
3792    ///      remote head reflects the final state.
3793    ///   4. Stamp the report and move to `Stopped`. Subsequent calls
3794    ///      return the cached report without re-running anything.
3795    ///
3796    /// On any error, the runtime is still marked `Stopped` so the
3797    /// process can exit; the caller logs the error context but does
3798    /// not retry the same shutdown — the operator can inspect the
3799    /// report fields to see which step failed.
3800    pub fn graceful_shutdown(
3801        &self,
3802        backup_on_shutdown: bool,
3803    ) -> RedDBResult<crate::runtime::lifecycle::ShutdownReport> {
3804        if !self.inner.lifecycle.begin_shutdown() {
3805            // Someone else already shut down (or is in flight). Return
3806            // the cached report so the HTTP caller and SIGTERM handler
3807            // get the same idempotent answer.
3808            return Ok(self.inner.lifecycle.shutdown_report().unwrap_or_default());
3809        }
3810
3811        let started_ms = std::time::SystemTime::now()
3812            .duration_since(std::time::UNIX_EPOCH)
3813            .map(|d| d.as_millis() as u64)
3814            .unwrap_or(0);
3815        let mut report = crate::runtime::lifecycle::ShutdownReport {
3816            started_at_ms: started_ms,
3817            ..Default::default()
3818        };
3819
3820        // Flush WAL + run any pending checkpoint. Local fsync is
3821        // unconditional — even a lease-lost replica needs its WAL on
3822        // disk before exit so a future restore has the latest tail.
3823        // The remote upload is gated separately so a lost-lease writer
3824        // doesn't clobber the new holder's state on its way out.
3825        let flush_res = self.inner.db.flush_local_only();
3826        report.flushed_wal = flush_res.is_ok();
3827        report.final_checkpoint = flush_res.is_ok();
3828        if let Err(err) = &flush_res {
3829            tracing::error!(
3830                target: "reddb::lifecycle",
3831                error = %err,
3832                "graceful_shutdown: local flush failed"
3833            );
3834        } else if let Err(lease_err) =
3835            self.assert_remote_write_allowed("shutdown/checkpoint_upload")
3836        {
3837            tracing::warn!(
3838                target: "reddb::serverless::lease",
3839                error = %lease_err,
3840                "graceful_shutdown: remote upload skipped — lease not held"
3841            );
3842        } else if let Err(err) = self.inner.db.upload_to_remote_backend() {
3843            tracing::error!(
3844                target: "reddb::lifecycle",
3845                error = %err,
3846                "graceful_shutdown: remote upload failed"
3847            );
3848        }
3849
3850        // Optional final backup. Skipped silently when no remote
3851        // backend is configured — `trigger_backup()` returns Err
3852        // anyway in that case, but logging it as a shutdown failure
3853        // would be misleading on a standalone (no-backend) runtime.
3854        if backup_on_shutdown && self.inner.db.remote_backend.is_some() {
3855            // The trigger_backup gate now reads `WriteKind::Backup`,
3856            // which a replica/read_only instance refuses. That's
3857            // intentional — replicas don't drive backups; only the
3858            // primary does. We still want shutdown to flush its WAL
3859            // even if the backup branch is gated off.
3860            match self.trigger_backup() {
3861                Ok(result) => {
3862                    report.backup_uploaded = result.uploaded;
3863                }
3864                Err(err) => {
3865                    tracing::warn!(
3866                        target: "reddb::lifecycle",
3867                        error = %err,
3868                        "graceful_shutdown: final backup skipped"
3869                    );
3870                }
3871            }
3872        }
3873
3874        let completed_ms = std::time::SystemTime::now()
3875            .duration_since(std::time::UNIX_EPOCH)
3876            .map(|d| d.as_millis() as u64)
3877            .unwrap_or(started_ms);
3878        report.completed_at_ms = completed_ms;
3879        report.duration_ms = completed_ms.saturating_sub(started_ms);
3880
3881        self.inner.lifecycle.finish_shutdown(report.clone());
3882        Ok(report)
3883    }
3884
3885    /// PLAN.md Phase 4.4 — per-caller quota bucket. Always
3886    /// returned; `is_configured()` lets callers short-circuit.
3887    pub fn quota_bucket(&self) -> &crate::runtime::quota_bucket::QuotaBucket {
3888        &self.inner.quota_bucket
3889    }
3890
3891    /// PLAN.md Phase 6.3 — whether at-rest encryption is configured.
3892    /// Reads `RED_ENCRYPTION_KEY` / `RED_ENCRYPTION_KEY_FILE` lazily;
3893    /// returns `("enabled", None)` when a key is loadable, `("error", Some(msg))`
3894    /// when the operator set the env but it doesn't parse, and
3895    /// `("disabled", None)` when no key is configured. The pager
3896    /// hookup is deferred — this accessor surfaces the operator's
3897    /// intent for /admin/status without yet using the key in writes.
3898    pub fn encryption_at_rest_status(&self) -> (&'static str, Option<String>) {
3899        match crate::crypto::page_encryption::key_from_env() {
3900            Ok(Some(_)) => ("enabled", None),
3901            Ok(None) => ("disabled", None),
3902            Err(err) => ("error", Some(err)),
3903        }
3904    }
3905
3906    /// PLAN.md Phase 11.5 — current replica apply health label
3907    /// (`ok`, `gap`, `divergence`, `apply_error`, `connecting`,
3908    /// `stalled_gap`). Read from the persisted `red.replication.state`
3909    /// config key updated by the replica loop. Returns `None` on
3910    /// non-replica instances or when no apply has run yet.
3911    pub fn replica_apply_health(&self) -> Option<String> {
3912        let state = self.config_string("red.replication.state", "");
3913        if state.is_empty() {
3914            None
3915        } else {
3916            Some(state)
3917        }
3918    }
3919
3920    pub fn acquire(&self) -> RedDBResult<RuntimeConnection> {
3921        let mut pool = self
3922            .inner
3923            .pool
3924            .lock()
3925            .map_err(|e| RedDBError::Internal(format!("connection pool lock poisoned: {e}")))?;
3926        if pool.active >= self.inner.pool_config.max_connections {
3927            return Err(RedDBError::Internal(
3928                "connection pool exhausted".to_string(),
3929            ));
3930        }
3931
3932        let id = if let Some(id) = pool.idle.pop() {
3933            id
3934        } else {
3935            let id = pool.next_id;
3936            pool.next_id += 1;
3937            id
3938        };
3939        pool.active += 1;
3940        pool.total_checkouts += 1;
3941        drop(pool);
3942
3943        Ok(RuntimeConnection {
3944            id,
3945            inner: Arc::clone(&self.inner),
3946        })
3947    }
3948
3949    pub fn checkpoint(&self) -> RedDBResult<()> {
3950        // Local fsync always allowed — losing the lease shouldn't
3951        // prevent us from durably persisting what's already in memory.
3952        // The remote upload is the side-effect that risks clobbering a
3953        // peer's state, so it's behind the lease gate.
3954        self.inner.db.flush_local_only().map_err(|err| {
3955            // Issue #205 — local flush failure is a CheckpointFailed
3956            // operator-grade event. The local-flush path also covers
3957            // the WAL fsync we depend on, so a failure here doubles as
3958            // the WalFsyncFailed signal for the runtime entry point.
3959            let msg = err.to_string();
3960            crate::telemetry::operator_event::OperatorEvent::CheckpointFailed {
3961                lsn: 0,
3962                error: msg.clone(),
3963            }
3964            .emit_global();
3965            crate::telemetry::operator_event::OperatorEvent::WalFsyncFailed {
3966                path: "<flush_local_only>".to_string(),
3967                error: msg.clone(),
3968            }
3969            .emit_global();
3970            RedDBError::Engine(msg)
3971        })?;
3972        if let Err(err) = self.assert_remote_write_allowed("checkpoint") {
3973            tracing::warn!(
3974                target: "reddb::serverless::lease",
3975                error = %err,
3976                "checkpoint: skipping remote upload — lease not held"
3977            );
3978            return Ok(());
3979        }
3980        self.inner
3981            .db
3982            .upload_to_remote_backend()
3983            .map_err(|err| RedDBError::Engine(err.to_string()))
3984    }
3985
3986    /// Guard remote-mutating operations on the writer lease.
3987    /// Returns `Ok(())` when no remote backend is configured (the
3988    /// lease is irrelevant) or the lease state is `NotRequired` /
3989    /// `Held`. Returns `RedDBError::ReadOnly` when the lease is
3990    /// `NotHeld`, with an audit-friendly action label so the caller
3991    /// can record the rejection.
3992    pub(crate) fn assert_remote_write_allowed(&self, action: &str) -> RedDBResult<()> {
3993        if self.inner.db.remote_backend.is_none() {
3994            return Ok(());
3995        }
3996        match self.inner.write_gate.lease_state() {
3997            crate::runtime::write_gate::LeaseGateState::NotHeld => {
3998                self.inner.audit_log.record(
3999                    action,
4000                    "system",
4001                    "remote_backend",
4002                    "err: writer lease not held",
4003                    crate::json::Value::Null,
4004                );
4005                Err(RedDBError::ReadOnly(format!(
4006                    "writer lease not held — {action} blocked (serverless fence)"
4007                )))
4008            }
4009            _ => Ok(()),
4010        }
4011    }
4012
4013    pub fn run_maintenance(&self) -> RedDBResult<()> {
4014        self.inner
4015            .db
4016            .run_maintenance()
4017            .map_err(|err| RedDBError::Internal(err.to_string()))
4018    }
4019
4020    pub fn scan_collection(
4021        &self,
4022        collection: &str,
4023        cursor: Option<ScanCursor>,
4024        limit: usize,
4025    ) -> RedDBResult<ScanPage> {
4026        let store = self.inner.db.store();
4027        let manager = store
4028            .get_collection(collection)
4029            .ok_or_else(|| RedDBError::NotFound(collection.to_string()))?;
4030
4031        let mut entities = manager.query_all(|_| true);
4032        entities.sort_by_key(|entity| entity.id.raw());
4033
4034        let offset = cursor.map(|cursor| cursor.offset).unwrap_or(0);
4035        let total = entities.len();
4036        let end = total.min(offset.saturating_add(limit.max(1)));
4037        let items = if offset >= total {
4038            Vec::new()
4039        } else {
4040            entities[offset..end].to_vec()
4041        };
4042        let next = (end < total).then_some(ScanCursor { offset: end });
4043
4044        Ok(ScanPage {
4045            collection: collection.to_string(),
4046            items,
4047            next,
4048            total,
4049        })
4050    }
4051
4052    pub fn catalog(&self) -> CatalogModelSnapshot {
4053        self.inner.db.catalog_model_snapshot()
4054    }
4055
4056    pub fn catalog_consistency_report(&self) -> crate::catalog::CatalogConsistencyReport {
4057        self.inner.db.catalog_consistency_report()
4058    }
4059
4060    pub fn catalog_attention_summary(&self) -> CatalogAttentionSummary {
4061        crate::catalog::attention_summary(&self.catalog())
4062    }
4063
4064    pub fn collection_attention(&self) -> Vec<CollectionDescriptor> {
4065        crate::catalog::collection_attention(&self.catalog())
4066    }
4067
4068    pub fn index_attention(&self) -> Vec<CatalogIndexStatus> {
4069        crate::catalog::index_attention(&self.catalog())
4070    }
4071
4072    pub fn graph_projection_attention(&self) -> Vec<CatalogGraphProjectionStatus> {
4073        crate::catalog::graph_projection_attention(&self.catalog())
4074    }
4075
4076    pub fn analytics_job_attention(&self) -> Vec<CatalogAnalyticsJobStatus> {
4077        crate::catalog::analytics_job_attention(&self.catalog())
4078    }
4079
4080    pub fn stats(&self) -> RuntimeStats {
4081        let pool = runtime_pool_lock(self);
4082        RuntimeStats {
4083            active_connections: pool.active,
4084            idle_connections: pool.idle.len(),
4085            total_checkouts: pool.total_checkouts,
4086            paged_mode: self.inner.db.is_paged(),
4087            started_at_unix_ms: self.inner.started_at_unix_ms,
4088            store: self.inner.db.stats(),
4089            system: SystemInfo::collect(),
4090            result_blob_cache: self.inner.result_blob_cache.stats(),
4091            kv: self.inner.kv_stats.snapshot(),
4092            metrics_ingest: self.inner.metrics_ingest_stats.snapshot(),
4093        }
4094    }
4095
4096    pub(crate) fn record_metrics_ingest(
4097        &self,
4098        accepted_samples: u64,
4099        accepted_series: u64,
4100        rejected_samples: u64,
4101        rejected_series: u64,
4102    ) {
4103        self.inner.metrics_ingest_stats.record(
4104            accepted_samples,
4105            accepted_series,
4106            rejected_samples,
4107            rejected_series,
4108        );
4109    }
4110
4111    pub(crate) fn record_metrics_cardinality_budget_rejections(&self, rejected_series: u64) {
4112        self.inner
4113            .metrics_ingest_stats
4114            .record_cardinality_budget_rejections(rejected_series);
4115    }
4116
4117    pub(crate) fn record_metrics_tenant_activity(
4118        &self,
4119        tenant: &str,
4120        namespace: &str,
4121        operation: &str,
4122    ) {
4123        self.inner
4124            .metrics_tenant_activity_stats
4125            .record(tenant, namespace, operation);
4126    }
4127
4128    pub(crate) fn metrics_tenant_activity_snapshot(
4129        &self,
4130    ) -> Vec<crate::runtime::MetricsTenantActivityStats> {
4131        self.inner.metrics_tenant_activity_stats.snapshot()
4132    }
4133
4134    /// Execute a query under a typed scope override without embedding
4135    /// the tenant / user / role values into the SQL string. Use this
4136    /// from transport middleware (HTTP / gRPC / worker loops) where the
4137    /// scope is resolved from auth claims and the SQL is a parameterised
4138    /// template — avoids the string-concat injection risk of building
4139    /// `WITHIN TENANT '<id>' …` manually, and is drop-in compatible with
4140    /// prepared statements that didn't know about tenancy.
4141    ///
4142    /// Precedence matches the `WITHIN` clause: the passed `scope`
4143    /// overrides `SET LOCAL TENANT`, which overrides `SET TENANT`.
4144    /// The override is pushed on the thread-local scope stack for the
4145    /// duration of the call and popped on return — pool-shared
4146    /// connections cannot leak it across requests.
4147    pub fn execute_query_with_scope(
4148        &self,
4149        query: &str,
4150        scope: crate::runtime::within_clause::ScopeOverride,
4151    ) -> RedDBResult<RuntimeQueryResult> {
4152        if scope.is_empty() {
4153            return self.execute_query(query);
4154        }
4155        let _scope_guard = ScopeOverrideGuard::install(scope);
4156        self.execute_query(query)
4157    }
4158
4159    /// Issue #205 — single lifecycle exit for slow-query logging.
4160    ///
4161    /// `execute_query_inner` does the real work; this wrapper times it
4162    /// and, if elapsed exceeds the configured threshold, hands the
4163    /// triple `(QueryKind, elapsed_ms, sql_redacted, scope)` to the
4164    /// SlowQueryLogger. The threshold + sample_pct were captured at
4165    /// SlowQueryLogger construction (runtime startup), so the per-call
4166    /// cost on below-threshold paths is one relaxed atomic load.
4167    pub fn execute_query(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4168        let started = std::time::Instant::now();
4169        let result = self.execute_query_inner(query);
4170        self.finish_query_lifecycle(query, started, result)
4171    }
4172
4173    /// Execute a SQL statement with already-decoded positional bind
4174    /// parameters. Transports should call this instead of parsing +
4175    /// binding on their side and then reaching for `execute_query_expr`:
4176    /// this entry keeps parameterized statements inside the same
4177    /// statement lifecycle as textual SQL (snapshot guard, config/secret
4178    /// guards, coarse auth, intent locks, slow-query logging, integrity
4179    /// tombstone filtering, and causal bookmarks).
4180    pub fn execute_query_with_params(
4181        &self,
4182        query: &str,
4183        params: &[Value],
4184    ) -> RedDBResult<RuntimeQueryResult> {
4185        if params.is_empty() {
4186            return self.execute_query(query);
4187        }
4188        let started = std::time::Instant::now();
4189        let result = self.execute_query_with_params_inner(query, params);
4190        self.finish_query_lifecycle(query, started, result)
4191    }
4192
4193    fn finish_query_lifecycle(
4194        &self,
4195        query: &str,
4196        started: std::time::Instant,
4197        mut result: RedDBResult<RuntimeQueryResult>,
4198    ) -> RedDBResult<RuntimeQueryResult> {
4199        // Issue #765 / S6 — filter integrity-tombstoned rows out of SELECT
4200        // results before they reach any consumer. Fast no-op (one relaxed
4201        // atomic load) unless an input-stream digest mismatch has tombstoned
4202        // a RID range on this store.
4203        if let Ok(ref mut query_result) = result {
4204            if query_result.statement_type == "select" {
4205                self.filter_integrity_tombstoned(&mut query_result.result);
4206            }
4207        }
4208        let elapsed_ms = started.elapsed().as_millis() as u64;
4209
4210        // Build EffectiveScope from the same thread-locals frame-build
4211        // consults — keeps the slow-log row consistent with the audit /
4212        // RLS view of "this statement". `ai_scope()` is the canonical
4213        // builder.
4214        let scope = self.ai_scope();
4215        let kind = match result
4216            .as_ref()
4217            .map(|r| r.statement_type)
4218            .unwrap_or("select")
4219        {
4220            "select" => crate::telemetry::slow_query_logger::QueryKind::Select,
4221            "insert" => crate::telemetry::slow_query_logger::QueryKind::Insert,
4222            "update" => crate::telemetry::slow_query_logger::QueryKind::Update,
4223            "delete" => crate::telemetry::slow_query_logger::QueryKind::Delete,
4224            _ => crate::telemetry::slow_query_logger::QueryKind::Internal,
4225        };
4226        // SQL redaction: pass the raw query through. The slow-query
4227        // logger writes structured JSON so embedded literals stay
4228        // escape-safe at the JSON boundary (proven by
4229        // `adversarial_sql_is_escape_safe` in slow_query_logger.rs).
4230        // PII redaction (e.g. literal masking) is a follow-up.
4231        self.inner
4232            .slow_query_logger
4233            .record(kind, elapsed_ms, query.to_string(), &scope);
4234
4235        if let Ok(ref mut query_result) = result {
4236            if matches!(query_result.statement_type, "insert" | "update" | "delete") {
4237                let bookmark = crate::replication::CausalBookmark::new(
4238                    self.current_replication_term(),
4239                    self.cdc_current_lsn(),
4240                );
4241                query_result.bookmark = Some(bookmark.encode());
4242            }
4243        }
4244
4245        result
4246    }
4247
4248    fn execute_query_with_params_inner(
4249        &self,
4250        query: &str,
4251        params: &[Value],
4252    ) -> RedDBResult<RuntimeQueryResult> {
4253        let parsed = parse_multi(query).map_err(|err| RedDBError::Query(err.to_string()))?;
4254        let bound = crate::storage::query::user_params::bind(&parsed, params).map_err(|err| {
4255            RedDBError::Validation {
4256                message: err.to_string(),
4257                validation: crate::json!({
4258                    "code": "INVALID_PARAMS",
4259                    "surface": "query.params",
4260                }),
4261            }
4262        })?;
4263        self.execute_bound_query_expr_in_frame(query, bound)
4264    }
4265
4266    fn execute_bound_query_expr_in_frame(
4267        &self,
4268        query: &str,
4269        expr: QueryExpr,
4270    ) -> RedDBResult<RuntimeQueryResult> {
4271        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
4272        let execution_query = rewritten_query.as_deref().unwrap_or(query);
4273        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
4274        let _frame_guards = frame.install(self);
4275        let _log_span = crate::telemetry::span::query_span(query).entered();
4276
4277        let expr = self.rewrite_view_refs(expr);
4278        let mode = detect_mode(execution_query);
4279        let control_event_specs = query_control_event_specs(&expr);
4280        let _lock_guard = match frame.prepare_dispatch(self, &expr) {
4281            Ok(guard) => guard,
4282            Err(err) => {
4283                let outcome = control_event_outcome_for_error(&err);
4284                for spec in &control_event_specs {
4285                    self.emit_control_event(
4286                        spec.kind,
4287                        outcome,
4288                        spec.action,
4289                        spec.resource.clone(),
4290                        Some(err.to_string()),
4291                        spec.fields.clone(),
4292                    )?;
4293                }
4294                return Err(err);
4295            }
4296        };
4297
4298        let mut result = self.dispatch_expr(expr, query, mode)?;
4299        if result.statement_type == "select" {
4300            self.apply_secret_decryption(&mut result);
4301        }
4302        Ok(result)
4303    }
4304
4305    pub fn causal_session(&self) -> crate::runtime::CausalSession {
4306        crate::runtime::CausalSession {
4307            runtime: self.clone(),
4308            bookmark: None,
4309            wait_timeout: std::time::Duration::from_secs(5),
4310        }
4311    }
4312
4313    pub fn wait_for_bookmark(
4314        &self,
4315        bookmark: &crate::replication::CausalBookmark,
4316        timeout: std::time::Duration,
4317    ) -> RedDBResult<()> {
4318        let deadline = std::time::Instant::now() + timeout;
4319        loop {
4320            let applied_lsn = self.local_contiguous_applied_lsn();
4321            if applied_lsn >= bookmark.commit_lsn() {
4322                return Ok(());
4323            }
4324            let now = std::time::Instant::now();
4325            if now >= deadline {
4326                return Err(RedDBError::InvalidOperation(format!(
4327                    "timed out waiting for causal bookmark lsn {}; applied={}",
4328                    bookmark.commit_lsn(),
4329                    applied_lsn
4330                )));
4331            }
4332            let remaining = deadline.saturating_duration_since(now);
4333            std::thread::sleep(remaining.min(std::time::Duration::from_millis(5)));
4334        }
4335    }
4336
4337    fn local_contiguous_applied_lsn(&self) -> u64 {
4338        match self.inner.db.options().replication.role {
4339            crate::replication::ReplicationRole::Replica { .. } => {
4340                self.config_u64("red.replication.last_applied_lsn", 0)
4341            }
4342            _ => self.cdc_current_lsn(),
4343        }
4344    }
4345
4346    #[inline(never)]
4347    fn execute_query_inner(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4348        // ── ULTRA-TURBO: autocommit `SELECT * FROM t WHERE _entity_id = N` ──
4349        //
4350        // Moved above every boot-cost the normal path pays (WITHIN
4351        // strip, SET LOCAL parse, tx_local_tenants read, snapshot
4352        // guard, tracing span, tx_contexts read) because the bench's
4353        // `select_point` scenario was observed at 28× vs PostgreSQL —
4354        // the dominant cost wasn't the entity fetch but the ceremony
4355        // before it. Only fires when there's no ambient transaction
4356        // context or WITHIN override, so the snapshot install we skip
4357        // truly is a no-op for this query.
4358        if !has_scope_override_active()
4359            && !query.trim_start().starts_with("WITHIN")
4360            && !query.trim_start().starts_with("within")
4361            && !self.inner.query_audit.has_rules()
4362            && !self
4363                .inner
4364                .tx_contexts
4365                .read()
4366                .contains_key(&current_connection_id())
4367        {
4368            if let Some(result) = self.try_fast_entity_lookup(query) {
4369                return result;
4370            }
4371        }
4372
4373        // `WITHIN TENANT '<id>' [USER '<u>'] [AS ROLE '<r>'] <stmt>` —
4374        // strip the prefix, push a stack-scoped override, recurse on
4375        // the inner statement, pop on return. Stack lives in a
4376        // thread-local but is balanced by the RAII guard, so a
4377        // pool-shared connection cannot leak the override across
4378        // requests and an early `?` return still pops cleanly.
4379        match crate::runtime::within_clause::try_strip_within_prefix(query) {
4380            Ok(Some((scope, inner))) => {
4381                let _scope_guard = ScopeOverrideGuard::install(scope);
4382                // Re-enter the inner path, NOT `execute_query`, so the
4383                // slow-query lifecycle hook records exactly one row per
4384                // top-level statement (the WITHIN-stripped form would
4385                // double-record).
4386                return self.execute_query_inner(inner);
4387            }
4388            Ok(None) => {}
4389            Err(msg) => return Err(RedDBError::Query(msg)),
4390        }
4391
4392        // `EXPLAIN <stmt>` — introspection. Runs the planner on the
4393        // inner statement (WITHOUT executing it) and returns the
4394        // CanonicalLogicalNode tree as rows so the caller can see the
4395        // operator shape and estimated cost. `EXPLAIN ALTER FOR ...`
4396        // is a distinct schema-diff command and continues down the
4397        // regular SQL path.
4398        if let Some(inner) = strip_explain_prefix(query) {
4399            return self.explain_as_rows(query, inner);
4400        }
4401
4402        // `SET LOCAL TENANT '<id>'` — write the per-transaction tenant
4403        // override and return. Outside a transaction the statement is
4404        // an error (matches PG semantics: SET LOCAL only takes effect
4405        // within an active transaction).
4406        if let Some(value) = parse_set_local_tenant(query)? {
4407            let conn_id = current_connection_id();
4408            if !self.inner.tx_contexts.read().contains_key(&conn_id) {
4409                return Err(RedDBError::Query(
4410                    "SET LOCAL TENANT requires an active transaction".to_string(),
4411                ));
4412            }
4413            self.inner
4414                .tx_local_tenants
4415                .write()
4416                .insert(conn_id, value.clone());
4417            return Ok(RuntimeQueryResult::ok_message(
4418                query.to_string(),
4419                &match &value {
4420                    Some(id) => format!("local tenant set: {id}"),
4421                    None => "local tenant cleared".to_string(),
4422                },
4423                "set_local_tenant",
4424            ));
4425        }
4426
4427        if super::red_schema::is_system_schema_write(query) {
4428            return Err(RedDBError::Query(
4429                super::red_schema::READ_ONLY_ERROR.to_string(),
4430            ));
4431        }
4432
4433        if let Some(create_source) = super::analytics_source_catalog::parse_create_statement(query)?
4434        {
4435            return self.execute_create_analytics_source(query, create_source);
4436        }
4437
4438        // Issue #790 — `READ METRIC <path>` is intentionally rejected at
4439        // v0. The descriptor itself is readable through
4440        // `red.analytics.metrics`; the *output* read returns a
4441        // structured error so callers can tell "execution engine not yet
4442        // built" apart from "metric does not exist".
4443        if let Some(path) = super::metric_descriptor_catalog::parse_read_metric_statement(query) {
4444            return Err(super::metric_descriptor_catalog::read_output_unsupported(
4445                &path,
4446            ));
4447        }
4448
4449        // Issue #918 / ADR 0035 — leaderboard rank capability catalog
4450        // declarations are still recognised before the general parser.
4451        // Rank reads themselves are parser AST nodes, including Redis-flavor
4452        // Z* sugar that desugars to the same canonical rank shapes.
4453        if let Some(parsed) = super::ranking_descriptor_catalog::parse_create_ranking(query) {
4454            return self.execute_create_ranking(query, parsed?);
4455        }
4456        if super::ranking_descriptor_catalog::parse_show_rankings(query) {
4457            return self.execute_show_rankings(query);
4458        }
4459
4460        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
4461        let execution_query = rewritten_query.as_deref().unwrap_or(query);
4462
4463        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
4464        let _frame_guards = frame.install(self);
4465
4466        // Phase 6 logging: enter a span stamped with conn_id / tenant
4467        // / query_len. Every downstream tracing::info!/warn!/error!
4468        // inherits these fields — no need to thread them manually
4469        // through storage/scan layers. Entered AFTER the WITHIN /
4470        // SET LOCAL TENANT resolution above so the span reflects the
4471        // effective scope for this statement.
4472        let _log_span = crate::telemetry::span::query_span(query).entered();
4473
4474        // ── CTE prelude (#41) — `WITH x AS (...) SELECT ... FROM x` ──
4475        if let Some(rewritten) = frame.prepare_cte(execution_query)? {
4476            return self.execute_query_expr(rewritten);
4477        }
4478
4479        // ── TURBO: bypass SQL parse for SELECT * FROM x WHERE _entity_id = N ──
4480        if !self.inner.query_audit.has_rules() {
4481            if let Some(result) = self.try_fast_entity_lookup(execution_query) {
4482                return result;
4483            }
4484        }
4485
4486        // ── Result cache: return cached result if still fresh (30s TTL) ──
4487        if !self.inner.query_audit.has_rules() {
4488            if let Some(result) = frame.read_result_cache(self) {
4489                return Ok(result);
4490            }
4491        }
4492
4493        let prepared = frame.prepare_statement(self, execution_query)?;
4494        let mode = prepared.mode;
4495        let expr = prepared.expr;
4496
4497        let statement = query_expr_name(&expr);
4498        let result_cache_scopes = query_expr_result_cache_scopes(&expr);
4499        let control_event_specs = query_control_event_specs(&expr);
4500        let query_audit_plan = query_audit_plan(&expr);
4501
4502        let _lock_guard = match frame.prepare_dispatch(self, &expr) {
4503            Ok(guard) => guard,
4504            Err(err) => {
4505                let outcome = control_event_outcome_for_error(&err);
4506                for spec in &control_event_specs {
4507                    self.emit_control_event(
4508                        spec.kind,
4509                        outcome,
4510                        spec.action,
4511                        spec.resource.clone(),
4512                        Some(err.to_string()),
4513                        spec.fields.clone(),
4514                    )?;
4515                }
4516                return Err(err);
4517            }
4518        };
4519        let frame_iface: &dyn super::statement_frame::ReadFrame = &frame;
4520        let query_audit_started = std::time::Instant::now();
4521
4522        let query_result = match expr {
4523            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
4524                // Apply MVCC visibility + RLS gate while materialising the
4525                // graph: every node entity is screened against the source
4526                // collection's policy chain (basic and `Nodes`-targeted)
4527                // and dropped when the caller's tenant / role doesn't
4528                // admit it. Edges are pruned automatically because the
4529                // graph builder skips edges whose endpoints aren't in
4530                // `allowed_nodes`.
4531                let (graph, node_properties, edge_properties) =
4532                    self.materialize_graph_with_rls()?;
4533                let result =
4534                    crate::storage::query::unified::UnifiedExecutor::execute_on_with_graph_properties(
4535                        &graph,
4536                        &expr,
4537                        node_properties,
4538                        edge_properties,
4539                    )
4540                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4541
4542                Ok(RuntimeQueryResult {
4543                    query: query.to_string(),
4544                    mode,
4545                    statement,
4546                    engine: "materialized-graph",
4547                    result,
4548                    affected_rows: 0,
4549                    statement_type: "select",
4550                    bookmark: None,
4551                })
4552            }
4553            QueryExpr::Table(table) => {
4554                let table = self.resolve_table_expr_subqueries(
4555                    table,
4556                    &frame as &dyn super::statement_frame::ReadFrame,
4557                )?;
4558                // Table-valued functions (e.g. components(g)) dispatch to a
4559                // read-only executor before any catalog/virtual-table routing
4560                // (issue #795).
4561                if let Some(TableSource::Function {
4562                    name,
4563                    args,
4564                    named_args,
4565                }) = table.source.clone()
4566                {
4567                    // The graph-collection form is cacheable (issue #802): the
4568                    // result-cache read at the top of this function keys on the
4569                    // query string, and `result_cache_scopes` carries the graph
4570                    // collection (see `collect_table_source_scopes`) so a write
4571                    // to it invalidates the entry. Deterministic algorithm
4572                    // output is worth caching at any row count, so the write
4573                    // bypasses the generic ≤5-row payload heuristic.
4574                    let tvf_result = RuntimeQueryResult {
4575                        query: query.to_string(),
4576                        mode,
4577                        statement,
4578                        engine: "runtime-graph-tvf",
4579                        result: self.execute_table_function(&name, &args, &named_args)?,
4580                        affected_rows: 0,
4581                        statement_type: "select",
4582                        bookmark: None,
4583                    };
4584                    frame.write_result_cache(self, &tvf_result, result_cache_scopes.clone());
4585                    return Ok(tvf_result);
4586                }
4587                // Inline-graph TVF (issue #799): the graph is supplied by two
4588                // subqueries instead of a collection reference. Unlike the
4589                // graph-collection form, the result IS cacheable — its cache
4590                // key is the query string (the result-cache read at the top of
4591                // `execute_query_inner` keys on it) and `result_cache_scopes`
4592                // already carries the `nodes`/`edges` source collections, so a
4593                // write to any of them invalidates the entry.
4594                if let Some(TableSource::InlineGraphFunction {
4595                    name,
4596                    nodes,
4597                    edges,
4598                    named_args,
4599                }) = table.source.clone()
4600                {
4601                    let inline_result = RuntimeQueryResult {
4602                        query: query.to_string(),
4603                        mode,
4604                        statement,
4605                        engine: "runtime-graph-tvf-inline",
4606                        result: self.execute_inline_graph_function(
4607                            &name,
4608                            &nodes,
4609                            &edges,
4610                            &named_args,
4611                        )?,
4612                        affected_rows: 0,
4613                        statement_type: "select",
4614                        bookmark: None,
4615                    };
4616                    frame.write_result_cache(self, &inline_result, result_cache_scopes);
4617                    return Ok(inline_result);
4618                }
4619                if super::red_schema::is_virtual_table(&table.table) {
4620                    return Ok(RuntimeQueryResult {
4621                        query: query.to_string(),
4622                        mode,
4623                        statement,
4624                        engine: "runtime-red-schema",
4625                        result: super::red_schema::red_query(
4626                            self,
4627                            &table.table,
4628                            &table,
4629                            &frame as &dyn super::statement_frame::ReadFrame,
4630                        )?,
4631                        affected_rows: 0,
4632                        statement_type: "select",
4633                        bookmark: None,
4634                    });
4635                }
4636
4637                // `<graph>.<output>` analytics virtual view (issue #800).
4638                // Recomputed on demand — intentionally not result-cached, so it
4639                // always reflects the current graph data.
4640                if let Some(view_result) = self.try_resolve_analytics_view(
4641                    &table,
4642                    &frame as &dyn super::statement_frame::ReadFrame,
4643                )? {
4644                    return Ok(RuntimeQueryResult {
4645                        query: query.to_string(),
4646                        mode,
4647                        statement,
4648                        engine: "runtime-graph-analytics-view",
4649                        result: view_result,
4650                        affected_rows: 0,
4651                        statement_type: "select",
4652                        bookmark: None,
4653                    });
4654                }
4655
4656                if let Some(result) = self.execute_probabilistic_select(&table)? {
4657                    return Ok(RuntimeQueryResult {
4658                        query: query.to_string(),
4659                        mode,
4660                        statement,
4661                        engine: "runtime-probabilistic",
4662                        result,
4663                        affected_rows: 0,
4664                        statement_type: "select",
4665                        bookmark: None,
4666                    });
4667                }
4668
4669                // Foreign-table intercept (Phase 3.2.2 PG parity).
4670                //
4671                // When the referenced table matches a `CREATE FOREIGN TABLE`
4672                // registration, short-circuit into the FDW scan. Phase 3.2
4673                // wrappers don't yet support pushdown, so filters/projections
4674                // apply post-scan via `apply_foreign_table_filters` — good
4675                // enough for correctness; perf work lands in 3.2.3.
4676                if self.inner.foreign_tables.is_foreign_table(&table.table) {
4677                    let records = self
4678                        .inner
4679                        .foreign_tables
4680                        .scan(&table.table)
4681                        .map_err(|e| RedDBError::Internal(e.to_string()))?;
4682                    let result = apply_foreign_table_filters(records, &table);
4683                    return Ok(RuntimeQueryResult {
4684                        query: query.to_string(),
4685                        mode,
4686                        statement,
4687                        engine: "runtime-fdw",
4688                        result,
4689                        affected_rows: 0,
4690                        statement_type: "select",
4691                        bookmark: None,
4692                    });
4693                }
4694
4695                // Row-Level Security enforcement (Phase 2.5.2 PG parity).
4696                //
4697                // When RLS is enabled on this table, fetch every policy
4698                // that applies to the current (role, SELECT) pair and
4699                // fold them into the query's WHERE clause: policies
4700                // OR-combine (any of them admitting the row is enough),
4701                // then AND into the caller's existing filter.
4702                //
4703                // Anonymous callers (no thread-local identity) pass
4704                // `role = None`; policies with a specific `TO role`
4705                // clause skip, but `TO PUBLIC` policies still apply.
4706                //
4707                // When `inject_rls_filters` returns `None` the table has
4708                // RLS enabled but no policy admits the caller's role —
4709                // short-circuit with an empty result set instead of
4710                // synthesising a contradiction filter.
4711                let Some(table_with_rls) = self.authorize_relational_table_select(
4712                    table,
4713                    &frame as &dyn super::statement_frame::ReadFrame,
4714                )?
4715                else {
4716                    let empty = crate::storage::query::unified::UnifiedResult::empty();
4717                    return Ok(RuntimeQueryResult {
4718                        query: query.to_string(),
4719                        mode,
4720                        statement,
4721                        engine: "runtime-table-rls",
4722                        result: empty,
4723                        affected_rows: 0,
4724                        statement_type: "select",
4725                        bookmark: None,
4726                    });
4727                };
4728                Ok(RuntimeQueryResult {
4729                    query: query.to_string(),
4730                    mode,
4731                    statement,
4732                    engine: "runtime-table",
4733                    // #885: lend the frame-owned row-buffer arena to the
4734                    // streaming path so chunk buffers are reused across
4735                    // this statement's chunk-fetches instead of allocated
4736                    // fresh per chunk. This is the table-query dispatch
4737                    // that runs under a `StatementExecutionFrame`; the
4738                    // frameless prepared/subquery paths keep `None`.
4739                    result: execute_runtime_table_query_in(
4740                        &self.inner.db,
4741                        &table_with_rls,
4742                        Some(&self.inner.index_store),
4743                        Some(frame.row_arena()),
4744                    )?,
4745                    affected_rows: 0,
4746                    statement_type: "select",
4747                    bookmark: None,
4748                })
4749            }
4750            QueryExpr::Join(join) => {
4751                // Fold per-table RLS filters into each `QueryExpr::Table`
4752                // leaf of the join tree before executing. Without this
4753                // the join executor scans both tables raw and ignores
4754                // policies — a `WITHIN TENANT 'x'` against a join of
4755                // two tenant-scoped tables would leak cross-tenant rows.
4756                // When any leaf has RLS enabled and zero matching policy,
4757                // short-circuit to an empty join result instead of
4758                // emitting a contradiction filter.
4759                let join_with_rls = match self.authorize_relational_join_select(
4760                    join,
4761                    &frame as &dyn super::statement_frame::ReadFrame,
4762                )? {
4763                    Some(j) => j,
4764                    None => {
4765                        return Ok(RuntimeQueryResult {
4766                            query: query.to_string(),
4767                            mode,
4768                            statement,
4769                            engine: "runtime-join-rls",
4770                            result: crate::storage::query::unified::UnifiedResult::empty(),
4771                            affected_rows: 0,
4772                            statement_type: "select",
4773                            bookmark: None,
4774                        });
4775                    }
4776                };
4777                Ok(RuntimeQueryResult {
4778                    query: query.to_string(),
4779                    mode,
4780                    statement,
4781                    engine: "runtime-join",
4782                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
4783                    affected_rows: 0,
4784                    statement_type: "select",
4785                    bookmark: None,
4786                })
4787            }
4788            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
4789                query: query.to_string(),
4790                mode,
4791                statement,
4792                engine: "runtime-vector",
4793                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
4794                affected_rows: 0,
4795                statement_type: "select",
4796                bookmark: None,
4797            }),
4798            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
4799                query: query.to_string(),
4800                mode,
4801                statement,
4802                engine: "runtime-hybrid",
4803                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
4804                affected_rows: 0,
4805                statement_type: "select",
4806                bookmark: None,
4807            }),
4808            QueryExpr::RankOf(ref rank) => self.execute_rank_of(query, rank),
4809            QueryExpr::ApproxRankOf(ref rank) => self.execute_approx_rank_of(query, rank),
4810            QueryExpr::RankRange(ref range) => self.execute_rank_range(query, range),
4811            // DML execution
4812            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
4813                Err(RedDBError::Query(
4814                    super::red_schema::READ_ONLY_ERROR.to_string(),
4815                ))
4816            }
4817            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
4818                Err(RedDBError::Query(
4819                    super::red_schema::READ_ONLY_ERROR.to_string(),
4820                ))
4821            }
4822            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
4823                Err(RedDBError::Query(
4824                    super::red_schema::READ_ONLY_ERROR.to_string(),
4825                ))
4826            }
4827            QueryExpr::Insert(ref insert) => self
4828                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
4829                    self.execute_insert(query, insert)
4830                }),
4831            QueryExpr::Update(ref update) => self
4832                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
4833                    self.execute_update(query, update)
4834                }),
4835            QueryExpr::Delete(ref delete) => self
4836                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
4837                    self.execute_delete(query, delete)
4838                }),
4839            // DDL execution
4840            QueryExpr::CreateTable(ref create) => self.execute_create_table(query, create),
4841            QueryExpr::CreateCollection(ref create) => {
4842                self.execute_create_collection(query, create)
4843            }
4844            QueryExpr::CreateVector(ref create) => self.execute_create_vector(query, create),
4845            QueryExpr::DropTable(ref drop_tbl) => self.execute_drop_table(query, drop_tbl),
4846            QueryExpr::DropGraph(ref drop_graph) => self.execute_drop_graph(query, drop_graph),
4847            QueryExpr::DropVector(ref drop_vector) => self.execute_drop_vector(query, drop_vector),
4848            QueryExpr::DropDocument(ref drop_document) => {
4849                self.execute_drop_document(query, drop_document)
4850            }
4851            QueryExpr::DropKv(ref drop_kv) => self.execute_drop_kv(query, drop_kv),
4852            QueryExpr::DropCollection(ref drop_collection) => {
4853                self.execute_drop_collection(query, drop_collection)
4854            }
4855            QueryExpr::Truncate(ref truncate) => self.execute_truncate(query, truncate),
4856            QueryExpr::AlterTable(ref alter) => self.execute_alter_table(query, alter),
4857            QueryExpr::ExplainAlter(ref explain) => self.execute_explain_alter(query, explain),
4858            // Graph analytics commands
4859            QueryExpr::GraphCommand(ref cmd) => self.execute_graph_command(query, cmd),
4860            // Search commands
4861            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query, cmd),
4862            // ASK: RAG query with LLM synthesis
4863            QueryExpr::Ask(ref ask) => self.execute_ask(query, ask),
4864            QueryExpr::CreateIndex(ref create_idx) => self.execute_create_index(query, create_idx),
4865            QueryExpr::DropIndex(ref drop_idx) => self.execute_drop_index(query, drop_idx),
4866            QueryExpr::ProbabilisticCommand(ref cmd) => {
4867                self.execute_probabilistic_command(query, cmd)
4868            }
4869            // Time-series DDL
4870            QueryExpr::CreateTimeSeries(ref ts) => self.execute_create_timeseries(query, ts),
4871            QueryExpr::CreateMetric(ref metric) => self.execute_create_metric(query, metric),
4872            QueryExpr::AlterMetric(ref alter) => self.execute_alter_metric(query, alter),
4873            QueryExpr::CreateSlo(ref slo) => self.execute_create_slo(query, slo),
4874            QueryExpr::DropTimeSeries(ref ts) => self.execute_drop_timeseries(query, ts),
4875            // Queue DDL and commands
4876            QueryExpr::CreateQueue(ref q) => self.execute_create_queue(query, q),
4877            QueryExpr::AlterQueue(ref q) => self.execute_alter_queue(query, q),
4878            QueryExpr::DropQueue(ref q) => self.execute_drop_queue(query, q),
4879            QueryExpr::QueueSelect(ref q) => self.execute_queue_select(query, q),
4880            QueryExpr::QueueCommand(ref cmd) => self.execute_queue_command(query, cmd),
4881            QueryExpr::EventsBackfill(ref backfill) => {
4882                self.execute_events_backfill(query, backfill)
4883            }
4884            QueryExpr::EventsBackfillStatus { ref collection } => Err(RedDBError::Query(format!(
4885                "EVENTS BACKFILL STATUS for '{collection}' is not implemented in this slice"
4886            ))),
4887            QueryExpr::KvCommand(ref cmd) => self.execute_kv_command(query, cmd),
4888            QueryExpr::ConfigCommand(ref cmd) => self.execute_config_command(query, cmd),
4889            QueryExpr::CreateTree(ref tree) => self.execute_create_tree(query, tree),
4890            QueryExpr::DropTree(ref tree) => self.execute_drop_tree(query, tree),
4891            QueryExpr::TreeCommand(ref cmd) => self.execute_tree_command(query, cmd),
4892            // SET CONFIG key = value
4893            QueryExpr::SetConfig { ref key, ref value } => {
4894                if key.starts_with("red.secret.") {
4895                    return Err(RedDBError::Query(
4896                        "red.secret.* is reserved for vault secrets; use SET SECRET".to_string(),
4897                    ));
4898                }
4899                if key.starts_with("red.secrets.") {
4900                    return Err(RedDBError::Query(
4901                        "red.secrets.* is reserved for vault secrets; use SET SECRET".to_string(),
4902                    ));
4903                }
4904                match self.check_managed_config_write_for_set_config(key) {
4905                    Err(err) => Err(err),
4906                    Ok(()) => {
4907                        let store = self.inner.db.store();
4908                        let json_val = match value {
4909                            Value::Text(s) => crate::serde_json::Value::String(s.to_string()),
4910                            Value::Integer(n) => crate::serde_json::Value::Number(*n as f64),
4911                            Value::Float(n) => crate::serde_json::Value::Number(*n),
4912                            Value::Boolean(b) => crate::serde_json::Value::Bool(*b),
4913                            _ => crate::serde_json::Value::String(value.to_string()),
4914                        };
4915                        store.set_config_tree(key, &json_val);
4916                        update_current_config_value(key, value.clone());
4917                        // Config changes can flip runtime behavior mid-session
4918                        // (auto_decrypt, auto_encrypt, etc.) — invalidate the
4919                        // result cache so subsequent reads re-execute against
4920                        // the new config.
4921                        self.invalidate_result_cache();
4922                        Ok(RuntimeQueryResult::ok_message(
4923                            query.to_string(),
4924                            &format!("config set: {key}"),
4925                            "set",
4926                        ))
4927                    }
4928                }
4929            }
4930            // SET SECRET key = value
4931            QueryExpr::SetSecret { ref key, ref value } => {
4932                if key.starts_with("red.config.") {
4933                    return Err(RedDBError::Query(
4934                        "red.config.* is reserved for config; use SET CONFIG".to_string(),
4935                    ));
4936                }
4937                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4938                    RedDBError::Query("SET SECRET requires an enabled, unsealed vault".to_string())
4939                })?;
4940                if matches!(value, Value::Null) {
4941                    auth_store
4942                        .vault_kv_try_delete(key)
4943                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4944                    update_current_secret_value(key, None);
4945                    self.invalidate_result_cache();
4946                    return Ok(RuntimeQueryResult::ok_message(
4947                        query.to_string(),
4948                        &format!("secret deleted: {key}"),
4949                        "delete_secret",
4950                    ));
4951                }
4952                let value = secret_sql_value_to_string(value)?;
4953                auth_store
4954                    .vault_kv_try_set(key.clone(), value.clone())
4955                    .map_err(|err| RedDBError::Query(err.to_string()))?;
4956                update_current_secret_value(key, Some(value));
4957                self.invalidate_result_cache();
4958                Ok(RuntimeQueryResult::ok_message(
4959                    query.to_string(),
4960                    &format!("secret set: {key}"),
4961                    "set_secret",
4962                ))
4963            }
4964            // DELETE SECRET key
4965            QueryExpr::DeleteSecret { ref key } => {
4966                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4967                    RedDBError::Query(
4968                        "DELETE SECRET requires an enabled, unsealed vault".to_string(),
4969                    )
4970                })?;
4971                let deleted = auth_store
4972                    .vault_kv_try_delete(key)
4973                    .map_err(|err| RedDBError::Query(err.to_string()))?;
4974                if deleted {
4975                    update_current_secret_value(key, None);
4976                }
4977                self.invalidate_result_cache();
4978                Ok(RuntimeQueryResult::ok_message(
4979                    query.to_string(),
4980                    &format!("secret deleted: {key}"),
4981                    if deleted {
4982                        "delete_secret"
4983                    } else {
4984                        "delete_secret_not_found"
4985                    },
4986                ))
4987            }
4988            // SHOW SECRET[S] [prefix]
4989            QueryExpr::ShowSecrets { ref prefix } => {
4990                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4991                    RedDBError::Query("SHOW SECRET requires an enabled, unsealed vault".to_string())
4992                })?;
4993                if !auth_store.is_vault_backed() {
4994                    return Err(RedDBError::Query(
4995                        "SHOW SECRET requires an enabled, unsealed vault".to_string(),
4996                    ));
4997                }
4998                let mut keys = auth_store.vault_kv_keys();
4999                keys.sort();
5000                let mut result = UnifiedResult::with_columns(vec![
5001                    "key".into(),
5002                    "value".into(),
5003                    "status".into(),
5004                ]);
5005                for key in keys {
5006                    if let Some(ref pfx) = prefix {
5007                        if !key.starts_with(pfx) {
5008                            continue;
5009                        }
5010                    }
5011                    let mut record = UnifiedRecord::new();
5012                    record.set("key", Value::text(key));
5013                    record.set("value", Value::text("***"));
5014                    record.set("status", Value::text("active"));
5015                    result.push(record);
5016                }
5017                Ok(RuntimeQueryResult {
5018                    query: query.to_string(),
5019                    mode,
5020                    statement: "show_secrets",
5021                    engine: "runtime-secret",
5022                    result,
5023                    affected_rows: 0,
5024                    statement_type: "select",
5025                    bookmark: None,
5026                })
5027            }
5028            // SHOW CONFIG [prefix] [AS JSON|FORMAT JSON]
5029            QueryExpr::ShowConfig {
5030                ref prefix,
5031                as_json,
5032            } => {
5033                let store = self.inner.db.store();
5034                let all_collections = store.list_collections();
5035                if !all_collections.contains(&"red_config".to_string()) {
5036                    if as_json {
5037                        return Ok(show_config_json_result(
5038                            query,
5039                            mode,
5040                            prefix,
5041                            crate::serde_json::Value::Object(crate::serde_json::Map::new()),
5042                        ));
5043                    }
5044                    let result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5045                    return Ok(RuntimeQueryResult {
5046                        query: query.to_string(),
5047                        mode,
5048                        statement: "show_config",
5049                        engine: "runtime-config",
5050                        result,
5051                        affected_rows: 0,
5052                        statement_type: "select",
5053                        bookmark: None,
5054                    });
5055                }
5056                let manager = store
5057                    .get_collection("red_config")
5058                    .ok_or_else(|| RedDBError::NotFound("red_config".to_string()))?;
5059                let entities = manager.query_all(|_| true);
5060                let mut latest = std::collections::BTreeMap::<String, (u64, Value, Value)>::new();
5061                for entity in entities {
5062                    if let EntityData::Row(ref row) = entity.data {
5063                        if let Some(ref named) = row.named {
5064                            let key_val = named.get("key").cloned().unwrap_or(Value::Null);
5065                            let val = named.get("value").cloned().unwrap_or(Value::Null);
5066                            let key_str = match &key_val {
5067                                Value::Text(s) => s.as_ref(),
5068                                _ => continue,
5069                            };
5070                            if let Some(ref pfx) = prefix {
5071                                if !key_str.starts_with(pfx.as_str()) {
5072                                    continue;
5073                                }
5074                            }
5075                            let entity_id = entity.id.raw();
5076                            match latest.get(key_str) {
5077                                Some((prev_id, _, _)) if *prev_id > entity_id => {}
5078                                _ => {
5079                                    latest.insert(key_str.to_string(), (entity_id, key_val, val));
5080                                }
5081                            }
5082                        }
5083                    }
5084                }
5085                if as_json {
5086                    let mut tree = crate::serde_json::Value::Object(crate::serde_json::Map::new());
5087                    for (key, (_, _, val)) in latest {
5088                        let relative = match prefix {
5089                            Some(pfx) if key == *pfx => "",
5090                            Some(pfx) => key
5091                                .strip_prefix(pfx.as_str())
5092                                .and_then(|tail| tail.strip_prefix('.'))
5093                                .unwrap_or(key.as_str()),
5094                            None => key.as_str(),
5095                        };
5096                        insert_config_json_path(
5097                            &mut tree,
5098                            relative,
5099                            crate::presentation::entity_json::storage_value_to_json(&val),
5100                        );
5101                    }
5102                    return Ok(show_config_json_result(query, mode, prefix, tree));
5103                }
5104                let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
5105                for (_, key_val, val) in latest.into_values() {
5106                    let mut record = UnifiedRecord::new();
5107                    record.set("key", key_val);
5108                    record.set("value", val);
5109                    result.push(record);
5110                }
5111                Ok(RuntimeQueryResult {
5112                    query: query.to_string(),
5113                    mode,
5114                    statement: "show_config",
5115                    engine: "runtime-config",
5116                    result,
5117                    affected_rows: 0,
5118                    statement_type: "select",
5119                    bookmark: None,
5120                })
5121            }
5122            // Session-local multi-tenancy handle (Phase 2.5.3).
5123            //
5124            // SET TENANT 'id' / SET TENANT NULL / RESET TENANT — writes
5125            // the thread-local; SHOW TENANT returns it. Paired with the
5126            // CURRENT_TENANT() scalar for use in RLS policies.
5127            QueryExpr::SetTenant(ref value) => {
5128                match value {
5129                    Some(id) => set_current_tenant(id.clone()),
5130                    None => clear_current_tenant(),
5131                }
5132                Ok(RuntimeQueryResult::ok_message(
5133                    query.to_string(),
5134                    &match value {
5135                        Some(id) => format!("tenant set: {id}"),
5136                        None => "tenant cleared".to_string(),
5137                    },
5138                    "set_tenant",
5139                ))
5140            }
5141            QueryExpr::ShowTenant => {
5142                let mut result = UnifiedResult::with_columns(vec!["tenant".into()]);
5143                let mut record = UnifiedRecord::new();
5144                record.set(
5145                    "tenant",
5146                    current_tenant().map(Value::text).unwrap_or(Value::Null),
5147                );
5148                result.push(record);
5149                Ok(RuntimeQueryResult {
5150                    query: query.to_string(),
5151                    mode,
5152                    statement: "show_tenant",
5153                    engine: "runtime-tenant",
5154                    result,
5155                    affected_rows: 0,
5156                    statement_type: "select",
5157                    bookmark: None,
5158                })
5159            }
5160            // Transaction control (Phase 2.3 PG parity).
5161            //
5162            // BEGIN allocates a real `Xid` and stores a `TxnContext` keyed by
5163            // the current connection's id. COMMIT/ROLLBACK release it through
5164            // the `SnapshotManager` so future snapshots see the correct set of
5165            // active/aborted transactions.
5166            //
5167            // Tuple stamping (xmin/xmax) and read-path visibility filtering
5168            // land in Phase 2.3.2 — this dispatch only manages the snapshot
5169            // registry. Statements running outside a TxnContext still behave
5170            // as autocommit (xid=0 → visible to every snapshot).
5171            QueryExpr::TransactionControl(ref ctl) => {
5172                use crate::storage::query::ast::TxnControl;
5173                use crate::storage::transaction::snapshot::{TxnContext, Xid};
5174                use crate::storage::transaction::IsolationLevel;
5175
5176                // Phase 2.3 keys transactions by a thread-local connection id.
5177                // The stdio/gRPC paths wire a real per-connection id later;
5178                // for embedded use (one RedDBRuntime per process-ish caller)
5179                // we fall back to a deterministic placeholder.
5180                let conn_id = current_connection_id();
5181
5182                let (kind, msg) = match ctl {
5183                    TxnControl::Begin => {
5184                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5185                        let xid = mgr.begin();
5186                        let snapshot = mgr.snapshot(xid);
5187                        let ctx = TxnContext {
5188                            xid,
5189                            isolation: IsolationLevel::SnapshotIsolation,
5190                            snapshot,
5191                            savepoints: Vec::new(),
5192                            released_sub_xids: Vec::new(),
5193                        };
5194                        self.inner.tx_contexts.write().insert(conn_id, ctx);
5195                        ("begin", format!("BEGIN — xid={xid} (snapshot isolation)"))
5196                    }
5197                    TxnControl::Commit => {
5198                        // SET LOCAL TENANT ends with the transaction.
5199                        self.inner.tx_local_tenants.write().remove(&conn_id);
5200                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5201                        match ctx {
5202                            Some(ctx) => {
5203                                let mut own_xids = std::collections::HashSet::new();
5204                                own_xids.insert(ctx.xid);
5205                                for (_, sub) in &ctx.savepoints {
5206                                    own_xids.insert(*sub);
5207                                }
5208                                for sub in &ctx.released_sub_xids {
5209                                    own_xids.insert(*sub);
5210                                }
5211                                if let Err(err) = self.check_table_row_write_conflicts(
5212                                    conn_id,
5213                                    &ctx.snapshot,
5214                                    &own_xids,
5215                                ) {
5216                                    for (_, sub) in &ctx.savepoints {
5217                                        self.inner.snapshot_manager.rollback(*sub);
5218                                    }
5219                                    for sub in &ctx.released_sub_xids {
5220                                        self.inner.snapshot_manager.rollback(*sub);
5221                                    }
5222                                    self.inner.snapshot_manager.rollback(ctx.xid);
5223                                    self.revive_pending_versioned_updates(conn_id);
5224                                    self.revive_pending_tombstones(conn_id);
5225                                    self.discard_pending_kv_watch_events(conn_id);
5226                                    self.discard_pending_queue_wakes(conn_id);
5227                                    self.discard_pending_store_wal_actions(conn_id);
5228                                    return Err(err);
5229                                }
5230                                self.restore_pending_write_stamps(conn_id);
5231                                if let Err(err) = self.flush_pending_store_wal_actions(conn_id) {
5232                                    for (_, sub) in &ctx.savepoints {
5233                                        self.inner.snapshot_manager.rollback(*sub);
5234                                    }
5235                                    for sub in &ctx.released_sub_xids {
5236                                        self.inner.snapshot_manager.rollback(*sub);
5237                                    }
5238                                    self.inner.snapshot_manager.rollback(ctx.xid);
5239                                    self.revive_pending_versioned_updates(conn_id);
5240                                    self.revive_pending_tombstones(conn_id);
5241                                    self.discard_pending_kv_watch_events(conn_id);
5242                                    return Err(err);
5243                                }
5244                                // Phase 2.3.2e: commit every open sub-xid
5245                                // so they also become visible. Their
5246                                // work is promoted to the parent txn's
5247                                // result exactly like a RELEASE would
5248                                // have done.
5249                                for (_, sub) in &ctx.savepoints {
5250                                    self.inner.snapshot_manager.commit(*sub);
5251                                }
5252                                for sub in &ctx.released_sub_xids {
5253                                    self.inner.snapshot_manager.commit(*sub);
5254                                }
5255                                self.inner.snapshot_manager.commit(ctx.xid);
5256                                self.finalize_pending_versioned_updates(conn_id);
5257                                self.finalize_pending_tombstones(conn_id);
5258                                self.finalize_pending_kv_watch_events(conn_id);
5259                                self.finalize_pending_queue_wakes(conn_id);
5260                                ("commit", format!("COMMIT — xid={} committed", ctx.xid))
5261                            }
5262                            None => (
5263                                "commit",
5264                                "COMMIT outside transaction — no-op (autocommit)".to_string(),
5265                            ),
5266                        }
5267                    }
5268                    TxnControl::Rollback => {
5269                        self.inner.tx_local_tenants.write().remove(&conn_id);
5270                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5271                        match ctx {
5272                            Some(ctx) => {
5273                                // Phase 2.3.2e: abort every open sub-xid
5274                                // too so their writes stay hidden.
5275                                for (_, sub) in &ctx.savepoints {
5276                                    self.inner.snapshot_manager.rollback(*sub);
5277                                }
5278                                for sub in &ctx.released_sub_xids {
5279                                    self.inner.snapshot_manager.rollback(*sub);
5280                                }
5281                                self.inner.snapshot_manager.rollback(ctx.xid);
5282                                // Phase 2.3.2b: tuples that the txn had
5283                                // xmax-stamped become live again — wipe xmax
5284                                // back to 0 so later snapshots see them.
5285                                self.revive_pending_versioned_updates(conn_id);
5286                                self.revive_pending_tombstones(conn_id);
5287                                self.discard_pending_kv_watch_events(conn_id);
5288                                self.discard_pending_queue_wakes(conn_id);
5289                                self.discard_pending_store_wal_actions(conn_id);
5290                                ("rollback", format!("ROLLBACK — xid={} aborted", ctx.xid))
5291                            }
5292                            None => (
5293                                "rollback",
5294                                "ROLLBACK outside transaction — no-op (autocommit)".to_string(),
5295                            ),
5296                        }
5297                    }
5298                    // Phase 2.3.2e: savepoints map onto sub-xids. Each
5299                    // SAVEPOINT allocates a fresh xid and pushes it
5300                    // onto the per-txn stack so subsequent writes can
5301                    // be selectively rolled back. RELEASE pops without
5302                    // aborting; ROLLBACK TO aborts the sub-xid (and
5303                    // any nested ones) + revives their tombstones.
5304                    TxnControl::Savepoint(name) => {
5305                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5306                        let mut guard = self.inner.tx_contexts.write();
5307                        match guard.get_mut(&conn_id) {
5308                            Some(ctx) => {
5309                                let sub = mgr.begin();
5310                                ctx.savepoints.push((name.clone(), sub));
5311                                ("savepoint", format!("SAVEPOINT {name} — sub_xid={sub}"))
5312                            }
5313                            None => (
5314                                "savepoint",
5315                                "SAVEPOINT outside transaction — no-op".to_string(),
5316                            ),
5317                        }
5318                    }
5319                    TxnControl::ReleaseSavepoint(name) => {
5320                        let mut guard = self.inner.tx_contexts.write();
5321                        match guard.get_mut(&conn_id) {
5322                            Some(ctx) => {
5323                                let pos = ctx
5324                                    .savepoints
5325                                    .iter()
5326                                    .position(|(n, _)| n == name)
5327                                    .ok_or_else(|| {
5328                                        RedDBError::Internal(format!(
5329                                            "savepoint {name} does not exist"
5330                                        ))
5331                                    })?;
5332                                // RELEASE pops the named savepoint and
5333                                // any nested ones. Their sub-xids move
5334                                // to `released_sub_xids` so they commit
5335                                // (or roll back) alongside the parent
5336                                // xid — PG semantics: released
5337                                // savepoints still contribute their
5338                                // work, but their names are gone.
5339                                let released = ctx.savepoints.len() - pos;
5340                                let popped: Vec<Xid> = ctx
5341                                    .savepoints
5342                                    .split_off(pos)
5343                                    .into_iter()
5344                                    .map(|(_, x)| x)
5345                                    .collect();
5346                                ctx.released_sub_xids.extend(popped);
5347                                (
5348                                    "release_savepoint",
5349                                    format!("RELEASE SAVEPOINT {name} — {released} level(s)"),
5350                                )
5351                            }
5352                            None => (
5353                                "release_savepoint",
5354                                "RELEASE outside transaction — no-op".to_string(),
5355                            ),
5356                        }
5357                    }
5358                    TxnControl::RollbackToSavepoint(name) => {
5359                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5360                        // Splice out the savepoint + nested ones under
5361                        // a narrow lock, then run the snapshot-manager
5362                        // + tombstone side-effects without the tx map
5363                        // held so nothing re-enters.
5364                        let drop_result: Option<(Xid, Vec<Xid>)> = {
5365                            let mut guard = self.inner.tx_contexts.write();
5366                            if let Some(ctx) = guard.get_mut(&conn_id) {
5367                                let pos = ctx
5368                                    .savepoints
5369                                    .iter()
5370                                    .position(|(n, _)| n == name)
5371                                    .ok_or_else(|| {
5372                                        RedDBError::Internal(format!(
5373                                            "savepoint {name} does not exist"
5374                                        ))
5375                                    })?;
5376                                let savepoint_xid = ctx.savepoints[pos].1;
5377                                let aborted: Vec<Xid> = ctx
5378                                    .savepoints
5379                                    .split_off(pos)
5380                                    .into_iter()
5381                                    .map(|(_, x)| x)
5382                                    .collect();
5383                                Some((savepoint_xid, aborted))
5384                            } else {
5385                                None
5386                            }
5387                        };
5388
5389                        match drop_result {
5390                            Some((savepoint_xid, aborted)) => {
5391                                for x in &aborted {
5392                                    mgr.rollback(*x);
5393                                }
5394                                let reverted_updates =
5395                                    self.revive_versioned_updates_since(conn_id, savepoint_xid);
5396                                let revived = self.revive_tombstones_since(conn_id, savepoint_xid);
5397                                (
5398                                    "rollback_to_savepoint",
5399                                    format!(
5400                                        "ROLLBACK TO SAVEPOINT {name} — aborted {} sub_xid(s), reverted {reverted_updates} update(s), revived {revived} tombstone(s)",
5401                                        aborted.len(),
5402                                    ),
5403                                )
5404                            }
5405                            None => (
5406                                "rollback_to_savepoint",
5407                                "ROLLBACK TO outside transaction — no-op".to_string(),
5408                            ),
5409                        }
5410                    }
5411                };
5412                Ok(RuntimeQueryResult::ok_message(
5413                    query.to_string(),
5414                    &msg,
5415                    kind,
5416                ))
5417            }
5418            // Schema + Sequence DDL (Phase 1.3 PG parity).
5419            //
5420            // Schemas are lightweight logical namespaces: a CREATE SCHEMA call
5421            // just registers the name in `red_config` under `schema.{name}`.
5422            // Table lookups still happen by collection name; clients using
5423            // `schema.table` qualified names collapse to collection `schema.table`.
5424            //
5425            // Sequences persist a 64-bit counter + metadata (start, increment)
5426            // in `red_config` under `sequence.{name}.*`. Scalar callers
5427            // `nextval('name')` / `currval('name')` arrive with the MVCC phase
5428            // once we have a proper mutating-function dispatch path; for now the
5429            // DDL just establishes the catalog entry so clients don't error.
5430            QueryExpr::CreateSchema(ref q) => {
5431                let store = self.inner.db.store();
5432                let key = format!("schema.{}", q.name);
5433                if store.get_config(&key).is_some() {
5434                    if q.if_not_exists {
5435                        return Ok(RuntimeQueryResult::ok_message(
5436                            query.to_string(),
5437                            &format!("schema {} already exists — skipped", q.name),
5438                            "create_schema",
5439                        ));
5440                    }
5441                    return Err(RedDBError::Internal(format!(
5442                        "schema {} already exists",
5443                        q.name
5444                    )));
5445                }
5446                store.set_config_tree(&key, &crate::serde_json::Value::Bool(true));
5447                Ok(RuntimeQueryResult::ok_message(
5448                    query.to_string(),
5449                    &format!("schema {} created", q.name),
5450                    "create_schema",
5451                ))
5452            }
5453            QueryExpr::DropSchema(ref q) => {
5454                let store = self.inner.db.store();
5455                let key = format!("schema.{}", q.name);
5456                let existed = store.get_config(&key).is_some();
5457                if !existed && !q.if_exists {
5458                    return Err(RedDBError::Internal(format!(
5459                        "schema {} does not exist",
5460                        q.name
5461                    )));
5462                }
5463                // Remove marker from red_config via set to null.
5464                store.set_config_tree(&key, &crate::serde_json::Value::Null);
5465                let suffix = if q.cascade {
5466                    " (CASCADE accepted — tables untouched)"
5467                } else {
5468                    ""
5469                };
5470                Ok(RuntimeQueryResult::ok_message(
5471                    query.to_string(),
5472                    &format!("schema {} dropped{}", q.name, suffix),
5473                    "drop_schema",
5474                ))
5475            }
5476            QueryExpr::CreateSequence(ref q) => {
5477                let store = self.inner.db.store();
5478                let base = format!("sequence.{}", q.name);
5479                let start_key = format!("{base}.start");
5480                let incr_key = format!("{base}.increment");
5481                let curr_key = format!("{base}.current");
5482                if store.get_config(&start_key).is_some() {
5483                    if q.if_not_exists {
5484                        return Ok(RuntimeQueryResult::ok_message(
5485                            query.to_string(),
5486                            &format!("sequence {} already exists — skipped", q.name),
5487                            "create_sequence",
5488                        ));
5489                    }
5490                    return Err(RedDBError::Internal(format!(
5491                        "sequence {} already exists",
5492                        q.name
5493                    )));
5494                }
5495                // Persist start + increment, and set current so the first
5496                // nextval returns `start`.
5497                let initial_current = q.start - q.increment;
5498                store.set_config_tree(
5499                    &start_key,
5500                    &crate::serde_json::Value::Number(q.start as f64),
5501                );
5502                store.set_config_tree(
5503                    &incr_key,
5504                    &crate::serde_json::Value::Number(q.increment as f64),
5505                );
5506                store.set_config_tree(
5507                    &curr_key,
5508                    &crate::serde_json::Value::Number(initial_current as f64),
5509                );
5510                Ok(RuntimeQueryResult::ok_message(
5511                    query.to_string(),
5512                    &format!(
5513                        "sequence {} created (start={}, increment={})",
5514                        q.name, q.start, q.increment
5515                    ),
5516                    "create_sequence",
5517                ))
5518            }
5519            QueryExpr::DropSequence(ref q) => {
5520                let store = self.inner.db.store();
5521                let base = format!("sequence.{}", q.name);
5522                let existed = store.get_config(&format!("{base}.start")).is_some();
5523                if !existed && !q.if_exists {
5524                    return Err(RedDBError::Internal(format!(
5525                        "sequence {} does not exist",
5526                        q.name
5527                    )));
5528                }
5529                for k in ["start", "increment", "current"] {
5530                    store.set_config_tree(&format!("{base}.{k}"), &crate::serde_json::Value::Null);
5531                }
5532                Ok(RuntimeQueryResult::ok_message(
5533                    query.to_string(),
5534                    &format!("sequence {} dropped", q.name),
5535                    "drop_sequence",
5536                ))
5537            }
5538            // Views — CREATE [MATERIALIZED] VIEW (Phase 2.1 PG parity).
5539            //
5540            // The view definition is stored in-memory on RuntimeInner (not
5541            // persisted). SELECTs that reference the view name will substitute
5542            // the stored `QueryExpr` via `resolve_view_reference` during
5543            // planning (same entry point used by table-name resolution).
5544            //
5545            // Materialized views additionally allocate a slot in
5546            // `MaterializedViewCache`; a REFRESH repopulates that slot.
5547            QueryExpr::CreateView(ref q) => {
5548                let mut views = self.inner.views.write();
5549                if views.contains_key(&q.name) && !q.or_replace {
5550                    if q.if_not_exists {
5551                        return Ok(RuntimeQueryResult::ok_message(
5552                            query.to_string(),
5553                            &format!("view {} already exists — skipped", q.name),
5554                            "create_view",
5555                        ));
5556                    }
5557                    return Err(RedDBError::Internal(format!(
5558                        "view {} already exists",
5559                        q.name
5560                    )));
5561                }
5562                views.insert(q.name.clone(), Arc::new(q.clone()));
5563                drop(views);
5564
5565                // Materialized view: register cache slot (data is empty until REFRESH).
5566                if q.materialized {
5567                    use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
5568                    let refresh = match q.refresh_every_ms {
5569                        Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
5570                        None => RefreshPolicy::Manual,
5571                    };
5572                    let dependencies = collect_table_refs(&q.query);
5573                    let def = MaterializedViewDef {
5574                        name: q.name.clone(),
5575                        query: format!("<parsed view {}>", q.name),
5576                        dependencies: dependencies.clone(),
5577                        refresh,
5578                        retention_duration_ms: q.retention_duration_ms,
5579                    };
5580                    self.inner.materialized_views.write().register(def);
5581
5582                    // Issue #593 slice 9a — persist the descriptor to
5583                    // the system catalog so the definition survives a
5584                    // restart. Upsert semantics (delete-then-insert by
5585                    // name) keep the catalog free of duplicate rows
5586                    // across `CREATE OR REPLACE` churn.
5587                    let descriptor =
5588                        crate::runtime::continuous_materialized_view::MaterializedViewDescriptor {
5589                            name: q.name.clone(),
5590                            source_sql: query.to_string(),
5591                            source_collections: dependencies,
5592                            refresh_every_ms: q.refresh_every_ms,
5593                            retention_duration_ms: q.retention_duration_ms,
5594                        };
5595                    let store = self.inner.db.store();
5596                    crate::runtime::continuous_materialized_view::persist_descriptor(
5597                        store.as_ref(),
5598                        &descriptor,
5599                    )?;
5600
5601                    // Issue #594 slice 9b — provision a Table-shaped
5602                    // backing collection named after the view. The
5603                    // rewriter skips materialized views (see
5604                    // `rewrite_view_refs_inner`) so `SELECT FROM v`
5605                    // resolves to this collection directly. Empty
5606                    // until REFRESH wires through it in 9c.
5607                    self.ensure_materialized_view_backing(&q.name)?;
5608                }
5609                // Plan cache may have cached a plan that didn't know about this
5610                // view — invalidate so future references pick up the new binding.
5611                // Result cache gets flushed too: OR REPLACE must not serve a
5612                // prior execution of the obsolete body.
5613                self.invalidate_plan_cache();
5614                self.invalidate_result_cache();
5615
5616                Ok(RuntimeQueryResult::ok_message(
5617                    query.to_string(),
5618                    &format!(
5619                        "{}view {} created",
5620                        if q.materialized { "materialized " } else { "" },
5621                        q.name
5622                    ),
5623                    "create_view",
5624                ))
5625            }
5626            QueryExpr::DropView(ref q) => {
5627                let mut views = self.inner.views.write();
5628                let removed = views.remove(&q.name);
5629                let existed = removed.is_some();
5630                let removed_materialized =
5631                    removed.as_ref().map(|v| v.materialized).unwrap_or(false);
5632                drop(views);
5633                if q.materialized || existed {
5634                    // Try the materialised cache too — silent if absent.
5635                    self.inner.materialized_views.write().remove(&q.name);
5636                    // Issue #593 slice 9a — remove any persisted
5637                    // catalog row. Idempotent: a no-op when the view
5638                    // was never materialized (no row was ever written).
5639                    let store = self.inner.db.store();
5640                    crate::runtime::continuous_materialized_view::remove_by_name(
5641                        store.as_ref(),
5642                        &q.name,
5643                    )?;
5644                }
5645                // Issue #594 slice 9b — drop the backing collection
5646                // that was provisioned at CREATE time. Only mat views
5647                // ever had one; regular views never did.
5648                if removed_materialized || q.materialized {
5649                    self.drop_materialized_view_backing(&q.name)?;
5650                }
5651                // Drop any plan / result cache entries that baked the
5652                // view body into their QueryExpr.
5653                self.invalidate_plan_cache();
5654                self.invalidate_result_cache();
5655                if !existed && !q.if_exists {
5656                    return Err(RedDBError::Internal(format!(
5657                        "view {} does not exist",
5658                        q.name
5659                    )));
5660                }
5661                self.invalidate_plan_cache();
5662                Ok(RuntimeQueryResult::ok_message(
5663                    query.to_string(),
5664                    &format!("view {} dropped", q.name),
5665                    "drop_view",
5666                ))
5667            }
5668            QueryExpr::RefreshMaterializedView(ref q) => {
5669                // Look up the view definition, execute its underlying query,
5670                // and stash the serialized result in the materialised cache.
5671                let view = {
5672                    let views = self.inner.views.read();
5673                    views.get(&q.name).cloned()
5674                };
5675                let view = match view {
5676                    Some(v) => v,
5677                    None => {
5678                        return Err(RedDBError::Internal(format!(
5679                            "view {} does not exist",
5680                            q.name
5681                        )))
5682                    }
5683                };
5684                if !view.materialized {
5685                    return Err(RedDBError::Internal(format!(
5686                        "view {} is not materialized — REFRESH requires \
5687                         CREATE MATERIALIZED VIEW",
5688                        q.name
5689                    )));
5690                }
5691                // Execute the underlying query fresh.
5692                let started = std::time::Instant::now();
5693                let now_ms = std::time::SystemTime::now()
5694                    .duration_since(std::time::UNIX_EPOCH)
5695                    .map(|d| d.as_millis() as u64)
5696                    .unwrap_or(0);
5697                match self.execute_query_expr((*view.query).clone()) {
5698                    Ok(inner_result) => {
5699                        // Issue #595 slice 9c — atomically replace the
5700                        // backing collection's contents under a single
5701                        // WAL group. Concurrent SELECT from the view
5702                        // sees either the prior or new contents, never
5703                        // partial. A crash before the WAL commit lands
5704                        // leaves the prior contents intact on recovery.
5705                        let entities =
5706                            view_records_to_entities(&q.name, &inner_result.result.records);
5707                        let row_count = entities.len() as u64;
5708                        let store = self.inner.db.store();
5709                        let serialized_records = match store.refresh_collection(&q.name, entities) {
5710                            Ok(records) => records,
5711                            Err(err) => {
5712                                let duration_ms = started.elapsed().as_millis() as u64;
5713                                let msg = err.to_string();
5714                                self.inner
5715                                    .materialized_views
5716                                    .write()
5717                                    .record_refresh_failure(
5718                                        &q.name,
5719                                        msg.clone(),
5720                                        duration_ms,
5721                                        now_ms,
5722                                    );
5723                                return Err(RedDBError::Internal(format!(
5724                                    "REFRESH MATERIALIZED VIEW {}: {msg}",
5725                                    q.name
5726                                )));
5727                            }
5728                        };
5729
5730                        // Issue #596 slice 9d — emit a Refresh
5731                        // ChangeRecord into the logical-WAL spool so
5732                        // replicas deterministically replay the same
5733                        // backing-collection contents via
5734                        // `LogicalChangeApplier::apply_record`.
5735                        if let Some(ref primary) = self.inner.db.replication {
5736                            let lsn = self.inner.cdc.emit(
5737                                crate::replication::cdc::ChangeOperation::Refresh,
5738                                &q.name,
5739                                0,
5740                                "refresh",
5741                            );
5742                            self.invalidate_result_cache_for_table(&q.name);
5743                            let timestamp = std::time::SystemTime::now()
5744                                .duration_since(std::time::UNIX_EPOCH)
5745                                .unwrap_or_default()
5746                                .as_millis() as u64;
5747                            let record = ChangeRecord::for_refresh(
5748                                lsn,
5749                                timestamp,
5750                                q.name.clone(),
5751                                serialized_records,
5752                            )
5753                            .with_term(self.current_replication_term());
5754                            let encoded = record.encode();
5755                            primary.append_logical_record(record.lsn, encoded);
5756                        }
5757
5758                        let duration_ms = started.elapsed().as_millis() as u64;
5759                        let serialized = format!("{:?}", inner_result.result);
5760                        self.inner
5761                            .materialized_views
5762                            .write()
5763                            .record_refresh_success(
5764                                &q.name,
5765                                serialized.into_bytes(),
5766                                row_count,
5767                                duration_ms,
5768                                now_ms,
5769                            );
5770                        // SELECT FROM v now reads through the rewriter
5771                        // skip into the backing collection — drop the
5772                        // result cache so prior empty-backing reads
5773                        // don't shadow the new contents.
5774                        self.invalidate_result_cache();
5775                        Ok(RuntimeQueryResult::ok_message(
5776                            query.to_string(),
5777                            &format!("materialized view {} refreshed", q.name),
5778                            "refresh_materialized_view",
5779                        ))
5780                    }
5781                    Err(err) => {
5782                        let duration_ms = started.elapsed().as_millis() as u64;
5783                        let msg = err.to_string();
5784                        self.inner
5785                            .materialized_views
5786                            .write()
5787                            .record_refresh_failure(&q.name, msg.clone(), duration_ms, now_ms);
5788                        Err(err)
5789                    }
5790                }
5791            }
5792            // Row Level Security (Phase 2.5 PG parity).
5793            //
5794            // Policies live in an in-memory registry keyed by (table, name).
5795            // Enforcement (AND-ing the policy's USING clause into every
5796            // query's WHERE for the table) arrives in Phase 2.5.2 via the
5797            // filter compiler; this dispatch only manages the catalog.
5798            QueryExpr::CreatePolicy(ref q) => {
5799                let key = (q.table.clone(), q.name.clone());
5800                self.inner
5801                    .rls_policies
5802                    .write()
5803                    .insert(key, Arc::new(q.clone()));
5804                self.invalidate_plan_cache();
5805                // Issue #120 — surface policy names in the
5806                // schema-vocabulary so AskPipeline (#121) can resolve
5807                // a policy reference back to its table.
5808                self.schema_vocabulary_apply(
5809                    crate::runtime::schema_vocabulary::DdlEvent::CreatePolicy {
5810                        collection: q.table.clone(),
5811                        policy: q.name.clone(),
5812                    },
5813                );
5814                Ok(RuntimeQueryResult::ok_message(
5815                    query.to_string(),
5816                    &format!("policy {} on {} created", q.name, q.table),
5817                    "create_policy",
5818                ))
5819            }
5820            QueryExpr::DropPolicy(ref q) => {
5821                let removed = self
5822                    .inner
5823                    .rls_policies
5824                    .write()
5825                    .remove(&(q.table.clone(), q.name.clone()))
5826                    .is_some();
5827                if !removed && !q.if_exists {
5828                    return Err(RedDBError::Internal(format!(
5829                        "policy {} on {} does not exist",
5830                        q.name, q.table
5831                    )));
5832                }
5833                self.invalidate_plan_cache();
5834                // Issue #120 — keep the schema-vocabulary policy
5835                // entry in sync.
5836                self.schema_vocabulary_apply(
5837                    crate::runtime::schema_vocabulary::DdlEvent::DropPolicy {
5838                        collection: q.table.clone(),
5839                        policy: q.name.clone(),
5840                    },
5841                );
5842                Ok(RuntimeQueryResult::ok_message(
5843                    query.to_string(),
5844                    &format!("policy {} on {} dropped", q.name, q.table),
5845                    "drop_policy",
5846                ))
5847            }
5848            // Foreign Data Wrappers (Phase 3.2 PG parity).
5849            //
5850            // CREATE SERVER / CREATE FOREIGN TABLE register into the shared
5851            // `ForeignTableRegistry`. The read path consults that registry
5852            // before dispatching a SELECT — when the table name matches a
5853            // registered foreign table, we forward the scan to the wrapper
5854            // and skip the normal collection lookup.
5855            //
5856            // Phase 3.2 is in-memory only; persistence across restarts is a
5857            // 3.2.2 follow-up that mirrors the view registry pattern.
5858            QueryExpr::CreateServer(ref q) => {
5859                use crate::storage::fdw::FdwOptions;
5860                let registry = Arc::clone(&self.inner.foreign_tables);
5861                if registry.server(&q.name).is_some() {
5862                    if q.if_not_exists {
5863                        return Ok(RuntimeQueryResult::ok_message(
5864                            query.to_string(),
5865                            &format!("server {} already exists — skipped", q.name),
5866                            "create_server",
5867                        ));
5868                    }
5869                    return Err(RedDBError::Internal(format!(
5870                        "server {} already exists",
5871                        q.name
5872                    )));
5873                }
5874                let mut opts = FdwOptions::new();
5875                for (k, v) in &q.options {
5876                    opts.values.insert(k.clone(), v.clone());
5877                }
5878                registry
5879                    .create_server(&q.name, &q.wrapper, opts)
5880                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
5881                Ok(RuntimeQueryResult::ok_message(
5882                    query.to_string(),
5883                    &format!("server {} created (wrapper {})", q.name, q.wrapper),
5884                    "create_server",
5885                ))
5886            }
5887            QueryExpr::DropServer(ref q) => {
5888                let existed = self.inner.foreign_tables.drop_server(&q.name);
5889                if !existed && !q.if_exists {
5890                    return Err(RedDBError::Internal(format!(
5891                        "server {} does not exist",
5892                        q.name
5893                    )));
5894                }
5895                Ok(RuntimeQueryResult::ok_message(
5896                    query.to_string(),
5897                    &format!(
5898                        "server {} dropped{}",
5899                        q.name,
5900                        if q.cascade { " (cascade)" } else { "" }
5901                    ),
5902                    "drop_server",
5903                ))
5904            }
5905            QueryExpr::CreateForeignTable(ref q) => {
5906                use crate::storage::fdw::{FdwOptions, ForeignColumn, ForeignTable};
5907                let registry = Arc::clone(&self.inner.foreign_tables);
5908                if registry.foreign_table(&q.name).is_some() {
5909                    if q.if_not_exists {
5910                        return Ok(RuntimeQueryResult::ok_message(
5911                            query.to_string(),
5912                            &format!("foreign table {} already exists — skipped", q.name),
5913                            "create_foreign_table",
5914                        ));
5915                    }
5916                    return Err(RedDBError::Internal(format!(
5917                        "foreign table {} already exists",
5918                        q.name
5919                    )));
5920                }
5921                let mut opts = FdwOptions::new();
5922                for (k, v) in &q.options {
5923                    opts.values.insert(k.clone(), v.clone());
5924                }
5925                let columns: Vec<ForeignColumn> = q
5926                    .columns
5927                    .iter()
5928                    .map(|c| ForeignColumn {
5929                        name: c.name.clone(),
5930                        data_type: c.data_type.clone(),
5931                        not_null: c.not_null,
5932                    })
5933                    .collect();
5934                registry
5935                    .create_foreign_table(ForeignTable {
5936                        name: q.name.clone(),
5937                        server_name: q.server.clone(),
5938                        columns,
5939                        options: opts,
5940                    })
5941                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
5942                self.invalidate_plan_cache();
5943                Ok(RuntimeQueryResult::ok_message(
5944                    query.to_string(),
5945                    &format!("foreign table {} created (server {})", q.name, q.server),
5946                    "create_foreign_table",
5947                ))
5948            }
5949            QueryExpr::DropForeignTable(ref q) => {
5950                let existed = self.inner.foreign_tables.drop_foreign_table(&q.name);
5951                if !existed && !q.if_exists {
5952                    return Err(RedDBError::Internal(format!(
5953                        "foreign table {} does not exist",
5954                        q.name
5955                    )));
5956                }
5957                self.invalidate_plan_cache();
5958                Ok(RuntimeQueryResult::ok_message(
5959                    query.to_string(),
5960                    &format!("foreign table {} dropped", q.name),
5961                    "drop_foreign_table",
5962                ))
5963            }
5964            // COPY table FROM 'path' (Phase 1.5 PG parity).
5965            //
5966            // Stream CSV rows through the shared `CsvImporter`. The collection
5967            // is auto-created on first insert (via `insert_auto`-style path);
5968            // VACUUM/ANALYZE afterwards is up to the caller.
5969            QueryExpr::CopyFrom(ref q) => {
5970                use crate::storage::import::{CsvConfig, CsvImporter};
5971                let store = self.inner.db.store();
5972                let cfg = CsvConfig {
5973                    collection: q.table.clone(),
5974                    has_header: q.has_header,
5975                    delimiter: q.delimiter.map(|c| c as u8).unwrap_or(b','),
5976                    ..CsvConfig::default()
5977                };
5978                let importer = CsvImporter::new(cfg);
5979                let stats = importer
5980                    .import_file(&q.path, store.as_ref())
5981                    .map_err(|e| RedDBError::Internal(format!("COPY failed: {e}")))?;
5982                // Tables are written → invalidate cached plans / result cache.
5983                self.note_table_write(&q.table);
5984                Ok(RuntimeQueryResult::ok_message(
5985                    query.to_string(),
5986                    &format!(
5987                        "COPY imported {} rows into {} ({} errors skipped, {}ms)",
5988                        stats.records_imported, q.table, stats.errors_skipped, stats.duration_ms
5989                    ),
5990                    "copy_from",
5991                ))
5992            }
5993            // Maintenance commands (Phase 1.2 PG parity).
5994            //
5995            // - VACUUM [FULL] [table]: refreshes planner stats for the target
5996            //   collection(s) and — when FULL — triggers a full pager persist
5997            //   (flushes dirty pages + fsync). Also invalidates the result cache
5998            //   so subsequent reads re-execute against the freshly compacted
5999            //   storage. RedDB's segment/btree GC runs continuously via the
6000            //   background lifecycle; explicit space reclamation for sealed
6001            //   segments arrives with Phase 2.3 (MVCC + dead-tuple reclamation).
6002            // - ANALYZE [table]: reruns `analyze_collection` +
6003            //   `persist_table_stats` via `refresh_table_planner_stats` so the
6004            //   planner has fresh histograms, distinct estimates, null counts.
6005            //
6006            // Both commands accept an optional target; omitting the target
6007            // iterates every collection in the store.
6008            QueryExpr::MaintenanceCommand(ref cmd) => {
6009                use crate::storage::query::ast::MaintenanceCommand as Mc;
6010                let store = self.inner.db.store();
6011                let (kind, msg) = match cmd {
6012                    Mc::Analyze { target } => {
6013                        let targets: Vec<String> = match target {
6014                            Some(t) => vec![t.clone()],
6015                            None => store.list_collections(),
6016                        };
6017                        for t in &targets {
6018                            self.refresh_table_planner_stats(t);
6019                        }
6020                        (
6021                            "analyze",
6022                            format!("ANALYZE refreshed stats for {} table(s)", targets.len()),
6023                        )
6024                    }
6025                    Mc::Vacuum { target, full } => {
6026                        let targets: Vec<String> = match target {
6027                            Some(t) => vec![t.clone()],
6028                            None => store.list_collections(),
6029                        };
6030                        let cutoff_xid = self.mvcc_vacuum_cutoff_xid();
6031                        let mut vacuum_stats =
6032                            crate::storage::unified::store::MvccVacuumStats::default();
6033                        for t in &targets {
6034                            let stats = store.vacuum_mvcc_history(t, cutoff_xid).map_err(|e| {
6035                                RedDBError::Internal(format!(
6036                                    "VACUUM MVCC history failed for {t}: {e}"
6037                                ))
6038                            })?;
6039                            if stats.reclaimed_versions > 0 {
6040                                self.rebuild_runtime_indexes_for_table(t)?;
6041                            }
6042                            vacuum_stats.add(&stats);
6043                        }
6044                        self.inner.snapshot_manager.prune_aborted(cutoff_xid);
6045                        // Stats refresh covers every target (same as ANALYZE).
6046                        for t in &targets {
6047                            self.refresh_table_planner_stats(t);
6048                        }
6049                        // FULL forces a pager persist (dirty-page flush + fsync).
6050                        // Regular VACUUM relies on the background writer / segment
6051                        // lifecycle so the command is non-blocking.
6052                        let persisted = if *full {
6053                            match store.persist() {
6054                                Ok(()) => true,
6055                                Err(e) => {
6056                                    return Err(RedDBError::Internal(format!(
6057                                        "VACUUM FULL persist failed: {e:?}"
6058                                    )));
6059                                }
6060                            }
6061                        } else {
6062                            false
6063                        };
6064                        // Result cache depended on pre-vacuum state.
6065                        self.invalidate_result_cache();
6066                        (
6067                            "vacuum",
6068                            format!(
6069                                "VACUUM{} processed {} table(s): scanned_versions={}, retained_versions={}, reclaimed_versions={}, retained_history_versions={}, reclaimed_history_versions={}, retained_tombstones={}, reclaimed_tombstones={}{}",
6070                                if *full { " FULL" } else { "" },
6071                                targets.len(),
6072                                vacuum_stats.scanned_versions,
6073                                vacuum_stats.retained_versions,
6074                                vacuum_stats.reclaimed_versions,
6075                                vacuum_stats.retained_history_versions,
6076                                vacuum_stats.reclaimed_history_versions,
6077                                vacuum_stats.retained_tombstones,
6078                                vacuum_stats.reclaimed_tombstones,
6079                                if persisted {
6080                                    " (pages flushed to disk)"
6081                                } else {
6082                                    ""
6083                                }
6084                            ),
6085                        )
6086                    }
6087                };
6088                Ok(RuntimeQueryResult::ok_message(
6089                    query.to_string(),
6090                    &msg,
6091                    kind,
6092                ))
6093            }
6094            // GRANT / REVOKE / ALTER USER (RBAC milestone).
6095            //
6096            // These hit the AuthStore directly. The statement frame /
6097            // privilege gate has already decided whether the caller may
6098            // even run the statement; here we just translate the AST into
6099            // AuthStore calls.
6100            QueryExpr::Grant(ref g) => self.execute_grant_statement(query, g),
6101            QueryExpr::Revoke(ref r) => self.execute_revoke_statement(query, r),
6102            QueryExpr::AlterUser(ref a) => self.execute_alter_user_statement(query, a),
6103            QueryExpr::CreateUser(ref u) => self.execute_create_user_statement(query, u),
6104            QueryExpr::CreateIamPolicy { ref id, ref json } => {
6105                self.execute_create_iam_policy(query, id, json)
6106            }
6107            QueryExpr::DropIamPolicy { ref id } => self.execute_drop_iam_policy(query, id),
6108            QueryExpr::AttachPolicy {
6109                ref policy_id,
6110                ref principal,
6111            } => self.execute_attach_policy(query, policy_id, principal),
6112            QueryExpr::DetachPolicy {
6113                ref policy_id,
6114                ref principal,
6115            } => self.execute_detach_policy(query, policy_id, principal),
6116            QueryExpr::ShowPolicies { ref filter } => {
6117                self.execute_show_policies(query, filter.as_ref())
6118            }
6119            QueryExpr::ShowEffectivePermissions {
6120                ref user,
6121                ref resource,
6122            } => self.execute_show_effective_permissions(query, user, resource.as_ref()),
6123            QueryExpr::SimulatePolicy {
6124                ref user,
6125                ref action,
6126                ref resource,
6127            } => self.execute_simulate_policy(query, user, action, resource),
6128            QueryExpr::LintPolicy { ref source } => self.execute_lint_policy(query, source),
6129            QueryExpr::MigratePolicyMode {
6130                ref target,
6131                dry_run,
6132            } => self.execute_migrate_policy_mode(query, target, dry_run),
6133            QueryExpr::CreateMigration(ref q) => self.execute_create_migration(query, q),
6134            QueryExpr::ApplyMigration(ref q) => self.execute_apply_migration(query, q),
6135            QueryExpr::RollbackMigration(ref q) => self.execute_rollback_migration(query, q),
6136            QueryExpr::ExplainMigration(ref q) => self.execute_explain_migration(query, q),
6137        };
6138
6139        if !control_event_specs.is_empty() {
6140            let (outcome, reason) = match &query_result {
6141                Ok(_) => (crate::runtime::control_events::Outcome::Allowed, None),
6142                Err(err) => (control_event_outcome_for_error(err), Some(err.to_string())),
6143            };
6144            for spec in &control_event_specs {
6145                self.emit_control_event(
6146                    spec.kind,
6147                    outcome,
6148                    spec.action,
6149                    spec.resource.clone(),
6150                    reason.clone(),
6151                    spec.fields.clone(),
6152                )?;
6153            }
6154        }
6155
6156        if let (Some(plan), Ok(result)) = (&query_audit_plan, &query_result) {
6157            self.emit_query_audit(
6158                query,
6159                plan,
6160                query_audit_started.elapsed().as_millis() as u64,
6161                result,
6162            );
6163        }
6164
6165        // Decrypt Value::Secret columns in-place before caching, so
6166        // cached results match the post-decrypt shape and repeat
6167        // queries skip the per-row AES-GCM pass.
6168        let mut query_result = query_result;
6169        if let Ok(ref mut result) = query_result {
6170            if result.statement_type == "select" {
6171                self.apply_secret_decryption(result);
6172            }
6173        }
6174
6175        // Cache SELECT results for 30s.
6176        // Skip: pre-serialized JSON (large clone), and result sets > 5 rows.
6177        // Large multi-row results (range scans, filtered scans) are rarely
6178        // repeated with the same literal values so the cache hit rate is near
6179        // zero while the clone cost (100 records × ~16 fields each) is high.
6180        // Aggregations (1 row) and point lookups (1 row) still benefit.
6181        if let Ok(ref result) = query_result {
6182            frame.write_result_cache(self, result, result_cache_scopes);
6183        }
6184
6185        query_result
6186    }
6187
6188    /// Snapshot of every registered materialized view's runtime
6189    /// state — feeds the `red.materialized_views` virtual table.
6190    /// Issue #583 slice 10.
6191    pub fn materialized_view_metadata(
6192        &self,
6193    ) -> Vec<crate::storage::cache::result::MaterializedViewMetadata> {
6194        // Issue #595 slice 9c — `current_row_count` is now scraped
6195        // live from the backing collection rather than read from the
6196        // cache slot. Mirrors the slice-10 invariant on
6197        // `queue_pending_gauge` in #527: the live store is the source
6198        // of truth, the cache slot only carries last-refresh telemetry
6199        // (timing, error, refresh cadence).
6200        let store = self.inner.db.store();
6201        let mut entries = self.inner.materialized_views.read().metadata();
6202        for entry in &mut entries {
6203            if let Some(manager) = store.get_collection(&entry.name) {
6204                entry.current_row_count = manager.count() as u64;
6205            }
6206        }
6207        entries
6208    }
6209
6210    /// Drive scheduled refreshes for materialized views with a
6211    /// `REFRESH EVERY <duration>` clause. Called from the background
6212    /// scheduler thread (and from unit tests with a fake clock via
6213    /// `claim_due_at`). Each invocation atomically claims the set of
6214    /// due views (so two concurrent ticks never double-fire the same
6215    /// view) and runs each refresh through the standard execution
6216    /// path — failures are captured in `last_error` and the prior
6217    /// content stays intact. Issue #583 slice 10.
6218    /// Snapshot of every tracked retention sweeper state — feeds the
6219    /// three extra columns on `red.retention`. Issue #584 slice 12.
6220    pub(crate) fn retention_sweeper_snapshot(
6221        &self,
6222    ) -> Vec<(String, crate::runtime::retention_sweeper::SweeperState)> {
6223        self.inner.retention_sweeper.read().snapshot()
6224    }
6225
6226    /// Drive one tick of the retention sweeper. Iterates collections
6227    /// with a retention policy set, physically deletes at most
6228    /// `batch_size` expired rows per collection, and records the
6229    /// `last_sweep_at_ms` / `rows_swept_total` / pending estimate that
6230    /// `red.retention` exposes. Called from the background sweeper
6231    /// thread; safe to invoke directly from tests with a small batch
6232    /// size to drain rows deterministically. Issue #584 slice 12.
6233    ///
6234    /// Deletes are issued as `DELETE FROM <collection> WHERE
6235    /// <ts_column> < <cutoff>` through the standard `execute_query`
6236    /// chokepoint so WAL participation and snapshot guards apply
6237    /// exactly as for a user-issued DELETE — replicas replay the
6238    /// sweeper's deletes via the same WAL stream with no special
6239    /// handling on the replication side.
6240    ///
6241    /// Batching is enforced by tightening the cutoff: if more than
6242    /// `batch_size` rows are expired, the cutoff is dropped to the
6243    /// `batch_size`-th oldest expired timestamp + 1 so the predicate
6244    /// matches roughly `batch_size` rows; the remainder is reported
6245    /// as `current_rows_pending_sweep_estimate` and drained on the
6246    /// next tick.
6247    pub fn sweep_retention_tick(&self, batch_size: usize) {
6248        if batch_size == 0 {
6249            return;
6250        }
6251        let now_ms = std::time::SystemTime::now()
6252            .duration_since(std::time::UNIX_EPOCH)
6253            .map(|d| d.as_millis() as u64)
6254            .unwrap_or(0);
6255
6256        let store = self.inner.db.store();
6257        let collections = store.list_collections();
6258        for name in collections {
6259            let Some(contract) = self.inner.db.collection_contract(&name) else {
6260                continue;
6261            };
6262            let Some(retention_ms) = contract.retention_duration_ms else {
6263                continue;
6264            };
6265            let Some(ts_column) =
6266                crate::runtime::retention_filter::resolve_timestamp_column(&contract)
6267            else {
6268                continue;
6269            };
6270            let Some(manager) = store.get_collection(&name) else {
6271                continue;
6272            };
6273            let cutoff = (now_ms as i64).saturating_sub(retention_ms as i64);
6274
6275            // Single pass: collect expired timestamps. We keep the
6276            // full Vec rather than a bounded heap because the partial
6277            // sort below is the simplest correct way to find the
6278            // batch-th oldest; for the slice's "1000-row default
6279            // batch" target this is bounded enough for production
6280            // operation, and the alternative (in-place heap of size
6281            // batch+1) is a follow-up optimisation.
6282            let mut expired_ts: Vec<i64> = Vec::new();
6283            manager.for_each_entity(|entity| {
6284                let ts = match ts_column.as_str() {
6285                    "created_at" => Some(entity.created_at as i64),
6286                    "updated_at" => Some(entity.updated_at as i64),
6287                    other => entity
6288                        .data
6289                        .as_row()
6290                        .and_then(|row| row.get_field(other))
6291                        .and_then(|v| match v {
6292                            crate::storage::schema::Value::TimestampMs(t) => Some(*t),
6293                            crate::storage::schema::Value::Timestamp(t) => {
6294                                Some(t.saturating_mul(1_000))
6295                            }
6296                            crate::storage::schema::Value::BigInt(t) => Some(*t),
6297                            crate::storage::schema::Value::UnsignedInteger(t) => {
6298                                i64::try_from(*t).ok()
6299                            }
6300                            crate::storage::schema::Value::Integer(t) => Some(*t),
6301                            _ => None,
6302                        }),
6303                };
6304                if let Some(t) = ts {
6305                    if t < cutoff {
6306                        expired_ts.push(t);
6307                    }
6308                }
6309                true
6310            });
6311
6312            let total_expired = expired_ts.len() as u64;
6313            if total_expired == 0 {
6314                self.inner
6315                    .retention_sweeper
6316                    .write()
6317                    .record_tick(&name, 0, 0, now_ms);
6318                continue;
6319            }
6320
6321            let (effective_cutoff, pending) = if (total_expired as usize) <= batch_size {
6322                (cutoff, 0u64)
6323            } else {
6324                // Tighten the cutoff to the (batch_size)-th oldest
6325                // expired timestamp + 1 so DELETE matches roughly
6326                // `batch_size` rows.
6327                expired_ts.sort_unstable();
6328                let nth = expired_ts[batch_size - 1];
6329                (
6330                    nth.saturating_add(1),
6331                    total_expired.saturating_sub(batch_size as u64),
6332                )
6333            };
6334
6335            let stmt = format!(
6336                "DELETE FROM {} WHERE {} < {}",
6337                name, ts_column, effective_cutoff
6338            );
6339            let deleted = match self.execute_query(&stmt) {
6340                Ok(r) => r.affected_rows,
6341                Err(_) => 0,
6342            };
6343
6344            self.inner
6345                .retention_sweeper
6346                .write()
6347                .record_tick(&name, deleted, pending, now_ms);
6348        }
6349    }
6350
6351    pub fn refresh_due_materialized_views(&self) {
6352        let due = {
6353            let mut cache = self.inner.materialized_views.write();
6354            cache.claim_due_at(std::time::Instant::now())
6355        };
6356        for name in due {
6357            // Round-trip through `execute_query` (rather than the
6358            // prepared-statement `execute_query_expr` fast path, which
6359            // explicitly rejects DDL/maintenance statements). Failures
6360            // are captured inside the RefreshMaterializedView handler
6361            // via `record_refresh_failure`; the scheduler ignores the
6362            // Result so one bad view doesn't halt the loop.
6363            let stmt = format!("REFRESH MATERIALIZED VIEW {}", name);
6364            let _ = self.execute_query(&stmt);
6365        }
6366    }
6367
6368    /// Execute a pre-parsed `QueryExpr` directly, bypassing SQL parsing and the
6369    /// plan cache. Used by the prepared-statement fast path so that `execute_prepared`
6370    /// calls pay zero parse + cache overhead.
6371    ///
6372    /// Applies secret decryption on SELECT results, identical to `execute_query`.
6373    pub fn execute_query_expr(&self, expr: QueryExpr) -> RedDBResult<RuntimeQueryResult> {
6374        let _config_snapshot_guard = ConfigSnapshotGuard::install(Arc::clone(&self.inner.db));
6375        let _secret_store_guard = SecretStoreGuard::install(self.inner.auth_store.read().clone());
6376        // View rewrite (Phase 2.1): substitute any `QueryExpr::Table(tq)`
6377        // whose `tq.table` matches a registered view with the view's
6378        // underlying query. Safe to call even when no views are registered.
6379        let expr = self.rewrite_view_refs(expr);
6380
6381        self.validate_model_operations_before_auth(&expr)?;
6382        // Granular RBAC privilege check. Runs before dispatch so a
6383        // denied caller never reaches storage. Fail-closed: any error
6384        // resolving the action / resource produces PermissionDenied.
6385        if let Err(err) = self.check_query_privilege(&expr) {
6386            return Err(RedDBError::Query(format!("permission denied: {err}")));
6387        }
6388
6389        let statement = query_expr_name(&expr);
6390        let mode = detect_mode(statement);
6391        let query_str = statement;
6392
6393        let result = self.dispatch_expr(expr, query_str, mode)?;
6394        let mut r = result;
6395        if r.statement_type == "select" {
6396            self.apply_secret_decryption(&mut r);
6397        }
6398        Ok(r)
6399    }
6400
6401    pub(super) fn validate_model_operations_before_auth(
6402        &self,
6403        expr: &QueryExpr,
6404    ) -> RedDBResult<()> {
6405        use crate::catalog::CollectionModel;
6406        use crate::runtime::ddl::polymorphic_resolver;
6407        use crate::storage::query::ast::KvCommand;
6408
6409        let system_schema_target = match expr {
6410            QueryExpr::DropTable(q) => Some(q.name.as_str()),
6411            QueryExpr::DropGraph(q) => Some(q.name.as_str()),
6412            QueryExpr::DropVector(q) => Some(q.name.as_str()),
6413            QueryExpr::DropDocument(q) => Some(q.name.as_str()),
6414            QueryExpr::DropKv(q) => Some(q.name.as_str()),
6415            QueryExpr::DropCollection(q) => Some(q.name.as_str()),
6416            QueryExpr::Truncate(q) => Some(q.name.as_str()),
6417            _ => None,
6418        };
6419        if system_schema_target.is_some_and(crate::runtime::impl_ddl::is_system_schema_name) {
6420            return Err(RedDBError::Query("system schema is read-only".to_string()));
6421        }
6422
6423        let expected = match expr {
6424            QueryExpr::DropTable(q) => Some((q.name.as_str(), CollectionModel::Table)),
6425            QueryExpr::DropGraph(q) => Some((q.name.as_str(), CollectionModel::Graph)),
6426            QueryExpr::DropVector(q) => Some((q.name.as_str(), CollectionModel::Vector)),
6427            QueryExpr::DropDocument(q) => Some((q.name.as_str(), CollectionModel::Document)),
6428            QueryExpr::DropKv(q) => Some((q.name.as_str(), q.model)),
6429            QueryExpr::DropCollection(q) => q.model.map(|model| (q.name.as_str(), model)),
6430            QueryExpr::Truncate(q) => q.model.map(|model| (q.name.as_str(), model)),
6431            QueryExpr::KvCommand(cmd) => {
6432                let (collection, model) = match cmd {
6433                    KvCommand::Put {
6434                        collection, model, ..
6435                    }
6436                    | KvCommand::Get {
6437                        collection, model, ..
6438                    }
6439                    | KvCommand::Incr {
6440                        collection, model, ..
6441                    }
6442                    | KvCommand::Cas {
6443                        collection, model, ..
6444                    }
6445                    | KvCommand::List {
6446                        collection, model, ..
6447                    }
6448                    | KvCommand::Delete {
6449                        collection, model, ..
6450                    } => (collection.as_str(), *model),
6451                    KvCommand::Rotate { collection, .. }
6452                    | KvCommand::History { collection, .. }
6453                    | KvCommand::Purge { collection, .. } => {
6454                        (collection.as_str(), CollectionModel::Vault)
6455                    }
6456                    KvCommand::InvalidateTags { collection, .. } => {
6457                        (collection.as_str(), CollectionModel::Kv)
6458                    }
6459                    KvCommand::Watch {
6460                        collection, model, ..
6461                    } => (collection.as_str(), *model),
6462                    KvCommand::Unseal { collection, .. } => {
6463                        (collection.as_str(), CollectionModel::Vault)
6464                    }
6465                };
6466                Some((collection, model))
6467            }
6468            QueryExpr::ConfigCommand(cmd) => {
6469                self.validate_config_command_before_auth(cmd)?;
6470                None
6471            }
6472            _ => None,
6473        };
6474
6475        let Some((name, expected_model)) = expected else {
6476            return Ok(());
6477        };
6478        let snapshot = self.inner.db.catalog_model_snapshot();
6479        let Some(actual_model) = snapshot
6480            .collections
6481            .iter()
6482            .find(|collection| collection.name == name)
6483            .map(|collection| collection.declared_model.unwrap_or(collection.model))
6484        else {
6485            return Ok(());
6486        };
6487        polymorphic_resolver::ensure_model_match(expected_model, actual_model)
6488    }
6489
6490    /// Walk a `QueryExpr` and replace `QueryExpr::Table(tq)` nodes whose
6491    /// `tq.table` matches a registered view name with the view's stored
6492    /// body. Recurses through joins so `SELECT ... FROM t JOIN myview ...`
6493    /// resolves correctly. Pure operation — no side effects.
6494    pub(super) fn rewrite_view_refs(&self, expr: QueryExpr) -> QueryExpr {
6495        // Fast path: no views registered → return original expression.
6496        if self.inner.views.read().is_empty() {
6497            return expr;
6498        }
6499        self.rewrite_view_refs_inner(expr)
6500    }
6501
6502    fn rewrite_view_refs_inner(&self, expr: QueryExpr) -> QueryExpr {
6503        use crate::storage::query::ast::{Filter, TableSource};
6504        match expr {
6505            QueryExpr::Table(mut tq) => {
6506                // 1. If the TableSource is a subquery, recurse into it so
6507                //    `SELECT ... FROM (SELECT ... FROM myview) t` expands.
6508                //    The legacy `table` field (set to a synthetic
6509                //    "__subq_NNNN" sentinel) stays as-is so callers that
6510                //    read it keep compiling.
6511                if let Some(TableSource::Subquery(body)) = tq.source.take() {
6512                    tq.source = Some(TableSource::Subquery(Box::new(
6513                        self.rewrite_view_refs_inner(*body),
6514                    )));
6515                    return QueryExpr::Table(tq);
6516                }
6517
6518                // 2. Restore the source field (took it above for match).
6519                // When the source was `None` or `TableSource::Name(_)`, the
6520                // real lookup key is `tq.table` — check the view registry.
6521                let maybe_view = {
6522                    let views = self.inner.views.read();
6523                    views.get(&tq.table).cloned()
6524                };
6525                let Some(view) = maybe_view else {
6526                    return QueryExpr::Table(tq);
6527                };
6528
6529                // Issue #594 slice 9b — materialized views are read
6530                // from their backing collection, not by substituting
6531                // the body. Returning the TableQuery as-is lets the
6532                // normal table-read path resolve `SELECT FROM v`
6533                // against the collection provisioned at CREATE time.
6534                if view.materialized {
6535                    return QueryExpr::Table(tq);
6536                }
6537
6538                // Recurse into the view body — views may reference other
6539                // views. The recursion yields the final QueryExpr we need
6540                // to merge the outer's filter / limit / offset into.
6541                let inner_expr = self.rewrite_view_refs_inner((*view.query).clone());
6542
6543                // Phase 5: when the body is a Table we merge the outer
6544                // TableQuery's WHERE / LIMIT / OFFSET into it so stacked
6545                // views filter recursively. Non-table bodies (Search,
6546                // Ask, Vector, Graph, Hybrid) can't meaningfully combine
6547                // with an outer Table query today — return the body
6548                // verbatim; outer predicates are lost. Full projection
6549                // merge lands in Phase 5.2.
6550                match inner_expr {
6551                    QueryExpr::Table(mut inner_tq) => {
6552                        if let Some(outer_filter) = tq.filter.take() {
6553                            inner_tq.filter = Some(match inner_tq.filter.take() {
6554                                Some(existing) => {
6555                                    Filter::And(Box::new(existing), Box::new(outer_filter))
6556                                }
6557                                None => outer_filter,
6558                            });
6559                            // Keep the `Expr` form in lock-step with the
6560                            // merged `Filter`. The executor prefers
6561                            // `where_expr` and nulls `filter` when it is
6562                            // present (see `execute_query_inner`), so a
6563                            // stacked view whose outer predicate was only
6564                            // merged into `filter` would silently drop that
6565                            // predicate at eval time (#635).
6566                            inner_tq.where_expr = inner_tq
6567                                .filter
6568                                .as_ref()
6569                                .map(crate::storage::query::sql_lowering::filter_to_expr);
6570                        }
6571                        if let Some(outer_limit) = tq.limit {
6572                            inner_tq.limit = Some(match inner_tq.limit {
6573                                Some(existing) => existing.min(outer_limit),
6574                                None => outer_limit,
6575                            });
6576                        }
6577                        if let Some(outer_offset) = tq.offset {
6578                            inner_tq.offset = Some(match inner_tq.offset {
6579                                Some(existing) => existing + outer_offset,
6580                                None => outer_offset,
6581                            });
6582                        }
6583                        QueryExpr::Table(inner_tq)
6584                    }
6585                    other => other,
6586                }
6587            }
6588            QueryExpr::Join(mut jq) => {
6589                jq.left = Box::new(self.rewrite_view_refs_inner(*jq.left));
6590                jq.right = Box::new(self.rewrite_view_refs_inner(*jq.right));
6591                QueryExpr::Join(jq)
6592            }
6593            // Other variants don't carry nested QueryExpr that can reference
6594            // a view by table name. Return as-is.
6595            other => other,
6596        }
6597    }
6598
6599    /// Apply table-level read authorization and RLS rewriting for a
6600    /// relational SELECT leaf.
6601    fn authorize_relational_table_select(
6602        &self,
6603        mut table: TableQuery,
6604        frame: &dyn super::statement_frame::ReadFrame,
6605    ) -> RedDBResult<Option<TableQuery>> {
6606        if let Some(TableSource::Subquery(inner)) = table.source.take() {
6607            let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6608            table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6609            return Ok(Some(table));
6610        }
6611
6612        self.check_table_column_projection_authz(&table, frame)?;
6613
6614        if self.inner.rls_enabled_tables.read().contains(&table.table) {
6615            return Ok(inject_rls_filters(self, frame, table));
6616        }
6617
6618        Ok(Some(table))
6619    }
6620
6621    fn authorize_relational_join_select(
6622        &self,
6623        mut join: JoinQuery,
6624        frame: &dyn super::statement_frame::ReadFrame,
6625    ) -> RedDBResult<Option<JoinQuery>> {
6626        self.check_join_column_projection_authz(&join, frame)?;
6627        join.left = Box::new(self.authorize_relational_join_child(*join.left, frame)?);
6628        join.right = Box::new(self.authorize_relational_join_child(*join.right, frame)?);
6629        Ok(inject_rls_into_join(self, frame, join))
6630    }
6631
6632    fn authorize_relational_join_child(
6633        &self,
6634        expr: QueryExpr,
6635        frame: &dyn super::statement_frame::ReadFrame,
6636    ) -> RedDBResult<QueryExpr> {
6637        match expr {
6638            QueryExpr::Table(mut table) => {
6639                if let Some(TableSource::Subquery(inner)) = table.source.take() {
6640                    let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6641                    table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6642                }
6643                Ok(QueryExpr::Table(table))
6644            }
6645            QueryExpr::Join(join) => self
6646                .authorize_relational_join_select(join, frame)?
6647                .map(QueryExpr::Join)
6648                .ok_or_else(|| {
6649                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6650                }),
6651            other => Ok(other),
6652        }
6653    }
6654
6655    fn authorize_relational_select_expr(
6656        &self,
6657        expr: QueryExpr,
6658        frame: &dyn super::statement_frame::ReadFrame,
6659    ) -> RedDBResult<QueryExpr> {
6660        match expr {
6661            QueryExpr::Table(table) => self
6662                .authorize_relational_table_select(table, frame)?
6663                .map(QueryExpr::Table)
6664                .ok_or_else(|| {
6665                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6666                }),
6667            QueryExpr::Join(join) => self
6668                .authorize_relational_join_select(join, frame)?
6669                .map(QueryExpr::Join)
6670                .ok_or_else(|| {
6671                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6672                }),
6673            other => Ok(other),
6674        }
6675    }
6676
6677    fn check_table_column_projection_authz(
6678        &self,
6679        table: &TableQuery,
6680        frame: &dyn super::statement_frame::ReadFrame,
6681    ) -> RedDBResult<()> {
6682        let Some((username, role)) = frame.identity() else {
6683            return Ok(());
6684        };
6685        let Some(auth_store) = self.inner.auth_store.read().clone() else {
6686            return Ok(());
6687        };
6688
6689        let columns = self.resolved_table_projection_columns(table)?;
6690        let request = ColumnAccessRequest::select(table.table.clone(), columns);
6691        let principal = UserId::from_parts(frame.effective_scope(), username);
6692        let ctx = runtime_iam_context(role, frame.effective_scope());
6693        let outcome = auth_store.check_column_projection_authz(&principal, &request, &ctx);
6694        if outcome.allowed() {
6695            return Ok(());
6696        }
6697
6698        if let Some(denied) = outcome.first_denied_column() {
6699            return Err(RedDBError::Query(format!(
6700                "permission denied: principal=`{username}` cannot select column `{}`",
6701                denied.resource.name
6702            )));
6703        }
6704        Err(RedDBError::Query(format!(
6705            "permission denied: principal=`{username}` cannot select table `{}`",
6706            table.table
6707        )))
6708    }
6709
6710    fn check_join_column_projection_authz(
6711        &self,
6712        join: &JoinQuery,
6713        frame: &dyn super::statement_frame::ReadFrame,
6714    ) -> RedDBResult<()> {
6715        let mut by_table: HashMap<String, BTreeSet<String>> = HashMap::new();
6716        let projections = crate::storage::query::sql_lowering::effective_join_projections(join);
6717        self.collect_join_projection_columns(join, &projections, &mut by_table)?;
6718
6719        for (table, columns) in by_table {
6720            let query = TableQuery {
6721                table,
6722                source: None,
6723                alias: None,
6724                select_items: Vec::new(),
6725                columns: columns.into_iter().map(Projection::Column).collect(),
6726                where_expr: None,
6727                filter: None,
6728                group_by_exprs: Vec::new(),
6729                group_by: Vec::new(),
6730                having_expr: None,
6731                having: None,
6732                order_by: Vec::new(),
6733                limit: None,
6734                limit_param: None,
6735                offset: None,
6736                offset_param: None,
6737                expand: None,
6738                as_of: None,
6739                sessionize: None,
6740                distinct: false,
6741            };
6742            self.check_table_column_projection_authz(&query, frame)?;
6743        }
6744        Ok(())
6745    }
6746
6747    fn collect_join_projection_columns(
6748        &self,
6749        join: &JoinQuery,
6750        projections: &[Projection],
6751        out: &mut HashMap<String, BTreeSet<String>>,
6752    ) -> RedDBResult<()> {
6753        let left = table_side_context(join.left.as_ref());
6754        let right = table_side_context(join.right.as_ref());
6755
6756        if projections
6757            .iter()
6758            .any(|projection| matches!(projection, Projection::All))
6759        {
6760            for side in [left.as_ref(), right.as_ref()].into_iter().flatten() {
6761                out.entry(side.table.clone())
6762                    .or_default()
6763                    .extend(self.table_all_projection_columns(&side.table)?);
6764            }
6765            return Ok(());
6766        }
6767
6768        for projection in projections {
6769            collect_projection_columns_for_join_side(
6770                projection,
6771                left.as_ref(),
6772                right.as_ref(),
6773                out,
6774            )?;
6775        }
6776        Ok(())
6777    }
6778
6779    fn resolved_table_projection_columns(&self, table: &TableQuery) -> RedDBResult<Vec<String>> {
6780        let projections = crate::storage::query::sql_lowering::effective_table_projections(table);
6781        if projections
6782            .iter()
6783            .any(|projection| matches!(projection, Projection::All))
6784        {
6785            return self.table_all_projection_columns(&table.table);
6786        }
6787
6788        let mut columns = BTreeSet::new();
6789        for projection in &projections {
6790            collect_projection_columns_for_table(
6791                projection,
6792                &table.table,
6793                table.alias.as_deref(),
6794                &mut columns,
6795            );
6796        }
6797        Ok(columns.into_iter().collect())
6798    }
6799
6800    fn table_all_projection_columns(&self, table: &str) -> RedDBResult<Vec<String>> {
6801        if let Some(contract) = self.inner.db.collection_contract_arc(table) {
6802            let columns: Vec<String> = contract
6803                .declared_columns
6804                .iter()
6805                .map(|column| column.name.clone())
6806                .collect();
6807            if !columns.is_empty() {
6808                return Ok(columns);
6809            }
6810        }
6811
6812        let records = scan_runtime_table_source_records_limited(&self.inner.db, table, Some(1))?;
6813        Ok(records
6814            .first()
6815            .map(|record| {
6816                record
6817                    .column_names()
6818                    .into_iter()
6819                    .map(|column| column.to_string())
6820                    .collect()
6821            })
6822            .unwrap_or_default())
6823    }
6824
6825    fn resolve_table_expr_subqueries(
6826        &self,
6827        mut table: TableQuery,
6828        frame: &dyn super::statement_frame::ReadFrame,
6829    ) -> RedDBResult<TableQuery> {
6830        // Only a `Subquery` source needs recursive resolution. `.take()`
6831        // would otherwise drop a `Name` / `Function` source on the floor
6832        // (the `if let` skips the body but the take already cleared it),
6833        // which silently broke `SELECT * FROM components(g)` — the TVF
6834        // dispatch downstream keys off `TableSource::Function` and never
6835        // fired. Restore any non-subquery source unchanged (issue #795).
6836        match table.source.take() {
6837            Some(TableSource::Subquery(inner)) => {
6838                let inner = self.resolve_select_expr_subqueries(*inner, frame)?;
6839                table.source = Some(TableSource::Subquery(Box::new(inner)));
6840            }
6841            other => table.source = other,
6842        }
6843
6844        let outer_scopes = relation_scopes_for_query(&QueryExpr::Table(table.clone()));
6845        for item in &mut table.select_items {
6846            if let crate::storage::query::ast::SelectItem::Expr { expr, .. } = item {
6847                *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
6848            }
6849        }
6850        if let Some(where_expr) = table.where_expr.take() {
6851            table.where_expr =
6852                Some(self.resolve_expr_subqueries(where_expr, &outer_scopes, frame)?);
6853            table.filter = None;
6854        }
6855        if let Some(having_expr) = table.having_expr.take() {
6856            table.having_expr =
6857                Some(self.resolve_expr_subqueries(having_expr, &outer_scopes, frame)?);
6858            table.having = None;
6859        }
6860        for expr in &mut table.group_by_exprs {
6861            *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
6862        }
6863        for clause in &mut table.order_by {
6864            if let Some(expr) = clause.expr.take() {
6865                clause.expr = Some(self.resolve_expr_subqueries(expr, &outer_scopes, frame)?);
6866            }
6867        }
6868        Ok(table)
6869    }
6870
6871    fn resolve_select_expr_subqueries(
6872        &self,
6873        expr: QueryExpr,
6874        frame: &dyn super::statement_frame::ReadFrame,
6875    ) -> RedDBResult<QueryExpr> {
6876        match expr {
6877            QueryExpr::Table(table) => self
6878                .resolve_table_expr_subqueries(table, frame)
6879                .map(QueryExpr::Table),
6880            QueryExpr::Join(mut join) => {
6881                join.left = Box::new(self.resolve_select_expr_subqueries(*join.left, frame)?);
6882                join.right = Box::new(self.resolve_select_expr_subqueries(*join.right, frame)?);
6883                Ok(QueryExpr::Join(join))
6884            }
6885            other => Ok(other),
6886        }
6887    }
6888
6889    fn resolve_expr_subqueries(
6890        &self,
6891        expr: crate::storage::query::ast::Expr,
6892        outer_scopes: &[String],
6893        frame: &dyn super::statement_frame::ReadFrame,
6894    ) -> RedDBResult<crate::storage::query::ast::Expr> {
6895        use crate::storage::query::ast::Expr;
6896
6897        match expr {
6898            Expr::Subquery { query, span } => {
6899                let values = self.execute_expr_subquery_values(query, outer_scopes, frame)?;
6900                if values.len() > 1 {
6901                    return Err(RedDBError::Query(
6902                        "scalar subquery returned more than one row".to_string(),
6903                    ));
6904                }
6905                Ok(Expr::Literal {
6906                    value: values.into_iter().next().unwrap_or(Value::Null),
6907                    span,
6908                })
6909            }
6910            Expr::BinaryOp { op, lhs, rhs, span } => Ok(Expr::BinaryOp {
6911                op,
6912                lhs: Box::new(self.resolve_expr_subqueries(*lhs, outer_scopes, frame)?),
6913                rhs: Box::new(self.resolve_expr_subqueries(*rhs, outer_scopes, frame)?),
6914                span,
6915            }),
6916            Expr::UnaryOp { op, operand, span } => Ok(Expr::UnaryOp {
6917                op,
6918                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
6919                span,
6920            }),
6921            Expr::Cast {
6922                inner,
6923                target,
6924                span,
6925            } => Ok(Expr::Cast {
6926                inner: Box::new(self.resolve_expr_subqueries(*inner, outer_scopes, frame)?),
6927                target,
6928                span,
6929            }),
6930            Expr::FunctionCall { name, args, span } => {
6931                let args = args
6932                    .into_iter()
6933                    .map(|arg| self.resolve_expr_subqueries(arg, outer_scopes, frame))
6934                    .collect::<RedDBResult<Vec<_>>>()?;
6935                Ok(Expr::FunctionCall { name, args, span })
6936            }
6937            Expr::Case {
6938                branches,
6939                else_,
6940                span,
6941            } => {
6942                let branches = branches
6943                    .into_iter()
6944                    .map(|(cond, value)| {
6945                        Ok((
6946                            self.resolve_expr_subqueries(cond, outer_scopes, frame)?,
6947                            self.resolve_expr_subqueries(value, outer_scopes, frame)?,
6948                        ))
6949                    })
6950                    .collect::<RedDBResult<Vec<_>>>()?;
6951                let else_ = else_
6952                    .map(|expr| self.resolve_expr_subqueries(*expr, outer_scopes, frame))
6953                    .transpose()?
6954                    .map(Box::new);
6955                Ok(Expr::Case {
6956                    branches,
6957                    else_,
6958                    span,
6959                })
6960            }
6961            Expr::IsNull {
6962                operand,
6963                negated,
6964                span,
6965            } => Ok(Expr::IsNull {
6966                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
6967                negated,
6968                span,
6969            }),
6970            Expr::InList {
6971                target,
6972                values,
6973                negated,
6974                span,
6975            } => {
6976                let target =
6977                    Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?);
6978                let mut resolved = Vec::new();
6979                for value in values {
6980                    if let Expr::Subquery { query, .. } = value {
6981                        resolved.extend(
6982                            self.execute_expr_subquery_values(query, outer_scopes, frame)?
6983                                .into_iter()
6984                                .map(Expr::lit),
6985                        );
6986                    } else {
6987                        resolved.push(self.resolve_expr_subqueries(value, outer_scopes, frame)?);
6988                    }
6989                }
6990                Ok(Expr::InList {
6991                    target,
6992                    values: resolved,
6993                    negated,
6994                    span,
6995                })
6996            }
6997            Expr::Between {
6998                target,
6999                low,
7000                high,
7001                negated,
7002                span,
7003            } => Ok(Expr::Between {
7004                target: Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?),
7005                low: Box::new(self.resolve_expr_subqueries(*low, outer_scopes, frame)?),
7006                high: Box::new(self.resolve_expr_subqueries(*high, outer_scopes, frame)?),
7007                negated,
7008                span,
7009            }),
7010            other => Ok(other),
7011        }
7012    }
7013
7014    fn execute_expr_subquery_values(
7015        &self,
7016        subquery: crate::storage::query::ast::ExprSubquery,
7017        outer_scopes: &[String],
7018        frame: &dyn super::statement_frame::ReadFrame,
7019    ) -> RedDBResult<Vec<Value>> {
7020        let query = *subquery.query;
7021        if query_references_outer_scope(&query, outer_scopes) {
7022            return Err(RedDBError::Query(
7023                "NOT_YET_SUPPORTED: correlated subqueries are not supported yet; track follow-up issue #470-correlated-subqueries".to_string(),
7024            ));
7025        }
7026        let query = self.rewrite_view_refs(query);
7027        let query = self.resolve_select_expr_subqueries(query, frame)?;
7028        let query = self.authorize_relational_select_expr(query, frame)?;
7029        let result = match query {
7030            QueryExpr::Table(table) => {
7031                execute_runtime_table_query(&self.inner.db, &table, Some(&self.inner.index_store))?
7032            }
7033            QueryExpr::Join(join) => execute_runtime_join_query(&self.inner.db, &join)?,
7034            other => {
7035                return Err(RedDBError::Query(format!(
7036                    "expression subquery must be a SELECT query, got {}",
7037                    query_expr_name(&other)
7038                )))
7039            }
7040        };
7041        first_column_values(result)
7042    }
7043
7044    fn dispatch_expr(
7045        &self,
7046        expr: QueryExpr,
7047        query_str: &str,
7048        mode: QueryMode,
7049    ) -> RedDBResult<RuntimeQueryResult> {
7050        let statement = query_expr_name(&expr);
7051        match expr {
7052            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
7053                // Graph queries are not cacheable as prepared statements.
7054                Err(RedDBError::Query(
7055                    "graph queries cannot be used as prepared statements".to_string(),
7056                ))
7057            }
7058            QueryExpr::Table(table) => {
7059                let scope = self.ai_scope();
7060                let table = self.resolve_table_expr_subqueries(
7061                    table,
7062                    &scope as &dyn super::statement_frame::ReadFrame,
7063                )?;
7064                // Table-valued functions (e.g. components(g)) dispatch to a
7065                // read-only executor before any catalog/virtual-table routing
7066                // (issue #795).
7067                if let Some(TableSource::Function {
7068                    name,
7069                    args,
7070                    named_args,
7071                }) = table.source.clone()
7072                {
7073                    return Ok(RuntimeQueryResult {
7074                        query: query_str.to_string(),
7075                        mode,
7076                        statement,
7077                        engine: "runtime-graph-tvf",
7078                        result: self.execute_table_function(&name, &args, &named_args)?,
7079                        affected_rows: 0,
7080                        statement_type: "select",
7081                        bookmark: None,
7082                    });
7083                }
7084                // Inline-graph TVF (issue #799) on the prepared-statement /
7085                // direct-expr path. Result caching is wired on the
7086                // `execute_query_inner` path; here we just compute and return.
7087                if let Some(TableSource::InlineGraphFunction {
7088                    name,
7089                    nodes,
7090                    edges,
7091                    named_args,
7092                }) = table.source.clone()
7093                {
7094                    return Ok(RuntimeQueryResult {
7095                        query: query_str.to_string(),
7096                        mode,
7097                        statement,
7098                        engine: "runtime-graph-tvf-inline",
7099                        result: self.execute_inline_graph_function(
7100                            &name,
7101                            &nodes,
7102                            &edges,
7103                            &named_args,
7104                        )?,
7105                        affected_rows: 0,
7106                        statement_type: "select",
7107                        bookmark: None,
7108                    });
7109                }
7110                if super::red_schema::is_virtual_table(&table.table) {
7111                    return Ok(RuntimeQueryResult {
7112                        query: query_str.to_string(),
7113                        mode,
7114                        statement,
7115                        engine: "runtime-red-schema",
7116                        result: super::red_schema::red_query(
7117                            self,
7118                            &table.table,
7119                            &table,
7120                            &scope as &dyn super::statement_frame::ReadFrame,
7121                        )?,
7122                        affected_rows: 0,
7123                        statement_type: "select",
7124                        bookmark: None,
7125                    });
7126                }
7127                // `<graph>.<output>` analytics virtual view (issue #800).
7128                if let Some(view_result) = self.try_resolve_analytics_view(
7129                    &table,
7130                    &scope as &dyn super::statement_frame::ReadFrame,
7131                )? {
7132                    return Ok(RuntimeQueryResult {
7133                        query: query_str.to_string(),
7134                        mode,
7135                        statement,
7136                        engine: "runtime-graph-analytics-view",
7137                        result: view_result,
7138                        affected_rows: 0,
7139                        statement_type: "select",
7140                        bookmark: None,
7141                    });
7142                }
7143                let Some(table_with_rls) = self.authorize_relational_table_select(
7144                    table,
7145                    &scope as &dyn super::statement_frame::ReadFrame,
7146                )?
7147                else {
7148                    return Ok(RuntimeQueryResult {
7149                        query: query_str.to_string(),
7150                        mode,
7151                        statement,
7152                        engine: "runtime-table-rls",
7153                        result: crate::storage::query::unified::UnifiedResult::empty(),
7154                        affected_rows: 0,
7155                        statement_type: "select",
7156                        bookmark: None,
7157                    });
7158                };
7159                Ok(RuntimeQueryResult {
7160                    query: query_str.to_string(),
7161                    mode,
7162                    statement,
7163                    engine: "runtime-table",
7164                    result: execute_runtime_table_query(
7165                        &self.inner.db,
7166                        &table_with_rls,
7167                        Some(&self.inner.index_store),
7168                    )?,
7169                    affected_rows: 0,
7170                    statement_type: "select",
7171                    bookmark: None,
7172                })
7173            }
7174            QueryExpr::Join(join) => {
7175                let scope = self.ai_scope();
7176                let Some(join_with_rls) = self.authorize_relational_join_select(
7177                    join,
7178                    &scope as &dyn super::statement_frame::ReadFrame,
7179                )?
7180                else {
7181                    return Ok(RuntimeQueryResult {
7182                        query: query_str.to_string(),
7183                        mode,
7184                        statement,
7185                        engine: "runtime-join-rls",
7186                        result: crate::storage::query::unified::UnifiedResult::empty(),
7187                        affected_rows: 0,
7188                        statement_type: "select",
7189                        bookmark: None,
7190                    });
7191                };
7192                Ok(RuntimeQueryResult {
7193                    query: query_str.to_string(),
7194                    mode,
7195                    statement,
7196                    engine: "runtime-join",
7197                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
7198                    affected_rows: 0,
7199                    statement_type: "select",
7200                    bookmark: None,
7201                })
7202            }
7203            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
7204                query: query_str.to_string(),
7205                mode,
7206                statement,
7207                engine: "runtime-vector",
7208                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
7209                affected_rows: 0,
7210                statement_type: "select",
7211                bookmark: None,
7212            }),
7213            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
7214                query: query_str.to_string(),
7215                mode,
7216                statement,
7217                engine: "runtime-hybrid",
7218                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
7219                affected_rows: 0,
7220                statement_type: "select",
7221                bookmark: None,
7222            }),
7223            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
7224                Err(RedDBError::Query(
7225                    super::red_schema::READ_ONLY_ERROR.to_string(),
7226                ))
7227            }
7228            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
7229                Err(RedDBError::Query(
7230                    super::red_schema::READ_ONLY_ERROR.to_string(),
7231                ))
7232            }
7233            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
7234                Err(RedDBError::Query(
7235                    super::red_schema::READ_ONLY_ERROR.to_string(),
7236                ))
7237            }
7238            QueryExpr::Insert(ref insert) => self
7239                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
7240                    self.execute_insert(query_str, insert)
7241                }),
7242            QueryExpr::Update(ref update) => self
7243                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
7244                    self.execute_update(query_str, update)
7245                }),
7246            QueryExpr::Delete(ref delete) => self
7247                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
7248                    self.execute_delete(query_str, delete)
7249                }),
7250            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query_str, cmd),
7251            QueryExpr::Ask(ref ask) => self.execute_ask(query_str, ask),
7252            _ => Err(RedDBError::Query(format!(
7253                "prepared-statement execution does not support {statement} statements"
7254            ))),
7255        }
7256    }
7257
7258    /// Dispatch a graph-collection table-valued function call in FROM
7259    /// position (e.g. `SELECT * FROM components(g)`).
7260    ///
7261    /// Validates the function name and arity here, materializes the whole
7262    /// active graph read-only, then runs the algorithm via the shared
7263    /// `dispatch_graph_algorithm` path. Never mutates the catalog or store.
7264    fn execute_table_function(
7265        &self,
7266        name: &str,
7267        args: &[String],
7268        named_args: &[(String, f64)],
7269    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7270        if !is_graph_tvf_name(name) {
7271            return Err(RedDBError::Query(format!("unknown table function: {name}")));
7272        }
7273        // Every graph-collection TVF takes exactly one graph argument.
7274        if args.len() != 1 {
7275            return Err(RedDBError::Query(format!(
7276                "table function '{name}' takes exactly 1 graph argument, got {}",
7277                args.len()
7278            )));
7279        }
7280
7281        // Read-only materialization of the full active graph. Passing `None`
7282        // for the projection uses the full graph store. Like #795/#796, the
7283        // v0 form runs over the whole graph store regardless of the collection
7284        // argument value. Materialization never mutates any store.
7285        let (nodes, edges) = self.materialize_whole_graph_abstract()?;
7286        self.dispatch_graph_algorithm(name, nodes, edges, named_args)
7287    }
7288
7289    /// Dispatch an inline-graph table-valued function call in FROM position
7290    /// (e.g. `SELECT * FROM components(nodes => (…), edges => (…))`, issue
7291    /// #799).
7292    ///
7293    /// Materializes the two subqueries through the normal read path (so RLS,
7294    /// column authz, and MVCC visibility all apply), constructs the abstract
7295    /// graph — the first column of `nodes` is the node id; the first two-or-
7296    /// three columns of `edges` are `(source, target [, weight])` — then runs
7297    /// the same algorithm path used by the graph-collection form. Read-only.
7298    fn execute_inline_graph_function(
7299        &self,
7300        name: &str,
7301        nodes_query: &QueryExpr,
7302        edges_query: &QueryExpr,
7303        named_args: &[(String, f64)],
7304    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7305        if !is_graph_tvf_name(name) {
7306            return Err(RedDBError::Query(format!("unknown table function: {name}")));
7307        }
7308
7309        let node_result = self.execute_query_expr(nodes_query.clone())?.result;
7310        let nodes = inline_node_ids(name, &node_result)?;
7311
7312        let edge_result = self.execute_query_expr(edges_query.clone())?.result;
7313        let edges = inline_edges(name, &edge_result)?;
7314
7315        self.dispatch_graph_algorithm(name, nodes, edges, named_args)
7316    }
7317
7318    /// Materialize the whole active graph read-only into the abstract
7319    /// `(nodes, edges)` inputs the pure graph algorithms consume.
7320    fn materialize_whole_graph_abstract(
7321        &self,
7322    ) -> RedDBResult<(
7323        Vec<String>,
7324        Vec<(
7325            String,
7326            String,
7327            crate::storage::engine::graph_algorithms::Weight,
7328        )>,
7329    )> {
7330        use crate::storage::engine::graph_algorithms;
7331
7332        let graph = super::graph_dsl::materialize_graph_with_projection(
7333            self.inner.db.store().as_ref(),
7334            None,
7335        )?;
7336        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
7337        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
7338            .iter_all_edges()
7339            .into_iter()
7340            .map(|e| (e.source_id, e.target_id, e.weight))
7341            .collect();
7342        Ok((nodes, edges))
7343    }
7344
7345    /// Resolve a `<graph>.<output>` analytics virtual view (issue #800).
7346    ///
7347    /// Returns `Ok(None)` when `table` is not an analytics view — either the
7348    /// name is not dotted, a real collection of that exact name exists (a real
7349    /// collection always wins; no shadowing), the suffix is not a recognised
7350    /// analytics output, or the parent is not a graph. Returns `Ok(Some(_))`
7351    /// with the freshly computed result when it does resolve, and an error when
7352    /// the parent graph exists but the output is not enabled, a declared
7353    /// algorithm is unsupported, or the parent collection's policy denies the
7354    /// read.
7355    ///
7356    /// The view is recomputed on every call (no result-cache write) so it
7357    /// always reflects the current graph data, satisfying the on-demand
7358    /// recompute contract for this slice.
7359    fn try_resolve_analytics_view(
7360        &self,
7361        table: &TableQuery,
7362        frame: &dyn super::statement_frame::ReadFrame,
7363    ) -> RedDBResult<Option<crate::storage::query::unified::UnifiedResult>> {
7364        let full = table.table.as_str();
7365        let Some(dot) = full.rfind('.') else {
7366            return Ok(None);
7367        };
7368        // A real collection literally named `g.communities` always wins.
7369        if self.inner.db.store().get_collection(full).is_some() {
7370            return Ok(None);
7371        }
7372        let graph_name = &full[..dot];
7373        let output_name = &full[dot + 1..];
7374        let Some(output) = crate::catalog::AnalyticsOutput::from_str(output_name) else {
7375            return Ok(None);
7376        };
7377
7378        let contracts = self.inner.db.collection_contracts();
7379        let Some(contract) = contracts.iter().find(|c| c.name == graph_name) else {
7380            return Ok(None);
7381        };
7382        if contract.declared_model != crate::catalog::CollectionModel::Graph {
7383            return Ok(None);
7384        }
7385        let Some(view) = contract
7386            .analytics_config
7387            .iter()
7388            .find(|view| view.output == output)
7389        else {
7390            // The parent graph exists but this output was not declared — a
7391            // clear error beats the misleading "collection not found".
7392            return Err(RedDBError::Query(format!(
7393                "analytics output '{output_name}' is not enabled on graph '{graph_name}'; declare it with WITH ANALYTICS (...)"
7394            )));
7395        };
7396
7397        // Policy inheritance (AC5): route through the parent graph collection's
7398        // read authorization. A policy or RLS rule that denies the parent
7399        // denies its analytics views transitively.
7400        let parent_query = TableQuery::new(graph_name);
7401        if self
7402            .authorize_relational_table_select(parent_query, frame)?
7403            .is_none()
7404        {
7405            return Err(RedDBError::Query(format!(
7406                "permission denied: policy on graph '{graph_name}' denies analytics view '{output_name}'"
7407            )));
7408        }
7409
7410        let (algorithm, named_args) = analytics_view_algorithm(graph_name, view)?;
7411        let (nodes, edges) = self.materialize_whole_graph_abstract()?;
7412        let result = self.dispatch_graph_algorithm(&algorithm, nodes, edges, &named_args)?;
7413        Ok(Some(result))
7414    }
7415
7416    /// Shared algorithm dispatch over abstract `(nodes, edges)` inputs.
7417    ///
7418    /// Both the graph-collection form and the inline-graph form route here so
7419    /// named-argument validation and the projected row shape stay identical
7420    /// across the two signatures (issue #799). Projects each algorithm's
7421    /// native output shape.
7422    fn dispatch_graph_algorithm(
7423        &self,
7424        name: &str,
7425        nodes: Vec<String>,
7426        edges: Vec<(
7427            String,
7428            String,
7429            crate::storage::engine::graph_algorithms::Weight,
7430        )>,
7431        named_args: &[(String, f64)],
7432    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7433        use crate::storage::engine::graph_algorithms;
7434        use crate::storage::query::unified::UnifiedResult;
7435        use crate::storage::schema::Value;
7436
7437        if name.eq_ignore_ascii_case("components") {
7438            reject_named_args(name, named_args)?;
7439            let assignment = graph_algorithms::connected_components(&nodes, &edges);
7440            let mut result =
7441                UnifiedResult::with_columns(vec!["node_id".into(), "island_id".into()]);
7442            for (node_id, island_id) in assignment {
7443                let mut record = UnifiedRecord::new();
7444                record.set("node_id", Value::text(node_id));
7445                record.set("island_id", Value::Integer(island_id as i64));
7446                result.push(record);
7447            }
7448            return Ok(result);
7449        }
7450
7451        if name.eq_ignore_ascii_case("louvain") {
7452            // The only supported named argument is `resolution` (γ). It
7453            // defaults to 1.0 (classic modularity) and must be a finite,
7454            // strictly positive number — a non-positive (or NaN/inf)
7455            // resolution has no sensible meaning.
7456            let resolution = louvain_resolution(named_args)?;
7457            let assignment = graph_algorithms::louvain(&nodes, &edges, resolution);
7458            let mut result =
7459                UnifiedResult::with_columns(vec!["node_id".into(), "community_id".into()]);
7460            for (node_id, community_id) in assignment {
7461                let mut record = UnifiedRecord::new();
7462                record.set("node_id", Value::text(node_id));
7463                record.set("community_id", Value::Integer(community_id as i64));
7464                result.push(record);
7465            }
7466            return Ok(result);
7467        }
7468
7469        if name.eq_ignore_ascii_case("degree_centrality") {
7470            reject_named_args(name, named_args)?;
7471            let assignment = abstract_degree_centrality(&nodes, &edges);
7472            let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "degree".into()]);
7473            for (node_id, degree) in assignment {
7474                let mut record = UnifiedRecord::new();
7475                record.set("node_id", Value::text(node_id));
7476                record.set("degree", Value::Integer(degree as i64));
7477                result.push(record);
7478            }
7479            return Ok(result);
7480        }
7481
7482        if name.eq_ignore_ascii_case("shortest_path") {
7483            // Scalar named arguments: `src` and `dst` are required node ids,
7484            // `max_hops` is an optional non-negative edge-count cap. Node ids
7485            // in the graph store are integer entity ids rendered as strings, so
7486            // each id arg must be a non-negative whole number; reject anything
7487            // else (fractional, negative, NaN/inf) with a clear message.
7488            let mut src: Option<String> = None;
7489            let mut dst: Option<String> = None;
7490            let mut max_hops: Option<usize> = None;
7491            let as_node_id = |key: &str, value: f64| -> RedDBResult<String> {
7492                if !value.is_finite() || value < 0.0 || value.fract() != 0.0 {
7493                    return Err(RedDBError::Query(format!(
7494                        "table function 'shortest_path' argument '{key}' must be a non-negative integer node id, got {value}"
7495                    )));
7496                }
7497                Ok((value as i64).to_string())
7498            };
7499            for (key, value) in named_args {
7500                if key.eq_ignore_ascii_case("src") {
7501                    src = Some(as_node_id("src", *value)?);
7502                } else if key.eq_ignore_ascii_case("dst") {
7503                    dst = Some(as_node_id("dst", *value)?);
7504                } else if key.eq_ignore_ascii_case("max_hops") {
7505                    if !value.is_finite() || *value < 0.0 || value.fract() != 0.0 {
7506                        return Err(RedDBError::Query(format!(
7507                            "table function 'shortest_path' max_hops must be a non-negative integer, got {value}"
7508                        )));
7509                    }
7510                    max_hops = Some(*value as usize);
7511                } else {
7512                    return Err(RedDBError::Query(format!(
7513                        "table function 'shortest_path' has no named argument '{key}' (expected 'src', 'dst', 'max_hops')"
7514                    )));
7515                }
7516            }
7517            let src = src.ok_or_else(|| {
7518                RedDBError::Query(
7519                    "table function 'shortest_path' requires named argument 'src'".to_string(),
7520                )
7521            })?;
7522            let dst = dst.ok_or_else(|| {
7523                RedDBError::Query(
7524                    "table function 'shortest_path' requires named argument 'dst'".to_string(),
7525                )
7526            })?;
7527
7528            // Columns are always present; an unreachable pair (within the
7529            // optional `max_hops` budget) simply yields zero rows — never an
7530            // error. `hop` is the 0-based index from the source;
7531            // `cumulative_weight` is the running path weight (0 at the source,
7532            // the total at the destination). Edges are treated as undirected,
7533            // consistent with `components` / `louvain`.
7534            let mut result = UnifiedResult::with_columns(vec![
7535                "hop".into(),
7536                "node_id".into(),
7537                "cumulative_weight".into(),
7538            ]);
7539            if let Some(path) =
7540                graph_algorithms::shortest_path(&nodes, &edges, &src, &dst, max_hops)
7541            {
7542                for (hop, (node_id, cumulative_weight)) in path.into_iter().enumerate() {
7543                    let mut record = UnifiedRecord::new();
7544                    record.set("hop", Value::Integer(hop as i64));
7545                    record.set("node_id", Value::text(node_id));
7546                    record.set("cumulative_weight", Value::Float(cumulative_weight));
7547                    result.push(record);
7548                }
7549            }
7550            return Ok(result);
7551        }
7552        // ── Centrality family (issue #797): each returns rows `(node_id,
7553        // score)` over the abstract `(nodes, edges)` graph. Like the other
7554        // graph TVFs the graph is treated as undirected and scores are
7555        // deterministic; the inline-graph form shares this dispatch. ──
7556        if name.eq_ignore_ascii_case("betweenness") {
7557            reject_named_args(name, named_args)?;
7558            return Ok(Self::centrality_result(graph_algorithms::betweenness(
7559                &nodes, &edges,
7560            )));
7561        }
7562        if name.eq_ignore_ascii_case("eigenvector") {
7563            // Optional `max_iterations` (positive integer, default 100) and
7564            // `tolerance` (finite, strictly positive, default 1e-6).
7565            let mut max_iterations = 100_usize;
7566            let mut tolerance = 1e-6_f64;
7567            for (key, value) in named_args {
7568                if key.eq_ignore_ascii_case("max_iterations") {
7569                    max_iterations = parse_positive_iterations("eigenvector", value)?;
7570                } else if key.eq_ignore_ascii_case("tolerance") {
7571                    if !value.is_finite() || *value <= 0.0 {
7572                        return Err(RedDBError::Query(format!(
7573                            "table function 'eigenvector' tolerance must be > 0, got {value}"
7574                        )));
7575                    }
7576                    tolerance = *value;
7577                } else {
7578                    return Err(RedDBError::Query(format!(
7579                        "table function 'eigenvector' has no named argument '{key}' (expected 'max_iterations' or 'tolerance')"
7580                    )));
7581                }
7582            }
7583            return Ok(Self::centrality_result(graph_algorithms::eigenvector(
7584                &nodes,
7585                &edges,
7586                max_iterations,
7587                tolerance,
7588            )));
7589        }
7590        if name.eq_ignore_ascii_case("pagerank") {
7591            // Optional `damping` (in (0, 1), default 0.85) and `max_iterations`
7592            // (positive integer, default 100).
7593            let mut damping = 0.85_f64;
7594            let mut max_iterations = 100_usize;
7595            for (key, value) in named_args {
7596                if key.eq_ignore_ascii_case("damping") {
7597                    if !value.is_finite() || *value <= 0.0 || *value >= 1.0 {
7598                        return Err(RedDBError::Query(format!(
7599                            "table function 'pagerank' damping must be in (0, 1), got {value}"
7600                        )));
7601                    }
7602                    damping = *value;
7603                } else if key.eq_ignore_ascii_case("max_iterations") {
7604                    max_iterations = parse_positive_iterations("pagerank", value)?;
7605                } else {
7606                    return Err(RedDBError::Query(format!(
7607                        "table function 'pagerank' has no named argument '{key}' (expected 'damping' or 'max_iterations')"
7608                    )));
7609                }
7610            }
7611            return Ok(Self::centrality_result(graph_algorithms::pagerank(
7612                &nodes,
7613                &edges,
7614                damping,
7615                max_iterations,
7616            )));
7617        }
7618        Err(RedDBError::Query(format!("unknown table function: {name}")))
7619    }
7620
7621    /// `components(<graph_collection>)` — returns rows `(node_id, island_id)`.
7622    ///
7623    /// Materializes the active graph (nodes + weighted edges) read-only and
7624    /// runs the pure `graph_algorithms::connected_components`. Edges are
7625    /// treated as undirected; island ids are deterministic (ascending order of
7626    /// each component's smallest node).
7627    fn execute_components_tvf(
7628        &self,
7629        _collection: &str,
7630    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7631        use crate::storage::engine::graph_algorithms;
7632        use crate::storage::query::unified::UnifiedResult;
7633        use crate::storage::schema::Value;
7634
7635        // Read-only materialization of the full active graph. The named
7636        // collection identifies the active graph scope; passing `None` for the
7637        // projection uses the full graph store (the same result
7638        // `active_graph_projection` yields when no projection is registered).
7639        // Materialization never mutates any store.
7640        let graph = super::graph_dsl::materialize_graph_with_projection(
7641            self.inner.db.store().as_ref(),
7642            None,
7643        )?;
7644
7645        // Materialize abstract inputs for the pure algorithm.
7646        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
7647        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
7648            .iter_all_edges()
7649            .into_iter()
7650            .map(|e| (e.source_id, e.target_id, e.weight))
7651            .collect();
7652
7653        let assignment = graph_algorithms::connected_components(&nodes, &edges);
7654
7655        // Project into a UnifiedResult with columns ["node_id", "island_id"].
7656        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "island_id".into()]);
7657        for (node_id, island_id) in assignment {
7658            let mut record = UnifiedRecord::new();
7659            record.set("node_id", Value::text(node_id));
7660            record.set("island_id", Value::Integer(island_id as i64));
7661            result.push(record);
7662        }
7663        Ok(result)
7664    }
7665
7666    /// `louvain(<graph> [, resolution => <f64>])` — returns rows
7667    /// `(node_id, community_id)` (issue #796).
7668    ///
7669    /// Materializes the active graph (nodes + weighted edges) read-only and
7670    /// runs the pure, deterministic `graph_algorithms::louvain`. Edges are
7671    /// treated as undirected; community ids are assigned in ascending order of
7672    /// each community's smallest node, so identical input + resolution always
7673    /// yields identical rows. Like `components`, the v0 form runs over the
7674    /// whole graph store regardless of the collection argument value.
7675    fn execute_louvain_tvf(
7676        &self,
7677        _collection: &str,
7678        resolution: f64,
7679    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7680        use crate::storage::engine::graph_algorithms;
7681        use crate::storage::query::unified::UnifiedResult;
7682        use crate::storage::schema::Value;
7683
7684        let graph = super::graph_dsl::materialize_graph_with_projection(
7685            self.inner.db.store().as_ref(),
7686            None,
7687        )?;
7688
7689        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
7690        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
7691            .iter_all_edges()
7692            .into_iter()
7693            .map(|e| (e.source_id, e.target_id, e.weight))
7694            .collect();
7695
7696        let assignment = graph_algorithms::louvain(&nodes, &edges, resolution);
7697
7698        // Project into a UnifiedResult with columns ["node_id", "community_id"].
7699        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "community_id".into()]);
7700        for (node_id, community_id) in assignment {
7701            let mut record = UnifiedRecord::new();
7702            record.set("node_id", Value::text(node_id));
7703            record.set("community_id", Value::Integer(community_id as i64));
7704            result.push(record);
7705        }
7706        Ok(result)
7707    }
7708
7709    /// Project `(node_id, score)` centrality rows into a `UnifiedResult` with
7710    /// columns `["node_id", "score"]`; scores are `Value::Float`.
7711    fn centrality_result(
7712        rows: Vec<(String, f64)>,
7713    ) -> crate::storage::query::unified::UnifiedResult {
7714        use crate::storage::query::unified::UnifiedResult;
7715        use crate::storage::schema::Value;
7716        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "score".into()]);
7717        for (node_id, score) in rows {
7718            let mut record = UnifiedRecord::new();
7719            record.set("node_id", Value::text(node_id));
7720            record.set("score", Value::Float(score));
7721            result.push(record);
7722        }
7723        result
7724    }
7725
7726    /// Ultra-fast path: detect `SELECT * FROM table WHERE _entity_id = N` by string pattern
7727    /// and execute it without SQL parsing or planning. Returns None if pattern doesn't match.
7728    fn try_fast_entity_lookup(&self, query: &str) -> Option<RedDBResult<RuntimeQueryResult>> {
7729        // Pattern: "SELECT * FROM <table> WHERE _entity_id = <id>"
7730        // or "SELECT * FROM <table> WHERE _entity_id =<id>"
7731        let q = query.trim();
7732        if !q.starts_with("SELECT") && !q.starts_with("select") {
7733            return None;
7734        }
7735
7736        // Find "WHERE _entity_id = " or "WHERE _entity_id ="
7737        let where_pos = q
7738            .find("WHERE _entity_id")
7739            .or_else(|| q.find("where _entity_id"))?;
7740        let after_field = &q[where_pos + 16..].trim_start(); // skip "WHERE _entity_id"
7741        let after_eq = after_field.strip_prefix('=')?.trim_start();
7742
7743        // Parse the entity ID number
7744        let id_str = after_eq.trim();
7745        let entity_id: u64 = id_str.parse().ok()?;
7746
7747        // Extract table name: between "FROM " and " WHERE"
7748        let from_pos = q.find("FROM ").or_else(|| q.find("from "))? + 5;
7749        let table = q[from_pos..where_pos].trim();
7750        if table.is_empty()
7751            || table.contains(' ') && !table.contains(" AS ") && !table.contains(" as ")
7752        {
7753            return None; // complex query, fall through
7754        }
7755        let table_name = table.split_whitespace().next()?;
7756
7757        // Direct entity lookup — skips SQL parse, plan cache, result
7758        // cache, view rewriter, RLS gate. Safe because the gating in
7759        // `execute_query` guarantees no scope override / no
7760        // transaction context is active. MVCC visibility is still
7761        // honoured against the current snapshot.
7762        let store = self.inner.db.store();
7763        let entity = store
7764            .get(
7765                table_name,
7766                crate::storage::unified::EntityId::new(entity_id),
7767            )
7768            .filter(entity_visible_under_current_snapshot)
7769            .filter(|entity| {
7770                self.inner
7771                    .db
7772                    .replica_allows_entity_at_read(table_name, entity)
7773            });
7774
7775        let count = if entity.is_some() { 1u64 } else { 0 };
7776
7777        // Materialize a record so downstream consumers that walk
7778        // `result.records` (embedded runtime API, decrypt pass, CLI)
7779        // see the row. Previously only `pre_serialized_json` was
7780        // filled, which caused those consumers to see zero rows and
7781        // skewed benchmarks.
7782        let records: Vec<crate::storage::query::unified::UnifiedRecord> = entity
7783            .as_ref()
7784            .and_then(|e| runtime_table_record_from_entity(e.clone()))
7785            .into_iter()
7786            .collect();
7787
7788        let json = match entity {
7789            Some(ref e) => execute_runtime_serialize_single_entity(e),
7790            None => r#"{"columns":[],"record_count":0,"selection":{"scope":"any"},"records":[]}"#
7791                .to_string(),
7792        };
7793
7794        Some(Ok(RuntimeQueryResult {
7795            query: query.to_string(),
7796            mode: crate::storage::query::modes::QueryMode::Sql,
7797            statement: "select",
7798            engine: "fast-entity-lookup",
7799            result: crate::storage::query::unified::UnifiedResult {
7800                columns: Vec::new(),
7801                records,
7802                stats: crate::storage::query::unified::QueryStats {
7803                    rows_scanned: count,
7804                    ..Default::default()
7805                },
7806                pre_serialized_json: Some(json),
7807            },
7808            affected_rows: 0,
7809            statement_type: "select",
7810            bookmark: None,
7811        }))
7812    }
7813
7814    pub(crate) fn invalidate_plan_cache(&self) {
7815        self.inner.query_cache.write().clear();
7816        self.inner
7817            .ddl_epoch
7818            .fetch_add(1, std::sync::atomic::Ordering::Release);
7819    }
7820
7821    /// Read the monotonic DDL epoch counter. Bumped by every
7822    /// `invalidate_plan_cache` call so prepared-statement holders can
7823    /// detect schema drift between PREPARE and EXECUTE.
7824    pub fn ddl_epoch(&self) -> u64 {
7825        self.inner
7826            .ddl_epoch
7827            .load(std::sync::atomic::Ordering::Acquire)
7828    }
7829
7830    pub(crate) fn clear_table_planner_stats(&self, table: &str) {
7831        let store = self.inner.db.store();
7832        crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
7833        self.invalidate_plan_cache();
7834    }
7835
7836    /// Replay `tenant_tables.*.column` keys from red_config at boot so
7837    /// `CREATE TABLE ... TENANT BY (col)` declarations persist across
7838    /// restarts (Phase 2.5.4). Reads every row of the `red_config`
7839    /// collection, picks the keys matching the tenant-marker shape,
7840    /// and calls `register_tenant_table` for each.
7841    ///
7842    /// Safe no-op when `red_config` doesn't exist (first boot on a
7843    /// fresh datadir).
7844    pub(crate) fn rehydrate_tenant_tables(&self) {
7845        let store = self.inner.db.store();
7846        let Some(manager) = store.get_collection("red_config") else {
7847            return;
7848        };
7849        // Replay in insertion order (SegmentManager iteration). Multiple
7850        // toggles on the same table leave several rows behind — the
7851        // last one processed wins because each register/unregister
7852        // call overwrites the in-memory state.
7853        for entity in manager.query_all(|_| true) {
7854            let crate::storage::unified::entity::EntityData::Row(row) = &entity.data else {
7855                continue;
7856            };
7857            let Some(named) = &row.named else { continue };
7858            let Some(crate::storage::schema::Value::Text(key)) = named.get("key") else {
7859                continue;
7860            };
7861            // Shape: tenant_tables.{table}.column
7862            let Some(rest) = key.strip_prefix("tenant_tables.") else {
7863                continue;
7864            };
7865            let Some((table, suffix)) = rest.rsplit_once('.') else {
7866                // Issue #205 — a `tenant_tables.*` row that doesn't
7867                // split cleanly is a schema-shape regression: the
7868                // metadata writer must always emit the `.column`
7869                // suffix, so reaching this branch means an upgrade
7870                // with incompatible state or external tampering.
7871                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7872                    collection: "red_config".to_string(),
7873                    detail: format!("malformed tenant_tables key: {key}"),
7874                }
7875                .emit_global();
7876                continue;
7877            };
7878            if suffix != "column" {
7879                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7880                    collection: "red_config".to_string(),
7881                    detail: format!("unexpected tenant_tables suffix: {key}"),
7882                }
7883                .emit_global();
7884                continue;
7885            }
7886            match named.get("value") {
7887                Some(crate::storage::schema::Value::Text(column)) => {
7888                    self.register_tenant_table(table, column);
7889                }
7890                // Null / missing value = DISABLE TENANCY marker.
7891                Some(crate::storage::schema::Value::Null) | None => {
7892                    self.unregister_tenant_table(table);
7893                }
7894                _ => {}
7895            }
7896        }
7897    }
7898
7899    /// Replay every persisted `MaterializedViewDescriptor` from the
7900    /// `red_materialized_view_defs` system collection (issue #593
7901    /// slice 9a). For each descriptor, re-parse the original SQL,
7902    /// extract the `QueryExpr::CreateView` it produced, and populate
7903    /// the in-memory registries (`inner.views` and
7904    /// `inner.materialized_views`) directly — no write paths run, so
7905    /// rehydrate does not re-persist what it just read.
7906    ///
7907    /// Malformed rows (missing `name`/`source_sql`, parse errors) are
7908    /// skipped with a `SchemaCorruption` operator event so a single
7909    /// bad entry does not block startup.
7910    pub(crate) fn rehydrate_materialized_view_descriptors(&self) {
7911        let store = self.inner.db.store();
7912        let descriptors = crate::runtime::continuous_materialized_view::load_all(store.as_ref());
7913        for descriptor in descriptors {
7914            let parsed = match crate::storage::query::parser::parse(&descriptor.source_sql) {
7915                Ok(qc) => qc,
7916                Err(err) => {
7917                    crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7918                        collection:
7919                            crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
7920                                .to_string(),
7921                        detail: format!(
7922                            "failed to re-parse materialized-view source for {}: {err}",
7923                            descriptor.name
7924                        ),
7925                    }
7926                    .emit_global();
7927                    continue;
7928                }
7929            };
7930            let crate::storage::query::ast::QueryExpr::CreateView(create) = parsed.query else {
7931                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7932                    collection: crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
7933                        .to_string(),
7934                    detail: format!(
7935                        "materialized-view source for {} did not re-parse as CREATE VIEW",
7936                        descriptor.name
7937                    ),
7938                }
7939                .emit_global();
7940                continue;
7941            };
7942            // Populate in-memory view registry.
7943            let view_name = create.name.clone();
7944            self.inner
7945                .views
7946                .write()
7947                .insert(view_name.clone(), Arc::new(create));
7948            // Materialized cache slot (data empty until next REFRESH).
7949            use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
7950            let refresh = match descriptor.refresh_every_ms {
7951                Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
7952                None => RefreshPolicy::Manual,
7953            };
7954            let def = MaterializedViewDef {
7955                name: view_name.clone(),
7956                query: format!("<parsed view {}>", view_name),
7957                dependencies: descriptor.source_collections.clone(),
7958                refresh,
7959                retention_duration_ms: descriptor.retention_duration_ms,
7960            };
7961            self.inner.materialized_views.write().register(def);
7962            if let Err(err) = self.ensure_materialized_view_backing(&view_name) {
7963                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7964                    collection: crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
7965                        .to_string(),
7966                    detail: format!(
7967                        "failed to rehydrate backing collection for materialized view {view_name}: {err}"
7968                    ),
7969                }
7970                .emit_global();
7971            }
7972        }
7973        // A rehydrated view shape may differ from any plans the cache
7974        // bootstrapped before this method ran — flush to be safe.
7975        self.invalidate_plan_cache();
7976    }
7977
7978    pub(crate) fn rehydrate_declared_column_schemas(&self) {
7979        let store = self.inner.db.store();
7980        for contract in self.inner.db.collection_contracts() {
7981            let columns: Vec<String> = contract
7982                .declared_columns
7983                .iter()
7984                .map(|column| column.name.clone())
7985                .collect();
7986            let Some(manager) = store.get_collection(&contract.name) else {
7987                continue;
7988            };
7989            manager.set_column_schema_if_empty(columns);
7990        }
7991    }
7992
7993    /// Register a table as tenant-scoped (Phase 2.5.4). Installs the
7994    /// in-memory column mapping, the implicit RLS policy, and enables
7995    /// row-level security on the table. Idempotent — re-registering
7996    /// the same `(table, column)` replaces the prior auto-policy.
7997    pub fn register_tenant_table(&self, table: &str, column: &str) {
7998        use crate::storage::query::ast::{
7999            CompareOp, CreatePolicyQuery, Expr, FieldRef, Filter, Span,
8000        };
8001        self.inner
8002            .tenant_tables
8003            .write()
8004            .insert(table.to_string(), column.to_string());
8005
8006        // Build the policy: col = CURRENT_TENANT()
8007        // Uses CompareExpr so the comparison happens at runtime against
8008        // the thread-local tenant value read by the CURRENT_TENANT
8009        // scalar. Spans are synthetic — there's no source location for
8010        // an auto-generated policy.
8011        let lhs = Expr::Column {
8012            field: FieldRef::TableColumn {
8013                table: table.to_string(),
8014                column: column.to_string(),
8015            },
8016            span: Span::synthetic(),
8017        };
8018        let rhs = Expr::FunctionCall {
8019            name: "CURRENT_TENANT".to_string(),
8020            args: Vec::new(),
8021            span: Span::synthetic(),
8022        };
8023        let policy_filter = Filter::CompareExpr {
8024            lhs,
8025            op: CompareOp::Eq,
8026            rhs,
8027        };
8028
8029        let policy = CreatePolicyQuery {
8030            name: "__tenant_iso".to_string(),
8031            table: table.to_string(),
8032            action: None, // None = ALL actions (SELECT/INSERT/UPDATE/DELETE)
8033            role: None,   // None = every role
8034            using: Box::new(policy_filter),
8035            // Auto-tenancy defaults to Table targets. Collections of
8036            // other kinds (graph / vector / queue / timeseries) that
8037            // opt in via `ALTER ... ENABLE TENANCY` should use the
8038            // matching kind — but for now we keep the auto-policy
8039            // kind-agnostic so the evaluator can apply it to any
8040            // entity living in the collection.
8041            target_kind: crate::storage::query::ast::PolicyTargetKind::Table,
8042        };
8043
8044        // Replace any prior auto-policy for this table (column rename).
8045        self.inner.rls_policies.write().insert(
8046            (table.to_string(), "__tenant_iso".to_string()),
8047            Arc::new(policy),
8048        );
8049        self.inner
8050            .rls_enabled_tables
8051            .write()
8052            .insert(table.to_string());
8053
8054        // Auto-build a hash index on the tenant column. Every read/write
8055        // against a tenant-scoped table carries an implicit
8056        // `col = CURRENT_TENANT()` predicate from the auto-policy, so an
8057        // index on that column is on the hot path of every query. Without
8058        // it, every SELECT/UPDATE/DELETE degrades to a full scan.
8059        self.ensure_tenant_index(table, column);
8060    }
8061
8062    /// Auto-create the hash index that backs the tenant-iso RLS predicate.
8063    /// Skipped when:
8064    ///   * the column is dotted (nested path — flat secondary indices
8065    ///     don't cover those today; RLS still works via the policy)
8066    ///   * `__tenant_idx_{table}` already exists (idempotent on rehydrate)
8067    ///   * the user already registered an index whose first column matches
8068    ///     (avoids redundant duplicates of a user-defined composite)
8069    fn ensure_tenant_index(&self, table: &str, column: &str) {
8070        if column.contains('.') {
8071            return;
8072        }
8073        let index_name = format!("__tenant_idx_{table}");
8074        let registry = self.inner.index_store.list_indices(table);
8075        if registry.iter().any(|idx| idx.name == index_name) {
8076            return;
8077        }
8078        if registry
8079            .iter()
8080            .any(|idx| idx.columns.first().map(|c| c.as_str()) == Some(column))
8081        {
8082            return;
8083        }
8084
8085        let store = self.inner.db.store();
8086        let Some(manager) = store.get_collection(table) else {
8087            return;
8088        };
8089        let entities = manager.query_all(|_| true);
8090        let entity_fields: Vec<(
8091            crate::storage::unified::EntityId,
8092            Vec<(String, crate::storage::schema::Value)>,
8093        )> = entities
8094            .iter()
8095            .map(|e| {
8096                let fields = match &e.data {
8097                    crate::storage::EntityData::Row(row) => {
8098                        if let Some(ref named) = row.named {
8099                            named.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
8100                        } else if let Some(ref schema) = row.schema {
8101                            schema
8102                                .iter()
8103                                .zip(row.columns.iter())
8104                                .map(|(k, v)| (k.clone(), v.clone()))
8105                                .collect()
8106                        } else {
8107                            Vec::new()
8108                        }
8109                    }
8110                    crate::storage::EntityData::Node(node) => node
8111                        .properties
8112                        .iter()
8113                        .map(|(k, v)| (k.clone(), v.clone()))
8114                        .collect(),
8115                    _ => Vec::new(),
8116                };
8117                (e.id, fields)
8118            })
8119            .collect();
8120
8121        let columns = vec![column.to_string()];
8122        if self
8123            .inner
8124            .index_store
8125            .create_index(
8126                &index_name,
8127                table,
8128                &columns,
8129                super::index_store::IndexMethodKind::Hash,
8130                false,
8131                &entity_fields,
8132            )
8133            .is_err()
8134        {
8135            return;
8136        }
8137        self.inner
8138            .index_store
8139            .register(super::index_store::RegisteredIndex {
8140                name: index_name,
8141                collection: table.to_string(),
8142                columns,
8143                method: super::index_store::IndexMethodKind::Hash,
8144                unique: false,
8145            });
8146        self.invalidate_plan_cache();
8147    }
8148
8149    /// Drop the auto-generated tenant index, if one exists. Called from
8150    /// `unregister_tenant_table` so DISABLE TENANCY / DROP TABLE clean up.
8151    fn drop_tenant_index(&self, table: &str) {
8152        let index_name = format!("__tenant_idx_{table}");
8153        self.inner.index_store.drop_index(&index_name, table);
8154    }
8155
8156    /// Retrieve the tenant column for a table, if any (Phase 2.5.4).
8157    /// Used by the INSERT auto-fill path to know which column to
8158    /// populate with `current_tenant()` when the user didn't name it.
8159    pub fn tenant_column(&self, table: &str) -> Option<String> {
8160        self.inner.tenant_tables.read().get(table).cloned()
8161    }
8162
8163    /// Remove a table's tenant registration (Phase 2.5.4). Called by
8164    /// DROP TABLE / ALTER TABLE DISABLE TENANCY. Removes the auto-policy
8165    /// but leaves any user-installed explicit policies intact.
8166    pub fn unregister_tenant_table(&self, table: &str) {
8167        self.inner.tenant_tables.write().remove(table);
8168        self.inner
8169            .rls_policies
8170            .write()
8171            .remove(&(table.to_string(), "__tenant_iso".to_string()));
8172        self.drop_tenant_index(table);
8173        // Only clear RLS enablement if no other policies remain.
8174        let has_other_policies = self
8175            .inner
8176            .rls_policies
8177            .read()
8178            .keys()
8179            .any(|(t, _)| t == table);
8180        if !has_other_policies {
8181            self.inner.rls_enabled_tables.write().remove(table);
8182        }
8183    }
8184
8185    /// Record that the running transaction has marked `id` in `collection`
8186    /// for deletion (Phase 2.3.2b MVCC tombstones). `stamper_xid` is the
8187    /// xid that was written into `xmax` — either the parent txn xid or
8188    /// the innermost savepoint sub-xid. Savepoint rollback filters by
8189    /// this xid to revive only its own tombstones.
8190    pub(crate) fn record_pending_tombstone(
8191        &self,
8192        conn_id: u64,
8193        collection: &str,
8194        id: crate::storage::unified::entity::EntityId,
8195        stamper_xid: crate::storage::transaction::snapshot::Xid,
8196        previous_xmax: crate::storage::transaction::snapshot::Xid,
8197    ) {
8198        self.inner
8199            .pending_tombstones
8200            .write()
8201            .entry(conn_id)
8202            .or_default()
8203            .push((collection.to_string(), id, stamper_xid, previous_xmax));
8204    }
8205
8206    pub(crate) fn record_pending_versioned_update(
8207        &self,
8208        conn_id: u64,
8209        collection: &str,
8210        old_id: crate::storage::unified::entity::EntityId,
8211        new_id: crate::storage::unified::entity::EntityId,
8212        stamper_xid: crate::storage::transaction::snapshot::Xid,
8213        previous_xmax: crate::storage::transaction::snapshot::Xid,
8214    ) {
8215        self.inner
8216            .pending_versioned_updates
8217            .write()
8218            .entry(conn_id)
8219            .or_default()
8220            .push((
8221                collection.to_string(),
8222                old_id,
8223                new_id,
8224                stamper_xid,
8225                previous_xmax,
8226            ));
8227    }
8228
8229    fn with_deferred_store_wal_if_transaction<T>(
8230        &self,
8231        f: impl FnOnce() -> RedDBResult<T>,
8232    ) -> RedDBResult<T> {
8233        let conn_id = current_connection_id();
8234        if !self.inner.tx_contexts.read().contains_key(&conn_id) {
8235            return f();
8236        }
8237
8238        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
8239        let result = f();
8240        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
8241        match result {
8242            Ok(value) => {
8243                self.record_pending_store_wal_actions(conn_id, captured);
8244                Ok(value)
8245            }
8246            Err(err) => Err(err),
8247        }
8248    }
8249
8250    fn with_deferred_store_wal_for_dml<T>(
8251        &self,
8252        capture_autocommit_events: bool,
8253        f: impl FnOnce() -> RedDBResult<T>,
8254    ) -> RedDBResult<T> {
8255        let conn_id = current_connection_id();
8256        if self.inner.tx_contexts.read().contains_key(&conn_id) {
8257            return self.with_deferred_store_wal_if_transaction(f);
8258        }
8259        if !capture_autocommit_events {
8260            return f();
8261        }
8262
8263        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
8264        let result = f();
8265        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
8266        self.inner
8267            .db
8268            .store()
8269            .append_deferred_store_wal_actions(captured)
8270            .map_err(|err| RedDBError::Internal(err.to_string()))?;
8271        result
8272    }
8273
8274    fn insert_may_emit_events(&self, query: &InsertQuery) -> bool {
8275        !query.suppress_events
8276            && self.collection_has_event_subscriptions_for_operation(
8277                &query.table,
8278                crate::catalog::SubscriptionOperation::Insert,
8279            )
8280    }
8281
8282    fn update_may_emit_events(&self, query: &UpdateQuery) -> bool {
8283        !query.suppress_events
8284            && self.collection_has_event_subscriptions_for_operation(
8285                &query.table,
8286                crate::catalog::SubscriptionOperation::Update,
8287            )
8288    }
8289
8290    fn delete_may_emit_events(&self, query: &DeleteQuery) -> bool {
8291        !query.suppress_events
8292            && self.collection_has_event_subscriptions_for_operation(
8293                &query.table,
8294                crate::catalog::SubscriptionOperation::Delete,
8295            )
8296    }
8297
8298    fn collection_has_event_subscriptions_for_operation(
8299        &self,
8300        collection: &str,
8301        operation: crate::catalog::SubscriptionOperation,
8302    ) -> bool {
8303        let Some(contract) = self.db().collection_contract_arc(collection) else {
8304            return false;
8305        };
8306        contract.subscriptions.iter().any(|subscription| {
8307            subscription.enabled
8308                && (subscription.ops_filter.is_empty()
8309                    || subscription.ops_filter.contains(&operation))
8310        })
8311    }
8312
8313    fn record_pending_store_wal_actions(
8314        &self,
8315        conn_id: u64,
8316        actions: crate::storage::unified::DeferredStoreWalActions,
8317    ) {
8318        if actions.is_empty() {
8319            return;
8320        }
8321        let mut guard = self.inner.pending_store_wal_actions.write();
8322        guard.entry(conn_id).or_default().extend(actions);
8323    }
8324
8325    fn flush_pending_store_wal_actions(&self, conn_id: u64) -> RedDBResult<()> {
8326        let Some(actions) = self
8327            .inner
8328            .pending_store_wal_actions
8329            .write()
8330            .remove(&conn_id)
8331        else {
8332            return Ok(());
8333        };
8334        self.inner
8335            .db
8336            .store()
8337            .append_deferred_store_wal_actions(actions)
8338            .map_err(|err| RedDBError::Internal(err.to_string()))
8339    }
8340
8341    fn discard_pending_store_wal_actions(&self, conn_id: u64) {
8342        self.inner
8343            .pending_store_wal_actions
8344            .write()
8345            .remove(&conn_id);
8346    }
8347
8348    fn xid_conflicts_with_snapshot(
8349        &self,
8350        xid: crate::storage::transaction::snapshot::Xid,
8351        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8352        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8353    ) -> bool {
8354        xid != 0
8355            && !own_xids.contains(&xid)
8356            && !self.inner.snapshot_manager.is_aborted(xid)
8357            && !self.inner.snapshot_manager.is_active(xid)
8358            && (xid > snapshot.xid || snapshot.in_progress.contains(&xid))
8359    }
8360
8361    fn conflict_error(
8362        collection: &str,
8363        logical_id: crate::storage::unified::entity::EntityId,
8364        xid: crate::storage::transaction::snapshot::Xid,
8365    ) -> RedDBError {
8366        RedDBError::Query(format!(
8367            "serialization conflict: table row {collection}/{} was modified by concurrent transaction {xid}",
8368            logical_id.raw()
8369        ))
8370    }
8371
8372    fn check_logical_row_conflict(
8373        &self,
8374        collection: &str,
8375        logical_id: crate::storage::unified::entity::EntityId,
8376        excluded_ids: &[crate::storage::unified::entity::EntityId],
8377        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8378        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8379    ) -> RedDBResult<()> {
8380        let store = self.inner.db.store();
8381        let Some(manager) = store.get_collection(collection) else {
8382            return Ok(());
8383        };
8384
8385        for candidate in manager.query_all(|_| true) {
8386            if excluded_ids.contains(&candidate.id) || candidate.logical_id() != logical_id {
8387                continue;
8388            }
8389            if self.xid_conflicts_with_snapshot(candidate.xmin, snapshot, own_xids) {
8390                return Err(Self::conflict_error(collection, logical_id, candidate.xmin));
8391            }
8392            if self.xid_conflicts_with_snapshot(candidate.xmax, snapshot, own_xids) {
8393                return Err(Self::conflict_error(collection, logical_id, candidate.xmax));
8394            }
8395        }
8396        Ok(())
8397    }
8398
8399    pub(crate) fn check_table_row_write_conflicts(
8400        &self,
8401        conn_id: u64,
8402        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8403        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8404    ) -> RedDBResult<()> {
8405        let versioned_updates = self
8406            .inner
8407            .pending_versioned_updates
8408            .read()
8409            .get(&conn_id)
8410            .cloned()
8411            .unwrap_or_default();
8412        let tombstones = self
8413            .inner
8414            .pending_tombstones
8415            .read()
8416            .get(&conn_id)
8417            .cloned()
8418            .unwrap_or_default();
8419
8420        let store = self.inner.db.store();
8421        for (collection, old_id, new_id, xid, previous_xmax) in versioned_updates {
8422            let Some(manager) = store.get_collection(&collection) else {
8423                continue;
8424            };
8425            let Some(old) = manager.get(old_id) else {
8426                continue;
8427            };
8428            let logical_id = old.logical_id();
8429            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
8430                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
8431            }
8432            if old.xmax != xid && self.xid_conflicts_with_snapshot(old.xmax, snapshot, own_xids) {
8433                return Err(Self::conflict_error(&collection, logical_id, old.xmax));
8434            }
8435            self.check_logical_row_conflict(
8436                &collection,
8437                logical_id,
8438                &[old_id, new_id],
8439                snapshot,
8440                own_xids,
8441            )?;
8442        }
8443
8444        for (collection, id, xid, previous_xmax) in tombstones {
8445            let Some(manager) = store.get_collection(&collection) else {
8446                continue;
8447            };
8448            let Some(entity) = manager.get(id) else {
8449                continue;
8450            };
8451            let logical_id = entity.logical_id();
8452            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
8453                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
8454            }
8455            if entity.xmax != xid
8456                && self.xid_conflicts_with_snapshot(entity.xmax, snapshot, own_xids)
8457            {
8458                return Err(Self::conflict_error(&collection, logical_id, entity.xmax));
8459            }
8460            self.check_logical_row_conflict(&collection, logical_id, &[id], snapshot, own_xids)?;
8461        }
8462
8463        Ok(())
8464    }
8465
8466    pub(crate) fn restore_pending_write_stamps(&self, conn_id: u64) {
8467        let versioned_updates = self
8468            .inner
8469            .pending_versioned_updates
8470            .read()
8471            .get(&conn_id)
8472            .cloned()
8473            .unwrap_or_default();
8474        let tombstones = self
8475            .inner
8476            .pending_tombstones
8477            .read()
8478            .get(&conn_id)
8479            .cloned()
8480            .unwrap_or_default();
8481
8482        let store = self.inner.db.store();
8483        for (collection, old_id, _new_id, xid, _previous_xmax) in versioned_updates {
8484            if let Some(manager) = store.get_collection(&collection) {
8485                if let Some(mut entity) = manager.get(old_id) {
8486                    entity.set_xmax(xid);
8487                    let _ = manager.update(entity);
8488                }
8489            }
8490        }
8491        for (collection, id, xid, _previous_xmax) in tombstones {
8492            if let Some(manager) = store.get_collection(&collection) {
8493                if let Some(mut entity) = manager.get(id) {
8494                    entity.set_xmax(xid);
8495                    let _ = manager.update(entity);
8496                }
8497            }
8498        }
8499    }
8500
8501    pub(crate) fn finalize_pending_versioned_updates(&self, conn_id: u64) {
8502        self.inner
8503            .pending_versioned_updates
8504            .write()
8505            .remove(&conn_id);
8506    }
8507
8508    pub(crate) fn revive_pending_versioned_updates(&self, conn_id: u64) {
8509        let Some(pending) = self
8510            .inner
8511            .pending_versioned_updates
8512            .write()
8513            .remove(&conn_id)
8514        else {
8515            return;
8516        };
8517
8518        let store = self.inner.db.store();
8519        for (collection, old_id, new_id, xid, previous_xmax) in pending {
8520            if let Some(manager) = store.get_collection(&collection) {
8521                if let Some(mut old) = manager.get(old_id) {
8522                    if old.xmax == xid {
8523                        old.set_xmax(previous_xmax);
8524                        let _ = manager.update(old);
8525                    }
8526                }
8527            }
8528            let _ = store.delete_batch(&collection, &[new_id]);
8529        }
8530    }
8531
8532    pub(crate) fn revive_versioned_updates_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
8533        let mut guard = self.inner.pending_versioned_updates.write();
8534        let Some(pending) = guard.get_mut(&conn_id) else {
8535            return 0;
8536        };
8537
8538        let store = self.inner.db.store();
8539        let mut reverted = 0usize;
8540        pending.retain(|(collection, old_id, new_id, xid, previous_xmax)| {
8541            if *xid < stamper_xid {
8542                return true;
8543            }
8544            if let Some(manager) = store.get_collection(collection) {
8545                if let Some(mut old) = manager.get(*old_id) {
8546                    if old.xmax == *xid {
8547                        old.set_xmax(*previous_xmax);
8548                        let _ = manager.update(old);
8549                    }
8550                }
8551            }
8552            let _ = store.delete_batch(collection, &[*new_id]);
8553            reverted += 1;
8554            false
8555        });
8556        if pending.is_empty() {
8557            guard.remove(&conn_id);
8558        }
8559        reverted
8560    }
8561
8562    /// Flush tombstones on COMMIT. The xmax stamp is already the durable
8563    /// delete marker; commit only drops the rollback journal and emits
8564    /// side effects. Physical reclamation is left for VACUUM so old
8565    /// snapshots can still resolve the pre-delete row version.
8566    pub(crate) fn finalize_pending_tombstones(&self, conn_id: u64) {
8567        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
8568            return;
8569        };
8570        if pending.is_empty() {
8571            return;
8572        }
8573
8574        let store = self.inner.db.store();
8575        for (collection, id, _xid, _previous_xmax) in pending {
8576            store.context_index().remove_entity(id);
8577            self.cdc_emit(
8578                crate::replication::cdc::ChangeOperation::Delete,
8579                &collection,
8580                id.raw(),
8581                "entity",
8582            );
8583        }
8584    }
8585
8586    /// Revive tombstones on ROLLBACK — reset `xmax` to 0 so the tuples
8587    /// become visible again to future snapshots. Best-effort: a row
8588    /// already reclaimed by a concurrent VACUUM stays gone, but VACUUM
8589    /// never reclaims tuples whose xmax is still referenced by any
8590    /// active snapshot, so this case is only reachable via external
8591    /// storage corruption.
8592    pub(crate) fn revive_pending_tombstones(&self, conn_id: u64) {
8593        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
8594            return;
8595        };
8596
8597        let store = self.inner.db.store();
8598        for (collection, id, xid, previous_xmax) in pending {
8599            let Some(manager) = store.get_collection(&collection) else {
8600                continue;
8601            };
8602            if let Some(mut entity) = manager.get(id) {
8603                if entity.xmax == xid {
8604                    entity.set_xmax(previous_xmax);
8605                    let _ = manager.update(entity);
8606                }
8607            }
8608        }
8609    }
8610
8611    /// Slice C of PRD #718 — accessor for the local wait registry.
8612    pub fn queue_wait_registry(
8613        &self,
8614    ) -> std::sync::Arc<crate::runtime::queue_wait_registry::QueueWaitRegistry> {
8615        self.inner.queue_wait_registry.clone()
8616    }
8617
8618    /// Buffer a `(scope, queue)` wake on the current connection so it
8619    /// fires post-COMMIT, or notify immediately if no transaction is
8620    /// open (autocommit path). The wait registry only ever observes
8621    /// notifies for committed work — rollback drops the buffer.
8622    pub(crate) fn record_queue_wake(&self, scope: &str, queue: &str) {
8623        if self.current_xid().is_some() {
8624            let conn_id = current_connection_id();
8625            self.inner
8626                .pending_queue_wakes
8627                .write()
8628                .entry(conn_id)
8629                .or_default()
8630                .push((scope.to_string(), queue.to_string()));
8631            return;
8632        }
8633        self.inner.queue_wait_registry.notify(scope, queue);
8634    }
8635
8636    pub(crate) fn finalize_pending_queue_wakes(&self, conn_id: u64) {
8637        let Some(pending) = self.inner.pending_queue_wakes.write().remove(&conn_id) else {
8638            return;
8639        };
8640        for (scope, queue) in pending {
8641            self.inner.queue_wait_registry.notify(&scope, &queue);
8642        }
8643    }
8644
8645    pub(crate) fn discard_pending_queue_wakes(&self, conn_id: u64) {
8646        self.inner.pending_queue_wakes.write().remove(&conn_id);
8647    }
8648
8649    pub(crate) fn finalize_pending_kv_watch_events(&self, conn_id: u64) {
8650        let Some(pending) = self.inner.pending_kv_watch_events.write().remove(&conn_id) else {
8651            return;
8652        };
8653        for event in pending {
8654            self.cdc_emit_kv(
8655                event.op,
8656                &event.collection,
8657                &event.key,
8658                0,
8659                event.before,
8660                event.after,
8661            );
8662        }
8663    }
8664
8665    pub(crate) fn discard_pending_kv_watch_events(&self, conn_id: u64) {
8666        self.inner.pending_kv_watch_events.write().remove(&conn_id);
8667    }
8668
8669    /// Materialise the entire graph store while applying MVCC visibility
8670    /// AND per-collection RLS to each candidate node and edge. Mirrors
8671    /// `materialize_graph` but routes every entity through the same
8672    /// gate the SELECT path uses, with the correct `PolicyTargetKind`
8673    /// per entity kind (`Nodes` for graph nodes, `Edges` for graph
8674    /// edges). Returns the filtered `GraphStore` plus the
8675    /// `node_id → properties` map the executor needs for `RETURN n.*`
8676    /// projections.
8677    fn materialize_graph_with_rls(
8678        &self,
8679    ) -> RedDBResult<(
8680        crate::storage::engine::GraphStore,
8681        std::collections::HashMap<
8682            String,
8683            std::collections::HashMap<String, crate::storage::schema::Value>,
8684        >,
8685        crate::storage::query::unified::EdgeProperties,
8686    )> {
8687        use crate::storage::engine::GraphStore;
8688        use crate::storage::query::ast::{PolicyAction, PolicyTargetKind};
8689        use crate::storage::unified::entity::{EntityData, EntityKind};
8690        use std::collections::{HashMap, HashSet};
8691
8692        let store = self.inner.db.store();
8693        let snap_ctx = capture_current_snapshot();
8694        let role = current_auth_identity().map(|(_, r)| r.as_str().to_string());
8695
8696        let graph = GraphStore::new();
8697        let mut node_properties: HashMap<String, HashMap<String, crate::storage::schema::Value>> =
8698            HashMap::new();
8699        let mut edge_properties: crate::storage::query::unified::EdgeProperties = HashMap::new();
8700        let mut allowed_nodes: HashSet<String> = HashSet::new();
8701
8702        // Per-collection cached compiled filters — Nodes-kind for
8703        // first pass, Edges-kind for the second. None entries mean
8704        // "RLS enabled, zero matching policy → deny all of this kind".
8705        let mut node_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
8706            HashMap::new();
8707        let mut edge_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
8708            HashMap::new();
8709
8710        let collections = store.list_collections();
8711
8712        // First pass — gather nodes.
8713        for collection in &collections {
8714            let Some(manager) = store.get_collection(collection) else {
8715                continue;
8716            };
8717            let entities = manager.query_all(|_| true);
8718            for entity in entities {
8719                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
8720                    continue;
8721                }
8722                let EntityKind::GraphNode(ref node) = entity.kind else {
8723                    continue;
8724                };
8725                if !node_passes_rls(self, collection, role.as_deref(), &mut node_rls, &entity) {
8726                    continue;
8727                }
8728                let id_str = entity.id.raw().to_string();
8729                graph
8730                    .add_node_with_label(
8731                        &id_str,
8732                        &node.label,
8733                        &super::graph_node_label(&node.node_type),
8734                    )
8735                    .map_err(|err| RedDBError::Query(err.to_string()))?;
8736                allowed_nodes.insert(id_str.clone());
8737                if let EntityData::Node(node_data) = &entity.data {
8738                    node_properties.insert(id_str, node_data.properties.clone());
8739                }
8740            }
8741        }
8742
8743        // Second pass — gather edges. An edge appears only when both
8744        // endpoint nodes survived the RLS pass AND the edge itself
8745        // passes its own RLS gate.
8746        for collection in &collections {
8747            let Some(manager) = store.get_collection(collection) else {
8748                continue;
8749            };
8750            let entities = manager.query_all(|_| true);
8751            for entity in entities {
8752                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
8753                    continue;
8754                }
8755                let EntityKind::GraphEdge(ref edge) = entity.kind else {
8756                    continue;
8757                };
8758                if !allowed_nodes.contains(&edge.from_node)
8759                    || !allowed_nodes.contains(&edge.to_node)
8760                {
8761                    continue;
8762                }
8763                if !edge_passes_rls(self, collection, role.as_deref(), &mut edge_rls, &entity) {
8764                    continue;
8765                }
8766                let weight = match &entity.data {
8767                    EntityData::Edge(e) => e.weight,
8768                    _ => edge.weight as f32 / 1000.0,
8769                };
8770                let edge_label = super::graph_edge_label(&edge.label);
8771                graph
8772                    .add_edge_with_label(&edge.from_node, &edge.to_node, &edge_label, weight)
8773                    .map_err(|err| RedDBError::Query(err.to_string()))?;
8774                if let EntityData::Edge(edge_data) = &entity.data {
8775                    edge_properties.insert(
8776                        (edge.from_node.clone(), edge_label, edge.to_node.clone()),
8777                        edge_data.properties.clone(),
8778                    );
8779                }
8780            }
8781        }
8782
8783        // Suppress unused-PolicyAction/PolicyTargetKind warnings — both
8784        // are used inside the helper closures via the per-kind helpers
8785        // declared at the bottom of this file.
8786        let _ = (PolicyAction::Select, PolicyTargetKind::Nodes);
8787
8788        Ok((graph, node_properties, edge_properties))
8789    }
8790
8791    /// Phase 1.1 MVCC universal: post-save hook that stamps `xmin` on a
8792    /// freshly-inserted entity when the current connection holds an
8793    /// open transaction. Used by graph / vector / queue / timeseries
8794    /// write paths that go through the DevX builder API (`db.node(...)
8795    /// .save()` and friends) — those live in the storage crate and
8796    /// can't reach `current_xid()` without crossing layers, so the
8797    /// application layer calls this helper right after `save()` to
8798    /// finalise the MVCC stamp.
8799    ///
8800    /// Autocommit (outside BEGIN) is a no-op — no extra lookup or
8801    /// write, so the non-transactional hot path stays untouched.
8802    ///
8803    /// Best-effort: if the collection or entity disappears between
8804    /// the save and the stamp (concurrent DROP), we silently skip.
8805    pub(crate) fn stamp_xmin_if_in_txn(
8806        &self,
8807        collection: &str,
8808        id: crate::storage::unified::entity::EntityId,
8809    ) {
8810        let Some(xid) = self.current_xid() else {
8811            return;
8812        };
8813        let store = self.inner.db.store();
8814        let Some(manager) = store.get_collection(collection) else {
8815            return;
8816        };
8817        if let Some(mut entity) = manager.get(id) {
8818            entity.set_xmin(xid);
8819            let _ = manager.update(entity);
8820        }
8821    }
8822
8823    /// Revive tombstones stamped by `stamper_xid` or any sub-xid
8824    /// allocated after it (Phase 2.3.2e savepoint rollback). Any
8825    /// pending entries with `xid < stamper_xid` stay queued because
8826    /// they belong to the enclosing scope — they'll either flush on
8827    /// COMMIT or revive on an outer ROLLBACK TO SAVEPOINT.
8828    ///
8829    /// Returns the number of tuples whose `xmax` was wiped back to 0.
8830    pub(crate) fn revive_tombstones_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
8831        let mut guard = self.inner.pending_tombstones.write();
8832        let Some(pending) = guard.get_mut(&conn_id) else {
8833            return 0;
8834        };
8835
8836        let store = self.inner.db.store();
8837        let mut revived = 0usize;
8838        pending.retain(|(collection, id, xid, previous_xmax)| {
8839            if *xid < stamper_xid {
8840                // Stamped before the savepoint — keep in queue.
8841                return true;
8842            }
8843            if let Some(manager) = store.get_collection(collection) {
8844                if let Some(mut entity) = manager.get(*id) {
8845                    if entity.xmax == *xid {
8846                        entity.set_xmax(*previous_xmax);
8847                        let _ = manager.update(entity);
8848                        revived += 1;
8849                    }
8850                }
8851            }
8852            false
8853        });
8854        if pending.is_empty() {
8855            guard.remove(&conn_id);
8856        }
8857        revived
8858    }
8859
8860    /// Return the snapshot the current connection should use for visibility
8861    /// checks (Phase 2.3 PG parity).
8862    ///
8863    /// * If the connection is inside a BEGIN-wrapped transaction, reuse
8864    ///   the snapshot stored in its `TxnContext`.
8865    /// * Otherwise (autocommit), capture a fresh snapshot tied to an
8866    ///   implicit xid=0 — the read path treats pre-MVCC rows as always
8867    ///   visible so this degrades to "see everything committed".
8868    pub fn current_snapshot(&self) -> crate::storage::transaction::snapshot::Snapshot {
8869        let conn_id = current_connection_id();
8870        if let Some(ctx) = self.inner.tx_contexts.read().get(&conn_id).cloned() {
8871            return ctx.snapshot;
8872        }
8873        // Autocommit: take a fresh snapshot bounded by `peek_next_xid` so
8874        // every already-committed xid (which is strictly less) passes the
8875        // `xmin <= snap.xid` gate, while concurrently-active xids land in
8876        // the `in_progress` set and stay hidden until they commit. Using
8877        // xid=0 would incorrectly hide every MVCC-stamped tuple.
8878        let high_water = self.inner.snapshot_manager.peek_next_xid();
8879        self.inner.snapshot_manager.snapshot(high_water)
8880    }
8881
8882    /// Xid of the current connection's active transaction, or `None` when
8883    /// running outside a BEGIN/COMMIT block. Write paths call this to
8884    /// decide whether to stamp `xmin`/`xmax` on tuples.
8885    /// Phase 2.3.2e: when a savepoint is open, `writer_xid` returns the
8886    /// sub-xid so new writes can be selectively rolled back. Otherwise
8887    /// the parent txn's xid is returned, matching pre-savepoint
8888    /// behaviour. Callers that need the enclosing *transaction* xid
8889    /// (e.g. VACUUM min-active calculations) should read `ctx.xid`
8890    /// directly.
8891    pub fn current_xid(&self) -> Option<crate::storage::transaction::snapshot::Xid> {
8892        let conn_id = current_connection_id();
8893        self.inner
8894            .tx_contexts
8895            .read()
8896            .get(&conn_id)
8897            .map(|ctx| ctx.writer_xid())
8898    }
8899
8900    /// `true` when the given connection id has an open `BEGIN`. Issue
8901    /// #760 — `OpenStream` consults this to refuse output streams that
8902    /// would otherwise collide with an interactive transaction (see
8903    /// ADR 0029 "Transaction interaction"). HTTP requests pre-dating the
8904    /// connection-id plumbing run with id `0`, which never carries a
8905    /// transaction context, so this returns `false` on those paths.
8906    pub fn connection_in_transaction(&self, conn_id: u64) -> bool {
8907        self.inner.tx_contexts.read().contains_key(&conn_id)
8908    }
8909
8910    /// Access the shared `SnapshotManager` — useful for VACUUM to compute
8911    /// the oldest-active xid when reclaiming dead tuples.
8912    pub fn snapshot_manager(&self) -> Arc<crate::storage::transaction::snapshot::SnapshotManager> {
8913        Arc::clone(&self.inner.snapshot_manager)
8914    }
8915
8916    fn mvcc_vacuum_cutoff_xid(&self) -> crate::storage::transaction::snapshot::Xid {
8917        let manager = &self.inner.snapshot_manager;
8918        let next_xid = manager.peek_next_xid();
8919        let mut cutoff = next_xid;
8920        if let Some(oldest_active) = manager.oldest_active_xid() {
8921            cutoff = cutoff.min(oldest_active);
8922        }
8923        if let Some(oldest_pinned) = manager.oldest_pinned_xid() {
8924            cutoff = cutoff.min(oldest_pinned);
8925        }
8926        let retention_xids = self.config_u64("runtime.mvcc.vacuum_retention_xids", 0);
8927        if retention_xids > 0 {
8928            cutoff = cutoff.min(next_xid.saturating_sub(retention_xids));
8929        }
8930        cutoff
8931    }
8932
8933    fn rebuild_runtime_indexes_for_table(&self, table: &str) -> RedDBResult<()> {
8934        let registered = self.inner.index_store.list_indices(table);
8935        if registered.is_empty() {
8936            return Ok(());
8937        }
8938        let store = self.inner.db.store();
8939        let Some(manager) = store.get_collection(table) else {
8940            return Ok(());
8941        };
8942        let entity_fields = manager
8943            .query_all(|entity| matches!(entity.kind, crate::storage::EntityKind::TableRow { .. }))
8944            .into_iter()
8945            .map(|entity| (entity.id, table_row_index_fields(&entity)))
8946            .collect::<Vec<_>>();
8947
8948        for index in registered {
8949            self.inner.index_store.drop_index(&index.name, table);
8950            self.inner
8951                .index_store
8952                .create_index(
8953                    &index.name,
8954                    table,
8955                    &index.columns,
8956                    index.method,
8957                    index.unique,
8958                    &entity_fields,
8959                )
8960                .map_err(RedDBError::Internal)?;
8961            self.inner.index_store.register(index);
8962        }
8963        self.invalidate_plan_cache();
8964        Ok(())
8965    }
8966
8967    pub(crate) fn persist_runtime_index_descriptor(
8968        &self,
8969        index: super::index_store::RegisteredIndex,
8970    ) -> RedDBResult<()> {
8971        let store = self.inner.db.store();
8972        let _ = store.get_or_create_collection(RUNTIME_INDEX_REGISTRY_COLLECTION);
8973        let entity = crate::storage::UnifiedEntity::new(
8974            crate::storage::EntityId::new(0),
8975            crate::storage::EntityKind::TableRow {
8976                table: std::sync::Arc::from(RUNTIME_INDEX_REGISTRY_COLLECTION),
8977                row_id: 0,
8978            },
8979            crate::storage::EntityData::Row(crate::storage::RowData {
8980                columns: Vec::new(),
8981                named: Some(
8982                    [
8983                        (
8984                            "collection".to_string(),
8985                            crate::storage::schema::Value::text(index.collection.clone()),
8986                        ),
8987                        (
8988                            "name".to_string(),
8989                            crate::storage::schema::Value::text(index.name.clone()),
8990                        ),
8991                        (
8992                            "columns".to_string(),
8993                            crate::storage::schema::Value::text(index.columns.join("\u{1f}")),
8994                        ),
8995                        (
8996                            "method".to_string(),
8997                            crate::storage::schema::Value::text(index_method_kind_as_str(
8998                                index.method,
8999                            )),
9000                        ),
9001                        (
9002                            "unique".to_string(),
9003                            crate::storage::schema::Value::Boolean(index.unique),
9004                        ),
9005                        (
9006                            "dropped".to_string(),
9007                            crate::storage::schema::Value::Boolean(false),
9008                        ),
9009                    ]
9010                    .into_iter()
9011                    .collect(),
9012                ),
9013                schema: None,
9014            }),
9015        );
9016        store
9017            .insert_auto(RUNTIME_INDEX_REGISTRY_COLLECTION, entity)
9018            .map(|_| ())
9019            .map_err(|err| RedDBError::Internal(format!("{err:?}")))
9020    }
9021
9022    pub(crate) fn persist_runtime_index_drop(
9023        &self,
9024        collection: &str,
9025        name: &str,
9026    ) -> RedDBResult<()> {
9027        let store = self.inner.db.store();
9028        let _ = store.get_or_create_collection(RUNTIME_INDEX_REGISTRY_COLLECTION);
9029        let entity = crate::storage::UnifiedEntity::new(
9030            crate::storage::EntityId::new(0),
9031            crate::storage::EntityKind::TableRow {
9032                table: std::sync::Arc::from(RUNTIME_INDEX_REGISTRY_COLLECTION),
9033                row_id: 0,
9034            },
9035            crate::storage::EntityData::Row(crate::storage::RowData {
9036                columns: Vec::new(),
9037                named: Some(
9038                    [
9039                        (
9040                            "collection".to_string(),
9041                            crate::storage::schema::Value::text(collection.to_string()),
9042                        ),
9043                        (
9044                            "name".to_string(),
9045                            crate::storage::schema::Value::text(name.to_string()),
9046                        ),
9047                        (
9048                            "dropped".to_string(),
9049                            crate::storage::schema::Value::Boolean(true),
9050                        ),
9051                    ]
9052                    .into_iter()
9053                    .collect(),
9054                ),
9055                schema: None,
9056            }),
9057        );
9058        store
9059            .insert_auto(RUNTIME_INDEX_REGISTRY_COLLECTION, entity)
9060            .map(|_| ())
9061            .map_err(|err| RedDBError::Internal(format!("{err:?}")))
9062    }
9063
9064    fn rehydrate_runtime_index_registry(&self) -> RedDBResult<()> {
9065        let store = self.inner.db.store();
9066        let Some(manager) = store.get_collection(RUNTIME_INDEX_REGISTRY_COLLECTION) else {
9067            return Ok(());
9068        };
9069        let mut rows = manager.query_all(|_| true);
9070        rows.sort_by_key(|entity| entity.id.raw());
9071
9072        let mut latest = std::collections::HashMap::<
9073            (String, String),
9074            Option<super::index_store::RegisteredIndex>,
9075        >::new();
9076        for entity in rows {
9077            let crate::storage::EntityData::Row(row) = &entity.data else {
9078                continue;
9079            };
9080            let Some(named) = &row.named else {
9081                continue;
9082            };
9083            let Some(collection) = named_text(named, "collection") else {
9084                continue;
9085            };
9086            let Some(name) = named_text(named, "name") else {
9087                continue;
9088            };
9089            let dropped = named_bool(named, "dropped").unwrap_or(false);
9090            let key = (collection.clone(), name.clone());
9091            if dropped {
9092                latest.insert(key, None);
9093                continue;
9094            }
9095            let columns = named_text(named, "columns")
9096                .map(|raw| {
9097                    raw.split('\u{1f}')
9098                        .filter(|part| !part.is_empty())
9099                        .map(str::to_string)
9100                        .collect::<Vec<_>>()
9101                })
9102                .unwrap_or_default();
9103            let Some(method) =
9104                named_text(named, "method").and_then(|raw| index_method_kind_from_str(&raw))
9105            else {
9106                continue;
9107            };
9108            latest.insert(
9109                key,
9110                Some(super::index_store::RegisteredIndex {
9111                    name,
9112                    collection,
9113                    columns,
9114                    method,
9115                    unique: named_bool(named, "unique").unwrap_or(false),
9116                }),
9117            );
9118        }
9119
9120        for index in latest.into_values().flatten() {
9121            let Some(manager) = store.get_collection(&index.collection) else {
9122                continue;
9123            };
9124            let entity_fields = manager
9125                .query_all(|entity| {
9126                    matches!(entity.kind, crate::storage::EntityKind::TableRow { .. })
9127                })
9128                .into_iter()
9129                .map(|entity| (entity.id, table_row_index_fields(&entity)))
9130                .collect::<Vec<_>>();
9131            self.inner
9132                .index_store
9133                .create_index(
9134                    &index.name,
9135                    &index.collection,
9136                    &index.columns,
9137                    index.method,
9138                    index.unique,
9139                    &entity_fields,
9140                )
9141                .map_err(RedDBError::Internal)?;
9142            self.inner.index_store.register(index);
9143        }
9144        self.invalidate_plan_cache();
9145        Ok(())
9146    }
9147
9148    /// Own-tx xids (parent + open/released savepoints) for the current
9149    /// connection. Transports + tests that build a `SnapshotContext`
9150    /// manually (outside the `execute_query` scope) need this set so
9151    /// the writer's own uncommitted tuples stay visible to self.
9152    pub fn current_txn_own_xids(
9153        &self,
9154    ) -> std::collections::HashSet<crate::storage::transaction::snapshot::Xid> {
9155        let mut set = std::collections::HashSet::new();
9156        if let Some(ctx) = self.inner.tx_contexts.read().get(&current_connection_id()) {
9157            set.insert(ctx.xid);
9158            for (_, sub) in &ctx.savepoints {
9159                set.insert(*sub);
9160            }
9161            for sub in &ctx.released_sub_xids {
9162                set.insert(*sub);
9163            }
9164        }
9165        set
9166    }
9167
9168    /// Access the shared `ForeignTableRegistry` (Phase 3.2 PG parity).
9169    ///
9170    /// Callers use this to check whether a table name is a registered
9171    /// foreign table (`registry.is_foreign_table(name)`) and, if so, to
9172    /// scan it (`registry.scan(name)`). The read-path rewriter consults
9173    /// this before dispatching into native-collection lookup.
9174    pub fn foreign_tables(&self) -> Arc<crate::storage::fdw::ForeignTableRegistry> {
9175        Arc::clone(&self.inner.foreign_tables)
9176    }
9177
9178    /// Is Row-Level Security enabled for this table? (Phase 2.5 PG parity)
9179    pub fn is_rls_enabled(&self, table: &str) -> bool {
9180        self.inner.rls_enabled_tables.read().contains(table)
9181    }
9182
9183    /// Collect the USING predicates that apply to this `(table, role, action)`.
9184    ///
9185    /// Returned filters should be OR-combined (a row passes RLS when *any*
9186    /// matching policy accepts it) and then AND-ed into the query's WHERE.
9187    /// When the table has RLS disabled this returns an empty Vec — callers
9188    /// can fast-path back to the unfiltered read.
9189    pub fn matching_rls_policies(
9190        &self,
9191        table: &str,
9192        role: Option<&str>,
9193        action: crate::storage::query::ast::PolicyAction,
9194    ) -> Vec<crate::storage::query::ast::Filter> {
9195        // Default kind = Table preserves the pre-Phase-2.5.5 behaviour:
9196        // callers that don't name a kind only see Table-scoped
9197        // policies (which is what execute SELECT / UPDATE / DELETE
9198        // expect).
9199        self.matching_rls_policies_for_kind(
9200            table,
9201            role,
9202            action,
9203            crate::storage::query::ast::PolicyTargetKind::Table,
9204        )
9205    }
9206
9207    /// Kind-aware variant used by cross-model scans (Phase 2.5.5).
9208    ///
9209    /// Graph scans request `Nodes` / `Edges`, vector ANN requests
9210    /// `Vectors`, queue consumers request `Messages`, and timeseries
9211    /// range scans request `Points`. Policies tagged with a
9212    /// different kind are skipped so a graph-scoped policy doesn't
9213    /// accidentally gate a table SELECT on the same collection.
9214    pub fn matching_rls_policies_for_kind(
9215        &self,
9216        table: &str,
9217        role: Option<&str>,
9218        action: crate::storage::query::ast::PolicyAction,
9219        kind: crate::storage::query::ast::PolicyTargetKind,
9220    ) -> Vec<crate::storage::query::ast::Filter> {
9221        if !self.is_rls_enabled(table) {
9222            return Vec::new();
9223        }
9224        let policies = self.inner.rls_policies.read();
9225        policies
9226            .iter()
9227            .filter_map(|((t, _), p)| {
9228                if t != table {
9229                    return None;
9230                }
9231                // Kind gate — Table policies also apply to every
9232                // other kind *iff* the policy predicate evaluates
9233                // against entity fields that exist uniformly; the
9234                // caller's kind filter is the stricter check, so
9235                // match literally. Auto-tenancy policies stamp
9236                // Table and the caller passes the concrete kind —
9237                // we allow Table policies to apply cross-kind for
9238                // backwards compat.
9239                if p.target_kind != kind
9240                    && p.target_kind != crate::storage::query::ast::PolicyTargetKind::Table
9241                {
9242                    return None;
9243                }
9244                // Action gate — `None` means "ALL" actions.
9245                if let Some(a) = p.action {
9246                    if a != action {
9247                        return None;
9248                    }
9249                }
9250                // Role gate — `None` means "any role".
9251                if let Some(p_role) = p.role.as_deref() {
9252                    match role {
9253                        Some(r) if r == p_role => {}
9254                        _ => return None,
9255                    }
9256                }
9257                Some((*p.using).clone())
9258            })
9259            .collect()
9260    }
9261
9262    pub(crate) fn refresh_table_planner_stats(&self, table: &str) {
9263        let store = self.inner.db.store();
9264        if let Some(stats) =
9265            crate::storage::query::planner::stats_catalog::analyze_collection(store.as_ref(), table)
9266        {
9267            crate::storage::query::planner::stats_catalog::persist_table_stats(
9268                store.as_ref(),
9269                &stats,
9270            );
9271        } else {
9272            crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
9273        }
9274        self.invalidate_plan_cache();
9275    }
9276
9277    pub(crate) fn note_table_write(&self, table: &str) {
9278        // Skip the write lock when the table is already marked
9279        // dirty. With single-row UPDATEs in a loop this used to
9280        // grab the planner_dirty_tables write lock N times even
9281        // though the first call already flipped the flag.
9282        let already_dirty = self.inner.planner_dirty_tables.read().contains(table);
9283        if !already_dirty {
9284            self.inner
9285                .planner_dirty_tables
9286                .write()
9287                .insert(table.to_string());
9288        }
9289        self.invalidate_result_cache_for_table(table);
9290    }
9291
9292    /// Wrap the planner's `RuntimeQueryExplain` as rows on a
9293    /// `RuntimeQueryResult` so callers over the SQL interface see the
9294    /// plan tree in the same shape a SELECT produces.
9295    ///
9296    /// Columns: `op`, `source`, `est_rows`, `est_cost`, `depth`.
9297    /// Nodes are walked depth-first; `depth` counts from 0 at the
9298    /// root so a text renderer can indent without re-walking.
9299    fn explain_as_rows(&self, raw_query: &str, inner_sql: &str) -> RedDBResult<RuntimeQueryResult> {
9300        let explain = self.explain_query(inner_sql)?;
9301
9302        let columns = vec![
9303            "op".to_string(),
9304            "source".to_string(),
9305            "est_rows".to_string(),
9306            "est_cost".to_string(),
9307            "depth".to_string(),
9308        ];
9309
9310        let mut records: Vec<crate::storage::query::unified::UnifiedRecord> = Vec::new();
9311
9312        // Prepend `CteScan` markers when the query carried a leading
9313        // WITH clause. The CTE bodies are already inlined into the
9314        // main plan tree, but operators reading EXPLAIN need to see
9315        // which named CTEs were resolved — without this row the plan
9316        // would look indistinguishable from a hand-inlined query.
9317        for name in &explain.cte_materializations {
9318            use std::sync::Arc;
9319            let mut rec = crate::storage::query::unified::UnifiedRecord::default();
9320            rec.set_arc(Arc::from("op"), Value::text("CteScan".to_string()));
9321            rec.set_arc(Arc::from("source"), Value::text(name.clone()));
9322            rec.set_arc(Arc::from("est_rows"), Value::Float(0.0));
9323            rec.set_arc(Arc::from("est_cost"), Value::Float(0.0));
9324            rec.set_arc(Arc::from("depth"), Value::Integer(0));
9325            records.push(rec);
9326        }
9327
9328        walk_plan_node(&explain.logical_plan.root, 0, &mut records);
9329
9330        let result = crate::storage::query::unified::UnifiedResult {
9331            columns,
9332            records,
9333            stats: Default::default(),
9334            pre_serialized_json: None,
9335        };
9336
9337        Ok(RuntimeQueryResult {
9338            query: raw_query.to_string(),
9339            mode: explain.mode,
9340            statement: "explain",
9341            engine: "runtime-explain",
9342            result,
9343            affected_rows: 0,
9344            statement_type: "select",
9345            bookmark: None,
9346        })
9347    }
9348
9349    // -----------------------------------------------------------------
9350    // Granular RBAC — privilege gate + GRANT/REVOKE/ALTER USER dispatch
9351    // -----------------------------------------------------------------
9352
9353    /// Project a `QueryExpr` to the (action, resource) pair the
9354    /// privilege engine cares about. Returns `Ok(())` for statements
9355    /// that don't touch user data (transaction control, SHOW, SET, etc.).
9356    pub(crate) fn check_query_privilege(
9357        &self,
9358        expr: &crate::storage::query::ast::QueryExpr,
9359    ) -> Result<(), String> {
9360        use crate::auth::privileges::{Action, AuthzContext, Resource};
9361        use crate::auth::UserId;
9362        use crate::storage::query::ast::QueryExpr;
9363
9364        // No auth store wired (embedded mode / fresh DB / tests) → bypass.
9365        // The bootstrap path itself goes through `execute_query` so this
9366        // is the only sensible default; once auth is wired, the gate
9367        // becomes active.
9368        let auth_store = match self.inner.auth_store.read().clone() {
9369            Some(s) => s,
9370            None => return Ok(()),
9371        };
9372
9373        // Resolve principal + role from the thread-local identity.
9374        // Anonymous (no identity) is allowed to read the bootstrap path
9375        // only when auth_store says so; we treat missing identity as
9376        // platform-admin-equivalent here so embedded test harnesses
9377        // continue to work without setting an identity.
9378        let (username, role) = match current_auth_identity() {
9379            Some(p) => p,
9380            None => return Ok(()),
9381        };
9382        let tenant = current_tenant();
9383
9384        let ctx = AuthzContext {
9385            principal: &username,
9386            effective_role: role,
9387            tenant: tenant.as_deref(),
9388        };
9389        let principal_id = UserId::from_parts(tenant.as_deref(), &username);
9390
9391        // Map QueryExpr → (Action, Resource).
9392        let (action, resource) = match expr {
9393            QueryExpr::Table(t) => (Action::Select, Resource::table_from_name(&t.table)),
9394            QueryExpr::RankOf(_) | QueryExpr::ApproxRankOf(_) | QueryExpr::RankRange(_) => {
9395                (Action::Select, Resource::Database)
9396            }
9397            QueryExpr::QueueSelect(q) => {
9398                return self.check_queue_op_privilege(
9399                    &auth_store,
9400                    &principal_id,
9401                    role,
9402                    tenant.as_deref(),
9403                    "queue:peek",
9404                    &q.queue,
9405                );
9406            }
9407            QueryExpr::QueueCommand(cmd) => {
9408                use crate::storage::query::ast::QueueCommand;
9409                let (queue, action_verb) = match cmd {
9410                    QueueCommand::Push { queue, .. } => (queue.as_str(), "queue:enqueue"),
9411                    QueueCommand::Pop { queue, .. }
9412                    | QueueCommand::GroupRead { queue, .. }
9413                    | QueueCommand::Claim { queue, .. } => (queue.as_str(), "queue:read"),
9414                    QueueCommand::Peek { queue, .. }
9415                    | QueueCommand::Len { queue }
9416                    | QueueCommand::Pending { queue, .. } => (queue.as_str(), "queue:peek"),
9417                    QueueCommand::Ack { queue, .. } => (queue.as_str(), "queue:ack"),
9418                    QueueCommand::Nack {
9419                        queue, delay_ms, ..
9420                    } => {
9421                        // Per-failure retry overrides re-shape retry
9422                        // behaviour for everyone draining the queue and
9423                        // gate on the dedicated `queue:retry` verb so
9424                        // operators can grant base NACK without granting
9425                        // the override capability.
9426                        let verb = if delay_ms.is_some() {
9427                            "queue:retry"
9428                        } else {
9429                            "queue:nack"
9430                        };
9431                        (queue.as_str(), verb)
9432                    }
9433                    QueueCommand::Purge { queue } => (queue.as_str(), "queue:purge"),
9434                    // `GroupCreate` is part of the consumer-setup
9435                    // surface — read-side, never destructive.
9436                    QueueCommand::GroupCreate { queue, .. } => (queue.as_str(), "queue:read"),
9437                    QueueCommand::Move { source, .. } => (source.as_str(), "queue:dlq:move"),
9438                };
9439                return self.check_queue_op_privilege(
9440                    &auth_store,
9441                    &principal_id,
9442                    role,
9443                    tenant.as_deref(),
9444                    action_verb,
9445                    queue,
9446                );
9447            }
9448            QueryExpr::Graph(g) => {
9449                // MATCH … RETURN is the explorer's pattern-traversal
9450                // surface — gate on `graph:traverse` (#757).
9451                self.check_graph_op_privilege(
9452                    &auth_store,
9453                    &principal_id,
9454                    role,
9455                    tenant.as_deref(),
9456                    "graph:traverse",
9457                )?;
9458                if auth_store.iam_authorization_enabled() {
9459                    self.check_graph_property_projection_privilege(
9460                        &auth_store,
9461                        &principal_id,
9462                        role,
9463                        tenant.as_deref(),
9464                        g,
9465                    )?;
9466                    return Ok(());
9467                }
9468                return Ok(());
9469            }
9470            QueryExpr::Path(_) => {
9471                // PATH FROM … TO … is a path-traversal query — gates
9472                // on `graph:traverse` like neighborhood/shortest-path
9473                // (#757).
9474                return self.check_graph_op_privilege(
9475                    &auth_store,
9476                    &principal_id,
9477                    role,
9478                    tenant.as_deref(),
9479                    "graph:traverse",
9480                );
9481            }
9482            QueryExpr::GraphCommand(cmd) => {
9483                use crate::storage::query::ast::GraphCommand;
9484                let action_verb = match cmd {
9485                    // Metadata / property reads.
9486                    GraphCommand::Properties { .. } => "graph:read",
9487                    // Traversal / pattern-walk surface.
9488                    GraphCommand::Neighborhood { .. }
9489                    | GraphCommand::Traverse { .. }
9490                    | GraphCommand::ShortestPath { .. } => "graph:traverse",
9491                    // Analytics algorithms — expensive enough that Red
9492                    // UI needs to gate the runner independently of
9493                    // ordinary traversal.
9494                    GraphCommand::Centrality { .. }
9495                    | GraphCommand::Community { .. }
9496                    | GraphCommand::Components { .. }
9497                    | GraphCommand::Cycles { .. }
9498                    | GraphCommand::Clustering
9499                    | GraphCommand::TopologicalSort => "graph:algorithm:run",
9500                };
9501                return self.check_graph_op_privilege(
9502                    &auth_store,
9503                    &principal_id,
9504                    role,
9505                    tenant.as_deref(),
9506                    action_verb,
9507                );
9508            }
9509            QueryExpr::Vector(v) => {
9510                if auth_store.iam_authorization_enabled() {
9511                    self.check_vector_op_privilege(
9512                        &auth_store,
9513                        &principal_id,
9514                        role,
9515                        tenant.as_deref(),
9516                        "vector:search",
9517                        &v.collection,
9518                    )?;
9519                    self.check_table_like_column_projection_privilege(
9520                        &auth_store,
9521                        &principal_id,
9522                        role,
9523                        tenant.as_deref(),
9524                        &v.collection,
9525                        &["content".to_string()],
9526                    )?;
9527                    return Ok(());
9528                }
9529                return Ok(());
9530            }
9531            QueryExpr::SearchCommand(cmd) => {
9532                use crate::storage::query::ast::SearchCommand;
9533                if auth_store.iam_authorization_enabled() {
9534                    // `SEARCH SIMILAR [..] COLLECTION <c>` and `SEARCH
9535                    // HYBRID ... COLLECTION <c>` are the same UI
9536                    // affordances as `VECTOR SEARCH` / hybrid joins —
9537                    // Red UI must see the same `vector:search` envelope
9538                    // so a single toolbar grant is sufficient.
9539                    let collection = match cmd {
9540                        SearchCommand::Similar { collection, .. }
9541                        | SearchCommand::Hybrid { collection, .. } => Some(collection.as_str()),
9542                        _ => None,
9543                    };
9544                    if let Some(c) = collection {
9545                        self.check_vector_op_privilege(
9546                            &auth_store,
9547                            &principal_id,
9548                            role,
9549                            tenant.as_deref(),
9550                            "vector:search",
9551                            c,
9552                        )?;
9553                        return Ok(());
9554                    }
9555                }
9556                return Ok(());
9557            }
9558            QueryExpr::Hybrid(h) => {
9559                if auth_store.iam_authorization_enabled() {
9560                    // The vector half of a hybrid search is gated under
9561                    // the same `vector:search` verb as a standalone
9562                    // VECTOR SEARCH — Red UI's hybrid-search toolbar
9563                    // must surface the same UI-safe denial envelope
9564                    // when the principal lacks the grant. The
9565                    // structured half is dispatched to its own gate via
9566                    // the inner query during execution.
9567                    self.check_vector_op_privilege(
9568                        &auth_store,
9569                        &principal_id,
9570                        role,
9571                        tenant.as_deref(),
9572                        "vector:search",
9573                        &h.vector.collection,
9574                    )?;
9575                    return Ok(());
9576                }
9577                return Ok(());
9578            }
9579            QueryExpr::Insert(i) => (Action::Insert, Resource::table_from_name(&i.table)),
9580            QueryExpr::Update(u) => (Action::Update, Resource::table_from_name(&u.table)),
9581            QueryExpr::Delete(d) => (Action::Delete, Resource::table_from_name(&d.table)),
9582            // Joins inherit the read privilege from any constituent
9583            // table — for now we emit a single Select on the database
9584            // (admins bypass; non-admins need a Database/Schema grant).
9585            QueryExpr::Join(_) => (Action::Select, Resource::Database),
9586            // GRANT / REVOKE / USER DDL are authority statements;
9587            // require Admin (the helper methods enforce).
9588            QueryExpr::Grant(_)
9589            | QueryExpr::Revoke(_)
9590            | QueryExpr::AlterUser(_)
9591            | QueryExpr::CreateUser(_) => {
9592                return if role == crate::auth::Role::Admin {
9593                    Ok(())
9594                } else {
9595                    Err(format!(
9596                        "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
9597                        username, role
9598                    ))
9599                };
9600            }
9601            QueryExpr::CreateIamPolicy { id, .. } => {
9602                return self.check_policy_management_privilege(
9603                    &auth_store,
9604                    &principal_id,
9605                    role,
9606                    tenant.as_deref(),
9607                    "policy:put",
9608                    "policy",
9609                    id,
9610                );
9611            }
9612            QueryExpr::DropIamPolicy { id } => {
9613                return self.check_policy_management_privilege(
9614                    &auth_store,
9615                    &principal_id,
9616                    role,
9617                    tenant.as_deref(),
9618                    "policy:drop",
9619                    "policy",
9620                    id,
9621                );
9622            }
9623            QueryExpr::AttachPolicy { policy_id, .. } => {
9624                return self.check_policy_management_privilege(
9625                    &auth_store,
9626                    &principal_id,
9627                    role,
9628                    tenant.as_deref(),
9629                    "policy:attach",
9630                    "policy",
9631                    policy_id,
9632                );
9633            }
9634            QueryExpr::DetachPolicy { policy_id, .. } => {
9635                return self.check_policy_management_privilege(
9636                    &auth_store,
9637                    &principal_id,
9638                    role,
9639                    tenant.as_deref(),
9640                    "policy:detach",
9641                    "policy",
9642                    policy_id,
9643                );
9644            }
9645            QueryExpr::ShowPolicies { .. } | QueryExpr::ShowEffectivePermissions { .. } => {
9646                return Ok(());
9647            }
9648            QueryExpr::SimulatePolicy { .. } => {
9649                return self.check_policy_management_privilege(
9650                    &auth_store,
9651                    &principal_id,
9652                    role,
9653                    tenant.as_deref(),
9654                    "policy:simulate",
9655                    "policy",
9656                    "*",
9657                );
9658            }
9659            QueryExpr::LintPolicy { .. } => {
9660                // Linting is a read-only inspection — gate it like
9661                // simulate (policy management role).
9662                return self.check_policy_management_privilege(
9663                    &auth_store,
9664                    &principal_id,
9665                    role,
9666                    tenant.as_deref(),
9667                    "policy:simulate",
9668                    "policy",
9669                    "*",
9670                );
9671            }
9672            QueryExpr::MigratePolicyMode { dry_run, .. } => {
9673                // DRY RUN is a pre-flight inspection (policy:simulate).
9674                // The actual mode flip is a privileged mutation under
9675                // the policy:put action (it persists a new enforcement
9676                // mode to the vault KV through `set_enforcement_mode`).
9677                let action = if *dry_run {
9678                    "policy:simulate"
9679                } else {
9680                    "policy:put"
9681                };
9682                return self.check_policy_management_privilege(
9683                    &auth_store,
9684                    &principal_id,
9685                    role,
9686                    tenant.as_deref(),
9687                    action,
9688                    "policy",
9689                    "*",
9690                );
9691            }
9692            // DROP and TRUNCATE — Write-role gate + per-collection IAM policy
9693            // when IAM mode is active. Other DDL stays role-only for now.
9694            QueryExpr::DropTable(q) => {
9695                return self.check_ddl_collection_privilege(
9696                    &auth_store,
9697                    &principal_id,
9698                    role,
9699                    tenant.as_deref(),
9700                    &username,
9701                    "drop",
9702                    &q.name,
9703                );
9704            }
9705            QueryExpr::DropGraph(q) => {
9706                return self.check_ddl_collection_privilege(
9707                    &auth_store,
9708                    &principal_id,
9709                    role,
9710                    tenant.as_deref(),
9711                    &username,
9712                    "drop",
9713                    &q.name,
9714                );
9715            }
9716            QueryExpr::DropVector(q) => {
9717                return self.check_ddl_collection_privilege(
9718                    &auth_store,
9719                    &principal_id,
9720                    role,
9721                    tenant.as_deref(),
9722                    &username,
9723                    "drop",
9724                    &q.name,
9725                );
9726            }
9727            QueryExpr::DropDocument(q) => {
9728                return self.check_ddl_collection_privilege(
9729                    &auth_store,
9730                    &principal_id,
9731                    role,
9732                    tenant.as_deref(),
9733                    &username,
9734                    "drop",
9735                    &q.name,
9736                );
9737            }
9738            QueryExpr::DropKv(q) => {
9739                return self.check_ddl_collection_privilege(
9740                    &auth_store,
9741                    &principal_id,
9742                    role,
9743                    tenant.as_deref(),
9744                    &username,
9745                    "drop",
9746                    &q.name,
9747                );
9748            }
9749            QueryExpr::DropCollection(q) => {
9750                return self.check_ddl_collection_privilege(
9751                    &auth_store,
9752                    &principal_id,
9753                    role,
9754                    tenant.as_deref(),
9755                    &username,
9756                    "drop",
9757                    &q.name,
9758                );
9759            }
9760            QueryExpr::Truncate(q) => {
9761                return self.check_ddl_collection_privilege(
9762                    &auth_store,
9763                    &principal_id,
9764                    role,
9765                    tenant.as_deref(),
9766                    &username,
9767                    "truncate",
9768                    &q.name,
9769                );
9770            }
9771            // Remaining DDL (#753) — hybrid policy-aware gate. Specific
9772            // create/alter/drop verbs gate operations with a clear
9773            // per-collection target so Red UI can author fine-grained
9774            // policies (`create on collection:users`). Namespace-level
9775            // and grouped DDL fall back to broader `schema:admin` /
9776            // `schema:write` verbs against a `schema:<name>` resource.
9777            // All branches share the [`check_ddl_object_privilege`]
9778            // helper so allows / denies produce the same structured
9779            // "principal=… action=… resource=<kind>:<name> denied by
9780            // IAM policy" reason the Red UI security read contracts
9781            // (#740) already render.
9782            QueryExpr::CreateTable(q) => {
9783                return self.check_ddl_object_privilege(
9784                    &auth_store,
9785                    &principal_id,
9786                    role,
9787                    tenant.as_deref(),
9788                    &username,
9789                    "create",
9790                    "collection",
9791                    &q.name,
9792                    crate::auth::Role::Write,
9793                );
9794            }
9795            QueryExpr::CreateCollection(q) => {
9796                return self.check_ddl_object_privilege(
9797                    &auth_store,
9798                    &principal_id,
9799                    role,
9800                    tenant.as_deref(),
9801                    &username,
9802                    "create",
9803                    "collection",
9804                    &q.name,
9805                    crate::auth::Role::Write,
9806                );
9807            }
9808            QueryExpr::CreateVector(q) => {
9809                return self.check_ddl_object_privilege(
9810                    &auth_store,
9811                    &principal_id,
9812                    role,
9813                    tenant.as_deref(),
9814                    &username,
9815                    "create",
9816                    "collection",
9817                    &q.name,
9818                    crate::auth::Role::Write,
9819                );
9820            }
9821            QueryExpr::AlterTable(q) => {
9822                return self.check_ddl_object_privilege(
9823                    &auth_store,
9824                    &principal_id,
9825                    role,
9826                    tenant.as_deref(),
9827                    &username,
9828                    "alter",
9829                    "collection",
9830                    &q.name,
9831                    crate::auth::Role::Write,
9832                );
9833            }
9834            QueryExpr::CreateIndex(q) => {
9835                return self.check_ddl_object_privilege(
9836                    &auth_store,
9837                    &principal_id,
9838                    role,
9839                    tenant.as_deref(),
9840                    &username,
9841                    "create",
9842                    "collection",
9843                    &q.table,
9844                    crate::auth::Role::Write,
9845                );
9846            }
9847            QueryExpr::DropIndex(q) => {
9848                return self.check_ddl_object_privilege(
9849                    &auth_store,
9850                    &principal_id,
9851                    role,
9852                    tenant.as_deref(),
9853                    &username,
9854                    "drop",
9855                    "collection",
9856                    &q.table,
9857                    crate::auth::Role::Write,
9858                );
9859            }
9860            QueryExpr::CreateSchema(q) => {
9861                return self.check_ddl_object_privilege(
9862                    &auth_store,
9863                    &principal_id,
9864                    role,
9865                    tenant.as_deref(),
9866                    &username,
9867                    "schema:admin",
9868                    "schema",
9869                    &q.name,
9870                    crate::auth::Role::Admin,
9871                );
9872            }
9873            QueryExpr::DropSchema(q) => {
9874                return self.check_ddl_object_privilege(
9875                    &auth_store,
9876                    &principal_id,
9877                    role,
9878                    tenant.as_deref(),
9879                    &username,
9880                    "schema:admin",
9881                    "schema",
9882                    &q.name,
9883                    crate::auth::Role::Admin,
9884                );
9885            }
9886            QueryExpr::CreateSequence(q) => {
9887                return self.check_ddl_object_privilege(
9888                    &auth_store,
9889                    &principal_id,
9890                    role,
9891                    tenant.as_deref(),
9892                    &username,
9893                    "create",
9894                    "collection",
9895                    &q.name,
9896                    crate::auth::Role::Write,
9897                );
9898            }
9899            QueryExpr::DropSequence(q) => {
9900                return self.check_ddl_object_privilege(
9901                    &auth_store,
9902                    &principal_id,
9903                    role,
9904                    tenant.as_deref(),
9905                    &username,
9906                    "drop",
9907                    "collection",
9908                    &q.name,
9909                    crate::auth::Role::Write,
9910                );
9911            }
9912            QueryExpr::CreateView(q) => {
9913                return self.check_ddl_object_privilege(
9914                    &auth_store,
9915                    &principal_id,
9916                    role,
9917                    tenant.as_deref(),
9918                    &username,
9919                    "create",
9920                    "collection",
9921                    &q.name,
9922                    crate::auth::Role::Write,
9923                );
9924            }
9925            QueryExpr::DropView(q) => {
9926                return self.check_ddl_object_privilege(
9927                    &auth_store,
9928                    &principal_id,
9929                    role,
9930                    tenant.as_deref(),
9931                    &username,
9932                    "drop",
9933                    "collection",
9934                    &q.name,
9935                    crate::auth::Role::Write,
9936                );
9937            }
9938            QueryExpr::RefreshMaterializedView(q) => {
9939                return self.check_ddl_object_privilege(
9940                    &auth_store,
9941                    &principal_id,
9942                    role,
9943                    tenant.as_deref(),
9944                    &username,
9945                    "alter",
9946                    "collection",
9947                    &q.name,
9948                    crate::auth::Role::Write,
9949                );
9950            }
9951            QueryExpr::CreatePolicy(q) => {
9952                return self.check_ddl_object_privilege(
9953                    &auth_store,
9954                    &principal_id,
9955                    role,
9956                    tenant.as_deref(),
9957                    &username,
9958                    "create",
9959                    "collection",
9960                    &q.table,
9961                    crate::auth::Role::Write,
9962                );
9963            }
9964            QueryExpr::DropPolicy(q) => {
9965                return self.check_ddl_object_privilege(
9966                    &auth_store,
9967                    &principal_id,
9968                    role,
9969                    tenant.as_deref(),
9970                    &username,
9971                    "drop",
9972                    "collection",
9973                    &q.table,
9974                    crate::auth::Role::Write,
9975                );
9976            }
9977            QueryExpr::CreateServer(q) => {
9978                return self.check_ddl_object_privilege(
9979                    &auth_store,
9980                    &principal_id,
9981                    role,
9982                    tenant.as_deref(),
9983                    &username,
9984                    "schema:admin",
9985                    "schema",
9986                    &q.name,
9987                    crate::auth::Role::Admin,
9988                );
9989            }
9990            QueryExpr::DropServer(q) => {
9991                return self.check_ddl_object_privilege(
9992                    &auth_store,
9993                    &principal_id,
9994                    role,
9995                    tenant.as_deref(),
9996                    &username,
9997                    "schema:admin",
9998                    "schema",
9999                    &q.name,
10000                    crate::auth::Role::Admin,
10001                );
10002            }
10003            QueryExpr::CreateForeignTable(q) => {
10004                return self.check_ddl_object_privilege(
10005                    &auth_store,
10006                    &principal_id,
10007                    role,
10008                    tenant.as_deref(),
10009                    &username,
10010                    "schema:write",
10011                    "schema",
10012                    &q.name,
10013                    crate::auth::Role::Write,
10014                );
10015            }
10016            QueryExpr::DropForeignTable(q) => {
10017                return self.check_ddl_object_privilege(
10018                    &auth_store,
10019                    &principal_id,
10020                    role,
10021                    tenant.as_deref(),
10022                    &username,
10023                    "schema:write",
10024                    "schema",
10025                    &q.name,
10026                    crate::auth::Role::Write,
10027                );
10028            }
10029            QueryExpr::CreateTimeSeries(q) => {
10030                return self.check_ddl_object_privilege(
10031                    &auth_store,
10032                    &principal_id,
10033                    role,
10034                    tenant.as_deref(),
10035                    &username,
10036                    "create",
10037                    "collection",
10038                    &q.name,
10039                    crate::auth::Role::Write,
10040                );
10041            }
10042            QueryExpr::CreateMetric(q) => {
10043                return self.check_ddl_object_privilege(
10044                    &auth_store,
10045                    &principal_id,
10046                    role,
10047                    tenant.as_deref(),
10048                    &username,
10049                    "create",
10050                    "collection",
10051                    &q.path,
10052                    crate::auth::Role::Write,
10053                );
10054            }
10055            QueryExpr::AlterMetric(q) => {
10056                return self.check_ddl_object_privilege(
10057                    &auth_store,
10058                    &principal_id,
10059                    role,
10060                    tenant.as_deref(),
10061                    &username,
10062                    "alter",
10063                    "collection",
10064                    &q.path,
10065                    crate::auth::Role::Write,
10066                );
10067            }
10068            QueryExpr::CreateSlo(q) => {
10069                return self.check_ddl_object_privilege(
10070                    &auth_store,
10071                    &principal_id,
10072                    role,
10073                    tenant.as_deref(),
10074                    &username,
10075                    "create",
10076                    "collection",
10077                    &q.path,
10078                    crate::auth::Role::Write,
10079                );
10080            }
10081            QueryExpr::DropTimeSeries(q) => {
10082                return self.check_ddl_object_privilege(
10083                    &auth_store,
10084                    &principal_id,
10085                    role,
10086                    tenant.as_deref(),
10087                    &username,
10088                    "drop",
10089                    "collection",
10090                    &q.name,
10091                    crate::auth::Role::Write,
10092                );
10093            }
10094            QueryExpr::CreateQueue(q) => {
10095                return self.check_ddl_object_privilege(
10096                    &auth_store,
10097                    &principal_id,
10098                    role,
10099                    tenant.as_deref(),
10100                    &username,
10101                    "create",
10102                    "collection",
10103                    &q.name,
10104                    crate::auth::Role::Write,
10105                );
10106            }
10107            QueryExpr::AlterQueue(q) => {
10108                return self.check_ddl_object_privilege(
10109                    &auth_store,
10110                    &principal_id,
10111                    role,
10112                    tenant.as_deref(),
10113                    &username,
10114                    "alter",
10115                    "collection",
10116                    &q.name,
10117                    crate::auth::Role::Write,
10118                );
10119            }
10120            QueryExpr::DropQueue(q) => {
10121                return self.check_ddl_object_privilege(
10122                    &auth_store,
10123                    &principal_id,
10124                    role,
10125                    tenant.as_deref(),
10126                    &username,
10127                    "drop",
10128                    "collection",
10129                    &q.name,
10130                    crate::auth::Role::Write,
10131                );
10132            }
10133            QueryExpr::CreateTree(q) => {
10134                return self.check_ddl_object_privilege(
10135                    &auth_store,
10136                    &principal_id,
10137                    role,
10138                    tenant.as_deref(),
10139                    &username,
10140                    "create",
10141                    "collection",
10142                    &q.collection,
10143                    crate::auth::Role::Write,
10144                );
10145            }
10146            QueryExpr::DropTree(q) => {
10147                return self.check_ddl_object_privilege(
10148                    &auth_store,
10149                    &principal_id,
10150                    role,
10151                    tenant.as_deref(),
10152                    &username,
10153                    "drop",
10154                    "collection",
10155                    &q.collection,
10156                    crate::auth::Role::Write,
10157                );
10158            }
10159            // Migration DDL — CREATE MIGRATION is grouped DDL on the
10160            // schema namespace; uses the `schema:write` fallback verb
10161            // (no obvious per-collection target).
10162            QueryExpr::CreateMigration(q) => {
10163                return self.check_ddl_object_privilege(
10164                    &auth_store,
10165                    &principal_id,
10166                    role,
10167                    tenant.as_deref(),
10168                    &username,
10169                    "schema:write",
10170                    "schema",
10171                    &q.name,
10172                    crate::auth::Role::Write,
10173                );
10174            }
10175            // APPLY / ROLLBACK change data and schema — require Admin.
10176            QueryExpr::ApplyMigration(_) | QueryExpr::RollbackMigration(_) => {
10177                return if role == crate::auth::Role::Admin {
10178                    Ok(())
10179                } else {
10180                    Err(format!(
10181                        "principal=`{}` role=`{:?}` cannot issue APPLY/ROLLBACK MIGRATION",
10182                        username, role
10183                    ))
10184                };
10185            }
10186            // EXPLAIN MIGRATION is read-only — any authenticated principal.
10187            QueryExpr::ExplainMigration(_) => return Ok(()),
10188            // Everything else (SET, SHOW, transaction control, graph
10189            // commands, queue/tree commands, MaintenanceCommand …)
10190            // is allowed for any authenticated principal.
10191            _ => return Ok(()),
10192        };
10193
10194        if auth_store.iam_authorization_enabled() {
10195            let iam_action = legacy_action_to_iam(action);
10196            let iam_resource = legacy_resource_to_iam(&resource, tenant.as_deref());
10197            let iam_ctx = runtime_iam_context(role, tenant.as_deref());
10198            if !auth_store.check_policy_authz_with_role(
10199                &principal_id,
10200                iam_action,
10201                &iam_resource,
10202                &iam_ctx,
10203                role,
10204            ) {
10205                return Err(format!(
10206                    "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10207                    username, iam_action, iam_resource.kind, iam_resource.name
10208                ));
10209            }
10210
10211            if let QueryExpr::Table(table) = expr {
10212                self.check_table_column_projection_privilege(
10213                    &auth_store,
10214                    &principal_id,
10215                    &iam_ctx,
10216                    table,
10217                )?;
10218            }
10219
10220            if let QueryExpr::Update(update) = expr {
10221                let columns = update_set_target_columns(update);
10222                if !columns.is_empty() {
10223                    let request = column_access_request_for_table_update(&update.table, columns);
10224                    let outcome =
10225                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
10226                    if let Some(denied) = outcome.first_denied_column() {
10227                        return Err(format!(
10228                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM column policy",
10229                            username, iam_action, denied.resource.kind, denied.resource.name
10230                        ));
10231                    }
10232                    if !outcome.allowed() {
10233                        return Err(format!(
10234                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10235                            username,
10236                            iam_action,
10237                            outcome.table_resource.kind,
10238                            outcome.table_resource.name
10239                        ));
10240                    }
10241                }
10242
10243                if let Some(columns) = update_returning_columns_for_policy(self, update) {
10244                    let request = column_access_request_for_table_select(&update.table, columns);
10245                    let outcome =
10246                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
10247                    if let Some(denied) = outcome.first_denied_column() {
10248                        return Err(format!(
10249                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM column policy",
10250                            username, denied.resource.kind, denied.resource.name
10251                        ));
10252                    }
10253                    if !outcome.allowed() {
10254                        return Err(format!(
10255                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
10256                            username, outcome.table_resource.kind, outcome.table_resource.name
10257                        ));
10258                    }
10259                }
10260            }
10261
10262            Ok(())
10263        } else {
10264            auth_store
10265                .check_grant(&ctx, action, &resource)
10266                .map_err(|e| e.to_string())
10267        }
10268    }
10269
10270    fn check_table_column_projection_privilege(
10271        &self,
10272        auth_store: &Arc<crate::auth::store::AuthStore>,
10273        principal: &crate::auth::UserId,
10274        ctx: &crate::auth::policies::EvalContext,
10275        table: &crate::storage::query::ast::TableQuery,
10276    ) -> Result<(), String> {
10277        use crate::auth::{ColumnAccessRequest, ColumnDecisionEffect};
10278
10279        let columns = requested_table_columns_for_policy(table);
10280        if columns.is_empty() {
10281            return Ok(());
10282        }
10283
10284        let request = ColumnAccessRequest::select(table.table.clone(), columns);
10285        let outcome = auth_store.check_column_projection_authz(principal, &request, ctx);
10286        if outcome.allowed() {
10287            return Ok(());
10288        }
10289
10290        if !matches!(
10291            outcome.table_decision,
10292            crate::auth::policies::Decision::Allow { .. }
10293                | crate::auth::policies::Decision::AdminBypass
10294        ) {
10295            return Err(format!(
10296                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
10297                principal, outcome.table_resource.kind, outcome.table_resource.name
10298            ));
10299        }
10300
10301        let denied = outcome
10302            .first_denied_column()
10303            .filter(|decision| decision.effective == ColumnDecisionEffect::Denied);
10304        match denied {
10305            Some(decision) => Err(format!(
10306                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
10307                principal, decision.resource.kind, decision.resource.name
10308            )),
10309            None => Ok(()),
10310        }
10311    }
10312
10313    fn check_graph_property_projection_privilege(
10314        &self,
10315        auth_store: &Arc<crate::auth::store::AuthStore>,
10316        principal: &crate::auth::UserId,
10317        role: crate::auth::Role,
10318        tenant: Option<&str>,
10319        query: &crate::storage::query::ast::GraphQuery,
10320    ) -> Result<(), String> {
10321        let columns = explicit_graph_projection_properties(query);
10322        if columns.is_empty() {
10323            return Ok(());
10324        }
10325        self.check_table_like_column_projection_privilege(
10326            auth_store, principal, role, tenant, "graph", &columns,
10327        )
10328    }
10329
10330    fn check_table_like_column_projection_privilege(
10331        &self,
10332        auth_store: &Arc<crate::auth::store::AuthStore>,
10333        principal: &crate::auth::UserId,
10334        role: crate::auth::Role,
10335        tenant: Option<&str>,
10336        table: &str,
10337        columns: &[String],
10338    ) -> Result<(), String> {
10339        let iam_ctx = runtime_iam_context(role, tenant);
10340        let request =
10341            crate::auth::ColumnAccessRequest::select(table.to_string(), columns.iter().cloned());
10342        let outcome = auth_store.check_column_projection_authz(principal, &request, &iam_ctx);
10343        if outcome.allowed() {
10344            return Ok(());
10345        }
10346        let denied = outcome
10347            .first_denied_column()
10348            .map(|d| d.resource.name.clone())
10349            .unwrap_or_else(|| format!("{table}.<unknown>"));
10350        Err(format!(
10351            "principal=`{}` action=`select` resource=`column:{}` denied by IAM policy",
10352            principal, denied
10353        ))
10354    }
10355
10356    fn check_policy_management_privilege(
10357        &self,
10358        auth_store: &Arc<crate::auth::store::AuthStore>,
10359        principal: &crate::auth::UserId,
10360        role: crate::auth::Role,
10361        tenant: Option<&str>,
10362        action: &str,
10363        resource_kind: &str,
10364        resource_name: &str,
10365    ) -> Result<(), String> {
10366        let ctx = runtime_iam_context(role, tenant);
10367
10368        if !auth_store.iam_authorization_enabled() {
10369            return if role == crate::auth::Role::Admin {
10370                Ok(())
10371            } else {
10372                Err(format!(
10373                    "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
10374                    principal, role
10375                ))
10376            };
10377        }
10378
10379        let mut resource = crate::auth::policies::ResourceRef::new(
10380            resource_kind.to_string(),
10381            resource_name.to_string(),
10382        );
10383        if let Some(t) = tenant {
10384            resource = resource.with_tenant(t.to_string());
10385        }
10386        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10387            Ok(())
10388        } else {
10389            Err(format!(
10390                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10391                principal, action, resource.kind, resource.name
10392            ))
10393        }
10394    }
10395
10396    fn check_managed_config_write_for_set_config(&self, key: &str) -> RedDBResult<()> {
10397        let Some(auth_store) = self.inner.auth_store.read().clone() else {
10398            return Ok(());
10399        };
10400        let (username, role) = current_auth_identity()
10401            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
10402        let tenant = current_tenant();
10403        let principal = crate::auth::UserId::from_parts(tenant.as_deref(), &username);
10404        let ctx = runtime_iam_context(role, tenant.as_deref());
10405        let gate = crate::auth::managed_config::ManagedConfigGate::new(
10406            self.inner.config_registry.as_ref(),
10407        );
10408        match gate.check_write(&auth_store, &principal, &ctx, key) {
10409            crate::auth::managed_config::ManagedConfigDecision::PassThrough { .. }
10410            | crate::auth::managed_config::ManagedConfigDecision::Allow { .. } => Ok(()),
10411            crate::auth::managed_config::ManagedConfigDecision::Deny { reason, .. } => {
10412                Err(RedDBError::Query(format!(
10413                    "permission denied: managed config mutation blocked for `{key}`: {reason}"
10414                )))
10415            }
10416        }
10417    }
10418
10419    /// IAM privilege check for a granular queue operation (issue #755 /
10420    /// PRD #735).
10421    ///
10422    /// Each queue operation maps to a stable verb in
10423    /// [`crate::auth::action_catalog`] (`queue:enqueue`, `queue:read`,
10424    /// `queue:peek`, `queue:ack`, `queue:nack`, `queue:retry`,
10425    /// `queue:dlq:move`, `queue:purge`, `queue:presence:read`). The
10426    /// resource is `queue:<name>` scoped to the current tenant. In
10427    /// legacy mode (no IAM authorization configured) the check is a
10428    /// no-op — the role gates in `execute_queue_command` still apply
10429    /// and the legacy `select` / `write` grant table continues to
10430    /// govern queue access. In IAM-enabled mode a missing granular
10431    /// grant yields a structured, UI-safe error of the form
10432    /// `principal=… action=queue:… resource=queue:… denied by IAM
10433    /// policy` so Red UI can surface the failing toolbar action.
10434    fn check_queue_op_privilege(
10435        &self,
10436        auth_store: &Arc<crate::auth::store::AuthStore>,
10437        principal: &crate::auth::UserId,
10438        role: crate::auth::Role,
10439        tenant: Option<&str>,
10440        action: &str,
10441        queue: &str,
10442    ) -> Result<(), String> {
10443        if !auth_store.iam_authorization_enabled() {
10444            return Ok(());
10445        }
10446        let mut resource =
10447            crate::auth::policies::ResourceRef::new("queue".to_string(), queue.to_string());
10448        if let Some(t) = tenant {
10449            resource = resource.with_tenant(t.to_string());
10450        }
10451        let ctx = runtime_iam_context(role, tenant);
10452        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10453            Ok(())
10454        } else {
10455            Err(format!(
10456                "principal=`{}` action=`{}` resource=`queue:{}` denied by IAM policy",
10457                principal, action, queue
10458            ))
10459        }
10460    }
10461
10462    /// IAM privilege check for a graph operation (issue #757 / PRD
10463    /// #735).
10464    ///
10465    /// Each graph operation maps to a stable verb in
10466    /// [`crate::auth::action_catalog`] — `graph:read` for
10467    /// metadata/property lookups, `graph:traverse` for MATCH / PATH /
10468    /// NEIGHBORHOOD / TRAVERSE / SHORTEST_PATH, and
10469    /// `graph:algorithm:run` for analytics algorithms (centrality,
10470    /// community, components, cycles, clustering, topological sort).
10471    /// The resource is `graph:*` scoped to the current tenant — the
10472    /// runtime today operates on a singleton graph store so the name
10473    /// has no concrete identifier; policies grant the explorer
10474    /// surface by writing `graph:*` as the resource pattern.
10475    ///
10476    /// In legacy mode (no IAM authorization configured) the check is
10477    /// a no-op so the existing role-based defaults continue to
10478    /// govern. In IAM-enabled mode a missing grant produces the
10479    /// UI-safe envelope `principal=… action=graph:… resource=graph:*
10480    /// denied by IAM policy` Red UI keys on.
10481    fn check_graph_op_privilege(
10482        &self,
10483        auth_store: &Arc<crate::auth::store::AuthStore>,
10484        principal: &crate::auth::UserId,
10485        role: crate::auth::Role,
10486        tenant: Option<&str>,
10487        action: &str,
10488    ) -> Result<(), String> {
10489        if !auth_store.iam_authorization_enabled() {
10490            return Ok(());
10491        }
10492        let mut resource =
10493            crate::auth::policies::ResourceRef::new("graph".to_string(), "*".to_string());
10494        if let Some(t) = tenant {
10495            resource = resource.with_tenant(t.to_string());
10496        }
10497        let ctx = runtime_iam_context(role, tenant);
10498        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10499            Ok(())
10500        } else {
10501            Err(format!(
10502                "principal=`{}` action=`{}` resource=`graph:*` denied by IAM policy",
10503                principal, action
10504            ))
10505        }
10506    }
10507
10508    /// IAM privilege check for a granular vector operation (issue #756
10509    /// / PRD #735).
10510    ///
10511    /// Each vector operation maps to a stable verb in
10512    /// [`crate::auth::action_catalog`] (`vector:read`, `vector:search`,
10513    /// `vector:artifact:read`, `vector:artifact:rebuild`,
10514    /// `vector:admin`). The resource is `vector:<collection>` scoped to
10515    /// the current tenant. In legacy mode (no IAM authorization
10516    /// configured) the check is a no-op — the role gates and existing
10517    /// `select` / column-projection grants continue to govern access.
10518    /// In IAM-enabled mode a missing granular grant yields a
10519    /// structured, UI-safe error of the form `principal=…
10520    /// action=vector:… resource=vector:… denied by IAM policy` so Red
10521    /// UI can surface the failing toolbar action.
10522    fn check_vector_op_privilege(
10523        &self,
10524        auth_store: &Arc<crate::auth::store::AuthStore>,
10525        principal: &crate::auth::UserId,
10526        role: crate::auth::Role,
10527        tenant: Option<&str>,
10528        action: &str,
10529        collection: &str,
10530    ) -> Result<(), String> {
10531        if !auth_store.iam_authorization_enabled() {
10532            return Ok(());
10533        }
10534        let mut resource =
10535            crate::auth::policies::ResourceRef::new("vector".to_string(), collection.to_string());
10536        if let Some(t) = tenant {
10537            resource = resource.with_tenant(t.to_string());
10538        }
10539        let ctx = runtime_iam_context(role, tenant);
10540        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10541            Ok(())
10542        } else {
10543            Err(format!(
10544                "principal=`{}` action=`{}` resource=`vector:{}` denied by IAM policy",
10545                principal, action, collection
10546            ))
10547        }
10548    }
10549
10550    /// IAM privilege check for DROP / TRUNCATE on a named collection.
10551    ///
10552    /// Delegates to [`check_ddl_object_privilege`] with `resource_kind =
10553    /// "collection"`. Kept as a thin wrapper so the existing DROP/TRUNCATE
10554    /// callsites stay readable.
10555    fn check_ddl_collection_privilege(
10556        &self,
10557        auth_store: &Arc<crate::auth::store::AuthStore>,
10558        principal: &crate::auth::UserId,
10559        role: crate::auth::Role,
10560        tenant: Option<&str>,
10561        username: &str,
10562        action: &str,
10563        collection: &str,
10564    ) -> Result<(), String> {
10565        self.check_ddl_object_privilege(
10566            auth_store,
10567            principal,
10568            role,
10569            tenant,
10570            username,
10571            action,
10572            "collection",
10573            collection,
10574            crate::auth::Role::Write,
10575        )
10576    }
10577
10578    /// Generalised IAM privilege check for DDL on a named object.
10579    ///
10580    /// `action` is the stable verb advertised through the action catalog
10581    /// (`create`, `alter`, `drop`, `truncate`, `schema:write`,
10582    /// `schema:admin`). `resource_kind` / `resource_name` form the policy
10583    /// resource (`collection:<name>`, `schema:<name>`). `min_role` is the
10584    /// legacy gate when IAM is not yet enabled.
10585    ///
10586    /// Behaviour:
10587    /// * Role below `min_role` → structured "principal=… role=… cannot
10588    ///   issue DDL" denial, audit recorded.
10589    /// * IAM disabled → audit-record success and allow (legacy path).
10590    /// * IAM enabled → call `check_policy_authz_with_role`. Explicit Deny
10591    ///   and DefaultDeny in PolicyOnly mode both produce a UI-safe
10592    ///   "principal=… action=… resource=<kind>:<name> denied by IAM
10593    ///   policy" string. Explicit Allow and the LegacyRbac fallback
10594    ///   allow the action.
10595    #[allow(clippy::too_many_arguments)]
10596    fn check_ddl_object_privilege(
10597        &self,
10598        auth_store: &Arc<crate::auth::store::AuthStore>,
10599        principal: &crate::auth::UserId,
10600        role: crate::auth::Role,
10601        tenant: Option<&str>,
10602        username: &str,
10603        action: &str,
10604        resource_kind: &str,
10605        resource_name: &str,
10606        min_role: crate::auth::Role,
10607    ) -> Result<(), String> {
10608        if role < min_role {
10609            let msg = format!(
10610                "principal=`{}` role=`{:?}` cannot issue DDL action=`{}` resource=`{}:{}`",
10611                username, role, action, resource_kind, resource_name
10612            );
10613            self.inner.audit_log.record(
10614                action,
10615                username,
10616                resource_name,
10617                "denied",
10618                crate::json::Value::Null,
10619            );
10620            return Err(msg);
10621        }
10622
10623        if !auth_store.iam_authorization_enabled() {
10624            self.inner.audit_log.record(
10625                action,
10626                username,
10627                resource_name,
10628                "ok",
10629                crate::json::Value::Null,
10630            );
10631            return Ok(());
10632        }
10633
10634        let mut resource = crate::auth::policies::ResourceRef::new(
10635            resource_kind.to_string(),
10636            resource_name.to_string(),
10637        );
10638        if let Some(t) = tenant {
10639            resource = resource.with_tenant(t.to_string());
10640        }
10641        let ctx = runtime_iam_context(role, tenant);
10642        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10643            self.inner.audit_log.record(
10644                action,
10645                username,
10646                resource_name,
10647                "ok",
10648                crate::json::Value::Null,
10649            );
10650            Ok(())
10651        } else {
10652            self.inner.audit_log.record(
10653                action,
10654                username,
10655                resource_name,
10656                "denied",
10657                crate::json::Value::Null,
10658            );
10659            Err(format!(
10660                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10661                username, action, resource_kind, resource_name
10662            ))
10663        }
10664    }
10665
10666    /// Translate the parsed [`GrantStmt`] into AuthStore mutations.
10667    fn execute_grant_statement(
10668        &self,
10669        query: &str,
10670        stmt: &crate::storage::query::ast::GrantStmt,
10671    ) -> RedDBResult<RuntimeQueryResult> {
10672        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
10673        use crate::auth::UserId;
10674        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
10675
10676        let auth_store = self
10677            .inner
10678            .auth_store
10679            .read()
10680            .clone()
10681            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10682
10683        // Granter identity + role.
10684        let (gname, grole) = current_auth_identity().ok_or_else(|| {
10685            RedDBError::Query("GRANT requires an authenticated principal".to_string())
10686        })?;
10687        let granter = UserId::from_parts(current_tenant().as_deref(), &gname);
10688        let granter_role = grole;
10689
10690        // Build the action set.
10691        let mut actions: Vec<Action> = Vec::new();
10692        if stmt.all {
10693            actions.push(Action::All);
10694        } else {
10695            for kw in &stmt.actions {
10696                let a = Action::from_keyword(kw).ok_or_else(|| {
10697                    RedDBError::Query(format!("unknown privilege keyword `{}`", kw))
10698                })?;
10699                actions.push(a);
10700            }
10701        }
10702
10703        // Audit emit (printed; structured emission is Agent #4's lane).
10704        let mut applied = 0usize;
10705        for obj in &stmt.objects {
10706            let resource = match stmt.object_kind {
10707                GrantObjectKind::Table => Resource::Table {
10708                    schema: obj.schema.clone(),
10709                    table: obj.name.clone(),
10710                },
10711                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
10712                GrantObjectKind::Database => Resource::Database,
10713                GrantObjectKind::Function => Resource::Function {
10714                    schema: obj.schema.clone(),
10715                    name: obj.name.clone(),
10716                },
10717            };
10718            for principal in &stmt.principals {
10719                let p = match principal {
10720                    GrantPrincipalRef::Public => GrantPrincipal::Public,
10721                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
10722                    GrantPrincipalRef::User { tenant, name } => {
10723                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
10724                    }
10725                };
10726                // Tenant of the grant follows the granter's tenant
10727                // (cross-tenant guard inside `AuthStore::grant`).
10728                let tenant = granter.tenant.clone();
10729                auth_store
10730                    .grant(
10731                        &granter,
10732                        granter_role,
10733                        p.clone(),
10734                        resource.clone(),
10735                        actions.clone(),
10736                        stmt.with_grant_option,
10737                        tenant.clone(),
10738                    )
10739                    .map_err(|e| RedDBError::Query(e.to_string()))?;
10740
10741                // IAM policy translation: every GRANT also lands as a
10742                // synthetic `_grant_<id>` policy attached to the
10743                // principal so the new evaluator sees it.
10744                if let Some(policy) =
10745                    grant_to_iam_policy(&p, &resource, &actions, tenant.as_deref())
10746                {
10747                    let pid = policy.id.clone();
10748                    auth_store
10749                        .put_policy_internal(policy)
10750                        .map_err(|e| RedDBError::Query(e.to_string()))?;
10751                    let attachment = match &p {
10752                        GrantPrincipal::User(uid) => {
10753                            crate::auth::store::PrincipalRef::User(uid.clone())
10754                        }
10755                        GrantPrincipal::Group(group) => {
10756                            crate::auth::store::PrincipalRef::Group(group.clone())
10757                        }
10758                        GrantPrincipal::Public => crate::auth::store::PrincipalRef::Group(
10759                            crate::auth::store::PUBLIC_IAM_GROUP.to_string(),
10760                        ),
10761                    };
10762                    auth_store
10763                        .attach_policy(attachment, &pid)
10764                        .map_err(|e| RedDBError::Query(e.to_string()))?;
10765                }
10766                applied += 1;
10767                tracing::info!(
10768                    target: "audit",
10769                    principal = %granter,
10770                    action = "grant",
10771                    "GRANT applied"
10772                );
10773            }
10774        }
10775
10776        self.invalidate_result_cache();
10777        Ok(RuntimeQueryResult::ok_message(
10778            query.to_string(),
10779            &format!("GRANT applied to {} target(s)", applied),
10780            "grant",
10781        ))
10782    }
10783
10784    /// Translate the parsed [`RevokeStmt`] into AuthStore mutations.
10785    fn execute_revoke_statement(
10786        &self,
10787        query: &str,
10788        stmt: &crate::storage::query::ast::RevokeStmt,
10789    ) -> RedDBResult<RuntimeQueryResult> {
10790        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
10791        use crate::auth::UserId;
10792        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
10793
10794        let auth_store = self
10795            .inner
10796            .auth_store
10797            .read()
10798            .clone()
10799            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10800
10801        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
10802            RedDBError::Query("REVOKE requires an authenticated principal".to_string())
10803        })?;
10804        let granter_role = grole;
10805
10806        let actions: Vec<Action> = if stmt.all {
10807            vec![Action::All]
10808        } else {
10809            stmt.actions
10810                .iter()
10811                .map(|kw| Action::from_keyword(kw).unwrap_or(Action::Select))
10812                .collect()
10813        };
10814
10815        let mut total_removed = 0usize;
10816        for obj in &stmt.objects {
10817            let resource = match stmt.object_kind {
10818                GrantObjectKind::Table => Resource::Table {
10819                    schema: obj.schema.clone(),
10820                    table: obj.name.clone(),
10821                },
10822                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
10823                GrantObjectKind::Database => Resource::Database,
10824                GrantObjectKind::Function => Resource::Function {
10825                    schema: obj.schema.clone(),
10826                    name: obj.name.clone(),
10827                },
10828            };
10829            for principal in &stmt.principals {
10830                let p = match principal {
10831                    GrantPrincipalRef::Public => GrantPrincipal::Public,
10832                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
10833                    GrantPrincipalRef::User { tenant, name } => {
10834                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
10835                    }
10836                };
10837                let removed = auth_store
10838                    .revoke(granter_role, &p, &resource, &actions)
10839                    .map_err(|e| RedDBError::Query(e.to_string()))?;
10840                let _removed_policies =
10841                    auth_store.delete_synthetic_grant_policies(&p, &resource, &actions);
10842                total_removed += removed;
10843            }
10844        }
10845
10846        self.invalidate_result_cache();
10847        Ok(RuntimeQueryResult::ok_message(
10848            query.to_string(),
10849            &format!("REVOKE removed {} grant(s)", total_removed),
10850            "revoke",
10851        ))
10852    }
10853
10854    /// Translate the parsed [`CreateUserStmt`] into an AuthStore user.
10855    fn execute_create_user_statement(
10856        &self,
10857        query: &str,
10858        stmt: &crate::storage::query::ast::CreateUserStmt,
10859    ) -> RedDBResult<RuntimeQueryResult> {
10860        let auth_store = self
10861            .inner
10862            .auth_store
10863            .read()
10864            .clone()
10865            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10866
10867        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
10868            RedDBError::Query("CREATE USER requires an authenticated principal".to_string())
10869        })?;
10870        if grole != crate::auth::Role::Admin {
10871            return Err(RedDBError::Query(
10872                "CREATE USER requires Admin role".to_string(),
10873            ));
10874        }
10875
10876        let role = crate::auth::Role::from_str(&stmt.role)
10877            .ok_or_else(|| RedDBError::Query(format!("invalid role `{}`", stmt.role)))?;
10878        let user = auth_store
10879            .create_user_in_tenant(stmt.tenant.as_deref(), &stmt.username, &stmt.password, role)
10880            .map_err(|e| RedDBError::Query(e.to_string()))?;
10881
10882        self.invalidate_result_cache();
10883        let target = crate::auth::UserId::from_parts(user.tenant_id.as_deref(), &user.username);
10884        tracing::info!(
10885            target: "audit",
10886            principal = %target,
10887            role = %role,
10888            action = "create_user",
10889            "CREATE USER applied"
10890        );
10891
10892        Ok(RuntimeQueryResult::ok_message(
10893            query.to_string(),
10894            &format!("CREATE USER {} applied", target),
10895            "create_user",
10896        ))
10897    }
10898
10899    /// Translate the parsed [`AlterUserStmt`] into AuthStore mutations.
10900    fn execute_alter_user_statement(
10901        &self,
10902        query: &str,
10903        stmt: &crate::storage::query::ast::AlterUserStmt,
10904    ) -> RedDBResult<RuntimeQueryResult> {
10905        use crate::auth::privileges::UserAttributes;
10906        use crate::auth::UserId;
10907        use crate::storage::query::ast::AlterUserAttribute;
10908
10909        let auth_store = self
10910            .inner
10911            .auth_store
10912            .read()
10913            .clone()
10914            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10915
10916        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
10917            RedDBError::Query("ALTER USER requires an authenticated principal".to_string())
10918        })?;
10919        if grole != crate::auth::Role::Admin {
10920            return Err(RedDBError::Query(
10921                "ALTER USER requires Admin role".to_string(),
10922            ));
10923        }
10924
10925        let target = UserId::from_parts(stmt.tenant.as_deref(), &stmt.username);
10926
10927        // Apply attributes incrementally — each one reads the current
10928        // record, mutates the relevant field, writes back.
10929        let mut attrs = auth_store.user_attributes(&target);
10930        let mut enable_change: Option<bool> = None;
10931
10932        for a in &stmt.attributes {
10933            match a {
10934                AlterUserAttribute::ValidUntil(ts) => {
10935                    // Parse ISO-ish timestamp → ms since epoch. Fall
10936                    // back to integer-ms parsing for callers that pass
10937                    // `'1234567890123'`.
10938                    let ms = parse_timestamp_to_ms(ts).ok_or_else(|| {
10939                        RedDBError::Query(format!("invalid VALID UNTIL timestamp `{ts}`"))
10940                    })?;
10941                    attrs.valid_until = Some(ms);
10942                }
10943                AlterUserAttribute::ConnectionLimit(n) => {
10944                    if *n < 0 {
10945                        return Err(RedDBError::Query(
10946                            "CONNECTION LIMIT must be non-negative".to_string(),
10947                        ));
10948                    }
10949                    attrs.connection_limit = Some(*n as u32);
10950                }
10951                AlterUserAttribute::SetSearchPath(p) => {
10952                    attrs.search_path = Some(p.clone());
10953                }
10954                AlterUserAttribute::AddGroup(g) => {
10955                    if !attrs.groups.iter().any(|existing| existing == g) {
10956                        attrs.groups.push(g.clone());
10957                        attrs.groups.sort();
10958                    }
10959                }
10960                AlterUserAttribute::DropGroup(g) => {
10961                    attrs.groups.retain(|existing| existing != g);
10962                }
10963                AlterUserAttribute::Enable => enable_change = Some(true),
10964                AlterUserAttribute::Disable => enable_change = Some(false),
10965                AlterUserAttribute::Password(_) => {
10966                    // Out of scope — accept the AST but no-op so the
10967                    // parser stays compatible with future password
10968                    // rotation work.
10969                }
10970            }
10971        }
10972
10973        auth_store
10974            .set_user_attributes(&target, attrs)
10975            .map_err(|e| RedDBError::Query(e.to_string()))?;
10976        if let Some(en) = enable_change {
10977            auth_store
10978                .set_user_enabled(&target, en)
10979                .map_err(|e| RedDBError::Query(e.to_string()))?;
10980        }
10981        self.invalidate_result_cache();
10982        tracing::info!(
10983            target: "audit",
10984            principal = %target,
10985            action = "alter_user",
10986            "ALTER USER applied"
10987        );
10988
10989        Ok(RuntimeQueryResult::ok_message(
10990            query.to_string(),
10991            &format!("ALTER USER {} applied", target),
10992            "alter_user",
10993        ))
10994    }
10995
10996    // -----------------------------------------------------------------
10997    // IAM policy executors
10998    // -----------------------------------------------------------------
10999
11000    fn execute_create_iam_policy(
11001        &self,
11002        query: &str,
11003        id: &str,
11004        json: &str,
11005    ) -> RedDBResult<RuntimeQueryResult> {
11006        use crate::auth::policies::Policy;
11007
11008        let auth_store = self
11009            .inner
11010            .auth_store
11011            .read()
11012            .clone()
11013            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11014
11015        // Parse + validate. The kernel rejects oversize / bad shape /
11016        // bad action keywords. If the supplied id differs from the JSON
11017        // id, override it with the SQL-provided id (the JSON id is
11018        // optional context — the SQL DDL form is authoritative).
11019        let mut policy = Policy::from_json_str(json)
11020            .map_err(|e| RedDBError::Query(format!("policy parse: {e}")))?;
11021        if policy.id != id {
11022            policy.id = id.to_string();
11023        }
11024        let pid = policy.id.clone();
11025        let tenant = current_tenant();
11026        let (actor_name, actor_role) = current_auth_identity()
11027            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
11028        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
11029        let eval_ctx = runtime_iam_context(actor_role, tenant.as_deref());
11030        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
11031        let ledger = self.inner.control_event_ledger.read();
11032        let control = crate::auth::store::PolicyMutationControl {
11033            ctx: &event_ctx,
11034            ledger: ledger.as_ref(),
11035            config: self.inner.control_event_config,
11036            registry: Some(self.inner.config_registry.as_ref()),
11037            actor: &actor,
11038            eval_ctx: &eval_ctx,
11039        };
11040        auth_store
11041            .put_policy_with_control_events(policy, &control)
11042            .map_err(|e| RedDBError::Query(e.to_string()))?;
11043
11044        let principal = actor_name;
11045        tracing::info!(
11046            target: "audit",
11047            principal = %principal,
11048            action = "iam:policy.put",
11049            matched_policy_id = %pid,
11050            "CREATE POLICY applied"
11051        );
11052        self.inner.audit_log.record(
11053            "iam/policy.put",
11054            &principal,
11055            &pid,
11056            "ok",
11057            crate::json::Value::Null,
11058        );
11059
11060        self.invalidate_result_cache();
11061        Ok(RuntimeQueryResult::ok_message(
11062            query.to_string(),
11063            &format!("policy `{pid}` stored"),
11064            "create_iam_policy",
11065        ))
11066    }
11067
11068    fn execute_drop_iam_policy(&self, query: &str, id: &str) -> RedDBResult<RuntimeQueryResult> {
11069        let auth_store = self
11070            .inner
11071            .auth_store
11072            .read()
11073            .clone()
11074            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11075        let tenant = current_tenant();
11076        let (actor_name, actor_role) = current_auth_identity()
11077            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
11078        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
11079        let eval_ctx = runtime_iam_context(actor_role, tenant.as_deref());
11080        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
11081        let ledger = self.inner.control_event_ledger.read();
11082        let control = crate::auth::store::PolicyMutationControl {
11083            ctx: &event_ctx,
11084            ledger: ledger.as_ref(),
11085            config: self.inner.control_event_config,
11086            registry: Some(self.inner.config_registry.as_ref()),
11087            actor: &actor,
11088            eval_ctx: &eval_ctx,
11089        };
11090        auth_store
11091            .delete_policy_with_control_events(id, &control)
11092            .map_err(|e| RedDBError::Query(e.to_string()))?;
11093
11094        let principal = actor_name;
11095        tracing::info!(
11096            target: "audit",
11097            principal = %principal,
11098            action = "iam:policy.drop",
11099            matched_policy_id = %id,
11100            "DROP POLICY applied"
11101        );
11102        self.inner.audit_log.record(
11103            "iam/policy.drop",
11104            &principal,
11105            id,
11106            "ok",
11107            crate::json::Value::Null,
11108        );
11109
11110        self.invalidate_result_cache();
11111        Ok(RuntimeQueryResult::ok_message(
11112            query.to_string(),
11113            &format!("policy `{id}` dropped"),
11114            "drop_iam_policy",
11115        ))
11116    }
11117
11118    fn execute_attach_policy(
11119        &self,
11120        query: &str,
11121        policy_id: &str,
11122        principal: &crate::storage::query::ast::PolicyPrincipalRef,
11123    ) -> RedDBResult<RuntimeQueryResult> {
11124        use crate::auth::store::PrincipalRef;
11125        use crate::auth::UserId;
11126        use crate::storage::query::ast::PolicyPrincipalRef;
11127
11128        let auth_store = self
11129            .inner
11130            .auth_store
11131            .read()
11132            .clone()
11133            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11134        let p = match principal {
11135            PolicyPrincipalRef::User(u) => {
11136                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
11137            }
11138            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
11139        };
11140        let pretty_target = principal_label(principal);
11141        let tenant = current_tenant();
11142        let (actor_name, actor_role) = current_auth_identity()
11143            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
11144        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
11145        let eval_ctx = runtime_iam_context(actor_role, tenant.as_deref());
11146        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
11147        let ledger = self.inner.control_event_ledger.read();
11148        let control = crate::auth::store::PolicyMutationControl {
11149            ctx: &event_ctx,
11150            ledger: ledger.as_ref(),
11151            config: self.inner.control_event_config,
11152            registry: Some(self.inner.config_registry.as_ref()),
11153            actor: &actor,
11154            eval_ctx: &eval_ctx,
11155        };
11156        auth_store
11157            .attach_policy_with_control_events(p, policy_id, &control)
11158            .map_err(|e| RedDBError::Query(e.to_string()))?;
11159
11160        let principal_str = actor_name;
11161        tracing::info!(
11162            target: "audit",
11163            principal = %principal_str,
11164            action = "iam:policy.attach",
11165            matched_policy_id = %policy_id,
11166            target = %pretty_target,
11167            "ATTACH POLICY applied"
11168        );
11169        self.inner.audit_log.record(
11170            "iam/policy.attach",
11171            &principal_str,
11172            &pretty_target,
11173            "ok",
11174            crate::json::Value::Null,
11175        );
11176
11177        self.invalidate_result_cache();
11178        Ok(RuntimeQueryResult::ok_message(
11179            query.to_string(),
11180            &format!("policy `{policy_id}` attached to {pretty_target}"),
11181            "attach_policy",
11182        ))
11183    }
11184
11185    fn execute_detach_policy(
11186        &self,
11187        query: &str,
11188        policy_id: &str,
11189        principal: &crate::storage::query::ast::PolicyPrincipalRef,
11190    ) -> RedDBResult<RuntimeQueryResult> {
11191        use crate::auth::store::PrincipalRef;
11192        use crate::auth::UserId;
11193        use crate::storage::query::ast::PolicyPrincipalRef;
11194
11195        let auth_store = self
11196            .inner
11197            .auth_store
11198            .read()
11199            .clone()
11200            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11201        let p = match principal {
11202            PolicyPrincipalRef::User(u) => {
11203                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
11204            }
11205            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
11206        };
11207        let pretty_target = principal_label(principal);
11208        let tenant = current_tenant();
11209        let (actor_name, actor_role) = current_auth_identity()
11210            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
11211        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
11212        let eval_ctx = runtime_iam_context(actor_role, tenant.as_deref());
11213        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
11214        let ledger = self.inner.control_event_ledger.read();
11215        let control = crate::auth::store::PolicyMutationControl {
11216            ctx: &event_ctx,
11217            ledger: ledger.as_ref(),
11218            config: self.inner.control_event_config,
11219            registry: Some(self.inner.config_registry.as_ref()),
11220            actor: &actor,
11221            eval_ctx: &eval_ctx,
11222        };
11223        auth_store
11224            .detach_policy_with_control_events(p, policy_id, &control)
11225            .map_err(|e| RedDBError::Query(e.to_string()))?;
11226
11227        let principal_str = actor_name;
11228        tracing::info!(
11229            target: "audit",
11230            principal = %principal_str,
11231            action = "iam:policy.detach",
11232            matched_policy_id = %policy_id,
11233            target = %pretty_target,
11234            "DETACH POLICY applied"
11235        );
11236        self.inner.audit_log.record(
11237            "iam/policy.detach",
11238            &principal_str,
11239            &pretty_target,
11240            "ok",
11241            crate::json::Value::Null,
11242        );
11243
11244        self.invalidate_result_cache();
11245        Ok(RuntimeQueryResult::ok_message(
11246            query.to_string(),
11247            &format!("policy `{policy_id}` detached from {pretty_target}"),
11248            "detach_policy",
11249        ))
11250    }
11251
11252    fn execute_show_policies(
11253        &self,
11254        query: &str,
11255        filter: Option<&crate::storage::query::ast::PolicyPrincipalRef>,
11256    ) -> RedDBResult<RuntimeQueryResult> {
11257        use crate::auth::UserId;
11258        use crate::storage::query::ast::PolicyPrincipalRef;
11259        use crate::storage::query::unified::UnifiedRecord;
11260        use crate::storage::schema::Value as SchemaValue;
11261        use std::sync::Arc;
11262
11263        let auth_store = self
11264            .inner
11265            .auth_store
11266            .read()
11267            .clone()
11268            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11269
11270        let pols = match filter {
11271            None => auth_store.list_policies(),
11272            Some(PolicyPrincipalRef::User(u)) => {
11273                let id = UserId::from_parts(u.tenant.as_deref(), &u.username);
11274                auth_store.effective_policies(&id)
11275            }
11276            Some(PolicyPrincipalRef::Group(g)) => auth_store.group_policies(g),
11277        };
11278
11279        let mut records = Vec::with_capacity(pols.len() + 1);
11280
11281        // Header row (#712 / S5A): synthetic record at index 0 that
11282        // reports the active PolicyEnforcementMode and the hard-cutover
11283        // version, so an operator running SHOW POLICIES can see the
11284        // current posture without a separate command.
11285        let mode = auth_store.enforcement_mode();
11286        let mut header = UnifiedRecord::default();
11287        header.set_arc(
11288            Arc::from("id"),
11289            SchemaValue::text("<enforcement_mode>".to_string()),
11290        );
11291        header.set_arc(Arc::from("statements"), SchemaValue::Integer(0));
11292        header.set_arc(Arc::from("tenant"), SchemaValue::Null);
11293        let header_json = format!(
11294            r#"{{"enforcement_mode":"{}","policy_only_hard_version":"{}"}}"#,
11295            mode.as_str(),
11296            crate::auth::enforcement_mode::POLICY_ONLY_HARD_VERSION
11297        );
11298        header.set_arc(Arc::from("json"), SchemaValue::text(header_json));
11299        records.push(header);
11300
11301        for p in pols.iter() {
11302            let mut rec = UnifiedRecord::default();
11303            rec.set_arc(Arc::from("id"), SchemaValue::text(p.id.clone()));
11304            rec.set_arc(
11305                Arc::from("statements"),
11306                SchemaValue::Integer(p.statements.len() as i64),
11307            );
11308            rec.set_arc(
11309                Arc::from("tenant"),
11310                p.tenant
11311                    .as_deref()
11312                    .map(|t| SchemaValue::text(t.to_string()))
11313                    .unwrap_or(SchemaValue::Null),
11314            );
11315            rec.set_arc(Arc::from("json"), SchemaValue::text(p.to_json_string()));
11316            records.push(rec);
11317        }
11318        let mut result = crate::storage::query::unified::UnifiedResult::empty();
11319        result.records = records;
11320        Ok(RuntimeQueryResult {
11321            query: query.to_string(),
11322            mode: crate::storage::query::modes::QueryMode::Sql,
11323            statement: "show_policies",
11324            engine: "iam-policies",
11325            result,
11326            affected_rows: 0,
11327            statement_type: "select",
11328            bookmark: None,
11329        })
11330    }
11331
11332    fn execute_show_effective_permissions(
11333        &self,
11334        query: &str,
11335        user: &crate::storage::query::ast::PolicyUserRef,
11336        resource: Option<&crate::storage::query::ast::PolicyResourceRef>,
11337    ) -> RedDBResult<RuntimeQueryResult> {
11338        use crate::auth::UserId;
11339        use crate::storage::query::unified::UnifiedRecord;
11340        use crate::storage::schema::Value as SchemaValue;
11341        use std::sync::Arc;
11342
11343        let auth_store = self
11344            .inner
11345            .auth_store
11346            .read()
11347            .clone()
11348            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11349        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
11350        let pols = auth_store.effective_policies(&id);
11351
11352        // Show one row per (policy, statement) tuple, plus any
11353        // resource-level filter passed by the caller.
11354        let mut records = Vec::new();
11355        for p in pols.iter() {
11356            for (idx, st) in p.statements.iter().enumerate() {
11357                if let Some(_r) = resource {
11358                    // Naive filter: render statement targets to strings
11359                    // and skip if no match. Conservative default = include
11360                    // (the simulator handles fine-grained matching).
11361                }
11362                let mut rec = UnifiedRecord::default();
11363                rec.set_arc(Arc::from("policy_id"), SchemaValue::text(p.id.clone()));
11364                rec.set_arc(
11365                    Arc::from("statement_index"),
11366                    SchemaValue::Integer(idx as i64),
11367                );
11368                rec.set_arc(
11369                    Arc::from("sid"),
11370                    st.sid
11371                        .as_deref()
11372                        .map(|s| SchemaValue::text(s.to_string()))
11373                        .unwrap_or(SchemaValue::Null),
11374                );
11375                rec.set_arc(
11376                    Arc::from("effect"),
11377                    SchemaValue::text(match st.effect {
11378                        crate::auth::policies::Effect::Allow => "allow",
11379                        crate::auth::policies::Effect::Deny => "deny",
11380                    }),
11381                );
11382                rec.set_arc(
11383                    Arc::from("actions"),
11384                    SchemaValue::Integer(st.actions.len() as i64),
11385                );
11386                rec.set_arc(
11387                    Arc::from("resources"),
11388                    SchemaValue::Integer(st.resources.len() as i64),
11389                );
11390                records.push(rec);
11391            }
11392        }
11393        let mut result = crate::storage::query::unified::UnifiedResult::empty();
11394        result.records = records;
11395        Ok(RuntimeQueryResult {
11396            query: query.to_string(),
11397            mode: crate::storage::query::modes::QueryMode::Sql,
11398            statement: "show_effective_permissions",
11399            engine: "iam-policies",
11400            result,
11401            affected_rows: 0,
11402            statement_type: "select",
11403            bookmark: None,
11404        })
11405    }
11406
11407    fn execute_lint_policy(
11408        &self,
11409        query: &str,
11410        source: &crate::storage::query::ast::LintPolicySource,
11411    ) -> RedDBResult<RuntimeQueryResult> {
11412        use crate::auth::policy_linter::lint;
11413        use crate::storage::query::ast::LintPolicySource;
11414        use crate::storage::query::unified::UnifiedRecord;
11415        use crate::storage::schema::Value as SchemaValue;
11416        use std::sync::Arc;
11417
11418        // Resolve the policy text. `JSON` source lints the literal
11419        // verbatim; `Id` source fetches the stored document so
11420        // operators can lint a policy by name without rebuilding the
11421        // JSON from `SHOW POLICY`.
11422        let policy_text = match source {
11423            LintPolicySource::Json(text) => text.clone(),
11424            LintPolicySource::Id(id) => {
11425                let auth_store =
11426                    self.inner.auth_store.read().clone().ok_or_else(|| {
11427                        RedDBError::Query("auth store not configured".to_string())
11428                    })?;
11429                let policy = auth_store
11430                    .get_policy(id)
11431                    .ok_or_else(|| RedDBError::Query(format!("policy `{id}` not found")))?;
11432                policy.to_json_string()
11433            }
11434        };
11435        let diagnostics = lint(&policy_text);
11436
11437        let principal_str = current_auth_identity()
11438            .map(|(u, _)| u)
11439            .unwrap_or_else(|| "anonymous".into());
11440        tracing::info!(
11441            target: "audit",
11442            principal = %principal_str,
11443            action = "iam:policy.lint",
11444            diagnostic_count = diagnostics.len(),
11445            "LINT POLICY issued"
11446        );
11447        self.inner.audit_log.record(
11448            "iam/policy.lint",
11449            &principal_str,
11450            match source {
11451                LintPolicySource::Id(id) => id.as_str(),
11452                LintPolicySource::Json(_) => "<json>",
11453            },
11454            "ok",
11455            crate::json::Value::Null,
11456        );
11457
11458        // One row per diagnostic. Column order matches the HTTP
11459        // surface's JSON keys so the two contracts line up.
11460        const COLUMNS: [&str; 5] = ["severity", "code", "message", "suggested_fix", "location"];
11461        let schema = Arc::new(
11462            COLUMNS
11463                .iter()
11464                .map(|name| Arc::<str>::from(*name))
11465                .collect::<Vec<_>>(),
11466        );
11467        let records: Vec<UnifiedRecord> = diagnostics
11468            .iter()
11469            .map(|d| {
11470                UnifiedRecord::with_schema(
11471                    Arc::clone(&schema),
11472                    vec![
11473                        SchemaValue::text(d.severity.as_str()),
11474                        SchemaValue::text(d.code.as_str()),
11475                        SchemaValue::text(d.message.clone()),
11476                        d.suggested_fix
11477                            .as_deref()
11478                            .map(SchemaValue::text)
11479                            .unwrap_or(SchemaValue::Null),
11480                        d.location
11481                            .as_deref()
11482                            .map(SchemaValue::text)
11483                            .unwrap_or(SchemaValue::Null),
11484                    ],
11485                )
11486            })
11487            .collect();
11488        let mut result = crate::storage::query::unified::UnifiedResult::with_columns(
11489            COLUMNS.iter().map(|c| c.to_string()).collect(),
11490        );
11491        result.records = records;
11492        Ok(RuntimeQueryResult {
11493            query: query.to_string(),
11494            mode: crate::storage::query::modes::QueryMode::Sql,
11495            statement: "lint_policy",
11496            engine: "iam-policies",
11497            result,
11498            affected_rows: 0,
11499            statement_type: "select",
11500            bookmark: None,
11501        })
11502    }
11503
11504    /// `MIGRATE POLICY MODE TO '<target>' [DRY RUN]` — flip the install
11505    /// from `legacy_rbac` to `policy_only` after the pre-flight delta
11506    /// simulator confirms no non-admin principal would lose access.
11507    /// Issue #714.
11508    fn execute_migrate_policy_mode(
11509        &self,
11510        query: &str,
11511        target: &str,
11512        dry_run: bool,
11513    ) -> RedDBResult<RuntimeQueryResult> {
11514        use crate::auth::enforcement_mode::PolicyEnforcementMode;
11515        use crate::auth::migrate_policy_mode::{
11516            principal_label, simulate_migration_delta, MigratePolicyDelta,
11517        };
11518        use crate::auth::policies::ResourceRef;
11519        use crate::storage::query::unified::UnifiedRecord;
11520        use crate::storage::schema::Value as SchemaValue;
11521        use std::sync::Arc;
11522
11523        // Only `policy_only` is a meaningful destination for this
11524        // command — flipping back to `legacy_rbac` is supported via
11525        // direct config writes (it doesn't need a pre-flight). We
11526        // reject everything else with the same allowlist `parse` uses.
11527        let parsed = PolicyEnforcementMode::parse(target).ok_or_else(|| {
11528            RedDBError::Query(format!(
11529                "MIGRATE POLICY MODE: invalid target `{target}` (expected `policy_only`)"
11530            ))
11531        })?;
11532        if parsed != PolicyEnforcementMode::PolicyOnly {
11533            return Err(RedDBError::Query(format!(
11534                "MIGRATE POLICY MODE: target `{target}` is not supported — only `policy_only` may be migrated to via this command"
11535            )));
11536        }
11537
11538        let auth_store = self
11539            .inner
11540            .auth_store
11541            .read()
11542            .clone()
11543            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11544
11545        // Resource enumeration: every existing collection probed as
11546        // `table:<name>`. This is the realistic resource surface for
11547        // the legacy_rbac fallback (the role floors gate per-table
11548        // actions). Wildcard / column-scoped resources are still
11549        // covered by the policy evaluator because evaluate() resolves
11550        // resource patterns relative to the concrete resources we
11551        // probe here.
11552        let snapshot = self.inner.db.catalog_model_snapshot();
11553        let resources: Vec<ResourceRef> = snapshot
11554            .collections
11555            .iter()
11556            .map(|c| ResourceRef::new("table", c.name.clone()))
11557            .collect();
11558
11559        let now_ms = crate::utils::now_unix_millis() as u128;
11560        let deltas: Vec<MigratePolicyDelta> =
11561            simulate_migration_delta(auth_store.as_ref(), &resources, now_ms);
11562
11563        let principal_str = current_auth_identity()
11564            .map(|(u, _)| u)
11565            .unwrap_or_else(|| "anonymous".into());
11566
11567        // Audit every issuance. The outcome line differentiates
11568        // dry-run, refused, and applied — operators can grep for these
11569        // strings in the audit log.
11570        let outcome_str = if dry_run {
11571            "dry_run"
11572        } else if deltas.is_empty() {
11573            "applied"
11574        } else {
11575            "refused"
11576        };
11577        tracing::info!(
11578            target: "audit",
11579            principal = %principal_str,
11580            action = "iam:policy.migrate_mode",
11581            target = %target,
11582            dry_run,
11583            delta_count = deltas.len(),
11584            outcome = outcome_str,
11585            "MIGRATE POLICY MODE issued"
11586        );
11587        self.inner.audit_log.record(
11588            "iam/policy.migrate_mode",
11589            &principal_str,
11590            target,
11591            outcome_str,
11592            crate::json::Value::Null,
11593        );
11594
11595        // Refuse the non-dry-run path when any principal would lose
11596        // access. The error string carries a compact summary plus the
11597        // delta count so operators can re-run with DRY RUN to inspect.
11598        if !dry_run && !deltas.is_empty() {
11599            let summary = deltas
11600                .iter()
11601                .take(5)
11602                .map(|d| {
11603                    format!(
11604                        "{}:{}/{}:{}",
11605                        principal_label(&d.principal),
11606                        d.action,
11607                        d.resource_kind,
11608                        d.resource_name
11609                    )
11610                })
11611                .collect::<Vec<_>>()
11612                .join(", ");
11613            let more = if deltas.len() > 5 {
11614                format!(" (and {} more)", deltas.len() - 5)
11615            } else {
11616                String::new()
11617            };
11618            return Err(RedDBError::Query(format!(
11619                "MIGRATE POLICY MODE refused: {n} principal/action/resource pair(s) would lose access under `policy_only`. Run `MIGRATE POLICY MODE TO '{target}' DRY RUN` to inspect. Sample: {summary}{more}",
11620                n = deltas.len(),
11621            )));
11622        }
11623
11624        // Mutate the live enforcement mode only on the non-dry-run
11625        // path with an empty delta. `set_enforcement_mode` also
11626        // persists to vault_kv so the new mode survives restart.
11627        if !dry_run {
11628            auth_store.set_enforcement_mode(parsed);
11629        }
11630
11631        const COLUMNS: [&str; 5] = [
11632            "principal",
11633            "role",
11634            "action",
11635            "resource_kind",
11636            "resource_name",
11637        ];
11638        let schema = Arc::new(
11639            COLUMNS
11640                .iter()
11641                .map(|name| Arc::<str>::from(*name))
11642                .collect::<Vec<_>>(),
11643        );
11644        let records: Vec<UnifiedRecord> = deltas
11645            .iter()
11646            .map(|d| {
11647                UnifiedRecord::with_schema(
11648                    Arc::clone(&schema),
11649                    vec![
11650                        SchemaValue::text(principal_label(&d.principal)),
11651                        SchemaValue::text(d.role.as_str()),
11652                        SchemaValue::text(d.action.clone()),
11653                        SchemaValue::text(d.resource_kind.clone()),
11654                        SchemaValue::text(d.resource_name.clone()),
11655                    ],
11656                )
11657            })
11658            .collect();
11659        let mut result = crate::storage::query::unified::UnifiedResult::with_columns(
11660            COLUMNS.iter().map(|c| c.to_string()).collect(),
11661        );
11662        result.records = records;
11663        Ok(RuntimeQueryResult {
11664            query: query.to_string(),
11665            mode: crate::storage::query::modes::QueryMode::Sql,
11666            statement: "migrate_policy_mode",
11667            engine: "iam-policies",
11668            result,
11669            affected_rows: 0,
11670            statement_type: "select",
11671            bookmark: None,
11672        })
11673    }
11674
11675    fn execute_simulate_policy(
11676        &self,
11677        query: &str,
11678        user: &crate::storage::query::ast::PolicyUserRef,
11679        action: &str,
11680        resource: &crate::storage::query::ast::PolicyResourceRef,
11681    ) -> RedDBResult<RuntimeQueryResult> {
11682        use crate::auth::policies::ResourceRef;
11683        use crate::auth::store::SimCtx;
11684        use crate::auth::UserId;
11685        use crate::storage::query::unified::UnifiedRecord;
11686        use crate::storage::schema::Value as SchemaValue;
11687        use std::sync::Arc;
11688
11689        let auth_store = self
11690            .inner
11691            .auth_store
11692            .read()
11693            .clone()
11694            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11695        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
11696        let r = ResourceRef::new(resource.kind.clone(), resource.name.clone());
11697        let outcome = auth_store.simulate(&id, action, &r, SimCtx::default());
11698
11699        let principal_str = current_auth_identity()
11700            .map(|(u, _)| u)
11701            .unwrap_or_else(|| "anonymous".into());
11702        let (decision_str, matched_pid, matched_sid) = decision_to_strings(&outcome.decision);
11703        tracing::info!(
11704            target: "audit",
11705            principal = %principal_str,
11706            action = "iam:policy.simulate",
11707            decision = %decision_str,
11708            matched_policy_id = ?matched_pid,
11709            matched_sid = ?matched_sid,
11710            "SIMULATE issued"
11711        );
11712        self.inner.audit_log.record(
11713            "iam/policy.simulate",
11714            &principal_str,
11715            &id.to_string(),
11716            "ok",
11717            crate::json::Value::Null,
11718        );
11719
11720        let mut rec = UnifiedRecord::default();
11721        rec.set_arc(Arc::from("decision"), SchemaValue::text(decision_str));
11722        rec.set_arc(
11723            Arc::from("matched_policy_id"),
11724            matched_pid
11725                .map(SchemaValue::text)
11726                .unwrap_or(SchemaValue::Null),
11727        );
11728        rec.set_arc(
11729            Arc::from("matched_sid"),
11730            matched_sid
11731                .map(SchemaValue::text)
11732                .unwrap_or(SchemaValue::Null),
11733        );
11734        rec.set_arc(Arc::from("reason"), SchemaValue::text(outcome.reason));
11735        rec.set_arc(
11736            Arc::from("trail_len"),
11737            SchemaValue::Integer(outcome.trail.len() as i64),
11738        );
11739        let mut result = crate::storage::query::unified::UnifiedResult::empty();
11740        result.records = vec![rec];
11741        Ok(RuntimeQueryResult {
11742            query: query.to_string(),
11743            mode: crate::storage::query::modes::QueryMode::Sql,
11744            statement: "simulate_policy",
11745            engine: "iam-policies",
11746            result,
11747            affected_rows: 0,
11748            statement_type: "select",
11749            bookmark: None,
11750        })
11751    }
11752}
11753
11754/// Translate a parsed GRANT into a synthetic IAM policy whose id
11755/// starts with `_grant_<unique>`. PUBLIC is represented as an
11756/// implicit IAM group; legacy GROUP grants are still rejected by the
11757/// grant store and are not translated here.
11758fn grant_to_iam_policy(
11759    principal: &crate::auth::privileges::GrantPrincipal,
11760    resource: &crate::auth::privileges::Resource,
11761    actions: &[crate::auth::privileges::Action],
11762    tenant: Option<&str>,
11763) -> Option<crate::auth::policies::Policy> {
11764    use crate::auth::policies::{
11765        compile_action, ActionPattern, Effect, Policy, ResourcePattern, Statement,
11766    };
11767    use crate::auth::privileges::{Action, GrantPrincipal, Resource};
11768
11769    if matches!(principal, GrantPrincipal::Group(_)) {
11770        return None;
11771    }
11772
11773    let now = crate::auth::now_ms();
11774    let id = format!("_grant_{:x}_{:x}", now, std::process::id());
11775
11776    let resource_str = match resource {
11777        Resource::Database => "table:*".to_string(),
11778        Resource::Schema(s) => format!("table:{s}.*"),
11779        Resource::Table { schema, table } => match schema {
11780            Some(s) => format!("table:{s}.{table}"),
11781            None => format!("table:{table}"),
11782        },
11783        Resource::Function { schema, name } => match schema {
11784            Some(s) => format!("function:{s}.{name}"),
11785            None => format!("function:{name}"),
11786        },
11787    };
11788
11789    // Compile actions — fall back to `*` only when the grant included
11790    // `Action::All`. Map every other action keyword to its lowercase
11791    // form so it lines up with the kernel's allowlist.
11792    let action_patterns: Vec<ActionPattern> = if actions.contains(&Action::All) {
11793        vec![ActionPattern::Wildcard]
11794    } else {
11795        actions
11796            .iter()
11797            .map(|a| compile_action(&a.as_str().to_ascii_lowercase()))
11798            .collect()
11799    };
11800    if action_patterns.is_empty() {
11801        return None;
11802    }
11803
11804    // Inline resource compilation matching the kernel's `compile_resource`:
11805    //   * `*` → wildcard
11806    //   * contains `*` → glob
11807    //   * `kind:name` → exact
11808    let resource_patterns = if resource_str == "*" {
11809        vec![ResourcePattern::Wildcard]
11810    } else if resource_str.contains('*') {
11811        vec![ResourcePattern::Glob(resource_str.clone())]
11812    } else if let Some((kind, name)) = resource_str.split_once(':') {
11813        vec![ResourcePattern::Exact {
11814            kind: kind.to_string(),
11815            name: name.to_string(),
11816        }]
11817    } else {
11818        vec![ResourcePattern::Wildcard]
11819    };
11820
11821    let policy = Policy {
11822        id,
11823        version: 1,
11824        tenant: tenant.map(|t| t.to_string()),
11825        created_at: now,
11826        updated_at: now,
11827        statements: vec![Statement {
11828            sid: None,
11829            effect: Effect::Allow,
11830            actions: action_patterns,
11831            resources: resource_patterns,
11832            condition: None,
11833        }],
11834    };
11835    if policy.validate().is_err() {
11836        return None;
11837    }
11838    Some(policy)
11839}
11840
11841/// Coerce a `key => <number>` table-function named argument into a positive
11842/// iteration count for the centrality TVFs (issue #797). The parser lexes all
11843/// named values as `f64`, so an integral, finite, strictly-positive value is
11844/// required here; anything else (fractional, zero, negative, NaN/inf) is a
11845/// clear query error. `func` names the function for the message.
11846fn parse_positive_iterations(func: &str, value: &f64) -> RedDBResult<usize> {
11847    if !value.is_finite() || *value < 1.0 || value.fract() != 0.0 {
11848        return Err(RedDBError::Query(format!(
11849            "table function '{func}' max_iterations must be a positive integer, got {value}"
11850        )));
11851    }
11852    Ok(*value as usize)
11853}
11854
11855fn legacy_action_to_iam(action: crate::auth::privileges::Action) -> &'static str {
11856    use crate::auth::privileges::Action;
11857    match action {
11858        Action::Select => "select",
11859        Action::Insert => "insert",
11860        Action::Update => "update",
11861        Action::Delete => "delete",
11862        Action::Truncate => "truncate",
11863        Action::References => "references",
11864        Action::Execute => "execute",
11865        Action::Usage => "usage",
11866        Action::All => "*",
11867    }
11868}
11869
11870fn update_set_target_columns(query: &crate::storage::query::ast::UpdateQuery) -> Vec<String> {
11871    let mut columns = Vec::new();
11872    for (column, _) in &query.assignment_exprs {
11873        if !columns.iter().any(|seen| seen == column) {
11874            columns.push(column.clone());
11875        }
11876    }
11877    columns
11878}
11879
11880fn column_access_request_for_table_update(
11881    table_name: &str,
11882    columns: Vec<String>,
11883) -> crate::auth::ColumnAccessRequest {
11884    match table_name.split_once('.') {
11885        Some((schema, table)) => {
11886            crate::auth::ColumnAccessRequest::update(table.to_string(), columns)
11887                .with_schema(schema.to_string())
11888        }
11889        None => crate::auth::ColumnAccessRequest::update(table_name.to_string(), columns),
11890    }
11891}
11892
11893fn column_access_request_for_table_select(
11894    table_name: &str,
11895    columns: Vec<String>,
11896) -> crate::auth::ColumnAccessRequest {
11897    match table_name.split_once('.') {
11898        Some((schema, table)) => {
11899            crate::auth::ColumnAccessRequest::select(table.to_string(), columns)
11900                .with_schema(schema.to_string())
11901        }
11902        None => crate::auth::ColumnAccessRequest::select(table_name.to_string(), columns),
11903    }
11904}
11905
11906fn update_returning_columns_for_policy(
11907    runtime: &RedDBRuntime,
11908    query: &crate::storage::query::ast::UpdateQuery,
11909) -> Option<Vec<String>> {
11910    let items = query.returning.as_ref()?;
11911    let mut columns = Vec::new();
11912    let project_all = items
11913        .iter()
11914        .any(|item| matches!(item, crate::storage::query::ast::ReturningItem::All));
11915    if project_all {
11916        collect_returning_star_columns(runtime, query, &mut columns);
11917    } else {
11918        for item in items {
11919            let crate::storage::query::ast::ReturningItem::Column(column) = item else {
11920                continue;
11921            };
11922            push_returning_policy_column(&mut columns, column);
11923        }
11924    }
11925    (!columns.is_empty()).then_some(columns)
11926}
11927
11928fn collect_returning_star_columns(
11929    runtime: &RedDBRuntime,
11930    query: &crate::storage::query::ast::UpdateQuery,
11931    columns: &mut Vec<String>,
11932) {
11933    let store = runtime.db().store();
11934    let Some(manager) = store.get_collection(&query.table) else {
11935        return;
11936    };
11937    if let Some(schema) = manager.column_schema() {
11938        for column in schema.iter() {
11939            push_returning_policy_column(columns, column);
11940        }
11941    }
11942    for entity in manager.query_all(|_| true) {
11943        if !returning_entity_matches_update_target(&entity, query.target) {
11944            continue;
11945        }
11946        match &entity.data {
11947            crate::storage::EntityData::Row(row) => {
11948                for (column, _) in row.iter_fields() {
11949                    push_returning_policy_column(columns, column);
11950                }
11951            }
11952            crate::storage::EntityData::Node(node) => {
11953                push_returning_policy_column(columns, "label");
11954                push_returning_policy_column(columns, "node_type");
11955                for column in node.properties.keys() {
11956                    push_returning_policy_column(columns, column);
11957                }
11958            }
11959            crate::storage::EntityData::Edge(edge) => {
11960                push_returning_policy_column(columns, "label");
11961                push_returning_policy_column(columns, "from_rid");
11962                push_returning_policy_column(columns, "to_rid");
11963                push_returning_policy_column(columns, "weight");
11964                for column in edge.properties.keys() {
11965                    push_returning_policy_column(columns, column);
11966                }
11967            }
11968            _ => {}
11969        }
11970    }
11971}
11972
11973fn push_returning_policy_column(columns: &mut Vec<String>, column: &str) {
11974    if returning_public_envelope_column(column) {
11975        return;
11976    }
11977    if !columns.iter().any(|seen| seen == column) {
11978        columns.push(column.to_string());
11979    }
11980}
11981
11982fn returning_public_envelope_column(column: &str) -> bool {
11983    matches!(
11984        column.to_ascii_lowercase().as_str(),
11985        "rid" | "collection" | "kind" | "tenant" | "created_at" | "updated_at" | "red_entity_id"
11986    )
11987}
11988
11989fn returning_entity_matches_update_target(
11990    entity: &crate::storage::UnifiedEntity,
11991    target: crate::storage::query::ast::UpdateTarget,
11992) -> bool {
11993    use crate::storage::query::ast::UpdateTarget;
11994    match target {
11995        UpdateTarget::Rows => {
11996            matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Row))
11997        }
11998        UpdateTarget::Documents => {
11999            matches!(
12000                returning_row_item_kind(entity),
12001                Some(ReturningRowKind::Document)
12002            )
12003        }
12004        UpdateTarget::Kv => matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Kv)),
12005        UpdateTarget::Nodes => matches!(
12006            (&entity.kind, &entity.data),
12007            (
12008                crate::storage::EntityKind::GraphNode(_),
12009                crate::storage::EntityData::Node(_)
12010            )
12011        ),
12012        UpdateTarget::Edges => matches!(
12013            (&entity.kind, &entity.data),
12014            (
12015                crate::storage::EntityKind::GraphEdge(_),
12016                crate::storage::EntityData::Edge(_)
12017            )
12018        ),
12019    }
12020}
12021
12022#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12023enum ReturningRowKind {
12024    Row,
12025    Document,
12026    Kv,
12027}
12028
12029fn returning_row_item_kind(entity: &crate::storage::UnifiedEntity) -> Option<ReturningRowKind> {
12030    let row = entity.data.as_row()?;
12031    let is_kv = row.iter_fields().all(|(column, _)| {
12032        column.eq_ignore_ascii_case("key") || column.eq_ignore_ascii_case("value")
12033    });
12034    if is_kv {
12035        return Some(ReturningRowKind::Kv);
12036    }
12037    let is_document = row
12038        .iter_fields()
12039        .any(|(_, value)| matches!(value, crate::storage::schema::Value::Json(_)));
12040    if is_document {
12041        Some(ReturningRowKind::Document)
12042    } else {
12043        Some(ReturningRowKind::Row)
12044    }
12045}
12046
12047fn requested_table_columns_for_policy(
12048    table: &crate::storage::query::ast::TableQuery,
12049) -> Vec<String> {
12050    use crate::storage::query::sql_lowering::{
12051        effective_table_filter, effective_table_group_by_exprs, effective_table_having_filter,
12052        effective_table_projections,
12053    };
12054
12055    let table_name = table.table.as_str();
12056    let table_alias = table.alias.as_deref();
12057    let mut columns = std::collections::BTreeSet::new();
12058
12059    for projection in effective_table_projections(table) {
12060        collect_projection_columns(&projection, table_name, table_alias, &mut columns);
12061    }
12062    if let Some(filter) = effective_table_filter(table) {
12063        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
12064    }
12065    for expr in effective_table_group_by_exprs(table) {
12066        collect_expr_columns(&expr, table_name, table_alias, &mut columns);
12067    }
12068    if let Some(filter) = effective_table_having_filter(table) {
12069        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
12070    }
12071    for order in &table.order_by {
12072        if let Some(expr) = order.expr.as_ref() {
12073            collect_expr_columns(expr, table_name, table_alias, &mut columns);
12074        } else {
12075            collect_field_ref_column(&order.field, table_name, table_alias, &mut columns);
12076        }
12077    }
12078
12079    columns.into_iter().collect()
12080}
12081
12082fn collect_projection_columns(
12083    projection: &crate::storage::query::ast::Projection,
12084    table_name: &str,
12085    table_alias: Option<&str>,
12086    columns: &mut std::collections::BTreeSet<String>,
12087) {
12088    use crate::storage::query::ast::Projection;
12089    match projection {
12090        Projection::All => {
12091            columns.insert("*".to_string());
12092        }
12093        Projection::Column(column) | Projection::Alias(column, _) => {
12094            if column != "*" {
12095                columns.insert(column.clone());
12096            }
12097        }
12098        Projection::Function(_, args) => {
12099            for arg in args {
12100                collect_projection_columns(arg, table_name, table_alias, columns);
12101            }
12102        }
12103        Projection::Expression(filter, _) => {
12104            collect_filter_columns(filter, table_name, table_alias, columns);
12105        }
12106        Projection::Field(field, _) => {
12107            collect_field_ref_column(field, table_name, table_alias, columns);
12108        }
12109        // Slice 7a (#589): no runtime support yet; recurse into args so
12110        // any column references are still tracked in case a future
12111        // executor needs the column set.
12112        Projection::Window { args, .. } => {
12113            for arg in args {
12114                collect_projection_columns(arg, table_name, table_alias, columns);
12115            }
12116        }
12117    }
12118}
12119
12120fn collect_filter_columns(
12121    filter: &crate::storage::query::ast::Filter,
12122    table_name: &str,
12123    table_alias: Option<&str>,
12124    columns: &mut std::collections::BTreeSet<String>,
12125) {
12126    use crate::storage::query::ast::Filter;
12127    match filter {
12128        Filter::Compare { field, .. }
12129        | Filter::IsNull(field)
12130        | Filter::IsNotNull(field)
12131        | Filter::In { field, .. }
12132        | Filter::Between { field, .. }
12133        | Filter::Like { field, .. }
12134        | Filter::StartsWith { field, .. }
12135        | Filter::EndsWith { field, .. }
12136        | Filter::Contains { field, .. } => {
12137            collect_field_ref_column(field, table_name, table_alias, columns);
12138        }
12139        Filter::CompareFields { left, right, .. } => {
12140            collect_field_ref_column(left, table_name, table_alias, columns);
12141            collect_field_ref_column(right, table_name, table_alias, columns);
12142        }
12143        Filter::CompareExpr { lhs, rhs, .. } => {
12144            collect_expr_columns(lhs, table_name, table_alias, columns);
12145            collect_expr_columns(rhs, table_name, table_alias, columns);
12146        }
12147        Filter::And(left, right) | Filter::Or(left, right) => {
12148            collect_filter_columns(left, table_name, table_alias, columns);
12149            collect_filter_columns(right, table_name, table_alias, columns);
12150        }
12151        Filter::Not(inner) => collect_filter_columns(inner, table_name, table_alias, columns),
12152    }
12153}
12154
12155fn collect_expr_columns(
12156    expr: &crate::storage::query::ast::Expr,
12157    table_name: &str,
12158    table_alias: Option<&str>,
12159    columns: &mut std::collections::BTreeSet<String>,
12160) {
12161    use crate::storage::query::ast::Expr;
12162    match expr {
12163        Expr::Column { field, .. } => {
12164            collect_field_ref_column(field, table_name, table_alias, columns);
12165        }
12166        Expr::Literal { .. } | Expr::Parameter { .. } => {}
12167        Expr::UnaryOp { operand, .. } | Expr::Cast { inner: operand, .. } => {
12168            collect_expr_columns(operand, table_name, table_alias, columns);
12169        }
12170        Expr::BinaryOp { lhs, rhs, .. } => {
12171            collect_expr_columns(lhs, table_name, table_alias, columns);
12172            collect_expr_columns(rhs, table_name, table_alias, columns);
12173        }
12174        Expr::FunctionCall { args, .. } => {
12175            for arg in args {
12176                collect_expr_columns(arg, table_name, table_alias, columns);
12177            }
12178        }
12179        Expr::Case {
12180            branches, else_, ..
12181        } => {
12182            for (condition, value) in branches {
12183                collect_expr_columns(condition, table_name, table_alias, columns);
12184                collect_expr_columns(value, table_name, table_alias, columns);
12185            }
12186            if let Some(value) = else_ {
12187                collect_expr_columns(value, table_name, table_alias, columns);
12188            }
12189        }
12190        Expr::IsNull { operand, .. } => {
12191            collect_expr_columns(operand, table_name, table_alias, columns);
12192        }
12193        Expr::InList { target, values, .. } => {
12194            collect_expr_columns(target, table_name, table_alias, columns);
12195            for value in values {
12196                collect_expr_columns(value, table_name, table_alias, columns);
12197            }
12198        }
12199        Expr::Between {
12200            target, low, high, ..
12201        } => {
12202            collect_expr_columns(target, table_name, table_alias, columns);
12203            collect_expr_columns(low, table_name, table_alias, columns);
12204            collect_expr_columns(high, table_name, table_alias, columns);
12205        }
12206        Expr::Subquery { .. } => {}
12207        Expr::WindowFunctionCall { args, window, .. } => {
12208            for arg in args {
12209                collect_expr_columns(arg, table_name, table_alias, columns);
12210            }
12211            for e in &window.partition_by {
12212                collect_expr_columns(e, table_name, table_alias, columns);
12213            }
12214            for o in &window.order_by {
12215                collect_expr_columns(&o.expr, table_name, table_alias, columns);
12216            }
12217        }
12218    }
12219}
12220
12221fn collect_field_ref_column(
12222    field: &crate::storage::query::ast::FieldRef,
12223    table_name: &str,
12224    table_alias: Option<&str>,
12225    columns: &mut std::collections::BTreeSet<String>,
12226) {
12227    if let Some(column) = policy_column_name_from_field_ref(field, table_name, table_alias) {
12228        if column != "*" {
12229            columns.insert(column);
12230        }
12231    }
12232}
12233
12234fn policy_column_name_from_field_ref(
12235    field: &crate::storage::query::ast::FieldRef,
12236    table_name: &str,
12237    table_alias: Option<&str>,
12238) -> Option<String> {
12239    match field {
12240        crate::storage::query::ast::FieldRef::TableColumn { table, column } => {
12241            if column == "*" {
12242                return Some("*".to_string());
12243            }
12244            if table.is_empty() || table == table_name || Some(table.as_str()) == table_alias {
12245                Some(column.clone())
12246            } else {
12247                Some(format!("{table}.{column}"))
12248            }
12249        }
12250        _ => None,
12251    }
12252}
12253
12254fn legacy_resource_to_iam(
12255    resource: &crate::auth::privileges::Resource,
12256    tenant: Option<&str>,
12257) -> crate::auth::policies::ResourceRef {
12258    use crate::auth::privileges::Resource;
12259
12260    let (kind, name) = match resource {
12261        Resource::Database => ("database".to_string(), "*".to_string()),
12262        Resource::Schema(s) => ("schema".to_string(), format!("{s}.*")),
12263        Resource::Table { schema, table } => (
12264            "table".to_string(),
12265            match schema {
12266                Some(s) => format!("{s}.{table}"),
12267                None => table.clone(),
12268            },
12269        ),
12270        Resource::Function { schema, name } => (
12271            "function".to_string(),
12272            match schema {
12273                Some(s) => format!("{s}.{name}"),
12274                None => name.clone(),
12275            },
12276        ),
12277    };
12278
12279    let mut out = crate::auth::policies::ResourceRef::new(kind, name);
12280    if let Some(t) = tenant {
12281        out = out.with_tenant(t.to_string());
12282    }
12283    out
12284}
12285
12286#[derive(Debug)]
12287struct JoinTableSide {
12288    table: String,
12289    alias: String,
12290}
12291
12292fn table_side_context(expr: &QueryExpr) -> Option<JoinTableSide> {
12293    match expr {
12294        QueryExpr::Table(table) => Some(JoinTableSide {
12295            table: table.table.clone(),
12296            alias: table.alias.clone().unwrap_or_else(|| table.table.clone()),
12297        }),
12298        _ => None,
12299    }
12300}
12301
12302fn collect_projection_columns_for_table(
12303    projection: &Projection,
12304    table: &str,
12305    alias: Option<&str>,
12306    out: &mut BTreeSet<String>,
12307) {
12308    match projection {
12309        Projection::Column(column) | Projection::Alias(column, _) => {
12310            match split_qualified_column(column) {
12311                Some((qualifier, column))
12312                    if qualifier == table || alias.is_some_and(|alias| qualifier == alias) =>
12313                {
12314                    push_policy_column(column, out);
12315                }
12316                Some(_) => {}
12317                None => push_policy_column(column, out),
12318            }
12319        }
12320        Projection::Field(
12321            FieldRef::TableColumn {
12322                table: qualifier,
12323                column,
12324            },
12325            _,
12326        ) => {
12327            if qualifier.is_empty()
12328                || qualifier == table
12329                || alias.is_some_and(|alias| qualifier == alias)
12330            {
12331                push_policy_column(column, out);
12332            }
12333        }
12334        Projection::Field(
12335            FieldRef::NodeProperty {
12336                alias: qualifier,
12337                property,
12338            },
12339            _,
12340        )
12341        | Projection::Field(
12342            FieldRef::EdgeProperty {
12343                alias: qualifier,
12344                property,
12345            },
12346            _,
12347        ) => {
12348            if qualifier == table || alias.is_some_and(|alias| qualifier == alias) {
12349                push_policy_column(property, out);
12350            }
12351        }
12352        Projection::Function(_, args) => {
12353            for arg in args {
12354                collect_projection_columns_for_table(arg, table, alias, out);
12355            }
12356        }
12357        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
12358        Projection::Window { args, .. } => {
12359            for arg in args {
12360                collect_projection_columns_for_table(arg, table, alias, out);
12361            }
12362        }
12363    }
12364}
12365
12366fn collect_projection_columns_for_join_side(
12367    projection: &Projection,
12368    left: Option<&JoinTableSide>,
12369    right: Option<&JoinTableSide>,
12370    out: &mut HashMap<String, BTreeSet<String>>,
12371) -> RedDBResult<()> {
12372    match projection {
12373        Projection::Column(column) | Projection::Alias(column, _) => {
12374            if let Some((qualifier, column)) = split_qualified_column(column) {
12375                push_qualified_join_column(qualifier, column, left, right, out);
12376            } else {
12377                push_unqualified_join_column(column, left, right, out);
12378            }
12379        }
12380        Projection::Field(FieldRef::TableColumn { table, column }, _) => {
12381            if table.is_empty() {
12382                push_unqualified_join_column(column, left, right, out);
12383            } else if let Some(side) = [left, right]
12384                .into_iter()
12385                .flatten()
12386                .find(|side| table == side.table.as_str() || table == side.alias.as_str())
12387            {
12388                push_join_column(&side.table, column, out);
12389            }
12390        }
12391        Projection::Field(FieldRef::NodeProperty { alias, property }, _)
12392        | Projection::Field(FieldRef::EdgeProperty { alias, property }, _) => {
12393            push_qualified_join_column(alias, property, left, right, out);
12394        }
12395        Projection::Function(_, args) => {
12396            for arg in args {
12397                collect_projection_columns_for_join_side(arg, left, right, out)?;
12398            }
12399        }
12400        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
12401        Projection::Window { args, .. } => {
12402            for arg in args {
12403                collect_projection_columns_for_join_side(arg, left, right, out)?;
12404            }
12405        }
12406    }
12407    Ok(())
12408}
12409
12410fn split_qualified_column(column: &str) -> Option<(&str, &str)> {
12411    let (qualifier, column) = column.split_once('.')?;
12412    if qualifier.is_empty() || column.is_empty() || column.contains('.') {
12413        return None;
12414    }
12415    Some((qualifier, column))
12416}
12417
12418fn push_qualified_join_column(
12419    qualifier: &str,
12420    column: &str,
12421    left: Option<&JoinTableSide>,
12422    right: Option<&JoinTableSide>,
12423    out: &mut HashMap<String, BTreeSet<String>>,
12424) {
12425    if let Some(side) = [left, right]
12426        .into_iter()
12427        .flatten()
12428        .find(|side| qualifier == side.table.as_str() || qualifier == side.alias.as_str())
12429    {
12430        push_join_column(&side.table, column, out);
12431    }
12432}
12433
12434fn push_unqualified_join_column(
12435    column: &str,
12436    left: Option<&JoinTableSide>,
12437    right: Option<&JoinTableSide>,
12438    out: &mut HashMap<String, BTreeSet<String>>,
12439) {
12440    for side in [left, right].into_iter().flatten() {
12441        push_join_column(&side.table, column, out);
12442    }
12443}
12444
12445fn push_join_column(table: &str, column: &str, out: &mut HashMap<String, BTreeSet<String>>) {
12446    if is_policy_column_name(column) {
12447        out.entry(table.to_string())
12448            .or_default()
12449            .insert(column.to_string());
12450    }
12451}
12452
12453fn push_policy_column(column: &str, out: &mut BTreeSet<String>) {
12454    if is_policy_column_name(column) {
12455        out.insert(column.to_string());
12456    }
12457}
12458
12459fn is_policy_column_name(column: &str) -> bool {
12460    !column.is_empty()
12461        && column != "*"
12462        && !column.starts_with("LIT:")
12463        && !column.starts_with("TYPE:")
12464}
12465
12466fn runtime_iam_context(
12467    role: crate::auth::Role,
12468    tenant: Option<&str>,
12469) -> crate::auth::policies::EvalContext {
12470    crate::auth::policies::EvalContext {
12471        principal_tenant: tenant.map(|t| t.to_string()),
12472        current_tenant: tenant.map(|t| t.to_string()),
12473        peer_ip: None,
12474        mfa_present: false,
12475        now_ms: crate::auth::now_ms(),
12476        principal_is_admin_role: role == crate::auth::Role::Admin,
12477        principal_is_platform_scoped: tenant.is_none(),
12478    }
12479}
12480
12481fn explicit_table_projection_columns(
12482    query: &crate::storage::query::ast::TableQuery,
12483) -> Vec<String> {
12484    use crate::storage::query::ast::{FieldRef, Projection};
12485
12486    let mut columns = Vec::new();
12487    for projection in crate::storage::query::sql_lowering::effective_table_projections(query) {
12488        match projection {
12489            Projection::Column(column) | Projection::Alias(column, _) => {
12490                push_unique(&mut columns, column)
12491            }
12492            Projection::Field(FieldRef::TableColumn { column, .. }, _) => {
12493                push_unique(&mut columns, column)
12494            }
12495            // SELECT * and expression/function projections need the
12496            // executor-wide column-policy context mapped in
12497            // docs/security/select-relational-column-policy-audit-2026-05-08.md.
12498            _ => {}
12499        }
12500    }
12501    columns
12502}
12503
12504fn explicit_graph_projection_properties(
12505    query: &crate::storage::query::ast::GraphQuery,
12506) -> Vec<String> {
12507    use crate::storage::query::ast::{FieldRef, Projection};
12508
12509    let mut columns = Vec::new();
12510    for projection in &query.return_ {
12511        match projection {
12512            Projection::Field(FieldRef::NodeProperty { property, .. }, _)
12513            | Projection::Field(FieldRef::EdgeProperty { property, .. }, _) => {
12514                push_unique(&mut columns, property.clone())
12515            }
12516            _ => {}
12517        }
12518    }
12519    columns
12520}
12521
12522fn push_unique(columns: &mut Vec<String>, column: String) {
12523    if !columns.iter().any(|existing| existing == &column) {
12524        columns.push(column);
12525    }
12526}
12527
12528fn principal_label(p: &crate::storage::query::ast::PolicyPrincipalRef) -> String {
12529    use crate::storage::query::ast::PolicyPrincipalRef;
12530    match p {
12531        PolicyPrincipalRef::User(u) => match &u.tenant {
12532            Some(t) => format!("user:{t}/{}", u.username),
12533            None => format!("user:{}", u.username),
12534        },
12535        PolicyPrincipalRef::Group(g) => format!("group:{g}"),
12536    }
12537}
12538
12539/// Render a `Decision` into the (decision, matched_policy_id, matched_sid)
12540/// shape used by every audit emit + the simulator response.
12541pub(crate) fn decision_to_strings(
12542    d: &crate::auth::policies::Decision,
12543) -> (String, Option<String>, Option<String>) {
12544    use crate::auth::policies::Decision;
12545    match d {
12546        Decision::Allow {
12547            matched_policy_id,
12548            matched_sid,
12549        } => (
12550            "allow".into(),
12551            Some(matched_policy_id.clone()),
12552            matched_sid.clone(),
12553        ),
12554        Decision::Deny {
12555            matched_policy_id,
12556            matched_sid,
12557        } => (
12558            "deny".into(),
12559            Some(matched_policy_id.clone()),
12560            matched_sid.clone(),
12561        ),
12562        Decision::DefaultDeny => ("default_deny".into(), None, None),
12563        Decision::AdminBypass => ("admin_bypass".into(), None, None),
12564    }
12565}
12566
12567fn relation_scopes_for_query(query: &QueryExpr) -> Vec<String> {
12568    let mut scopes = Vec::new();
12569    collect_relation_scopes(query, &mut scopes);
12570    scopes.sort();
12571    scopes.dedup();
12572    scopes
12573}
12574
12575fn collect_relation_scopes(query: &QueryExpr, scopes: &mut Vec<String>) {
12576    match query {
12577        QueryExpr::Table(table) => {
12578            if !table.table.is_empty() {
12579                scopes.push(table.table.clone());
12580            }
12581            if let Some(alias) = &table.alias {
12582                scopes.push(alias.clone());
12583            }
12584        }
12585        QueryExpr::Join(join) => {
12586            collect_relation_scopes(&join.left, scopes);
12587            collect_relation_scopes(&join.right, scopes);
12588        }
12589        _ => {}
12590    }
12591}
12592
12593fn query_references_outer_scope(query: &QueryExpr, outer_scopes: &[String]) -> bool {
12594    let inner_scopes = relation_scopes_for_query(query);
12595    query_expr_references_outer_scope(query, outer_scopes, &inner_scopes)
12596}
12597
12598fn query_expr_references_outer_scope(
12599    query: &QueryExpr,
12600    outer_scopes: &[String],
12601    inner_scopes: &[String],
12602) -> bool {
12603    match query {
12604        QueryExpr::Table(table) => {
12605            table.select_items.iter().any(|item| match item {
12606                crate::storage::query::ast::SelectItem::Wildcard => false,
12607                crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
12608                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12609                }
12610            }) || table
12611                .where_expr
12612                .as_ref()
12613                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
12614                || table.filter.as_ref().is_some_and(|filter| {
12615                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
12616                })
12617                || table.having_expr.as_ref().is_some_and(|expr| {
12618                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12619                })
12620                || table.having.as_ref().is_some_and(|filter| {
12621                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
12622                })
12623                || table
12624                    .group_by_exprs
12625                    .iter()
12626                    .any(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
12627                || table.order_by.iter().any(|clause| {
12628                    clause.expr.as_ref().is_some_and(|expr| {
12629                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12630                    })
12631                })
12632        }
12633        QueryExpr::Join(join) => {
12634            query_expr_references_outer_scope(&join.left, outer_scopes, inner_scopes)
12635                || query_expr_references_outer_scope(&join.right, outer_scopes, inner_scopes)
12636                || join.filter.as_ref().is_some_and(|filter| {
12637                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
12638                })
12639                || join.return_items.iter().any(|item| match item {
12640                    crate::storage::query::ast::SelectItem::Wildcard => false,
12641                    crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
12642                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12643                    }
12644                })
12645        }
12646        _ => false,
12647    }
12648}
12649
12650fn filter_references_outer_scope(
12651    filter: &crate::storage::query::ast::Filter,
12652    outer_scopes: &[String],
12653    inner_scopes: &[String],
12654) -> bool {
12655    use crate::storage::query::ast::Filter;
12656    match filter {
12657        Filter::Compare { field, .. }
12658        | Filter::IsNull(field)
12659        | Filter::IsNotNull(field)
12660        | Filter::In { field, .. }
12661        | Filter::Between { field, .. }
12662        | Filter::Like { field, .. }
12663        | Filter::StartsWith { field, .. }
12664        | Filter::EndsWith { field, .. }
12665        | Filter::Contains { field, .. } => {
12666            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
12667        }
12668        Filter::CompareFields { left, right, .. } => {
12669            field_ref_references_outer_scope(left, outer_scopes, inner_scopes)
12670                || field_ref_references_outer_scope(right, outer_scopes, inner_scopes)
12671        }
12672        Filter::CompareExpr { lhs, rhs, .. } => {
12673            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
12674                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
12675        }
12676        Filter::And(left, right) | Filter::Or(left, right) => {
12677            filter_references_outer_scope(left, outer_scopes, inner_scopes)
12678                || filter_references_outer_scope(right, outer_scopes, inner_scopes)
12679        }
12680        Filter::Not(inner) => filter_references_outer_scope(inner, outer_scopes, inner_scopes),
12681    }
12682}
12683
12684fn expr_references_outer_scope(
12685    expr: &crate::storage::query::ast::Expr,
12686    outer_scopes: &[String],
12687    inner_scopes: &[String],
12688) -> bool {
12689    use crate::storage::query::ast::Expr;
12690    match expr {
12691        Expr::Column { field, .. } => {
12692            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
12693        }
12694        Expr::BinaryOp { lhs, rhs, .. } => {
12695            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
12696                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
12697        }
12698        Expr::UnaryOp { operand, .. }
12699        | Expr::Cast { inner: operand, .. }
12700        | Expr::IsNull { operand, .. } => {
12701            expr_references_outer_scope(operand, outer_scopes, inner_scopes)
12702        }
12703        Expr::FunctionCall { args, .. } => args
12704            .iter()
12705            .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes)),
12706        Expr::Case {
12707            branches, else_, ..
12708        } => {
12709            branches.iter().any(|(cond, value)| {
12710                expr_references_outer_scope(cond, outer_scopes, inner_scopes)
12711                    || expr_references_outer_scope(value, outer_scopes, inner_scopes)
12712            }) || else_
12713                .as_ref()
12714                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
12715        }
12716        Expr::InList { target, values, .. } => {
12717            expr_references_outer_scope(target, outer_scopes, inner_scopes)
12718                || values
12719                    .iter()
12720                    .any(|value| expr_references_outer_scope(value, outer_scopes, inner_scopes))
12721        }
12722        Expr::Between {
12723            target, low, high, ..
12724        } => {
12725            expr_references_outer_scope(target, outer_scopes, inner_scopes)
12726                || expr_references_outer_scope(low, outer_scopes, inner_scopes)
12727                || expr_references_outer_scope(high, outer_scopes, inner_scopes)
12728        }
12729        Expr::Subquery { query, .. } => query_references_outer_scope(&query.query, inner_scopes),
12730        Expr::Literal { .. } | Expr::Parameter { .. } => false,
12731        Expr::WindowFunctionCall { args, window, .. } => {
12732            args.iter()
12733                .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes))
12734                || window
12735                    .partition_by
12736                    .iter()
12737                    .any(|e| expr_references_outer_scope(e, outer_scopes, inner_scopes))
12738                || window
12739                    .order_by
12740                    .iter()
12741                    .any(|o| expr_references_outer_scope(&o.expr, outer_scopes, inner_scopes))
12742        }
12743    }
12744}
12745
12746fn field_ref_references_outer_scope(
12747    field: &crate::storage::query::ast::FieldRef,
12748    outer_scopes: &[String],
12749    inner_scopes: &[String],
12750) -> bool {
12751    match field {
12752        crate::storage::query::ast::FieldRef::TableColumn { table, .. } if !table.is_empty() => {
12753            outer_scopes.iter().any(|scope| scope == table)
12754                && !inner_scopes.iter().any(|scope| scope == table)
12755        }
12756        _ => false,
12757    }
12758}
12759
12760fn first_column_values(
12761    result: crate::storage::query::unified::UnifiedResult,
12762) -> RedDBResult<Vec<Value>> {
12763    if result.columns.len() > 1 {
12764        return Err(RedDBError::Query(
12765            "expression subquery must return exactly one column".to_string(),
12766        ));
12767    }
12768    let fallback_column = result
12769        .records
12770        .first()
12771        .and_then(|record| record.column_names().into_iter().next())
12772        .map(|name| name.to_string());
12773    let column = result.columns.first().cloned().or(fallback_column);
12774    let Some(column) = column else {
12775        return Ok(Vec::new());
12776    };
12777    Ok(result
12778        .records
12779        .iter()
12780        .map(|record| record.get(column.as_str()).cloned().unwrap_or(Value::Null))
12781        .collect())
12782}
12783
12784fn parse_timestamp_to_ms(s: &str) -> Option<u128> {
12785    // Bare integer ms.
12786    if let Ok(n) = s.parse::<u128>() {
12787        return Some(n);
12788    }
12789    // Fallback: ISO-8601 like 2030-01-02 03:04:05 — accept the date
12790    // portion only (midnight UTC). Full RFC3339 parsing is a stretch
12791    // goal; the common case is `'2030-01-01'`.
12792    if let Some(date) = s.split_whitespace().next() {
12793        let parts: Vec<&str> = date.split('-').collect();
12794        if parts.len() == 3 {
12795            let (y, m, d) = (parts[0], parts[1], parts[2]);
12796            if let (Ok(y), Ok(m), Ok(d)) = (y.parse::<i64>(), m.parse::<u32>(), d.parse::<u32>()) {
12797                // Days since 1970-01-01 — simple Julian arithmetic
12798                // suitable for years 1970-2100. Good enough for test
12799                // fixtures; precise parsing lands when we wire chrono.
12800                let days_in = days_from_civil(y, m, d);
12801                return Some((days_in as u128) * 86_400_000u128);
12802            }
12803        }
12804    }
12805    None
12806}
12807
12808/// Days from Unix epoch using H. Hinnant's civil-from-days algorithm.
12809/// Robust for the entire Gregorian range; used by `parse_timestamp_to_ms`.
12810fn days_from_civil(y: i64, m: u32, d: u32) -> i64 {
12811    let y = if m <= 2 { y - 1 } else { y };
12812    let era = if y >= 0 { y } else { y - 399 } / 400;
12813    let yoe = (y - era * 400) as u64; // [0, 399]
12814    let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) as u64 + 2) / 5 + d as u64 - 1;
12815    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
12816    era * 146097 + doe as i64 - 719468
12817}
12818
12819fn walk_plan_node(
12820    node: &crate::storage::query::planner::CanonicalLogicalNode,
12821    depth: usize,
12822    out: &mut Vec<crate::storage::query::unified::UnifiedRecord>,
12823) {
12824    use std::sync::Arc;
12825    let mut rec = crate::storage::query::unified::UnifiedRecord::default();
12826    rec.set_arc(Arc::from("op"), Value::text(node.operator.clone()));
12827    rec.set_arc(
12828        Arc::from("source"),
12829        node.source.clone().map(Value::text).unwrap_or(Value::Null),
12830    );
12831    rec.set_arc(Arc::from("est_rows"), Value::Float(node.estimated_rows));
12832    rec.set_arc(Arc::from("est_cost"), Value::Float(node.operator_cost));
12833    rec.set_arc(Arc::from("depth"), Value::Integer(depth as i64));
12834    out.push(rec);
12835    for child in &node.children {
12836        walk_plan_node(child, depth + 1, out);
12837    }
12838}
12839
12840#[cfg(test)]
12841mod inline_graph_tvf_tests {
12842    use super::*;
12843
12844    fn scopes_for(sql: &str) -> HashSet<String> {
12845        let expr = crate::storage::query::parser::parse(sql)
12846            .expect("parse")
12847            .query;
12848        query_expr_result_cache_scopes(&expr)
12849    }
12850
12851    #[test]
12852    fn inline_tvf_cache_scopes_include_source_collections() {
12853        // The result-cache key for the inline form must derive from the
12854        // `nodes`/`edges` source collections so a write to either invalidates
12855        // the cached result (issue #799).
12856        let scopes = scopes_for(
12857            "SELECT * FROM components(nodes => (SELECT id FROM hosts), edges => (SELECT src, dst FROM links))",
12858        );
12859        assert!(scopes.contains("hosts"), "nodes source scoped: {scopes:?}");
12860        assert!(scopes.contains("links"), "edges source scoped: {scopes:?}");
12861    }
12862
12863    #[test]
12864    fn graph_collection_tvf_cache_scope_is_graph_argument() {
12865        // The graph-collection form still materializes the active graph, but
12866        // result-cache invalidation is scoped to the named graph argument so
12867        // INSERT INTO g NODE/EDGE invalidates cached TVF rows.
12868        let scopes = scopes_for("SELECT * FROM components(g)");
12869        assert!(scopes.contains("g"), "collection form scoped: {scopes:?}");
12870    }
12871
12872    #[test]
12873    fn abstract_degree_centrality_counts_undirected_endpoints() {
12874        let nodes = vec!["a".to_string(), "b".to_string(), "c".to_string()];
12875        let edges = vec![
12876            ("a".to_string(), "b".to_string(), 1.0_f32),
12877            ("b".to_string(), "c".to_string(), 1.0_f32),
12878        ];
12879        let degrees = abstract_degree_centrality(&nodes, &edges);
12880        assert_eq!(
12881            degrees,
12882            vec![
12883                ("a".to_string(), 1),
12884                ("b".to_string(), 2),
12885                ("c".to_string(), 1),
12886            ]
12887        );
12888    }
12889}