Skip to main content

reddb_server/runtime/
impl_core.rs

1use super::*;
2use crate::auth::column_policy_gate::ColumnAccessRequest;
3use crate::auth::UserId;
4use crate::replication::cdc::ChangeRecord;
5use crate::storage::query::ast::TableSource;
6
7/// Read a numeric score column out of a result record as `f64`, matching
8/// the column name case-insensitively. Used by the leaderboard-rank head
9/// walk (#918) to compare scores; non-numeric / missing columns yield
10/// `None` so a row with no comparable score never shifts a rank.
11fn record_column_f64(
12    rec: &crate::storage::query::unified::UnifiedRecord,
13    column: &str,
14) -> Option<f64> {
15    let value = rec
16        .get(column)
17        .or_else(|| rec.get(&column.to_lowercase()))?;
18    match value {
19        Value::Integer(n) => Some(*n as f64),
20        Value::UnsignedInteger(n) => Some(*n as f64),
21        Value::Float(n) => Some(*n),
22        Value::Timestamp(n) | Value::Duration(n) => Some(*n as f64),
23        _ => None,
24    }
25}
26
27fn record_rid_u64(rec: &crate::storage::query::unified::UnifiedRecord) -> Option<u64> {
28    match rec.get("rid") {
29        Some(Value::UnsignedInteger(n)) => Some(*n),
30        Some(Value::Integer(n)) if *n >= 0 => Some(*n as u64),
31        _ => None,
32    }
33}
34
35fn seed_storage_deploy_config(
36    store: &crate::storage::UnifiedStore,
37    selection: crate::storage::StorageProfileSelection,
38) {
39    store.set_config_tree(
40        "storage.deploy",
41        &crate::json!({
42            "profile": selection.deploy_profile.as_str(),
43            "packaging": selection.packaging.as_str(),
44            "preset": selection.preset_name(),
45            "replica_count": selection.replica_count,
46            "managed_backup": selection.managed_backup,
47            "wal_retention": selection.wal_retention,
48        }),
49    );
50}
51
52struct RankedHeadEntry {
53    rank: u64,
54    record: crate::storage::query::unified::UnifiedRecord,
55}
56
57fn secret_sql_value_to_string(value: &Value) -> RedDBResult<String> {
58    match value {
59        Value::Text(s) => Ok(s.to_string()),
60        Value::Integer(n) => Ok(n.to_string()),
61        Value::UnsignedInteger(n) => Ok(n.to_string()),
62        Value::Float(n) => Ok(n.to_string()),
63        Value::Boolean(b) => Ok(b.to_string()),
64        Value::Null => Err(RedDBError::Query(
65            "SET SECRET key = NULL deletes the secret; use DELETE SECRET for explicit deletes"
66                .to_string(),
67        )),
68        Value::Password(_) | Value::Secret(_) => Err(RedDBError::Query(
69            "SET SECRET accepts plain scalar literals; PASSWORD() and SECRET() are for typed columns"
70                .to_string(),
71        )),
72        _ => Err(RedDBError::Query(format!(
73            "SET SECRET does not support value type {:?} yet",
74            value.data_type()
75        ))),
76    }
77}
78
79#[derive(Clone)]
80struct QueryControlEventSpec {
81    kind: crate::runtime::control_events::EventKind,
82    action: &'static str,
83    resource: Option<String>,
84    fields: Vec<(String, crate::runtime::control_events::Sensitivity)>,
85}
86
87#[derive(Clone)]
88struct QueryAuditPlan {
89    statement_kind: &'static str,
90    collections: Vec<String>,
91}
92
93fn query_audit_plan(expr: &QueryExpr) -> Option<QueryAuditPlan> {
94    let mut collections = Vec::new();
95    let statement_kind = match expr {
96        QueryExpr::Table(table) => {
97            push_query_audit_collection(&mut collections, &table.table);
98            "select"
99        }
100        QueryExpr::Join(join) => {
101            collect_query_audit_collections(&join.left, &mut collections);
102            collect_query_audit_collections(&join.right, &mut collections);
103            "select"
104        }
105        QueryExpr::Insert(insert) => {
106            push_query_audit_collection(&mut collections, &insert.table);
107            "insert"
108        }
109        QueryExpr::Update(update) => {
110            push_query_audit_collection(&mut collections, &update.table);
111            "update"
112        }
113        QueryExpr::Delete(delete) => {
114            push_query_audit_collection(&mut collections, &delete.table);
115            "delete"
116        }
117        _ => return None,
118    };
119    if collections.is_empty() {
120        None
121    } else {
122        Some(QueryAuditPlan {
123            statement_kind,
124            collections,
125        })
126    }
127}
128
129fn collect_query_audit_collections(expr: &QueryExpr, collections: &mut Vec<String>) {
130    match expr {
131        QueryExpr::Table(table) => push_query_audit_collection(collections, &table.table),
132        QueryExpr::Join(join) => {
133            collect_query_audit_collections(&join.left, collections);
134            collect_query_audit_collections(&join.right, collections);
135        }
136        _ => {}
137    }
138}
139
140fn push_query_audit_collection(collections: &mut Vec<String>, name: &str) {
141    if name == "red" || name.starts_with("red.") || name.starts_with("__red_schema_") {
142        return;
143    }
144    if !collections.iter().any(|existing| existing == name) {
145        collections.push(name.to_string());
146    }
147}
148
149const RUNTIME_INDEX_REGISTRY_COLLECTION: &str = "red_index_registry";
150
151impl RedDBRuntime {
152    fn execute_create_metric(
153        &self,
154        raw_query: &str,
155        query: &crate::storage::query::ast::CreateMetricQuery,
156    ) -> RedDBResult<RuntimeQueryResult> {
157        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
158        let store = self.inner.db.store();
159        super::metric_descriptor_catalog::create(
160            store.as_ref(),
161            &query.path,
162            &query.kind,
163            &query.role,
164            super::metric_descriptor_catalog::DerivedSpec {
165                source: query.source.clone(),
166                query: query.query.clone(),
167                window_ms: query.window_ms,
168                time_field: query.time_field.clone(),
169            },
170        )?;
171        self.invalidate_result_cache();
172        Ok(RuntimeQueryResult::ok_message(
173            raw_query.to_string(),
174            &format!("metric descriptor '{}' created", query.path),
175            "create",
176        ))
177    }
178
179    /// `CREATE RANKING <name> ON <table> (<column> [ASC|DESC]) [TOP <k>]`
180    /// — declare a Ranking capability over an ordinary table's score
181    /// column (issue #918 / ADR 0035). Persists a WAL-backed catalog
182    /// record; no new Collection model is introduced. Authorized through
183    /// the same DDL write gate as `CREATE METRIC`/`CREATE INDEX`.
184    fn execute_create_ranking(
185        &self,
186        raw_query: &str,
187        req: super::ranking_descriptor_catalog::CreateRankingRequest,
188    ) -> RedDBResult<RuntimeQueryResult> {
189        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
190        let store = self.inner.db.store();
191        let descriptor = super::ranking_descriptor_catalog::create(store.as_ref(), &req)?;
192        self.invalidate_result_cache();
193        Ok(RuntimeQueryResult::ok_message(
194            raw_query.to_string(),
195            &format!(
196                "ranking '{}' created on {}({})",
197                descriptor.name, descriptor.table, descriptor.column
198            ),
199            "create",
200        ))
201    }
202
203    /// `SHOW RANKINGS` — project the declared Ranking capabilities back as
204    /// rows, so a declared capability is observable (the Analytics
205    /// "prefer SELECT over admin verbs" rule).
206    fn execute_show_rankings(&self, raw_query: &str) -> RedDBResult<RuntimeQueryResult> {
207        let store = self.inner.db.store();
208        let entries = super::ranking_descriptor_catalog::list(store.as_ref());
209        let columns = vec![
210            "name".to_string(),
211            "table".to_string(),
212            "column".to_string(),
213            "direction".to_string(),
214            "top_k".to_string(),
215        ];
216        let rows = entries
217            .into_iter()
218            .map(|e| {
219                vec![
220                    ("name".to_string(), Value::text(e.name)),
221                    ("table".to_string(), Value::text(e.table)),
222                    ("column".to_string(), Value::text(e.column)),
223                    (
224                        "direction".to_string(),
225                        Value::text(if e.descending { "DESC" } else { "ASC" }.to_string()),
226                    ),
227                    ("top_k".to_string(), Value::UnsignedInteger(e.top_k)),
228                ]
229            })
230            .collect();
231        let mut result =
232            RuntimeQueryResult::ok_records(raw_query.to_string(), columns, rows, "select");
233        result.statement = "rank_of";
234        result.engine = "runtime-rank";
235        Ok(result)
236    }
237
238    /// `RANK OF <id> IN <name>` — exact, MVCC-correct rank of a specific
239    /// row within the capability's bounded top-K head (issue #918).
240    ///
241    /// Returns a single `rank` row when the row is visible *and* falls
242    /// inside the exact head; an empty result otherwise (not visible, or
243    /// in the approximate tail — a separate slice). The computation runs
244    /// entirely over the regular read pipeline so it inherits MVCC
245    /// visibility, RLS/policy, and tenant scope from ordinary reads.
246    fn execute_rank_of(
247        &self,
248        raw_query: &str,
249        req: &crate::storage::query::ast::RankOfQuery,
250    ) -> RedDBResult<RuntimeQueryResult> {
251        let store = self.inner.db.store();
252        let descriptor = super::ranking_descriptor_catalog::get(store.as_ref(), &req.ranking)
253            .ok_or_else(|| {
254                RedDBError::Query(format!("ranking '{}' does not exist", req.ranking))
255            })?;
256        let rank = self.compute_exact_head_rank(&descriptor, req.entity_id)?;
257        let columns = vec!["rank".to_string()];
258        let rows = match rank {
259            Some(rank) => vec![vec![("rank".to_string(), Value::UnsignedInteger(rank))]],
260            None => Vec::new(),
261        };
262        let mut result =
263            RuntimeQueryResult::ok_records(raw_query.to_string(), columns, rows, "select");
264        result.statement = "rank_range";
265        result.engine = "runtime-rank";
266        Ok(result)
267    }
268
269    /// `RANK RANGE <lo> TO <hi> IN <name>` — exact, MVCC-correct entries
270    /// occupying a contiguous rank range within the bounded top-K head.
271    ///
272    /// The output is in leaderboard order and includes `rank` plus the
273    /// row columns returned by the canonical exact-head SQL read.
274    fn execute_rank_range(
275        &self,
276        raw_query: &str,
277        req: &crate::storage::query::ast::RankRangeQuery,
278    ) -> RedDBResult<RuntimeQueryResult> {
279        let store = self.inner.db.store();
280        let descriptor = super::ranking_descriptor_catalog::get(store.as_ref(), &req.ranking)
281            .ok_or_else(|| {
282                RedDBError::Query(format!("ranking '{}' does not exist", req.ranking))
283            })?;
284        let (head_columns, entries) = self.compute_ranked_head_entries(&descriptor)?;
285
286        let mut columns = Vec::with_capacity(head_columns.len() + 1);
287        columns.push("rank".to_string());
288        for column in &head_columns {
289            if column != "rank" {
290                columns.push(column.clone());
291            }
292        }
293
294        let rows = entries
295            .into_iter()
296            .filter(|entry| entry.rank >= req.lo && entry.rank <= req.hi)
297            .map(|entry| {
298                let mut row = Vec::with_capacity(columns.len());
299                row.push(("rank".to_string(), Value::UnsignedInteger(entry.rank)));
300                for column in &head_columns {
301                    if column == "rank" {
302                        continue;
303                    }
304                    if let Some(value) = entry.record.get(column) {
305                        row.push((column.clone(), value.clone()));
306                    }
307                }
308                row
309            })
310            .collect();
311        let mut result =
312            RuntimeQueryResult::ok_records(raw_query.to_string(), columns, rows, "select");
313        result.statement = "approx_rank_of";
314        result.engine = "runtime-rank";
315        Ok(result)
316    }
317
318    /// Compute the exact rank of `target_id` within the descriptor's
319    /// bounded top-K head, or `None` if the row is invisible to the
320    /// querying snapshot or beyond the exact head.
321    ///
322    /// Faithful to ADR 0035: it walks the sorted index head
323    /// (`ORDER BY <col> {DESC|ASC} LIMIT k`, served by
324    /// `try_sorted_index_lookup` + the per-row MVCC visibility re-check)
325    /// and counts only rows visible to the current snapshot. Running the
326    /// head scan through `execute_query_inner` keeps it on the same
327    /// snapshot/tenant/policy frame as ordinary reads, so the rank agrees
328    /// with `ORDER BY <col> {DESC|ASC} LIMIT` under that snapshot by
329    /// construction. RANK semantics: tied scores share a rank, so the
330    /// rank is `1 + (number of strictly-better visible rows)`.
331    fn compute_exact_head_rank(
332        &self,
333        descriptor: &super::ranking_descriptor_catalog::RankingDescriptor,
334        target_id: u64,
335    ) -> RedDBResult<Option<u64>> {
336        let (_columns, entries) = self.compute_ranked_head_entries(descriptor)?;
337        Ok(entries
338            .into_iter()
339            .find(|entry| record_rid_u64(&entry.record) == Some(target_id))
340            .map(|entry| entry.rank))
341    }
342
343    /// Return the exact head rows in deterministic rank order.
344    fn compute_ranked_head_entries(
345        &self,
346        descriptor: &super::ranking_descriptor_catalog::RankingDescriptor,
347    ) -> RedDBResult<(Vec<String>, Vec<RankedHeadEntry>)> {
348        let table = &descriptor.table;
349        let column = &descriptor.column;
350
351        // The exact head: top-K rows in rank order. Each row here already
352        // passed MVCC visibility *and* RLS/tenant filtering during the
353        // scan, so identifying the target *within* this result (rather
354        // than via a separate `red_entity_id` lookup, which takes the
355        // direct entity-fetch path that bypasses the RLS gate) is what
356        // makes the rank honor policy/tenant scope (criterion 5).
357        let dir = if descriptor.descending { "DESC" } else { "ASC" };
358        let head_sql = format!(
359            "SELECT * FROM {table} ORDER BY {column} {dir}, rid ASC LIMIT {}",
360            descriptor.top_k
361        );
362        let head_result = self.execute_query_inner(&head_sql)?;
363
364        let mut entries = Vec::with_capacity(head_result.result.records.len());
365        let mut row_position = 0u64;
366        let mut current_rank = 0u64;
367        let mut previous_score: Option<f64> = None;
368        for rec in &head_result.result.records {
369            let Some(score) = record_column_f64(rec, column) else {
370                continue;
371            };
372            row_position += 1;
373            current_rank = if previous_score == Some(score) {
374                current_rank
375            } else {
376                row_position
377            };
378            previous_score = Some(score);
379            entries.push(RankedHeadEntry {
380                rank: current_rank,
381                record: rec.clone(),
382            });
383        }
384        Ok((head_result.result.columns, entries))
385    }
386
387    /// `APPROX RANK OF <id> IN <name>` — the *approximate tail* read
388    /// (issue #923 / ADR 0035). Serves an explicitly-approximate
389    /// percentile / rank for an entry below the exact top-K head from a
390    /// per-`(table, column)` score sketch.
391    ///
392    /// The result is always labeled approximate (`approximate = true`,
393    /// distinct from the exact `RANK OF` surface which returns only a bare
394    /// `rank`) so a caller never reads a tail estimate as an exact head
395    /// position. An invisible / non-existent row yields no row, exactly
396    /// like the exact surface.
397    fn execute_approx_rank_of(
398        &self,
399        raw_query: &str,
400        req: &crate::storage::query::ast::RankOfQuery,
401    ) -> RedDBResult<RuntimeQueryResult> {
402        let store = self.inner.db.store();
403        let descriptor = super::ranking_descriptor_catalog::get(store.as_ref(), &req.ranking)
404            .ok_or_else(|| {
405                RedDBError::Query(format!("ranking '{}' does not exist", req.ranking))
406            })?;
407
408        let approx = self.compute_approx_rank(&descriptor, req.entity_id)?;
409        let columns = vec![
410            "rank".to_string(),
411            "percentile".to_string(),
412            "approximate".to_string(),
413        ];
414        let rows = match approx {
415            Some(approx) => vec![vec![
416                ("rank".to_string(), Value::UnsignedInteger(approx.rank)),
417                ("percentile".to_string(), Value::Float(approx.percentile)),
418                ("approximate".to_string(), Value::Boolean(true)),
419            ]],
420            None => Vec::new(),
421        };
422        Ok(RuntimeQueryResult::ok_records(
423            raw_query.to_string(),
424            columns,
425            rows,
426            "select",
427        ))
428    }
429
430    /// Refresh the per-`(table, column)` score sketch from the rows visible
431    /// to the current snapshot and return the target's approximate rank, or
432    /// `None` if the target row is invisible to this snapshot / tenant.
433    ///
434    /// The sketch is rebuilt from the live column on each read and persisted
435    /// back to `red_config` keyed by `(table, column)` — so it is maintained
436    /// per `(collection, score column)` and stays current as scores change
437    /// (criterion 4). The scan runs through `execute_query_inner`, inheriting
438    /// the same MVCC snapshot, RLS/tenant scope, and policy as ordinary
439    /// reads. The *approximation* is the histogram bucketing in
440    /// [`super::score_sketch::ScoreSketch`], not the data freshness, so the
441    /// estimate carries the documented error band even though it is built
442    /// from a full scan in this v0 (incremental maintenance is an ADR-0035
443    /// implementation detail, left open and reversible).
444    fn compute_approx_rank(
445        &self,
446        descriptor: &super::ranking_descriptor_catalog::RankingDescriptor,
447        target_id: u64,
448    ) -> RedDBResult<Option<super::score_sketch::ApproxRank>> {
449        let table = &descriptor.table;
450        let column = &descriptor.column;
451
452        // Scan the visible rows once: it both feeds the sketch and locates
453        // the target's score under the same snapshot/tenant/policy frame.
454        let scan_sql = format!("SELECT * FROM {table}");
455        let scan = self.execute_query_inner(&scan_sql)?;
456        let records = &scan.result.records;
457
458        let mut scores: Vec<f64> = Vec::with_capacity(records.len());
459        let mut target_score: Option<f64> = None;
460        for rec in records {
461            let Some(score) = record_column_f64(rec, column) else {
462                continue;
463            };
464            scores.push(score);
465            let rid = match rec.get("rid") {
466                Some(Value::UnsignedInteger(n)) => Some(*n),
467                Some(Value::Integer(n)) if *n >= 0 => Some(*n as u64),
468                _ => None,
469            };
470            if rid == Some(target_id) {
471                target_score = Some(score);
472            }
473        }
474
475        let sketch = super::score_sketch::ScoreSketch::from_scores(&scores);
476        // Persist the refreshed sketch per (table, column).
477        super::ranking_descriptor_catalog::save_sketch(
478            self.inner.db.store().as_ref(),
479            table,
480            column,
481            &sketch,
482        );
483
484        let Some(target_score) = target_score else {
485            // Not visible to this snapshot/tenant ⇒ no rank (matches exact).
486            return Ok(None);
487        };
488        Ok(sketch.approx_rank(target_score, descriptor.descending))
489    }
490
491    fn execute_alter_metric(
492        &self,
493        raw_query: &str,
494        query: &crate::storage::query::ast::AlterMetricQuery,
495    ) -> RedDBResult<RuntimeQueryResult> {
496        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
497        let store = self.inner.db.store();
498        super::metric_descriptor_catalog::update(
499            store.as_ref(),
500            &query.path,
501            query.set_role.as_deref(),
502            query.attempted_kind.as_deref(),
503            query.attempted_path.as_deref(),
504        )?;
505        self.invalidate_result_cache();
506        Ok(RuntimeQueryResult::ok_message(
507            raw_query.to_string(),
508            &format!("metric descriptor '{}' updated", query.path),
509            "alter",
510        ))
511    }
512
513    fn execute_create_slo(
514        &self,
515        raw_query: &str,
516        query: &crate::storage::query::ast::CreateSloQuery,
517    ) -> RedDBResult<RuntimeQueryResult> {
518        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
519        let store = self.inner.db.store();
520        super::slo_descriptor_catalog::create(
521            store.as_ref(),
522            &query.path,
523            &query.metric_path,
524            query.target,
525            query.window_ms,
526        )?;
527        self.invalidate_result_cache();
528        Ok(RuntimeQueryResult::ok_message(
529            raw_query.to_string(),
530            &format!("SLO descriptor '{}' created", query.path),
531            "create",
532        ))
533    }
534
535    fn execute_create_analytics_source(
536        &self,
537        raw_query: &str,
538        query: super::analytics_source_catalog::CreateAnalyticsSourceProfile,
539    ) -> RedDBResult<RuntimeQueryResult> {
540        self.check_write(crate::runtime::write_gate::WriteKind::Ddl)?;
541        let store = self.inner.db.store();
542        let profile = super::analytics_source_catalog::create(
543            store.as_ref(),
544            &self.inner.db.collection_contracts(),
545            query,
546        )?;
547        self.invalidate_result_cache();
548        Ok(RuntimeQueryResult::ok_message(
549            raw_query.to_string(),
550            &format!("analytics source '{}' created", profile.name),
551            "create",
552        ))
553    }
554}
555
556fn query_control_event_specs(expr: &QueryExpr) -> Vec<QueryControlEventSpec> {
557    use crate::runtime::control_events::{EventKind, Sensitivity};
558
559    let mut specs = Vec::new();
560    let mut schema = |action: &'static str, resource: Option<String>| {
561        specs.push(QueryControlEventSpec {
562            kind: EventKind::SchemaDdl,
563            action,
564            resource,
565            fields: Vec::new(),
566        });
567    };
568    match expr {
569        QueryExpr::CreateTable(q) => {
570            schema("create_table", Some(format!("table:{}", q.name)));
571            if let Some(column) = &q.tenant_by {
572                specs.push(QueryControlEventSpec {
573                    kind: EventKind::TenantGovernance,
574                    action: "create_table_tenant_by",
575                    resource: Some(format!("table:{}", q.name)),
576                    fields: vec![("tenant_column".to_string(), Sensitivity::raw(column))],
577                });
578            }
579        }
580        QueryExpr::CreateCollection(q) => {
581            schema("create_collection", Some(format!("collection:{}", q.name)));
582        }
583        QueryExpr::CreateVector(q) => schema("create_vector", Some(format!("vector:{}", q.name))),
584        QueryExpr::DropTable(q) => schema("drop_table", Some(format!("table:{}", q.name))),
585        QueryExpr::DropGraph(q) => schema("drop_graph", Some(format!("graph:{}", q.name))),
586        QueryExpr::DropVector(q) => schema("drop_vector", Some(format!("vector:{}", q.name))),
587        QueryExpr::DropDocument(q) => {
588            schema("drop_document", Some(format!("document:{}", q.name)));
589        }
590        QueryExpr::DropKv(q) => schema("drop_kv", Some(format!("kv:{}", q.name))),
591        QueryExpr::DropCollection(q) => {
592            schema("drop_collection", Some(format!("collection:{}", q.name)));
593        }
594        QueryExpr::Truncate(q) => schema("truncate", Some(format!("collection:{}", q.name))),
595        QueryExpr::AlterTable(q) => {
596            schema("alter_table", Some(format!("table:{}", q.name)));
597            for op in &q.operations {
598                match op {
599                    crate::storage::query::ast::AlterOperation::EnableRowLevelSecurity => {
600                        specs.push(QueryControlEventSpec {
601                            kind: EventKind::RlsGovernance,
602                            action: "enable_rls",
603                            resource: Some(format!("table:{}", q.name)),
604                            fields: Vec::new(),
605                        });
606                    }
607                    crate::storage::query::ast::AlterOperation::DisableRowLevelSecurity => {
608                        specs.push(QueryControlEventSpec {
609                            kind: EventKind::RlsGovernance,
610                            action: "disable_rls",
611                            resource: Some(format!("table:{}", q.name)),
612                            fields: Vec::new(),
613                        });
614                    }
615                    crate::storage::query::ast::AlterOperation::EnableTenancy { column } => {
616                        specs.push(QueryControlEventSpec {
617                            kind: EventKind::TenantGovernance,
618                            action: "enable_tenancy",
619                            resource: Some(format!("table:{}", q.name)),
620                            fields: vec![("tenant_column".to_string(), Sensitivity::raw(column))],
621                        });
622                    }
623                    crate::storage::query::ast::AlterOperation::DisableTenancy => {
624                        specs.push(QueryControlEventSpec {
625                            kind: EventKind::TenantGovernance,
626                            action: "disable_tenancy",
627                            resource: Some(format!("table:{}", q.name)),
628                            fields: Vec::new(),
629                        });
630                    }
631                    _ => {}
632                }
633            }
634        }
635        QueryExpr::CreateIndex(q) => {
636            schema(
637                "create_index",
638                Some(format!("index:{}:{}", q.table, q.name)),
639            );
640        }
641        QueryExpr::DropIndex(q) => {
642            schema("drop_index", Some(format!("index:{}:{}", q.table, q.name)));
643        }
644        QueryExpr::CreateTimeSeries(q) => {
645            schema("create_timeseries", Some(format!("timeseries:{}", q.name)));
646        }
647        QueryExpr::CreateMetric(q) => {
648            schema("create_metric", Some(format!("metric:{}", q.path)));
649        }
650        QueryExpr::AlterMetric(q) => {
651            schema("alter_metric", Some(format!("metric:{}", q.path)));
652        }
653        QueryExpr::CreateSlo(q) => {
654            schema("create_slo", Some(format!("slo:{}", q.path)));
655        }
656        QueryExpr::DropTimeSeries(q) => {
657            schema("drop_timeseries", Some(format!("timeseries:{}", q.name)));
658        }
659        QueryExpr::CreateQueue(q) => schema("create_queue", Some(format!("queue:{}", q.name))),
660        QueryExpr::AlterQueue(q) => schema("alter_queue", Some(format!("queue:{}", q.name))),
661        QueryExpr::DropQueue(q) => schema("drop_queue", Some(format!("queue:{}", q.name))),
662        QueryExpr::CreateTree(q) => {
663            schema(
664                "create_tree",
665                Some(format!("tree:{}:{}", q.collection, q.name)),
666            );
667        }
668        QueryExpr::DropTree(q) => {
669            schema(
670                "drop_tree",
671                Some(format!("tree:{}:{}", q.collection, q.name)),
672            );
673        }
674        QueryExpr::CreateSchema(q) => schema("create_schema", Some(format!("schema:{}", q.name))),
675        QueryExpr::DropSchema(q) => schema("drop_schema", Some(format!("schema:{}", q.name))),
676        QueryExpr::CreateSequence(q) => {
677            schema("create_sequence", Some(format!("sequence:{}", q.name)));
678        }
679        QueryExpr::DropSequence(q) => schema("drop_sequence", Some(format!("sequence:{}", q.name))),
680        QueryExpr::CreateView(q) => schema("create_view", Some(format!("view:{}", q.name))),
681        QueryExpr::DropView(q) => schema("drop_view", Some(format!("view:{}", q.name))),
682        QueryExpr::RefreshMaterializedView(q) => {
683            schema(
684                "refresh_materialized_view",
685                Some(format!("view:{}", q.name)),
686            );
687        }
688        QueryExpr::CreatePolicy(q) => {
689            specs.push(QueryControlEventSpec {
690                kind: EventKind::RlsGovernance,
691                action: "create_policy",
692                resource: Some(format!("table:{}:policy:{}", q.table, q.name)),
693                fields: vec![(
694                    "target_kind".to_string(),
695                    Sensitivity::raw(q.target_kind.as_ident()),
696                )],
697            });
698        }
699        QueryExpr::DropPolicy(q) => {
700            specs.push(QueryControlEventSpec {
701                kind: EventKind::RlsGovernance,
702                action: "drop_policy",
703                resource: Some(format!("table:{}:policy:{}", q.table, q.name)),
704                fields: Vec::new(),
705            });
706        }
707        QueryExpr::SetTenant(value) => {
708            let mut fields = Vec::new();
709            if let Some(value) = value {
710                fields.push(("tenant".to_string(), Sensitivity::raw(value)));
711            }
712            specs.push(QueryControlEventSpec {
713                kind: EventKind::TenantGovernance,
714                action: "set_tenant",
715                resource: Some("tenant:session".to_string()),
716                fields,
717            });
718        }
719        QueryExpr::SetConfig { key, .. } => {
720            specs.push(QueryControlEventSpec {
721                kind: EventKind::ConfigWrite,
722                action: "config:write",
723                resource: Some(format!("config:{key}")),
724                fields: vec![("key".to_string(), Sensitivity::raw(key))],
725            });
726        }
727        QueryExpr::ConfigCommand(cmd) => match cmd {
728            crate::storage::query::ast::ConfigCommand::Put {
729                collection, key, ..
730            }
731            | crate::storage::query::ast::ConfigCommand::Rotate {
732                collection, key, ..
733            } => {
734                let target = format!("{collection}/{key}");
735                specs.push(QueryControlEventSpec {
736                    kind: EventKind::ConfigWrite,
737                    action: "config:write",
738                    resource: Some(format!("config:{target}")),
739                    fields: vec![
740                        ("collection".to_string(), Sensitivity::raw(collection)),
741                        ("key".to_string(), Sensitivity::raw(key)),
742                    ],
743                });
744            }
745            crate::storage::query::ast::ConfigCommand::Delete { collection, key } => {
746                let target = format!("{collection}/{key}");
747                specs.push(QueryControlEventSpec {
748                    kind: EventKind::ConfigDelete,
749                    action: "config:write",
750                    resource: Some(format!("config:{target}")),
751                    fields: vec![
752                        ("collection".to_string(), Sensitivity::raw(collection)),
753                        ("key".to_string(), Sensitivity::raw(key)),
754                    ],
755                });
756            }
757            _ => {}
758        },
759        QueryExpr::AlterUser(stmt) => {
760            let disables = stmt.attributes.iter().any(|attr| {
761                matches!(
762                    attr,
763                    crate::storage::query::ast::AlterUserAttribute::Disable
764                )
765            });
766            specs.push(QueryControlEventSpec {
767                kind: if disables {
768                    EventKind::UserDisable
769                } else {
770                    EventKind::UserUpdate
771                },
772                action: "alter_user",
773                resource: Some(format!("user:{}", stmt.username)),
774                fields: Vec::new(),
775            });
776        }
777        _ => {}
778    }
779    specs
780}
781
782pub(crate) fn control_event_outcome_for_error(
783    err: &RedDBError,
784) -> crate::runtime::control_events::Outcome {
785    match err {
786        RedDBError::ReadOnly(_) => crate::runtime::control_events::Outcome::Denied,
787        RedDBError::Query(msg)
788            if msg.contains("permission denied")
789                || msg.contains("cannot issue")
790                || msg.contains("lacks") =>
791        {
792            crate::runtime::control_events::Outcome::Denied
793        }
794        _ => crate::runtime::control_events::Outcome::Error,
795    }
796}
797
798/// Convert the rows produced by a materialized-view body into
799/// `UnifiedEntity` table rows targeting the backing collection.
800/// Issue #595 slice 9c — feeds `UnifiedStore::refresh_collection`.
801///
802/// Graph fragments and vector hits are ignored: a materialized view
803/// is a relational result set (SELECT-shaped); slices 11+ may extend
804/// this once we have a richer view body shape. Each row materialises
805/// the union of its schema-bound columns + overflow.
806fn view_records_to_entities(
807    table: &str,
808    records: &[crate::storage::query::unified::UnifiedRecord],
809) -> Vec<crate::storage::UnifiedEntity> {
810    use std::collections::HashMap;
811    let table_arc: std::sync::Arc<str> = std::sync::Arc::from(table);
812    let mut out = Vec::with_capacity(records.len());
813    for record in records {
814        let mut named: HashMap<String, crate::storage::schema::Value> = HashMap::new();
815        for (name, value) in record.iter_fields() {
816            named.insert(name.to_string(), value.clone());
817        }
818        let entity = crate::storage::UnifiedEntity::new(
819            crate::storage::EntityId::new(0),
820            crate::storage::EntityKind::TableRow {
821                table: std::sync::Arc::clone(&table_arc),
822                row_id: 0,
823            },
824            crate::storage::EntityData::Row(crate::storage::RowData {
825                columns: Vec::new(),
826                named: Some(named),
827                schema: None,
828            }),
829        );
830        out.push(entity);
831    }
832    out
833}
834
835fn system_keyed_collection_contract(
836    name: &str,
837    model: crate::catalog::CollectionModel,
838) -> crate::physical::CollectionContract {
839    let now = crate::utils::now_unix_millis() as u128;
840    crate::physical::CollectionContract {
841        name: name.to_string(),
842        declared_model: model,
843        schema_mode: crate::catalog::SchemaMode::Dynamic,
844        origin: crate::physical::ContractOrigin::Implicit,
845        version: 1,
846        created_at_unix_ms: now,
847        updated_at_unix_ms: now,
848        default_ttl_ms: None,
849        vector_dimension: None,
850        vector_metric: None,
851        context_index_fields: Vec::new(),
852        declared_columns: Vec::new(),
853        table_def: None,
854        timestamps_enabled: false,
855        context_index_enabled: false,
856        metrics_raw_retention_ms: None,
857        metrics_rollup_policies: Vec::new(),
858        metrics_tenant_identity: None,
859        metrics_namespace: None,
860        append_only: false,
861        subscriptions: Vec::new(),
862        analytics_config: Vec::new(),
863        session_key: None,
864        session_gap_ms: None,
865        retention_duration_ms: None,
866        analytical_storage: None,
867    }
868}
869
870pub use super::execution_context::{
871    capture_current_snapshot, clear_current_auth_identity, clear_current_connection_id,
872    clear_current_snapshot, clear_current_tenant, current_auth_identity_for_audit,
873    current_connection_id, current_tenant, entity_visible_under_current_snapshot,
874    entity_visible_with_context, set_current_auth_identity, set_current_connection_id,
875    set_current_snapshot, set_current_tenant, snapshot_bundle, with_snapshot_bundle,
876    SnapshotBundle, SnapshotContext,
877};
878pub(crate) use super::execution_context::{
879    current_auth_identity, current_config_value, current_role_projected, current_scope_override,
880    current_secret_value, current_snapshot_requires_index_fallback, current_user_projected,
881    has_scope_override_active, parse_set_local_tenant, update_current_config_value,
882    update_current_secret_value, xids_visible_under_current_snapshot, ConfigSnapshotGuard,
883    CurrentSnapshotGuard, ScopeOverrideGuard, SecretStoreGuard, TxLocalTenantGuard,
884};
885
886fn table_row_index_fields(
887    entity: &crate::storage::unified::entity::UnifiedEntity,
888) -> Vec<(String, crate::storage::schema::Value)> {
889    let crate::storage::EntityData::Row(row) = &entity.data else {
890        return Vec::new();
891    };
892    if let Some(named) = &row.named {
893        return named
894            .iter()
895            .map(|(name, value)| (name.clone(), value.clone()))
896            .collect();
897    }
898    if let Some(schema) = &row.schema {
899        return schema
900            .iter()
901            .zip(row.columns.iter())
902            .map(|(name, value)| (name.clone(), value.clone()))
903            .collect();
904    }
905    Vec::new()
906}
907
908fn named_text(
909    named: &std::collections::HashMap<String, crate::storage::schema::Value>,
910    key: &str,
911) -> Option<String> {
912    match named.get(key) {
913        Some(crate::storage::schema::Value::Text(value)) => Some(value.to_string()),
914        _ => None,
915    }
916}
917
918fn named_bool(
919    named: &std::collections::HashMap<String, crate::storage::schema::Value>,
920    key: &str,
921) -> Option<bool> {
922    match named.get(key) {
923        Some(crate::storage::schema::Value::Boolean(value)) => Some(*value),
924        _ => None,
925    }
926}
927
928fn index_method_kind_as_str(method: super::index_store::IndexMethodKind) -> &'static str {
929    match method {
930        super::index_store::IndexMethodKind::Hash => "hash",
931        super::index_store::IndexMethodKind::Bitmap => "bitmap",
932        super::index_store::IndexMethodKind::Spatial => "spatial",
933        super::index_store::IndexMethodKind::BTree => "btree",
934    }
935}
936
937fn index_method_kind_from_str(raw: &str) -> Option<super::index_store::IndexMethodKind> {
938    match raw {
939        "hash" => Some(super::index_store::IndexMethodKind::Hash),
940        "bitmap" => Some(super::index_store::IndexMethodKind::Bitmap),
941        "spatial" | "rtree" => Some(super::index_store::IndexMethodKind::Spatial),
942        "btree" => Some(super::index_store::IndexMethodKind::BTree),
943        _ => None,
944    }
945}
946
947fn runtime_pool_lock(runtime: &RedDBRuntime) -> std::sync::MutexGuard<'_, PoolState> {
948    runtime
949        .inner
950        .pool
951        .lock()
952        .unwrap_or_else(|poisoned| poisoned.into_inner())
953}
954
955/// The graph-analytics table-valued functions recognized in FROM position.
956/// Both the graph-collection form and the inline `nodes => / edges =>` form
957/// (issue #799) accept these names.
958fn is_graph_tvf_name(name: &str) -> bool {
959    name.eq_ignore_ascii_case("components")
960        || name.eq_ignore_ascii_case("louvain")
961        || name.eq_ignore_ascii_case("degree_centrality")
962        || name.eq_ignore_ascii_case("shortest_path")
963        || name.eq_ignore_ascii_case("betweenness")
964        || name.eq_ignore_ascii_case("eigenvector")
965        || name.eq_ignore_ascii_case("pagerank")
966}
967
968/// Map a declared `WITH ANALYTICS` view to the concrete graph algorithm name
969/// and named-argument list that [`RedDBRuntime::dispatch_graph_algorithm`]
970/// consumes (issue #800). The `using` option selects the algorithm inside the
971/// output family; unsupported algorithms and the options that do not apply to
972/// the chosen algorithm are rejected so a view never silently ignores a
973/// declared parameter.
974fn analytics_view_algorithm(
975    graph: &str,
976    view: &crate::catalog::AnalyticsViewDescriptor,
977) -> RedDBResult<(String, Vec<(String, f64)>)> {
978    use crate::catalog::AnalyticsOutput;
979
980    let mut named_args: Vec<(String, f64)> = Vec::new();
981    let algorithm = match view.output {
982        AnalyticsOutput::Communities => {
983            let algo = view.algorithm.as_deref().unwrap_or("louvain");
984            if !algo.eq_ignore_ascii_case("louvain") {
985                return Err(RedDBError::Query(format!(
986                    "analytics output 'communities' on graph '{graph}' has unsupported algorithm '{algo}' (expected louvain)"
987                )));
988            }
989            if let Some(resolution) = view.resolution {
990                named_args.push(("resolution".to_string(), resolution));
991            }
992            "louvain".to_string()
993        }
994        AnalyticsOutput::Components => {
995            if let Some(algo) = view.algorithm.as_deref() {
996                if !algo.eq_ignore_ascii_case("components")
997                    && !algo.eq_ignore_ascii_case("connected_components")
998                {
999                    return Err(RedDBError::Query(format!(
1000                        "analytics output 'components' on graph '{graph}' has unsupported algorithm '{algo}' (expected connected_components)"
1001                    )));
1002                }
1003            }
1004            "components".to_string()
1005        }
1006        AnalyticsOutput::Centrality => {
1007            let algo = view
1008                .algorithm
1009                .as_deref()
1010                .unwrap_or("pagerank")
1011                .to_ascii_lowercase();
1012            match algo.as_str() {
1013                "pagerank" => {
1014                    if let Some(max_iterations) = view.max_iterations {
1015                        named_args.push(("max_iterations".to_string(), max_iterations as f64));
1016                    }
1017                }
1018                "eigenvector" => {
1019                    if let Some(max_iterations) = view.max_iterations {
1020                        named_args.push(("max_iterations".to_string(), max_iterations as f64));
1021                    }
1022                    if let Some(tolerance) = view.tolerance {
1023                        named_args.push(("tolerance".to_string(), tolerance));
1024                    }
1025                }
1026                "betweenness" => {}
1027                other => {
1028                    return Err(RedDBError::Query(format!(
1029                        "analytics output 'centrality' on graph '{graph}' has unsupported algorithm '{other}' (expected pagerank, betweenness, or eigenvector)"
1030                    )));
1031                }
1032            }
1033            algo
1034        }
1035    };
1036    Ok((algorithm, named_args))
1037}
1038
1039/// Reject any named arguments for a TVF that accepts none.
1040fn reject_named_args(name: &str, named_args: &[(String, f64)]) -> RedDBResult<()> {
1041    if let Some((key, _)) = named_args.first() {
1042        return Err(RedDBError::Query(format!(
1043            "table function '{name}' has no named argument '{key}'"
1044        )));
1045    }
1046    Ok(())
1047}
1048
1049/// Resolve louvain's optional `resolution` named arg (γ, default 1.0). Any
1050/// other named key, or a non-finite / non-positive resolution, is rejected.
1051fn louvain_resolution(named_args: &[(String, f64)]) -> RedDBResult<f64> {
1052    let mut resolution = 1.0_f64;
1053    for (key, value) in named_args {
1054        if key.eq_ignore_ascii_case("resolution") {
1055            if !value.is_finite() || *value <= 0.0 {
1056                return Err(RedDBError::Query(format!(
1057                    "table function 'louvain' resolution must be > 0, got {value}"
1058                )));
1059            }
1060            resolution = *value;
1061        } else {
1062            return Err(RedDBError::Query(format!(
1063                "table function 'louvain' has no named argument '{key}' (expected 'resolution')"
1064            )));
1065        }
1066    }
1067    Ok(resolution)
1068}
1069
1070/// Undirected degree centrality over abstract inputs: each edge contributes
1071/// 1 to both of its endpoints. Returns `(node_id, degree)` deterministically
1072/// in ascending node-id order, so identical input always yields identical
1073/// rows.
1074fn abstract_degree_centrality(
1075    nodes: &[String],
1076    edges: &[(
1077        String,
1078        String,
1079        crate::storage::engine::graph_algorithms::Weight,
1080    )],
1081) -> Vec<(String, usize)> {
1082    let mut degree: std::collections::BTreeMap<String, usize> = std::collections::BTreeMap::new();
1083    for n in nodes {
1084        degree.entry(n.clone()).or_insert(0);
1085    }
1086    for (a, b, _w) in edges {
1087        *degree.entry(a.clone()).or_insert(0) += 1;
1088        *degree.entry(b.clone()).or_insert(0) += 1;
1089    }
1090    degree.into_iter().collect()
1091}
1092
1093/// Ordered column names for a materialized subquery result: the projection
1094/// columns when present, else the first record's field order.
1095fn ordered_result_columns(result: &crate::storage::query::unified::UnifiedResult) -> Vec<String> {
1096    if !result.columns.is_empty() {
1097        return result.columns.clone();
1098    }
1099    result
1100        .records
1101        .first()
1102        .map(|record| {
1103            record
1104                .column_names()
1105                .iter()
1106                .map(|column| column.to_string())
1107                .collect()
1108        })
1109        .unwrap_or_default()
1110}
1111
1112/// Canonical node-id string for a cell value, so the node universe (from the
1113/// `nodes` subquery) and the edge endpoints (from the `edges` subquery)
1114/// compare equal regardless of integer-vs-text typing. `Null` is not a node.
1115fn value_to_node_id(value: &crate::storage::schema::Value) -> Option<String> {
1116    use crate::storage::schema::Value;
1117    match value {
1118        Value::Null => None,
1119        Value::Text(s) => Some(s.to_string()),
1120        Value::Integer(n) => Some(n.to_string()),
1121        Value::UnsignedInteger(n) => Some(n.to_string()),
1122        Value::NodeRef(s) => Some(s.clone()),
1123        other => Some(other.to_string()),
1124    }
1125}
1126
1127/// Numeric edge weight from a cell value (the optional third `edges` column).
1128fn value_to_weight(value: &crate::storage::schema::Value) -> Option<f32> {
1129    use crate::storage::schema::Value;
1130    match value {
1131        Value::Float(f) => Some(*f as f32),
1132        Value::Integer(n) => Some(*n as f32),
1133        Value::UnsignedInteger(n) => Some(*n as f32),
1134        _ => None,
1135    }
1136}
1137
1138/// Build the node universe from a materialized `nodes` subquery result: the
1139/// first projected column of each row is the node id (issue #799). Zero rows
1140/// is a valid empty node set; a row set with no columns is a shape error.
1141fn inline_node_ids(
1142    name: &str,
1143    result: &crate::storage::query::unified::UnifiedResult,
1144) -> RedDBResult<Vec<String>> {
1145    if result.records.is_empty() {
1146        return Ok(Vec::new());
1147    }
1148    let columns = ordered_result_columns(result);
1149    let Some(first_col) = columns.first() else {
1150        return Err(RedDBError::Query(format!(
1151            "table function '{name}' inline form: `nodes` subquery must project at least one column (the node id)"
1152        )));
1153    };
1154    let mut ids = Vec::with_capacity(result.records.len());
1155    for record in &result.records {
1156        if let Some(id) = record.get(first_col).and_then(value_to_node_id) {
1157            ids.push(id);
1158        }
1159    }
1160    Ok(ids)
1161}
1162
1163/// Build the edge list from a materialized `edges` subquery result: the first
1164/// two projected columns are `(source, target)` and an optional third column
1165/// is the numeric weight (defaulting to 1.0). Fewer than two columns is a
1166/// shape error (issue #799).
1167fn inline_edges(
1168    name: &str,
1169    result: &crate::storage::query::unified::UnifiedResult,
1170) -> RedDBResult<
1171    Vec<(
1172        String,
1173        String,
1174        crate::storage::engine::graph_algorithms::Weight,
1175    )>,
1176> {
1177    if result.records.is_empty() {
1178        return Ok(Vec::new());
1179    }
1180    let columns = ordered_result_columns(result);
1181    if columns.len() < 2 {
1182        return Err(RedDBError::Query(format!(
1183            "table function '{name}' inline form: `edges` subquery must project at least two columns (source, target), got {}",
1184            columns.len()
1185        )));
1186    }
1187    let src_col = &columns[0];
1188    let dst_col = &columns[1];
1189    let weight_col = columns.get(2);
1190    let mut edges = Vec::with_capacity(result.records.len());
1191    for record in &result.records {
1192        let (Some(src), Some(dst)) = (
1193            record.get(src_col).and_then(value_to_node_id),
1194            record.get(dst_col).and_then(value_to_node_id),
1195        ) else {
1196            // A null/absent endpoint is not a valid edge; skip it.
1197            continue;
1198        };
1199        let weight = match weight_col {
1200            Some(col) => match record.get(col) {
1201                None | Some(crate::storage::schema::Value::Null) => 1.0,
1202                Some(value) => value_to_weight(value).ok_or_else(|| {
1203                    RedDBError::Query(format!(
1204                        "table function '{name}' inline form: `edges` weight column must be numeric"
1205                    ))
1206                })?,
1207            },
1208            None => 1.0,
1209        };
1210        edges.push((src, dst, weight));
1211    }
1212    Ok(edges)
1213}
1214
1215fn cache_scope_insert(scopes: &mut HashSet<String>, name: &str) {
1216    if name.is_empty() || name.starts_with("__subq_") || is_universal_query_source(name) {
1217        return;
1218    }
1219    scopes.insert(name.to_string());
1220}
1221
1222fn collect_table_source_scopes(scopes: &mut HashSet<String>, query: &TableQuery) {
1223    match query.source.as_ref() {
1224        Some(crate::storage::query::ast::TableSource::Name(name)) => {
1225            cache_scope_insert(scopes, name)
1226        }
1227        Some(crate::storage::query::ast::TableSource::Subquery(subquery)) => {
1228            collect_query_expr_result_cache_scopes(scopes, subquery);
1229        }
1230        // Graph-collection TVFs (e.g. `louvain(g)`) read the graph store
1231        // read-only. The result is now cached (issue #802) and scoped to the
1232        // graph collection named in the first argument, so any mutation on
1233        // that collection (`INSERT INTO g NODE/EDGE …`) invalidates the
1234        // entry via `invalidate_result_cache_for_table`. Non-graph or
1235        // zero-arg functions contribute no scope.
1236        Some(crate::storage::query::ast::TableSource::Function { name, args, .. }) => {
1237            if is_graph_tvf_name(name) {
1238                if let Some(graph) = args.first() {
1239                    cache_scope_insert(scopes, graph);
1240                }
1241            }
1242        }
1243        // The inline-graph form reads ordinary tables/docs through its
1244        // `nodes`/`edges` subqueries, so its result cache must be scoped to
1245        // those source collections — mutating any of them invalidates the
1246        // cached result (issue #799).
1247        Some(crate::storage::query::ast::TableSource::InlineGraphFunction {
1248            nodes, edges, ..
1249        }) => {
1250            collect_query_expr_result_cache_scopes(scopes, nodes);
1251            collect_query_expr_result_cache_scopes(scopes, edges);
1252        }
1253        None => cache_scope_insert(scopes, &query.table),
1254    }
1255}
1256
1257fn collect_vector_source_scopes(
1258    scopes: &mut HashSet<String>,
1259    source: &crate::storage::query::ast::VectorSource,
1260) {
1261    match source {
1262        crate::storage::query::ast::VectorSource::Reference { collection, .. } => {
1263            cache_scope_insert(scopes, collection);
1264        }
1265        crate::storage::query::ast::VectorSource::Subquery(subquery) => {
1266            collect_query_expr_result_cache_scopes(scopes, subquery);
1267        }
1268        crate::storage::query::ast::VectorSource::Literal(_)
1269        | crate::storage::query::ast::VectorSource::Text(_) => {}
1270    }
1271}
1272
1273fn collect_path_selector_scopes(
1274    scopes: &mut HashSet<String>,
1275    selector: &crate::storage::query::ast::NodeSelector,
1276) {
1277    if let crate::storage::query::ast::NodeSelector::ByRow { table, .. } = selector {
1278        cache_scope_insert(scopes, table);
1279    }
1280}
1281
1282fn collect_query_expr_result_cache_scopes(scopes: &mut HashSet<String>, expr: &QueryExpr) {
1283    match expr {
1284        QueryExpr::Table(query) => collect_table_source_scopes(scopes, query),
1285        QueryExpr::Join(query) => {
1286            collect_query_expr_result_cache_scopes(scopes, &query.left);
1287            collect_query_expr_result_cache_scopes(scopes, &query.right);
1288        }
1289        QueryExpr::Path(query) => {
1290            collect_path_selector_scopes(scopes, &query.from);
1291            collect_path_selector_scopes(scopes, &query.to);
1292        }
1293        QueryExpr::Vector(query) => {
1294            cache_scope_insert(scopes, &query.collection);
1295            collect_vector_source_scopes(scopes, &query.query_vector);
1296        }
1297        QueryExpr::Hybrid(query) => {
1298            collect_query_expr_result_cache_scopes(scopes, &query.structured);
1299            cache_scope_insert(scopes, &query.vector.collection);
1300            collect_vector_source_scopes(scopes, &query.vector.query_vector);
1301        }
1302        QueryExpr::Insert(query) => cache_scope_insert(scopes, &query.table),
1303        QueryExpr::Update(query) => cache_scope_insert(scopes, &query.table),
1304        QueryExpr::Delete(query) => cache_scope_insert(scopes, &query.table),
1305        QueryExpr::CreateTable(query) => cache_scope_insert(scopes, &query.name),
1306        QueryExpr::CreateCollection(query) => cache_scope_insert(scopes, &query.name),
1307        QueryExpr::CreateVector(query) => cache_scope_insert(scopes, &query.name),
1308        QueryExpr::DropTable(query) => cache_scope_insert(scopes, &query.name),
1309        QueryExpr::DropGraph(query) => cache_scope_insert(scopes, &query.name),
1310        QueryExpr::DropVector(query) => cache_scope_insert(scopes, &query.name),
1311        QueryExpr::DropDocument(query) => cache_scope_insert(scopes, &query.name),
1312        QueryExpr::DropKv(query) => cache_scope_insert(scopes, &query.name),
1313        QueryExpr::DropCollection(query) => cache_scope_insert(scopes, &query.name),
1314        QueryExpr::Truncate(query) => cache_scope_insert(scopes, &query.name),
1315        QueryExpr::AlterTable(query) => cache_scope_insert(scopes, &query.name),
1316        QueryExpr::CreateIndex(query) => cache_scope_insert(scopes, &query.table),
1317        QueryExpr::DropIndex(query) => cache_scope_insert(scopes, &query.table),
1318        QueryExpr::CreateTimeSeries(query) => cache_scope_insert(scopes, &query.name),
1319        QueryExpr::CreateMetric(query) => cache_scope_insert(scopes, &query.path),
1320        QueryExpr::AlterMetric(query) => cache_scope_insert(scopes, &query.path),
1321        QueryExpr::CreateSlo(query) => cache_scope_insert(scopes, &query.path),
1322        QueryExpr::DropTimeSeries(query) => cache_scope_insert(scopes, &query.name),
1323        QueryExpr::CreateQueue(query) => cache_scope_insert(scopes, &query.name),
1324        QueryExpr::AlterQueue(query) => cache_scope_insert(scopes, &query.name),
1325        QueryExpr::DropQueue(query) => cache_scope_insert(scopes, &query.name),
1326        QueryExpr::QueueSelect(query) => cache_scope_insert(scopes, &query.queue),
1327        QueryExpr::QueueCommand(query) => match query {
1328            QueueCommand::Push { queue, .. }
1329            | QueueCommand::Pop { queue, .. }
1330            | QueueCommand::Peek { queue, .. }
1331            | QueueCommand::Len { queue }
1332            | QueueCommand::Purge { queue }
1333            | QueueCommand::GroupCreate { queue, .. }
1334            | QueueCommand::GroupRead { queue, .. }
1335            | QueueCommand::Pending { queue, .. }
1336            | QueueCommand::Claim { queue, .. }
1337            | QueueCommand::Ack { queue, .. }
1338            | QueueCommand::Nack { queue, .. } => cache_scope_insert(scopes, queue),
1339            QueueCommand::Move {
1340                source,
1341                destination,
1342                ..
1343            } => {
1344                cache_scope_insert(scopes, source);
1345                cache_scope_insert(scopes, destination);
1346            }
1347        },
1348        QueryExpr::EventsBackfill(query) => {
1349            cache_scope_insert(scopes, &query.collection);
1350            cache_scope_insert(scopes, &query.target_queue);
1351        }
1352        QueryExpr::CreateTree(query) => cache_scope_insert(scopes, &query.collection),
1353        QueryExpr::DropTree(query) => cache_scope_insert(scopes, &query.collection),
1354        QueryExpr::TreeCommand(query) => match query {
1355            TreeCommand::Insert { collection, .. }
1356            | TreeCommand::Move { collection, .. }
1357            | TreeCommand::Delete { collection, .. }
1358            | TreeCommand::Validate { collection, .. }
1359            | TreeCommand::Rebalance { collection, .. } => cache_scope_insert(scopes, collection),
1360        },
1361        QueryExpr::SearchCommand(query) => match query {
1362            SearchCommand::Similar { collection, .. }
1363            | SearchCommand::Hybrid { collection, .. }
1364            | SearchCommand::SpatialRadius { collection, .. }
1365            | SearchCommand::SpatialBbox { collection, .. }
1366            | SearchCommand::SpatialNearest { collection, .. } => {
1367                cache_scope_insert(scopes, collection);
1368            }
1369            SearchCommand::Text { collection, .. }
1370            | SearchCommand::Multimodal { collection, .. }
1371            | SearchCommand::Index { collection, .. }
1372            | SearchCommand::Context { collection, .. } => {
1373                if let Some(collection) = collection.as_deref() {
1374                    cache_scope_insert(scopes, collection);
1375                }
1376            }
1377        },
1378        QueryExpr::Ask(query) => {
1379            if let Some(collection) = query.collection.as_deref() {
1380                cache_scope_insert(scopes, collection);
1381            }
1382        }
1383        QueryExpr::ExplainAlter(query) => cache_scope_insert(scopes, &query.target.name),
1384        QueryExpr::MaintenanceCommand(cmd) => match cmd {
1385            crate::storage::query::ast::MaintenanceCommand::Vacuum { target, .. }
1386            | crate::storage::query::ast::MaintenanceCommand::Analyze { target } => {
1387                if let Some(t) = target {
1388                    cache_scope_insert(scopes, t);
1389                }
1390            }
1391        },
1392        QueryExpr::CopyFrom(cmd) => cache_scope_insert(scopes, &cmd.table),
1393        QueryExpr::CreateView(cmd) => {
1394            cache_scope_insert(scopes, &cmd.name);
1395            // Invalidating the view should also invalidate its dependencies.
1396            collect_query_expr_result_cache_scopes(scopes, &cmd.query);
1397        }
1398        QueryExpr::DropView(cmd) => cache_scope_insert(scopes, &cmd.name),
1399        QueryExpr::RefreshMaterializedView(cmd) => cache_scope_insert(scopes, &cmd.name),
1400        QueryExpr::CreatePolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1401        QueryExpr::DropPolicy(cmd) => cache_scope_insert(scopes, &cmd.table),
1402        QueryExpr::CreateServer(_) | QueryExpr::DropServer(_) => {}
1403        QueryExpr::CreateForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1404        QueryExpr::DropForeignTable(cmd) => cache_scope_insert(scopes, &cmd.name),
1405        QueryExpr::Graph(_)
1406        | QueryExpr::GraphCommand(_)
1407        | QueryExpr::ProbabilisticCommand(_)
1408        | QueryExpr::SetConfig { .. }
1409        | QueryExpr::ShowConfig { .. }
1410        | QueryExpr::SetSecret { .. }
1411        | QueryExpr::DeleteSecret { .. }
1412        | QueryExpr::ShowSecrets { .. }
1413        | QueryExpr::SetTenant(_)
1414        | QueryExpr::ShowTenant
1415        | QueryExpr::TransactionControl(_)
1416        | QueryExpr::CreateSchema(_)
1417        | QueryExpr::DropSchema(_)
1418        | QueryExpr::CreateSequence(_)
1419        | QueryExpr::DropSequence(_)
1420        | QueryExpr::Grant(_)
1421        | QueryExpr::Revoke(_)
1422        | QueryExpr::AlterUser(_)
1423        | QueryExpr::CreateIamPolicy { .. }
1424        | QueryExpr::DropIamPolicy { .. }
1425        | QueryExpr::AttachPolicy { .. }
1426        | QueryExpr::DetachPolicy { .. }
1427        | QueryExpr::ShowPolicies { .. }
1428        | QueryExpr::ShowEffectivePermissions { .. }
1429        | QueryExpr::RankOf(_)
1430        | QueryExpr::ApproxRankOf(_)
1431        | QueryExpr::RankRange(_)
1432        | QueryExpr::SimulatePolicy { .. }
1433        | QueryExpr::LintPolicy { .. }
1434        | QueryExpr::MigratePolicyMode { .. }
1435        | QueryExpr::CreateMigration(_)
1436        | QueryExpr::ApplyMigration(_)
1437        | QueryExpr::RollbackMigration(_)
1438        | QueryExpr::ExplainMigration(_)
1439        | QueryExpr::EventsBackfillStatus { .. } => {}
1440        QueryExpr::KvCommand(cmd) => {
1441            use crate::storage::query::ast::KvCommand;
1442            match cmd {
1443                KvCommand::Put { collection, .. }
1444                | KvCommand::InvalidateTags { collection, .. }
1445                | KvCommand::Get { collection, .. }
1446                | KvCommand::Unseal { collection, .. }
1447                | KvCommand::Rotate { collection, .. }
1448                | KvCommand::History { collection, .. }
1449                | KvCommand::List { collection, .. }
1450                | KvCommand::Purge { collection, .. }
1451                | KvCommand::Watch { collection, .. }
1452                | KvCommand::Delete { collection, .. }
1453                | KvCommand::Incr { collection, .. }
1454                | KvCommand::Cas { collection, .. } => cache_scope_insert(scopes, collection),
1455            }
1456        }
1457        QueryExpr::ConfigCommand(cmd) => {
1458            use crate::storage::query::ast::ConfigCommand;
1459            match cmd {
1460                ConfigCommand::Put { collection, .. }
1461                | ConfigCommand::Get { collection, .. }
1462                | ConfigCommand::Resolve { collection, .. }
1463                | ConfigCommand::Rotate { collection, .. }
1464                | ConfigCommand::Delete { collection, .. }
1465                | ConfigCommand::History { collection, .. }
1466                | ConfigCommand::List { collection, .. }
1467                | ConfigCommand::Watch { collection, .. }
1468                | ConfigCommand::InvalidVolatileOperation { collection, .. } => {
1469                    cache_scope_insert(scopes, collection)
1470                }
1471            }
1472        }
1473    }
1474}
1475
1476/// Combine matching RLS policies for a table + action into a single
1477/// `Filter` suitable for AND-ing into a caller's `WHERE` clause.
1478///
1479/// Returns `None` when RLS is disabled or no policy admits the caller's
1480/// role — callers use that to short-circuit the mutation (for DELETE /
1481/// UPDATE we simply skip the operation, which PG expresses as "no rows
1482/// match the policy + predicate combination").
1483pub(crate) fn rls_policy_filter(
1484    runtime: &RedDBRuntime,
1485    table: &str,
1486    action: crate::storage::query::ast::PolicyAction,
1487) -> Option<crate::storage::query::ast::Filter> {
1488    rls_policy_filter_for_kind(
1489        runtime,
1490        table,
1491        action,
1492        crate::storage::query::ast::PolicyTargetKind::Table,
1493    )
1494}
1495
1496/// Kind-aware policy filter combiner (Phase 2.5.5 RLS universal).
1497/// Graph / vector / queue / timeseries scans pass the concrete kind;
1498/// policies targeting other kinds are ignored. Legacy Table-scoped
1499/// policies still apply cross-kind — callers register auto-tenancy
1500/// policies as Table today.
1501pub(crate) fn rls_policy_filter_for_kind(
1502    runtime: &RedDBRuntime,
1503    table: &str,
1504    action: crate::storage::query::ast::PolicyAction,
1505    kind: crate::storage::query::ast::PolicyTargetKind,
1506) -> Option<crate::storage::query::ast::Filter> {
1507    use crate::storage::query::ast::Filter;
1508
1509    if !runtime.inner.rls_enabled_tables.read().contains(table) {
1510        return None;
1511    }
1512    let role = current_auth_identity().map(|(_, role)| role);
1513    let role_str = role.map(|r| r.as_str().to_string());
1514    let policies = runtime.matching_rls_policies_for_kind(table, role_str.as_deref(), action, kind);
1515    if policies.is_empty() {
1516        return None;
1517    }
1518    policies
1519        .into_iter()
1520        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1521}
1522
1523/// Returns true when the table has RLS enforcement enabled. Convenience
1524/// shortcut so DML paths can gate the AND-combine work without reaching
1525/// into `runtime.inner.rls_enabled_tables` directly.
1526pub(crate) fn rls_is_enabled(runtime: &RedDBRuntime, table: &str) -> bool {
1527    runtime.inner.rls_enabled_tables.read().contains(table)
1528}
1529
1530/// Per-entity gate used by the graph materialiser for `GraphNode`
1531/// entities. RLS is checked against the source collection with
1532/// `kind = Nodes`, which `matching_rls_policies_for_kind` resolves to
1533/// either `Nodes`-targeted policies or legacy `Table`-targeted ones
1534/// (for back-compat with auto-tenancy declarations). Cached per
1535/// collection so big graphs only resolve the policy chain once.
1536fn node_passes_rls(
1537    runtime: &RedDBRuntime,
1538    collection: &str,
1539    role: Option<&str>,
1540    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1541    entity: &crate::storage::unified::entity::UnifiedEntity,
1542) -> bool {
1543    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1544
1545    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1546        return true;
1547    }
1548    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1549        let policies = runtime.matching_rls_policies_for_kind(
1550            collection,
1551            role,
1552            PolicyAction::Select,
1553            PolicyTargetKind::Nodes,
1554        );
1555        if policies.is_empty() {
1556            None
1557        } else {
1558            policies
1559                .into_iter()
1560                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1561        }
1562    });
1563    let Some(filter) = filter else {
1564        return false;
1565    };
1566    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1567        Some(&runtime.inner.db),
1568        entity,
1569        filter,
1570        collection,
1571        collection,
1572    )
1573}
1574
1575/// Edge counterpart of `node_passes_rls`. Same caching strategy with
1576/// `kind = Edges`.
1577fn edge_passes_rls(
1578    runtime: &RedDBRuntime,
1579    collection: &str,
1580    role: Option<&str>,
1581    cache: &mut std::collections::HashMap<String, Option<crate::storage::query::ast::Filter>>,
1582    entity: &crate::storage::unified::entity::UnifiedEntity,
1583) -> bool {
1584    use crate::storage::query::ast::{Filter, PolicyAction, PolicyTargetKind};
1585
1586    if !runtime.inner.rls_enabled_tables.read().contains(collection) {
1587        return true;
1588    }
1589    let filter = cache.entry(collection.to_string()).or_insert_with(|| {
1590        let policies = runtime.matching_rls_policies_for_kind(
1591            collection,
1592            role,
1593            PolicyAction::Select,
1594            PolicyTargetKind::Edges,
1595        );
1596        if policies.is_empty() {
1597            None
1598        } else {
1599            policies
1600                .into_iter()
1601                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1602        }
1603    });
1604    let Some(filter) = filter else {
1605        return false;
1606    };
1607    crate::runtime::query_exec::evaluate_entity_filter_with_db(
1608        Some(&runtime.inner.db),
1609        entity,
1610        filter,
1611        collection,
1612        collection,
1613    )
1614}
1615
1616/// RLS policy injection (Phase 2.5.2 PG parity).
1617///
1618/// Fetch every matching policy for the current thread-local role and
1619/// fold them into the query's filter. Semantics mirror PostgreSQL:
1620///
1621/// * Multiple policies on the same table combine with **OR** — a row is
1622///   visible if *any* policy admits it.
1623/// * The combined policy predicate is **AND**-ed into the caller's
1624///   existing `WHERE` clause so explicit predicates continue to trim
1625///   the policy-allowed set.
1626/// * No matching policies + RLS enabled = zero rows (PG's
1627///   restrictive-default). Callers get `None` and return an empty
1628///   `UnifiedResult` without ever dispatching the scan.
1629///
1630/// This runs only when `RuntimeInner::rls_enabled_tables` already
1631/// contains the table name — callers gate the hot path upfront to
1632/// avoid the lock acquisition on tables without RLS.
1633///
1634/// Returns `None` when no policy admits the current role; returns
1635/// `Some(mutated_table)` with policy filters folded in otherwise.
1636fn inject_rls_filters(
1637    runtime: &RedDBRuntime,
1638    frame: &dyn super::statement_frame::ReadFrame,
1639    mut table: crate::storage::query::ast::TableQuery,
1640) -> Option<crate::storage::query::ast::TableQuery> {
1641    use crate::storage::query::ast::{Filter, PolicyAction};
1642
1643    // `None` role falls through to policies with no `TO role` clause.
1644    let role = frame.identity().map(|(_, role)| role);
1645    let role_str = role.map(|r| r.as_str().to_string());
1646    let policies =
1647        runtime.matching_rls_policies(&table.table, role_str.as_deref(), PolicyAction::Select);
1648
1649    if policies.is_empty() {
1650        // RLS enabled + no policy match = deny everything. Signal the
1651        // caller to short-circuit with an empty result set.
1652        return None;
1653    }
1654
1655    // Combine policy predicates with OR (PG's permissive default).
1656    let combined = policies
1657        .into_iter()
1658        .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1659        .expect("policies non-empty");
1660
1661    // AND into the caller's existing predicate. The predicate may live
1662    // in `where_expr` rather than `filter`: `resolve_table_expr_subqueries`
1663    // nulls `filter` whenever `where_expr` is present (the case for a
1664    // view body rewritten into `SELECT … WHERE …`). Folding only into
1665    // `filter` here would silently drop that `where_expr` predicate at
1666    // eval time because `effective_table_filter` prefers `filter` —
1667    // e.g. `WITHIN TENANT … SELECT * FROM <view>` would apply the
1668    // tenant policy but lose the view's own WHERE (#635).
1669    use crate::storage::query::sql_lowering::{expr_to_filter, filter_to_expr};
1670    let had_where_expr = table.where_expr.is_some();
1671    let existing = table
1672        .filter
1673        .take()
1674        .or_else(|| table.where_expr.as_ref().map(expr_to_filter));
1675    let new_filter = match existing {
1676        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1677        None => combined,
1678    };
1679    // Keep `where_expr` in lock-step with the merged `filter` so
1680    // whichever the executor consults sees the full predicate.
1681    if had_where_expr {
1682        table.where_expr = Some(filter_to_expr(&new_filter));
1683    }
1684    table.filter = Some(new_filter);
1685    Some(table)
1686}
1687
1688/// Apply per-table RLS to a `JoinQuery` by folding each side's policy
1689/// predicate into the join's outer filter. Walking the merged record
1690/// at the join layer (rather than mutating the per-side scan filter)
1691/// keeps the planner's strategy choice and per-side index selection
1692/// undisturbed — the policy predicate uses the qualified `t.col` form
1693/// that resolves cleanly against the merged record's keys.
1694///
1695/// Returns `None` when any leaf has RLS enabled and no policy admits
1696/// the caller — the join short-circuits to an empty result.
1697fn inject_rls_into_join(
1698    runtime: &RedDBRuntime,
1699    frame: &dyn super::statement_frame::ReadFrame,
1700    mut join: crate::storage::query::ast::JoinQuery,
1701) -> Option<crate::storage::query::ast::JoinQuery> {
1702    use crate::storage::query::ast::Filter;
1703
1704    let mut policy_filters: Vec<Filter> = Vec::new();
1705    if !collect_join_side_policy(runtime, frame, join.left.as_ref(), &mut policy_filters) {
1706        return None;
1707    }
1708    if !collect_join_side_policy(runtime, frame, join.right.as_ref(), &mut policy_filters) {
1709        return None;
1710    }
1711
1712    if policy_filters.is_empty() {
1713        return Some(join);
1714    }
1715
1716    let combined = policy_filters
1717        .into_iter()
1718        .reduce(|acc, f| Filter::And(Box::new(acc), Box::new(f)))
1719        .expect("policy_filters non-empty");
1720
1721    join.filter = Some(match join.filter.take() {
1722        Some(existing) => Filter::And(Box::new(existing), Box::new(combined)),
1723        None => combined,
1724    });
1725
1726    Some(join)
1727}
1728
1729/// For each `Table` leaf reachable through nested joins, append the
1730/// RLS-policy filter (combined with OR across that side's matching
1731/// policies) into `out`. Returns `false` when a side has RLS enabled
1732/// but no policy admits the caller — the join must short-circuit.
1733fn collect_join_side_policy(
1734    runtime: &RedDBRuntime,
1735    frame: &dyn super::statement_frame::ReadFrame,
1736    expr: &crate::storage::query::ast::QueryExpr,
1737    out: &mut Vec<crate::storage::query::ast::Filter>,
1738) -> bool {
1739    use crate::storage::query::ast::{Filter, PolicyAction, QueryExpr};
1740    match expr {
1741        QueryExpr::Table(t) => {
1742            if !runtime.inner.rls_enabled_tables.read().contains(&t.table) {
1743                return true;
1744            }
1745            let role = frame.identity().map(|(_, role)| role);
1746            let role_str = role.map(|r| r.as_str().to_string());
1747            let policies =
1748                runtime.matching_rls_policies(&t.table, role_str.as_deref(), PolicyAction::Select);
1749            if policies.is_empty() {
1750                return false;
1751            }
1752            let combined = policies
1753                .into_iter()
1754                .reduce(|acc, f| Filter::Or(Box::new(acc), Box::new(f)))
1755                .expect("policies non-empty");
1756            out.push(combined);
1757            true
1758        }
1759        QueryExpr::Join(inner) => {
1760            collect_join_side_policy(runtime, frame, inner.left.as_ref(), out)
1761                && collect_join_side_policy(runtime, frame, inner.right.as_ref(), out)
1762        }
1763        _ => true,
1764    }
1765}
1766
1767/// Foreign-table post-scan filter application (Phase 3.2.2 PG parity).
1768///
1769/// Phase 3.2 FDW wrappers don't advertise filter pushdown, so the runtime
1770/// applies `WHERE` / `ORDER BY` / `LIMIT` / `OFFSET` after the wrapper
1771/// materialises all rows. Projections are best-effort — when the query
1772/// lists explicit columns we keep only those; a `SELECT *` keeps every
1773/// wrapper-emitted field verbatim.
1774///
1775/// When a wrapper later opts into pushdown (`supports_pushdown = true`)
1776/// the runtime will pass the compiled filter down instead of post-filtering.
1777fn apply_foreign_table_filters(
1778    records: Vec<crate::storage::query::unified::UnifiedRecord>,
1779    query: &crate::storage::query::ast::TableQuery,
1780) -> crate::storage::query::unified::UnifiedResult {
1781    use crate::storage::query::sql_lowering::{
1782        effective_table_filter, effective_table_projections,
1783    };
1784    use crate::storage::query::unified::UnifiedResult;
1785
1786    let filter = effective_table_filter(query);
1787    let projections = effective_table_projections(query);
1788
1789    // Step 1 — WHERE. Reuse the cross-store evaluator so the semantics
1790    // match native-collection queries (same operators, same NULL handling).
1791    let mut filtered: Vec<_> = records
1792        .into_iter()
1793        .filter(|record| match &filter {
1794            Some(f) => {
1795                super::join_filter::evaluate_runtime_filter_with_db(None, record, f, None, None)
1796            }
1797            None => true,
1798        })
1799        .collect();
1800
1801    // Step 2 — LIMIT / OFFSET. Applied after filter to match SQL semantics.
1802    if let Some(offset) = query.offset {
1803        let offset = offset as usize;
1804        if offset >= filtered.len() {
1805            filtered.clear();
1806        } else {
1807            filtered.drain(0..offset);
1808        }
1809    }
1810    if let Some(limit) = query.limit {
1811        filtered.truncate(limit as usize);
1812    }
1813
1814    // Step 3 — columns list. `SELECT *` (no explicit projections) keeps
1815    // the wrapper's column set; an explicit list trims to those names.
1816    let columns: Vec<String> = if projections.is_empty() {
1817        filtered
1818            .first()
1819            .map(|r| r.column_names().iter().map(|k| k.to_string()).collect())
1820            .unwrap_or_default()
1821    } else {
1822        projections
1823            .iter()
1824            .map(super::join_filter::projection_name)
1825            .collect()
1826    };
1827
1828    let mut result = UnifiedResult::empty();
1829    result.columns = columns;
1830    result.records = filtered;
1831    result
1832}
1833
1834/// Collect every concrete table reference inside a `QueryExpr`.
1835///
1836/// Used by view bookkeeping (dependency tracking for materialised
1837/// invalidation) and any other rewriter that needs to know the base
1838/// tables a query pulls from. Does not descend into projections/filters;
1839/// only the `FROM` side.
1840pub(crate) fn collect_table_refs(expr: &QueryExpr) -> Vec<String> {
1841    let mut scopes: HashSet<String> = HashSet::new();
1842    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1843    scopes.into_iter().collect()
1844}
1845
1846fn query_expr_result_cache_scopes(expr: &QueryExpr) -> HashSet<String> {
1847    let mut scopes = HashSet::new();
1848    collect_query_expr_result_cache_scopes(&mut scopes, expr);
1849    scopes
1850}
1851
1852/// Heuristic: does the raw SQL reference a built-in whose output
1853/// varies by connection, clock, or randomness? Such queries must
1854/// skip the 30s result cache — see the call site for rationale.
1855///
1856/// ASCII case-insensitive substring match. False positives (the
1857/// token appears in a quoted string) only skip caching, which is
1858/// the conservative direction.
1859/// If `sql` starts with `EXPLAIN` followed by a non-`ALTER` token,
1860/// return the trimmed inner statement; otherwise `None`.
1861///
1862/// `EXPLAIN ALTER FOR CREATE TABLE ...` is a separate schema-diff
1863/// command handled inside the normal SQL parser, so we leave it
1864/// alone here.
1865fn strip_explain_prefix(sql: &str) -> Option<&str> {
1866    let trimmed = sql.trim_start();
1867    let (head, rest) = trimmed.split_at(
1868        trimmed
1869            .find(|c: char| c.is_whitespace())
1870            .unwrap_or(trimmed.len()),
1871    );
1872    if !head.eq_ignore_ascii_case("EXPLAIN") {
1873        return None;
1874    }
1875    let rest = rest.trim_start();
1876    if rest.is_empty() {
1877        return None;
1878    }
1879    // Peek the next token — if ALTER or ASK, defer to the normal parser.
1880    // `EXPLAIN ASK` is an executable read path: it runs retrieval and
1881    // provider selection, then short-circuits before the LLM call.
1882    let next_head_end = rest.find(|c: char| c.is_whitespace()).unwrap_or(rest.len());
1883    if rest[..next_head_end].eq_ignore_ascii_case("ALTER")
1884        || rest[..next_head_end].eq_ignore_ascii_case("ASK")
1885    {
1886        return None;
1887    }
1888    Some(rest)
1889}
1890
1891/// Cheap prefix check for a leading `WITH` keyword. Used to gate the
1892/// CTE-aware parse in `execute_query` without paying for a full
1893/// lexer pass on every statement. Treats `WITHIN` as not-a-CTE so
1894/// `WITHIN TENANT '...' SELECT ...` doesn't mis-route.
1895pub(super) fn has_with_prefix(sql: &str) -> bool {
1896    let trimmed = sql.trim_start();
1897    let head_end = trimmed
1898        .find(|c: char| c.is_whitespace() || c == '(')
1899        .unwrap_or(trimmed.len());
1900    trimmed[..head_end].eq_ignore_ascii_case("WITH")
1901}
1902
1903/// If the query is a plain SELECT whose top-level `TableQuery`
1904/// carries an `AS OF` clause, return a typed spec that the runtime
1905/// can feed to `vcs_resolve_as_of`. Returns `None` for any other
1906/// shape — joins, DML, EXPLAIN, or parse failures — so callers fall
1907/// back to the connection's regular MVCC snapshot. A cheap textual
1908/// prefilter skips the parse entirely when the source doesn't
1909/// mention `AS OF` / `as of`, keeping the autocommit hot path free.
1910fn peek_top_level_as_of(sql: &str) -> Option<crate::application::vcs::AsOfSpec> {
1911    peek_top_level_as_of_with_table(sql).map(|(spec, _)| spec)
1912}
1913
1914/// Same as `peek_top_level_as_of` but also returns the table name
1915/// targeted by the AS OF clause (when the FROM clause names a
1916/// concrete table). `None` for the table slot means scalar SELECT
1917/// or a subquery source — callers treat those as "no enforcement".
1918pub(super) fn peek_top_level_as_of_with_table(
1919    sql: &str,
1920) -> Option<(crate::application::vcs::AsOfSpec, Option<String>)> {
1921    if !sql
1922        .as_bytes()
1923        .windows(5)
1924        .any(|w| w.eq_ignore_ascii_case(b"as of"))
1925    {
1926        return None;
1927    }
1928    let parsed = crate::storage::query::parser::parse(sql).ok()?;
1929    let crate::storage::query::ast::QueryExpr::Table(table) = parsed.query else {
1930        return None;
1931    };
1932    let clause = table.as_of?;
1933    let table_name = if table.table.is_empty() || table.table == "any" {
1934        None
1935    } else {
1936        Some(table.table.clone())
1937    };
1938    let spec = match clause {
1939        crate::storage::query::ast::AsOfClause::Commit(h) => {
1940            crate::application::vcs::AsOfSpec::Commit(h)
1941        }
1942        crate::storage::query::ast::AsOfClause::Branch(b) => {
1943            crate::application::vcs::AsOfSpec::Branch(b)
1944        }
1945        crate::storage::query::ast::AsOfClause::Tag(t) => crate::application::vcs::AsOfSpec::Tag(t),
1946        crate::storage::query::ast::AsOfClause::TimestampMs(ts) => {
1947            crate::application::vcs::AsOfSpec::TimestampMs(ts)
1948        }
1949        crate::storage::query::ast::AsOfClause::Snapshot(x) => {
1950            crate::application::vcs::AsOfSpec::Snapshot(x)
1951        }
1952    };
1953    Some((spec, table_name))
1954}
1955
1956pub(super) fn query_has_volatile_builtin(sql: &str) -> bool {
1957    // Lowercase the bytes up to the first null/newline into a small
1958    // stack buffer for cheap contains() checks. Most SQL fits in the
1959    // buffer; longer queries fall back to owned lowercase.
1960    const VOLATILE_TOKENS: &[&str] = &[
1961        "pg_advisory_lock",
1962        "pg_try_advisory_lock",
1963        "pg_advisory_unlock",
1964        "random()",
1965        // NOW() / CURRENT_TIMESTAMP / CURRENT_DATE intentionally
1966        // omitted for now — they ARE volatile but today's tests rely
1967        // on caching them. Revisit once a tighter volatility story
1968        // lands.
1969    ];
1970    let lowered = sql.to_ascii_lowercase();
1971    VOLATILE_TOKENS.iter().any(|t| lowered.contains(t))
1972}
1973
1974pub(super) fn query_is_ask_statement(sql: &str) -> bool {
1975    let trimmed = sql.trim_start();
1976    let head_end = trimmed
1977        .find(|c: char| c.is_whitespace() || c == '(' || c == ';')
1978        .unwrap_or(trimmed.len());
1979    trimmed[..head_end].eq_ignore_ascii_case("ASK")
1980}
1981
1982/// Pick the `(global_mode, collection_mode)` pair for an expression,
1983/// or `None` for variants that opt out of intent-locking entirely
1984/// (admin statements like `SHOW CONFIG`, transaction control, tenant
1985/// toggles).
1986///
1987/// Phase-1 contract:
1988/// - Reads  — `(IX-compatible) (Global, IS) → (Collection, IS)`
1989/// - Writes — `(IX-compatible) (Global, IX) → (Collection, IX)`
1990/// - DDL    — `(strong)        (Global, IX) → (Collection, X)`
1991pub(super) fn intent_lock_modes_for(
1992    expr: &QueryExpr,
1993) -> Option<(
1994    crate::storage::transaction::lock::LockMode,
1995    crate::storage::transaction::lock::LockMode,
1996)> {
1997    use crate::storage::transaction::lock::LockMode::{Exclusive, IntentExclusive, IntentShared};
1998
1999    match expr {
2000        // Reads — IS / IS.
2001        QueryExpr::Table(_)
2002        | QueryExpr::Join(_)
2003        | QueryExpr::Vector(_)
2004        | QueryExpr::Hybrid(_)
2005        | QueryExpr::Graph(_)
2006        | QueryExpr::Path(_)
2007        | QueryExpr::Ask(_)
2008        | QueryExpr::SearchCommand(_)
2009        | QueryExpr::GraphCommand(_)
2010        | QueryExpr::RankOf(_)
2011        | QueryExpr::ApproxRankOf(_)
2012        | QueryExpr::RankRange(_)
2013        | QueryExpr::QueueSelect(_) => Some((IntentShared, IntentShared)),
2014
2015        // Writes — IX / IX. Non-tabular mutations (vector insert,
2016        // graph node insert, queue push, timeseries point insert)
2017        // don't carry their own dispatch arm here; they ride through
2018        // the Insert variant or a command variant covered by the
2019        // read-side arm above. P1.T4 expands only the TableQuery-ish
2020        // writes; non-tabular kinds inherit when their DML variants
2021        // land in later phases.
2022        QueryExpr::Insert(_)
2023        | QueryExpr::Update(_)
2024        | QueryExpr::Delete(_)
2025        | QueryExpr::QueueCommand(QueueCommand::Move { .. }) => {
2026            Some((IntentExclusive, IntentExclusive))
2027        }
2028        QueryExpr::QueueCommand(_) => Some((IntentShared, IntentShared)),
2029
2030        // DDL — IX / X. A DDL against collection `c` blocks all
2031        // other writers + readers on `c` but leaves other collections
2032        // running (because Global stays IX, not X).
2033        QueryExpr::CreateTable(_)
2034        | QueryExpr::CreateCollection(_)
2035        | QueryExpr::CreateVector(_)
2036        | QueryExpr::DropTable(_)
2037        | QueryExpr::DropGraph(_)
2038        | QueryExpr::DropVector(_)
2039        | QueryExpr::DropDocument(_)
2040        | QueryExpr::DropKv(_)
2041        | QueryExpr::DropCollection(_)
2042        | QueryExpr::Truncate(_)
2043        | QueryExpr::AlterTable(_)
2044        | QueryExpr::CreateIndex(_)
2045        | QueryExpr::DropIndex(_)
2046        | QueryExpr::CreateTimeSeries(_)
2047        | QueryExpr::CreateMetric(_)
2048        | QueryExpr::AlterMetric(_)
2049        | QueryExpr::CreateSlo(_)
2050        | QueryExpr::DropTimeSeries(_)
2051        | QueryExpr::CreateQueue(_)
2052        | QueryExpr::AlterQueue(_)
2053        | QueryExpr::DropQueue(_)
2054        | QueryExpr::CreateTree(_)
2055        | QueryExpr::DropTree(_)
2056        | QueryExpr::CreatePolicy(_)
2057        | QueryExpr::DropPolicy(_)
2058        | QueryExpr::CreateView(_)
2059        | QueryExpr::DropView(_)
2060        | QueryExpr::RefreshMaterializedView(_)
2061        | QueryExpr::CreateSchema(_)
2062        | QueryExpr::DropSchema(_)
2063        | QueryExpr::CreateSequence(_)
2064        | QueryExpr::DropSequence(_)
2065        | QueryExpr::CreateServer(_)
2066        | QueryExpr::DropServer(_)
2067        | QueryExpr::CreateForeignTable(_)
2068        | QueryExpr::DropForeignTable(_) => Some((IntentExclusive, Exclusive)),
2069
2070        // Admin / control — skip intent locks. `SET TENANT`,
2071        // `BEGIN / COMMIT / ROLLBACK`, `SET CONFIG`, `SHOW CONFIG`,
2072        // `VACUUM`, etc. don't touch collection data the same way
2073        // and the existing transaction layer already serialises the
2074        // pieces that matter.
2075        _ => None,
2076    }
2077}
2078
2079/// Best-effort collection inventory for an expression. Used to pick
2080/// `Collection(...)` resources for the intent-lock guard. Overshoots
2081/// are fine (take an extra IS, benign); undershoots leak writes past
2082/// DDL X locks, so err on the side of listing more names.
2083pub(super) fn collections_referenced(expr: &QueryExpr) -> Vec<String> {
2084    let mut out = Vec::new();
2085    walk_collections(expr, &mut out);
2086    out.sort();
2087    out.dedup();
2088    out
2089}
2090
2091fn walk_collections(expr: &QueryExpr, out: &mut Vec<String>) {
2092    match expr {
2093        QueryExpr::Table(t) => out.push(t.table.clone()),
2094        QueryExpr::Join(j) => {
2095            walk_collections(&j.left, out);
2096            walk_collections(&j.right, out);
2097        }
2098        QueryExpr::Insert(i) => out.push(i.table.clone()),
2099        QueryExpr::Update(u) => out.push(u.table.clone()),
2100        QueryExpr::Delete(d) => out.push(d.table.clone()),
2101        QueryExpr::QueueSelect(q) => out.push(q.queue.clone()),
2102
2103        // DDL — include the target collection so DDL takes
2104        // `(Collection, X)` and blocks concurrent readers / writers
2105        // on the same collection. Other collections stay live
2106        // because Global is still IX.
2107        QueryExpr::CreateTable(q) => out.push(q.name.clone()),
2108        QueryExpr::CreateCollection(q) => out.push(q.name.clone()),
2109        QueryExpr::CreateVector(q) => out.push(q.name.clone()),
2110        QueryExpr::DropTable(q) => out.push(q.name.clone()),
2111        QueryExpr::DropGraph(q) => out.push(q.name.clone()),
2112        QueryExpr::DropVector(q) => out.push(q.name.clone()),
2113        QueryExpr::DropDocument(q) => out.push(q.name.clone()),
2114        QueryExpr::DropKv(q) => out.push(q.name.clone()),
2115        QueryExpr::DropCollection(q) => out.push(q.name.clone()),
2116        QueryExpr::Truncate(q) => out.push(q.name.clone()),
2117        QueryExpr::AlterTable(q) => out.push(q.name.clone()),
2118        QueryExpr::CreateIndex(q) => out.push(q.table.clone()),
2119        QueryExpr::DropIndex(q) => out.push(q.table.clone()),
2120        QueryExpr::CreateTimeSeries(q) => out.push(q.name.clone()),
2121        QueryExpr::CreateMetric(q) => out.push(q.path.clone()),
2122        QueryExpr::AlterMetric(q) => out.push(q.path.clone()),
2123        QueryExpr::CreateSlo(q) => out.push(q.path.clone()),
2124        QueryExpr::DropTimeSeries(q) => out.push(q.name.clone()),
2125        QueryExpr::CreateQueue(q) => out.push(q.name.clone()),
2126        QueryExpr::AlterQueue(q) => out.push(q.name.clone()),
2127        QueryExpr::DropQueue(q) => out.push(q.name.clone()),
2128        QueryExpr::QueueCommand(QueueCommand::Move {
2129            source,
2130            destination,
2131            ..
2132        }) => {
2133            out.push(source.clone());
2134            out.push(destination.clone());
2135        }
2136        QueryExpr::CreatePolicy(q) => out.push(q.table.clone()),
2137        QueryExpr::CreateView(q) => out.push(q.name.clone()),
2138        QueryExpr::DropView(q) => out.push(q.name.clone()),
2139        QueryExpr::RefreshMaterializedView(q) => out.push(q.name.clone()),
2140
2141        // Vector / Hybrid / Graph / Path / commands reference
2142        // collections through fields whose shape varies; without a
2143        // uniform accessor we fall back to the global lock only —
2144        // benign because every runtime path still holds the global
2145        // mode.
2146        _ => {}
2147    }
2148}
2149
2150impl RedDBRuntime {
2151    pub fn in_memory() -> RedDBResult<Self> {
2152        Self::with_options(RedDBOptions::in_memory())
2153    }
2154
2155    pub fn flush(&self) -> RedDBResult<()> {
2156        self.inner
2157            .db
2158            .flush()
2159            .map_err(|err| RedDBError::Internal(err.to_string()))
2160    }
2161
2162    /// Handle to the intent-lock manager for tests + introspection.
2163    /// Production code acquires via `LockerGuard::new(rt.lock_manager())`
2164    /// rather than touching the manager directly.
2165    pub fn lock_manager(&self) -> std::sync::Arc<crate::storage::transaction::lock::LockManager> {
2166        self.inner.lock_manager.clone()
2167    }
2168
2169    /// Process-local governance registry for managed policy/config guardrails.
2170    pub fn config_registry(&self) -> std::sync::Arc<crate::auth::registry::ConfigRegistry> {
2171        self.inner.config_registry.clone()
2172    }
2173
2174    pub fn query_audit(&self) -> std::sync::Arc<crate::runtime::query_audit::QueryAuditStream> {
2175        self.inner.query_audit.clone()
2176    }
2177
2178    pub fn control_events_require_persistence(&self) -> bool {
2179        self.inner.control_event_config.require_persistence()
2180    }
2181
2182    pub fn control_event_config(&self) -> crate::runtime::control_events::ControlEventConfig {
2183        self.inner.control_event_config
2184    }
2185
2186    pub fn control_event_ledger(
2187        &self,
2188    ) -> Arc<dyn crate::runtime::control_events::ControlEventLedger> {
2189        self.inner.control_event_ledger.read().clone()
2190    }
2191
2192    #[doc(hidden)]
2193    pub fn replace_control_event_ledger_for_tests(
2194        &self,
2195        ledger: Arc<dyn crate::runtime::control_events::ControlEventLedger>,
2196    ) {
2197        *self.inner.control_event_ledger.write() = ledger;
2198    }
2199
2200    #[inline(never)]
2201    pub fn with_options(options: RedDBOptions) -> RedDBResult<Self> {
2202        Self::with_pool(options, ConnectionPoolConfig::default())
2203    }
2204
2205    pub fn with_pool(
2206        options: RedDBOptions,
2207        pool_config: ConnectionPoolConfig,
2208    ) -> RedDBResult<Self> {
2209        // PLAN.md Phase 9.1 — capture wall-clock before storage
2210        // open so the cold-start phase markers can be backfilled
2211        // once Lifecycle is constructed below. Storage open
2212        // encapsulates auto-restore + WAL replay; we treat the
2213        // whole window as one combined "restore" + "wal_replay"
2214        // phase split at the same boundary because the storage
2215        // layer doesn't yet emit a finer signal.
2216        let boot_open_start_ms = std::time::SystemTime::now()
2217            .duration_since(std::time::UNIX_EPOCH)
2218            .map(|d| d.as_millis() as u64)
2219            .unwrap_or(0);
2220        let embedded_single_file = options.storage_profile.deploy_profile
2221            == crate::storage::DeployProfile::Embedded
2222            && options.storage_profile.packaging == crate::storage::StoragePackaging::SingleFile;
2223        let db = Arc::new(
2224            RedDB::open_with_options(&options)
2225                .map_err(|err| RedDBError::Internal(err.to_string()))?,
2226        );
2227        let result_blob_cache_config = if embedded_single_file {
2228            crate::storage::cache::BlobCacheConfig::default()
2229        } else {
2230            crate::storage::cache::BlobCacheConfig::default().with_l2_path(
2231                reddb_file::layout::result_cache_l2_path(
2232                    &options.resolved_path(reddb_file::default_database_path()),
2233                ),
2234            )
2235        };
2236        let result_blob_cache =
2237            crate::storage::cache::BlobCache::open_with_l2(result_blob_cache_config).map_err(
2238                |err| RedDBError::Internal(format!("open result Blob Cache L2 failed: {err:?}")),
2239            )?;
2240        let storage_ready_ms = std::time::SystemTime::now()
2241            .duration_since(std::time::UNIX_EPOCH)
2242            .map(|d| d.as_millis() as u64)
2243            .unwrap_or(0);
2244
2245        let runtime = Self {
2246            inner: Arc::new(RuntimeInner {
2247                db: db.clone(),
2248                layout: PhysicalLayout::from_options(&options),
2249                embedded_single_file,
2250                indices: IndexCatalog::register_default_vector_graph(
2251                    options.has_capability(crate::api::Capability::Table),
2252                    options.has_capability(crate::api::Capability::Graph),
2253                ),
2254                pool_config,
2255                pool: Mutex::new(PoolState::default()),
2256                started_at_unix_ms: SystemTime::now()
2257                    .duration_since(UNIX_EPOCH)
2258                    .unwrap_or_default()
2259                    .as_millis(),
2260                probabilistic: super::probabilistic_store::ProbabilisticStore::new(),
2261                index_store: super::index_store::IndexStore::new(),
2262                cdc: crate::replication::cdc::CdcBuffer::new(100_000),
2263                backup_scheduler: crate::replication::scheduler::BackupScheduler::new(3600),
2264                query_cache: parking_lot::RwLock::new(
2265                    crate::storage::query::planner::cache::PlanCache::new(1000),
2266                ),
2267                result_cache: parking_lot::RwLock::new((
2268                    HashMap::new(),
2269                    std::collections::VecDeque::new(),
2270                )),
2271                result_blob_cache,
2272                result_blob_entries: parking_lot::RwLock::new((
2273                    HashMap::new(),
2274                    std::collections::VecDeque::new(),
2275                )),
2276                ask_answer_cache_entries: parking_lot::RwLock::new((
2277                    HashSet::new(),
2278                    std::collections::VecDeque::new(),
2279                )),
2280                result_cache_shadow_divergences: std::sync::atomic::AtomicU64::new(0),
2281                result_cache_hits: std::sync::atomic::AtomicU64::new(0),
2282                result_cache_misses: std::sync::atomic::AtomicU64::new(0),
2283                result_cache_evictions: std::sync::atomic::AtomicU64::new(0),
2284                ask_daily_spend: parking_lot::RwLock::new(HashMap::new()),
2285                queue_message_locks: parking_lot::RwLock::new(HashMap::new()),
2286                rmw_locks: RmwLockTable::new(),
2287                planner_dirty_tables: parking_lot::RwLock::new(HashSet::new()),
2288                ec_registry: Arc::new(crate::ec::config::EcRegistry::new()),
2289                config_registry: Arc::new(crate::auth::registry::ConfigRegistry::new()),
2290                ec_worker: crate::ec::worker::EcWorker::new(),
2291                auth_store: parking_lot::RwLock::new(None),
2292                oauth_validator: parking_lot::RwLock::new(None),
2293                browser_token_authority: parking_lot::RwLock::new(None),
2294                views: parking_lot::RwLock::new(HashMap::new()),
2295                materialized_views: parking_lot::RwLock::new(
2296                    crate::storage::cache::result::MaterializedViewCache::new(),
2297                ),
2298                retention_sweeper: parking_lot::RwLock::new(
2299                    crate::runtime::retention_sweeper::RetentionSweeperState::new(),
2300                ),
2301                snapshot_manager: Arc::new(
2302                    crate::storage::transaction::snapshot::SnapshotManager::new(),
2303                ),
2304                tx_contexts: parking_lot::RwLock::new(HashMap::new()),
2305                tx_local_tenants: parking_lot::RwLock::new(HashMap::new()),
2306                env_config_overrides: crate::runtime::config_overlay::collect_env_overrides(),
2307                lock_manager: Arc::new({
2308                    // Sourced from the matrix: Tier B key
2309                    // `concurrency.locking.deadlock_timeout_ms`
2310                    // (default 5000). Env var wins at boot so
2311                    // operators can tune without touching red_config.
2312                    let env = crate::runtime::config_overlay::collect_env_overrides();
2313                    let timeout_ms = env
2314                        .get("concurrency.locking.deadlock_timeout_ms")
2315                        .and_then(|raw| raw.parse::<u64>().ok())
2316                        .unwrap_or_else(|| {
2317                            match crate::runtime::config_matrix::default_for(
2318                                "concurrency.locking.deadlock_timeout_ms",
2319                            ) {
2320                                Some(crate::serde_json::Value::Number(n)) => n as u64,
2321                                _ => 5000,
2322                            }
2323                        });
2324                    let cfg = crate::storage::transaction::lock::LockConfig {
2325                        default_timeout: std::time::Duration::from_millis(timeout_ms),
2326                        ..Default::default()
2327                    };
2328                    crate::storage::transaction::lock::LockManager::new(cfg)
2329                }),
2330                rls_policies: parking_lot::RwLock::new(HashMap::new()),
2331                rls_enabled_tables: parking_lot::RwLock::new(HashSet::new()),
2332                foreign_tables: Arc::new(crate::storage::fdw::ForeignTableRegistry::with_builtins()),
2333                pending_tombstones: parking_lot::RwLock::new(HashMap::new()),
2334                pending_versioned_updates: parking_lot::RwLock::new(HashMap::new()),
2335                pending_kv_watch_events: parking_lot::RwLock::new(HashMap::new()),
2336                pending_store_wal_actions: parking_lot::RwLock::new(HashMap::new()),
2337                queue_wait_registry: std::sync::Arc::new(
2338                    crate::runtime::queue_wait_registry::QueueWaitRegistry::new(),
2339                ),
2340                pending_queue_wakes: parking_lot::RwLock::new(HashMap::new()),
2341                tenant_tables: parking_lot::RwLock::new(HashMap::new()),
2342                ddl_epoch: std::sync::atomic::AtomicU64::new(0),
2343                write_gate: Arc::new(crate::runtime::write_gate::WriteGate::from_options(
2344                    &options,
2345                )),
2346                lifecycle: crate::runtime::lifecycle::Lifecycle::new(),
2347                resource_limits: crate::runtime::resource_limits::ResourceLimits::from_env(),
2348                audit_log: {
2349                    // Default audit-log path for the in-memory case
2350                    // sits in the system temp dir; persistent runs
2351                    // place it next to the resolved data file.
2352                    //
2353                    // gh-471 iter 2: route through the resolved
2354                    // `LogDestination`. Performance/Max tiers emit a
2355                    // file-backed log destination under the file-owned
2356                    // support-directory logs tier;
2357                    // lower tiers / ephemeral runs report `Stderr`
2358                    // and we keep the legacy file-next-to-data sink.
2359                    let data_path = if embedded_single_file {
2360                        std::env::temp_dir()
2361                            .join("reddb-embedded-runtime")
2362                            .join(format!("audit-{}", std::process::id()))
2363                    } else {
2364                        options
2365                            .data_path
2366                            .clone()
2367                            .unwrap_or_else(|| std::env::temp_dir().join("reddb"))
2368                    };
2369                    let (audit_dest, _) = crate::api::tier_wiring::current_log_destinations();
2370                    Arc::new(crate::runtime::audit_log::AuditLogger::for_destination(
2371                        &audit_dest,
2372                        &data_path,
2373                    ))
2374                },
2375                control_event_ledger: parking_lot::RwLock::new(Arc::new(
2376                    crate::runtime::control_events::RuntimeLedger::new(db.store()),
2377                )),
2378                control_event_config: options.control_events,
2379                query_audit: Arc::new(crate::runtime::query_audit::QueryAuditStream::new(
2380                    db.store(),
2381                    options.query_audit.clone(),
2382                )),
2383                lease_lifecycle: std::sync::OnceLock::new(),
2384                replica_apply_metrics: std::sync::Arc::new(
2385                    crate::replication::logical::ReplicaApplyMetrics::default(),
2386                ),
2387                quota_bucket: crate::runtime::quota_bucket::QuotaBucket::from_env(),
2388                schema_vocabulary: parking_lot::RwLock::new(
2389                    crate::runtime::schema_vocabulary::SchemaVocabulary::new(),
2390                ),
2391                slow_query_logger: {
2392                    // Issue #205 — slow-query sink lives in the same
2393                    // directory the audit log uses, so backup/restore
2394                    // ships them together. Threshold + sample-pct
2395                    // default conservatively (1 s, 100% sampling) so
2396                    // emitted lines are rare and complete. Operators
2397                    // tune via env / config matrix in a follow-up.
2398                    //
2399                    // gh-471 iter 2: same routing as the audit log —
2400                    // `LogDestination::File(...)` for Performance/Max
2401                    // lands under the file-owned support-directory logs tier;
2402                    // lower tiers fall back to `red-slow.log` in the
2403                    // data directory.
2404                    let fallback_dir = if embedded_single_file {
2405                        std::env::temp_dir()
2406                            .join("reddb-embedded-runtime")
2407                            .join(format!("slow-{}", std::process::id()))
2408                    } else {
2409                        options
2410                            .data_path
2411                            .as_ref()
2412                            .and_then(|p| p.parent().map(std::path::PathBuf::from))
2413                            .unwrap_or_else(|| std::env::temp_dir().join("reddb"))
2414                    };
2415                    let threshold_ms = std::env::var("RED_SLOW_QUERY_THRESHOLD_MS")
2416                        .ok()
2417                        .and_then(|s| s.parse::<u64>().ok())
2418                        .unwrap_or(1000);
2419                    let sample_pct = std::env::var("RED_SLOW_QUERY_SAMPLE_PCT")
2420                        .ok()
2421                        .and_then(|s| s.parse::<u8>().ok())
2422                        .unwrap_or(100);
2423                    let (_, slow_dest) = crate::api::tier_wiring::current_log_destinations();
2424                    crate::telemetry::slow_query_logger::SlowQueryLogger::for_destination(
2425                        &slow_dest,
2426                        &fallback_dir,
2427                        threshold_ms,
2428                        sample_pct,
2429                    )
2430                },
2431                kv_stats: crate::runtime::KvStatsCounters::default(),
2432                metrics_ingest_stats: crate::runtime::MetricsIngestCounters::default(),
2433                metrics_tenant_activity_stats:
2434                    crate::runtime::MetricsTenantActivityCounters::default(),
2435                queue_telemetry: Arc::new(
2436                    crate::runtime::queue_telemetry::QueueTelemetryCounters::default(),
2437                ),
2438                queue_presence: Arc::new(
2439                    crate::storage::queue::presence::ConsumerPresenceRegistry::new(),
2440                ),
2441                vector_introspection: Arc::new(
2442                    crate::storage::vector::introspection::VectorIntrospectionRegistry::new(),
2443                ),
2444                kv_tag_index: crate::runtime::KvTagIndex::default(),
2445                chain_tip_cache: parking_lot::Mutex::new(HashMap::new()),
2446                chain_integrity_broken: parking_lot::Mutex::new(HashMap::new()),
2447                integrity_tombstones: parking_lot::Mutex::new(Vec::new()),
2448                integrity_tombstones_state: std::sync::atomic::AtomicU8::new(0),
2449            }),
2450        };
2451
2452        // Issue #205 — install the process-wide OperatorEvent sink so
2453        // emit sites buried in storage / replication / signal handlers
2454        // can record without threading an `&AuditLogger` through every
2455        // call stack. First registration wins; subsequent in-memory
2456        // runtimes (test harnesses) fall through to tracing+eprintln.
2457        crate::telemetry::operator_event::install_global_audit_sink(Arc::clone(
2458            &runtime.inner.audit_log,
2459        ));
2460
2461        // PLAN.md Phase 9.1 — backfill cold-start phase markers
2462        // from the wall-clock captured before storage open. The
2463        // entire `RedDB::open_with_options` call covers both
2464        // auto-restore (when configured) and WAL replay. We
2465        // record both phases against the same boundary today;
2466        // a follow-up will split them once the storage layer
2467        // surfaces a finer-grained event.
2468        runtime
2469            .inner
2470            .lifecycle
2471            .set_restore_started_at_ms(boot_open_start_ms);
2472        runtime
2473            .inner
2474            .lifecycle
2475            .set_restore_ready_at_ms(storage_ready_ms);
2476        runtime
2477            .inner
2478            .lifecycle
2479            .set_wal_replay_started_at_ms(boot_open_start_ms);
2480        runtime
2481            .inner
2482            .lifecycle
2483            .set_wal_replay_ready_at_ms(storage_ready_ms);
2484
2485        let restored_cdc_lsn = runtime
2486            .inner
2487            .db
2488            .replication
2489            .as_ref()
2490            .map(|repl| {
2491                repl.logical_wal_spool
2492                    .as_ref()
2493                    .map(|spool| spool.current_lsn())
2494                    .unwrap_or(0)
2495            })
2496            .unwrap_or(0)
2497            .max(runtime.config_u64("red.config.timeline.last_archived_lsn", 0));
2498        runtime.inner.cdc.set_current_lsn(restored_cdc_lsn);
2499        runtime.rehydrate_snapshot_xid_floor();
2500        runtime
2501            .bootstrap_system_keyed_collections()
2502            .map_err(|err| RedDBError::Internal(format!("bootstrap system collections: {err}")))?;
2503        runtime.rehydrate_declared_column_schemas();
2504        runtime.rehydrate_runtime_index_registry()?;
2505        runtime
2506            .load_probabilistic_state()
2507            .map_err(|err| RedDBError::Internal(format!("load probabilistic state: {err}")))?;
2508
2509        // Phase 2.5.4: replay `tenant_tables.{table}.column` markers so
2510        // tables declared via `TENANT BY (col)` survive restart. Each
2511        // entry re-registers the auto-policy and flips RLS on again.
2512        runtime.rehydrate_tenant_tables();
2513        // Issue #593 slice 9a — replay persisted materialized-view
2514        // descriptors so `CREATE MATERIALIZED VIEW v AS …` survives a
2515        // restart. Runs after the system-keyed collections bootstrap
2516        // and before the API opens.
2517        runtime.rehydrate_materialized_view_descriptors();
2518        if let Some(repl) = &runtime.inner.db.replication {
2519            repl.wal_buffer.set_current_lsn(restored_cdc_lsn);
2520        }
2521
2522        // Save system info to red_config on boot
2523        {
2524            let sys = SystemInfo::collect();
2525            runtime.inner.db.store().set_config_tree(
2526                "red.system",
2527                &crate::serde_json::json!({
2528                    "pid": sys.pid,
2529                    "cpu_cores": sys.cpu_cores,
2530                    "total_memory_bytes": sys.total_memory_bytes,
2531                    "available_memory_bytes": sys.available_memory_bytes,
2532                    "os": sys.os,
2533                    "arch": sys.arch,
2534                    "hostname": sys.hostname,
2535                    "started_at": SystemTime::now()
2536                        .duration_since(UNIX_EPOCH)
2537                        .unwrap_or_default()
2538                        .as_millis() as u64
2539                }),
2540            );
2541
2542            // Seed defaults on first boot (only if red_config is empty or missing defaults)
2543            let store = runtime.inner.db.store();
2544            if store
2545                .get_collection("red_config")
2546                .map(|m| m.query_all(|_| true).len())
2547                .unwrap_or(0)
2548                <= 10
2549            {
2550                store.set_config_tree("red.ai", &crate::json!({
2551                    "default": crate::json!({
2552                        "provider": "openai",
2553                        "model": crate::ai::DEFAULT_OPENAI_PROMPT_MODEL
2554                    }),
2555                    "max_embedding_inputs": 256,
2556                    "max_prompt_batch": 256,
2557                    "timeout": crate::json!({ "connect_secs": 10, "read_secs": 90, "write_secs": 30 })
2558                }));
2559                store.set_config_tree(
2560                    "red.server",
2561                    &crate::json!({
2562                        "max_scan_limit": 1000,
2563                        "max_body_size": 1048576,
2564                        "read_timeout_ms": 5000,
2565                        "write_timeout_ms": 5000
2566                    }),
2567                );
2568                store.set_config_tree(
2569                    "red.storage",
2570                    &crate::json!({
2571                        "page_size": 4096,
2572                        "page_cache_capacity": 100000,
2573                        "auto_checkpoint_pages": 1000,
2574                        "snapshot_retention": 16,
2575                        "verify_checksums": true,
2576                        "segment": crate::json!({
2577                            "max_entities": 100000,
2578                            "max_bytes": 268435456_u64,
2579                            "compression_level": 6
2580                        }),
2581                        "hnsw": crate::json!({ "m": 16, "ef_construction": 100, "ef_search": 50 }),
2582                        "ivf": crate::json!({ "n_lists": 100, "n_probes": 10 }),
2583                        "bm25": crate::json!({ "k1": 1.2, "b": 0.75 })
2584                    }),
2585                );
2586                store.set_config_tree(
2587                    "red.search",
2588                    &crate::json!({
2589                        "rag": crate::json!({
2590                            "max_chunks_per_source": 10,
2591                            "max_total_chunks": 25,
2592                            "similarity_threshold": 0.8,
2593                            "graph_depth": 2,
2594                            "min_relevance": 0.3
2595                        }),
2596                        "fusion": crate::json!({
2597                            "vector_weight": 0.5,
2598                            "graph_weight": 0.3,
2599                            "table_weight": 0.2,
2600                            "dedup_threshold": 0.85
2601                        })
2602                    }),
2603                );
2604                store.set_config_tree(
2605                    "red.auth",
2606                    &crate::json!({
2607                        "enabled": false,
2608                        "session_ttl_secs": 3600,
2609                        "require_auth": false
2610                    }),
2611                );
2612                store.set_config_tree(
2613                    "red.query",
2614                    &crate::json!({
2615                        "connection_pool": crate::json!({ "max_connections": 64, "max_idle": 16 }),
2616                        "max_recursion_depth": 1000
2617                    }),
2618                );
2619                store.set_config_tree(
2620                    "red.indexes",
2621                    &crate::json!({
2622                        "auto_select": true,
2623                        "bloom_filter": crate::json!({
2624                            "enabled": true,
2625                            "false_positive_rate": 0.01,
2626                            "prune_on_scan": true
2627                        }),
2628                        "hash": crate::json!({ "enabled": true }),
2629                        "bitmap": crate::json!({ "enabled": true, "max_cardinality": 1000 }),
2630                        "spatial": crate::json!({ "enabled": true })
2631                    }),
2632                );
2633                store.set_config_tree(
2634                    "red.memtable",
2635                    &crate::json!({
2636                        "enabled": true,
2637                        "max_bytes": 67108864_u64,
2638                        "flush_threshold": 0.75
2639                    }),
2640                );
2641                store.set_config_tree(
2642                    "red.probabilistic",
2643                    &crate::json!({
2644                        "hll_registers": 16384,
2645                        "sketch_default_width": 1000,
2646                        "sketch_default_depth": 5,
2647                        "filter_default_capacity": 100000
2648                    }),
2649                );
2650                store.set_config_tree(
2651                    "red.timeseries",
2652                    &crate::json!({
2653                        "default_chunk_size": 1024,
2654                        "compression": crate::json!({
2655                            "timestamps": "delta_of_delta",
2656                            "values": "gorilla_xor"
2657                        }),
2658                        "default_retention_days": 0
2659                    }),
2660                );
2661                store.set_config_tree(
2662                    "red.queue",
2663                    &crate::json!({
2664                        "default_max_size": 0,
2665                        "default_max_attempts": 3,
2666                        "visibility_timeout_ms": 30000,
2667                        "consumer_idle_timeout_ms": 60000
2668                    }),
2669                );
2670                store.set_config_tree(
2671                    "red.backup",
2672                    &crate::json!({
2673                        "enabled": false,
2674                        "interval_secs": 3600,
2675                        "retention_count": 24,
2676                        "upload": false,
2677                        "backend": "local"
2678                    }),
2679                );
2680                store.set_config_tree(
2681                    "red.wal",
2682                    &crate::json!({
2683                        "archive": crate::json!({
2684                            "enabled": false,
2685                            "retention_hours": 168,
2686                            "prefix": reddb_file::backup_wal_prefix("")
2687                        })
2688                    }),
2689                );
2690                store.set_config_tree(
2691                    "red.cdc",
2692                    &crate::json!({
2693                        "enabled": true,
2694                        "buffer_size": 100000
2695                    }),
2696                );
2697                store.set_config_tree(
2698                    "red.config.secret",
2699                    &crate::json!({
2700                        "auto_encrypt": true,
2701                        "auto_decrypt": true
2702                    }),
2703                );
2704            }
2705
2706            // Perf-parity config matrix: heal the Tier A (critical)
2707            // keys unconditionally on every boot. Idempotent — only
2708            // writes the default when the key is missing. Keeps
2709            // `SHOW CONFIG` showing every guarantee the operator has
2710            // (durability.mode, concurrency.locking.enabled, …) even
2711            // on long-running datadirs that predate the matrix.
2712            crate::runtime::config_matrix::heal_critical_keys(store.as_ref());
2713            seed_storage_deploy_config(store.as_ref(), options.storage_profile);
2714
2715            // Phase 5 — Lehman-Yao runtime flag. Read the Tier A
2716            // `storage.btree.lehman_yao` value from the matrix (env
2717            // > file > red_config > default) and publish it to the
2718            // storage layer's atomic so the B-tree read / split
2719            // paths can branch without re-reading the config on
2720            // every hot-path call.
2721            let lehman_yao = runtime.config_bool("storage.btree.lehman_yao", true);
2722            crate::storage::engine::btree::lehman_yao::set_enabled(lehman_yao);
2723            if lehman_yao {
2724                tracing::info!(
2725                    "storage.btree.lehman_yao=true — lock-free concurrent descent enabled"
2726                );
2727            }
2728
2729            // Config file overlay — mounted `/etc/reddb/config.json`
2730            // (override path via REDDB_CONFIG_FILE). Writes keys with
2731            // write-if-absent semantics so a later user `SET CONFIG`
2732            // always wins. Missing file = silent no-op.
2733            let overlay_path = crate::runtime::config_overlay::config_file_path();
2734            let _ =
2735                crate::runtime::config_overlay::apply_config_file(store.as_ref(), &overlay_path);
2736        }
2737
2738        // VCS ("Git for Data") — create the `red_*` metadata
2739        // collections on first boot. Idempotent: `get_or_create_collection`
2740        // is a no-op if the collection already exists.
2741        {
2742            let store = runtime.inner.db.store();
2743            for name in crate::application::vcs_collections::ALL {
2744                let _ = store.get_or_create_collection(*name);
2745            }
2746            // Seed VCS config namespace with sensible defaults on first
2747            // boot, matching the pattern used by red.ai / red.storage.
2748            store.set_config_tree(
2749                crate::application::vcs_collections::CONFIG_NAMESPACE,
2750                &crate::json!({
2751                    "default_branch": "main",
2752                    "author": crate::json!({
2753                        "name": "reddb",
2754                        "email": "reddb@localhost"
2755                    }),
2756                    "protected_branches": crate::json!(["main"]),
2757                    "closure": crate::json!({
2758                        "enabled": true,
2759                        "lazy": true
2760                    }),
2761                    "merge": crate::json!({
2762                        "default_strategy": "auto",
2763                        "fast_forward": true
2764                    })
2765                }),
2766            );
2767        }
2768
2769        // Migrations — create the `red_migrations` / `red_migration_deps`
2770        // system collections on first boot. Idempotent.
2771        {
2772            let store = runtime.inner.db.store();
2773            for name in crate::application::migration_collections::ALL {
2774                let _ = store.get_or_create_collection(*name);
2775            }
2776        }
2777
2778        // Topology graph (#803) — ensure the built-in `red.topology.cluster`
2779        // graph collection (declared WITH ANALYTICS) and its metadata sidecar
2780        // exist. Idempotent and survives restarts via the WAL-backed contract.
2781        let _ = crate::application::topology_collections::ensure(&runtime);
2782
2783        // Start background maintenance thread (context index refresh +
2784        // session purge). Held by a WEAK reference to `RuntimeInner`
2785        // so dropping the last `RedDBRuntime` handle actually releases
2786        // the underlying Arc<Pager> (and its file lock). Polling at
2787        // 200ms means shutdown latency is bounded; the real 60-second
2788        // work cadence is tracked independently via a `last_work`
2789        // timestamp.
2790        //
2791        // The previous version captured `rt = runtime.clone()` by
2792        // strong reference and ran an unterminated `loop`, which held
2793        // Arc<RuntimeInner> forever — reopening a persistent database
2794        // in the same process failed with "Database is locked" because
2795        // the pager could never drop. See the regression test
2796        // `finding_1_select_after_bulk_insert_persistent_reopen`.
2797        {
2798            let weak = Arc::downgrade(&runtime.inner);
2799            std::thread::Builder::new()
2800                .name("reddb-maintenance".into())
2801                .spawn(move || {
2802                    let tick = std::time::Duration::from_millis(200);
2803                    let work_interval = std::time::Duration::from_secs(60);
2804                    let mut last_work = std::time::Instant::now();
2805                    loop {
2806                        std::thread::sleep(tick);
2807                        let Some(inner) = weak.upgrade() else {
2808                            // All strong references dropped — the
2809                            // runtime is gone, exit cleanly.
2810                            break;
2811                        };
2812                        if last_work.elapsed() >= work_interval {
2813                            let _stats = inner.db.store().context_index().stats();
2814                            last_work = std::time::Instant::now();
2815                        }
2816                    }
2817                })
2818                .ok();
2819        }
2820
2821        // Start backup scheduler if enabled via red_config
2822        {
2823            let store = runtime.inner.db.store();
2824            let mut backup_enabled = false;
2825            let mut backup_interval = 3600u64;
2826
2827            if let Some(manager) = store.get_collection("red_config") {
2828                manager.for_each_entity(|entity| {
2829                    if let Some(row) = entity.data.as_row() {
2830                        let key = row.get_field("key").and_then(|v| match v {
2831                            crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
2832                            _ => None,
2833                        });
2834                        let val = row.get_field("value");
2835                        if key == Some("red.config.backup.enabled") {
2836                            backup_enabled = match val {
2837                                Some(crate::storage::schema::Value::Boolean(true)) => true,
2838                                Some(crate::storage::schema::Value::Text(s)) => &**s == "true",
2839                                _ => false,
2840                            };
2841                        } else if key == Some("red.config.backup.interval_secs") {
2842                            if let Some(crate::storage::schema::Value::Integer(n)) = val {
2843                                backup_interval = *n as u64;
2844                            }
2845                        }
2846                    }
2847                    true
2848                });
2849            }
2850
2851            if backup_enabled {
2852                runtime.inner.backup_scheduler.set_interval(backup_interval);
2853                let rt = runtime.clone();
2854                runtime
2855                    .inner
2856                    .backup_scheduler
2857                    .start(move || rt.trigger_backup().map_err(|e| format!("{}", e)));
2858            }
2859        }
2860
2861        // Load EC registry from red_config and start worker
2862        {
2863            runtime
2864                .inner
2865                .ec_registry
2866                .load_from_config_store(runtime.inner.db.store().as_ref());
2867            if !runtime.inner.ec_registry.async_configs().is_empty() {
2868                runtime.inner.ec_worker.start(
2869                    Arc::clone(&runtime.inner.ec_registry),
2870                    Arc::clone(&runtime.inner.db.store()),
2871                );
2872            }
2873        }
2874
2875        if let crate::replication::ReplicationRole::Replica { primary_addr } =
2876            runtime.inner.db.options().replication.role.clone()
2877        {
2878            let rt = runtime.clone();
2879            std::thread::Builder::new()
2880                .name("reddb-replica".into())
2881                .spawn(move || rt.run_replica_loop(primary_addr))
2882                .ok();
2883        }
2884
2885        // PLAN.md Phase 1 — Lifecycle Contract. Mark Ready once every
2886        // boot stage above has completed (WAL replay, restore-from-
2887        // remote, replica-loop spawn). Health probes flip from 503 to
2888        // 200 here; shutdown begins from this state.
2889        runtime.inner.lifecycle.mark_ready();
2890
2891        // Issue #583 slice 10 — ContinuousMaterializedView scheduler.
2892        // Low-priority background ticker that drains the cache's
2893        // `claim_due_at` set every ~50ms. Holds only a Weak<RuntimeInner>
2894        // so the thread exits cleanly when the runtime drops (≤50ms
2895        // latency between drop and exit). Materialized views without
2896        // a `REFRESH EVERY` clause stay on the manual-refresh path
2897        // and are skipped by `claim_due_at`, so the loop is a no-op
2898        // when no scheduled views exist.
2899        {
2900            let weak_inner = Arc::downgrade(&runtime.inner);
2901            std::thread::Builder::new()
2902                .name("reddb-mv-scheduler".into())
2903                .spawn(move || loop {
2904                    std::thread::sleep(std::time::Duration::from_millis(50));
2905                    let Some(inner) = weak_inner.upgrade() else {
2906                        break;
2907                    };
2908                    let rt = RedDBRuntime { inner };
2909                    rt.refresh_due_materialized_views();
2910                })
2911                .ok();
2912        }
2913
2914        // Issue #584 slice 12 — DeclarativeRetention background sweeper.
2915        // Low-priority ticker that physically reclaims rows whose
2916        // timestamp has fallen beyond the retention window. Holds a
2917        // `Weak<RuntimeInner>` so the thread exits within one tick of
2918        // the runtime drop (graceful shutdown leaves storage consistent
2919        // because each tick goes through the standard DELETE path —
2920        // there is no half-finished mutation state to clean up). The
2921        // tick interval is intentionally longer than the MV scheduler
2922        // (500ms) because retention is order-of-seconds at minimum.
2923        if !runtime.write_gate().is_read_only() {
2924            let weak_inner = Arc::downgrade(&runtime.inner);
2925            std::thread::Builder::new()
2926                .name("reddb-retention-sweeper".into())
2927                .spawn(move || loop {
2928                    std::thread::sleep(std::time::Duration::from_millis(500));
2929                    let Some(inner) = weak_inner.upgrade() else {
2930                        break;
2931                    };
2932                    let rt = RedDBRuntime { inner };
2933                    rt.sweep_retention_tick(
2934                        crate::runtime::retention_sweeper::DEFAULT_SWEEPER_BATCH,
2935                    );
2936                })
2937                .ok();
2938        }
2939
2940        Ok(runtime)
2941    }
2942
2943    fn rehydrate_snapshot_xid_floor(&self) {
2944        let store = self.inner.db.store();
2945        for collection in store.list_collections() {
2946            let Some(manager) = store.get_collection(&collection) else {
2947                continue;
2948            };
2949            for entity in manager.query_all(|_| true) {
2950                self.inner
2951                    .snapshot_manager
2952                    .observe_committed_xid(entity.xmin);
2953                self.inner
2954                    .snapshot_manager
2955                    .observe_committed_xid(entity.xmax);
2956            }
2957        }
2958    }
2959
2960    /// Provision an empty Table-shaped collection that backs a
2961    /// `CREATE MATERIALIZED VIEW v` (issue #594 slice 9b of #575).
2962    /// `SELECT FROM v` reads this collection directly; the rewriter is
2963    /// configured to skip materialized views so the body is no longer
2964    /// substituted. REFRESH still writes to the cache slot — wiring it
2965    /// into this backing collection is the job of slice 9c.
2966    ///
2967    /// Idempotent: re-running for the same name leaves the existing
2968    /// collection in place (mirrors `CREATE TABLE IF NOT EXISTS`
2969    /// semantics). This keeps `CREATE OR REPLACE MATERIALIZED VIEW v`
2970    /// cheap — the body change does not invalidate already-buffered
2971    /// rows. Until 9c lands the backing is always empty anyway.
2972    pub(crate) fn ensure_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
2973        let store = self.inner.db.store();
2974        let mut changed = false;
2975        if store.get_collection(name).is_none() {
2976            store.get_or_create_collection(name);
2977            changed = true;
2978        }
2979        if self.inner.db.collection_contract(name).is_none() {
2980            self.inner
2981                .db
2982                .save_collection_contract(system_keyed_collection_contract(
2983                    name,
2984                    crate::catalog::CollectionModel::Table,
2985                ))
2986                .map_err(|err| RedDBError::Internal(err.to_string()))?;
2987            changed = true;
2988        }
2989        if changed {
2990            self.inner
2991                .db
2992                .persist_metadata()
2993                .map_err(|err| RedDBError::Internal(err.to_string()))?;
2994        }
2995        Ok(())
2996    }
2997
2998    /// Inverse of [`ensure_materialized_view_backing`] — drops the
2999    /// backing collection on `DROP MATERIALIZED VIEW v`. No-op when
3000    /// the collection was never created (e.g. a `DROP MATERIALIZED
3001    /// VIEW IF EXISTS v` against an unknown name).
3002    pub(crate) fn drop_materialized_view_backing(&self, name: &str) -> RedDBResult<()> {
3003        let store = self.inner.db.store();
3004        if store.get_collection(name).is_none() {
3005            return Ok(());
3006        }
3007        store
3008            .drop_collection(name)
3009            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3010        // The contract may have been dropped already (DROP TABLE path)
3011        // — ignore "not found" errors by checking presence first.
3012        if self.inner.db.collection_contract(name).is_some() {
3013            self.inner
3014                .db
3015                .remove_collection_contract(name)
3016                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3017        }
3018        self.invalidate_result_cache();
3019        self.inner
3020            .db
3021            .persist_metadata()
3022            .map_err(|err| RedDBError::Internal(err.to_string()))?;
3023        Ok(())
3024    }
3025
3026    fn bootstrap_system_keyed_collections(&self) -> RedDBResult<()> {
3027        let mut changed = false;
3028        for (name, model) in [
3029            ("red.config", crate::catalog::CollectionModel::Config),
3030            ("red.vault", crate::catalog::CollectionModel::Vault),
3031            // Issue #593 — materialized-view catalog. One row per
3032            // `CREATE MATERIALIZED VIEW`; rehydrated at boot before
3033            // the API opens.
3034            (
3035                crate::runtime::continuous_materialized_view::CATALOG_COLLECTION,
3036                crate::catalog::CollectionModel::Config,
3037            ),
3038        ] {
3039            if self.inner.db.store().get_collection(name).is_none() {
3040                self.inner.db.store().get_or_create_collection(name);
3041                changed = true;
3042            }
3043            if self.inner.db.collection_contract(name).is_none() {
3044                self.inner
3045                    .db
3046                    .save_collection_contract(system_keyed_collection_contract(name, model))
3047                    .map_err(|err| RedDBError::Internal(err.to_string()))?;
3048                changed = true;
3049            }
3050        }
3051        if changed {
3052            self.inner
3053                .db
3054                .persist_metadata()
3055                .map_err(|err| RedDBError::Internal(err.to_string()))?;
3056        }
3057        Ok(())
3058    }
3059
3060    pub fn db(&self) -> Arc<RedDB> {
3061        Arc::clone(&self.inner.db)
3062    }
3063
3064    /// Direct access to the runtime's secondary-index store.
3065    /// Used by bulk-insert entry points (gRPC binary bulk, HTTP bulk,
3066    /// wire bulk) that need to push new rows through the per-index
3067    /// maintenance hook after `store.bulk_insert` returns.
3068    pub fn index_store_ref(&self) -> &super::index_store::IndexStore {
3069        &self.inner.index_store
3070    }
3071
3072    /// Apply a DDL event to the schema-vocabulary reverse index
3073    /// (issue #120). Called by DDL execution paths after the catalog
3074    /// mutation has succeeded so the index never holds entries for
3075    /// half-applied DDL.
3076    pub(crate) fn schema_vocabulary_apply(
3077        &self,
3078        event: crate::runtime::schema_vocabulary::DdlEvent,
3079    ) {
3080        self.inner.schema_vocabulary.write().on_ddl(event);
3081    }
3082
3083    /// Lookup `token` in the schema-vocabulary reverse index. Returns
3084    /// an owned `Vec<VocabHit>` because the underlying read lock
3085    /// cannot be borrowed across the call boundary; the slice from
3086    /// `SchemaVocabulary::lookup` is cloned per hit.
3087    pub fn schema_vocabulary_lookup(
3088        &self,
3089        token: &str,
3090    ) -> Vec<crate::runtime::schema_vocabulary::VocabHit> {
3091        self.inner.schema_vocabulary.read().lookup(token).to_vec()
3092    }
3093
3094    /// Inject an AuthStore into the runtime. Called by server boot
3095    /// after the vault has been bootstrapped, so that `Value::Secret`
3096    /// auto-encrypt/decrypt can reach the vault AES key.
3097    pub fn set_auth_store(&self, store: Arc<crate::auth::store::AuthStore>) {
3098        *self.inner.auth_store.write() = Some(store);
3099    }
3100
3101    /// Snapshot the current AuthStore (if any). Used by the wire listener
3102    /// to validate bearer tokens issued via HTTP `/auth/login`.
3103    pub fn auth_store(&self) -> Option<Arc<crate::auth::store::AuthStore>> {
3104        self.inner.auth_store.read().clone()
3105    }
3106
3107    /// Read a vault KV secret from the configured AuthStore, if present.
3108    pub fn vault_kv_get(&self, key: &str) -> Option<String> {
3109        self.inner
3110            .auth_store
3111            .read()
3112            .as_ref()
3113            .and_then(|store| store.vault_kv_get(key))
3114    }
3115
3116    /// Write a vault KV secret and fail if the encrypted vault write is
3117    /// unavailable or cannot be made durable.
3118    pub fn vault_kv_try_set(&self, key: String, value: String) -> RedDBResult<()> {
3119        let store = self.inner.auth_store.read().clone().ok_or_else(|| {
3120            RedDBError::Query("secret storage requires an enabled, unsealed vault".to_string())
3121        })?;
3122        store
3123            .vault_kv_try_set(key, value)
3124            .map_err(|err| RedDBError::Query(err.to_string()))
3125    }
3126
3127    /// Inject an `OAuthValidator` into the runtime. When set, HTTP and
3128    /// wire transports try OAuth JWT validation before falling back to
3129    /// the local AuthStore lookup. Pass `None` to disable.
3130    pub fn set_oauth_validator(&self, validator: Option<Arc<crate::auth::oauth::OAuthValidator>>) {
3131        *self.inner.oauth_validator.write() = validator;
3132    }
3133
3134    /// Returns a clone of the configured `OAuthValidator` Arc, if any.
3135    /// Hot path: called per HTTP request when an Authorization header
3136    /// is present, so we hand back a cheap Arc clone.
3137    pub fn oauth_validator(&self) -> Option<Arc<crate::auth::oauth::OAuthValidator>> {
3138        self.inner.oauth_validator.read().clone()
3139    }
3140
3141    /// Inject the browser-token authority (issue #936). When set, the
3142    /// RedWire WS handshake accepts the short-lived access JWT it mints
3143    /// (alongside, and tried before, the federated OAuth validator), and
3144    /// the `/auth/browser/*` HTTP endpoints can issue/rotate the pair.
3145    /// `None` leaves the browser credential flow inert.
3146    pub fn set_browser_token_authority(
3147        &self,
3148        authority: Option<Arc<crate::auth::browser_token::BrowserTokenAuthority>>,
3149    ) {
3150        *self.inner.browser_token_authority.write() = authority;
3151    }
3152
3153    /// Snapshot the browser-token authority, if wired. Read on the WS
3154    /// handshake path and by the `/auth/browser/*` handlers; a cheap Arc
3155    /// clone keeps the lock hold short.
3156    pub fn browser_token_authority(
3157        &self,
3158    ) -> Option<Arc<crate::auth::browser_token::BrowserTokenAuthority>> {
3159        self.inner.browser_token_authority.read().clone()
3160    }
3161
3162    /// Returns the vault AES key (`red.secret.aes_key`) if an auth
3163    /// store is wired and a key has been generated. Used by the
3164    /// `Value::Secret` encrypt/decrypt pipeline.
3165    pub(crate) fn secret_aes_key(&self) -> Option<[u8; 32]> {
3166        let guard = self.inner.auth_store.read();
3167        guard.as_ref().and_then(|s| s.vault_secret_key())
3168    }
3169
3170    /// Resolve a boolean flag from `red_config`. Defaults to `default`
3171    /// when the key is missing or not coercible. If the same key has
3172    /// been written multiple times (SET CONFIG appends new rows), the
3173    /// most recent entity wins. Env-var overrides
3174    /// (`REDDB_<UP_DOTTED>`) take highest precedence.
3175    pub(crate) fn config_bool(&self, key: &str, default: bool) -> bool {
3176        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3177            if let Some(crate::storage::schema::Value::Boolean(b)) =
3178                crate::runtime::config_overlay::coerce_env_value(key, raw)
3179            {
3180                return b;
3181            }
3182        }
3183        let store = self.inner.db.store();
3184        let Some(manager) = store.get_collection("red_config") else {
3185            return default;
3186        };
3187        let mut result = default;
3188        let mut latest_id: u64 = 0;
3189        manager.for_each_entity(|entity| {
3190            if let Some(row) = entity.data.as_row() {
3191                let entry_key = row.get_field("key").and_then(|v| match v {
3192                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3193                    _ => None,
3194                });
3195                if entry_key == Some(key) {
3196                    let id = entity.id.raw();
3197                    if id >= latest_id {
3198                        latest_id = id;
3199                        result = match row.get_field("value") {
3200                            Some(crate::storage::schema::Value::Boolean(b)) => *b,
3201                            Some(crate::storage::schema::Value::Text(s)) => {
3202                                matches!(s.as_ref(), "true" | "TRUE" | "True" | "1")
3203                            }
3204                            Some(crate::storage::schema::Value::Integer(n)) => *n != 0,
3205                            _ => default,
3206                        };
3207                    }
3208                }
3209            }
3210            true
3211        });
3212        result
3213    }
3214
3215    pub(crate) fn config_u64(&self, key: &str, default: u64) -> u64 {
3216        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3217            if let Some(crate::storage::schema::Value::UnsignedInteger(n)) =
3218                crate::runtime::config_overlay::coerce_env_value(key, raw)
3219            {
3220                return n;
3221            }
3222        }
3223        let store = self.inner.db.store();
3224        let Some(manager) = store.get_collection("red_config") else {
3225            return default;
3226        };
3227        let mut result = default;
3228        let mut latest_id: u64 = 0;
3229        manager.for_each_entity(|entity| {
3230            if let Some(row) = entity.data.as_row() {
3231                let entry_key = row.get_field("key").and_then(|v| match v {
3232                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3233                    _ => None,
3234                });
3235                if entry_key == Some(key) {
3236                    let id = entity.id.raw();
3237                    if id >= latest_id {
3238                        latest_id = id;
3239                        result = match row.get_field("value") {
3240                            Some(crate::storage::schema::Value::Integer(n)) => *n as u64,
3241                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n,
3242                            Some(crate::storage::schema::Value::Text(s)) => {
3243                                s.parse::<u64>().unwrap_or(default)
3244                            }
3245                            _ => default,
3246                        };
3247                    }
3248                }
3249            }
3250            true
3251        });
3252        result
3253    }
3254
3255    pub(crate) fn config_f64(&self, key: &str, default: f64) -> f64 {
3256        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3257            if let Ok(n) = raw.parse::<f64>() {
3258                return n;
3259            }
3260        }
3261        let store = self.inner.db.store();
3262        let Some(manager) = store.get_collection("red_config") else {
3263            return default;
3264        };
3265        let mut result = default;
3266        let mut latest_id: u64 = 0;
3267        manager.for_each_entity(|entity| {
3268            if let Some(row) = entity.data.as_row() {
3269                let entry_key = row.get_field("key").and_then(|v| match v {
3270                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3271                    _ => None,
3272                });
3273                if entry_key == Some(key) {
3274                    let id = entity.id.raw();
3275                    if id >= latest_id {
3276                        latest_id = id;
3277                        result = match row.get_field("value") {
3278                            Some(crate::storage::schema::Value::Float(n)) => *n,
3279                            Some(crate::storage::schema::Value::Integer(n)) => *n as f64,
3280                            Some(crate::storage::schema::Value::UnsignedInteger(n)) => *n as f64,
3281                            Some(crate::storage::schema::Value::Text(s)) => {
3282                                s.parse::<f64>().unwrap_or(default)
3283                            }
3284                            _ => default,
3285                        };
3286                    }
3287                }
3288            }
3289            true
3290        });
3291        result
3292    }
3293
3294    pub(crate) fn config_string(&self, key: &str, default: &str) -> String {
3295        if let Some(raw) = self.inner.env_config_overrides.get(key) {
3296            return raw.clone();
3297        }
3298        let store = self.inner.db.store();
3299        let Some(manager) = store.get_collection("red_config") else {
3300            return default.to_string();
3301        };
3302        let mut result = default.to_string();
3303        let mut latest_id: u64 = 0;
3304        manager.for_each_entity(|entity| {
3305            if let Some(row) = entity.data.as_row() {
3306                let entry_key = row.get_field("key").and_then(|v| match v {
3307                    crate::storage::schema::Value::Text(s) => Some(s.as_ref()),
3308                    _ => None,
3309                });
3310                if entry_key == Some(key) {
3311                    let id = entity.id.raw();
3312                    if id >= latest_id {
3313                        latest_id = id;
3314                        if let Some(crate::storage::schema::Value::Text(value)) =
3315                            row.get_field("value")
3316                        {
3317                            result = value.to_string();
3318                        }
3319                    }
3320                }
3321            }
3322            true
3323        });
3324        result
3325    }
3326
3327    /// Whether `SECRET('...')` literals should be encrypted with the
3328    /// vault AES key on INSERT. Default `true`.
3329    pub(crate) fn secret_auto_encrypt(&self) -> bool {
3330        self.config_bool("red.config.secret.auto_encrypt", true)
3331    }
3332
3333    /// Whether `Value::Secret` columns should be decrypted back to
3334    /// plaintext on SELECT when the vault is unsealed. Default `true`.
3335    /// Turning this off keeps secrets masked as `***` even while the
3336    /// vault is open — useful for audit trails or read-only exports.
3337    pub(crate) fn secret_auto_decrypt(&self) -> bool {
3338        self.config_bool("red.config.secret.auto_decrypt", true)
3339    }
3340
3341    /// Walk every record in `result` and swap `Value::Secret(bytes)`
3342    /// for the decrypted plaintext when the runtime has the vault
3343    /// AES key AND `red.config.secret.auto_decrypt = true`. If the
3344    /// key is missing, the vault is sealed, or auto_decrypt is off,
3345    /// secrets are left as `Value::Secret` which every formatter
3346    /// (Display, JSON) already masks as `***`.
3347    pub(crate) fn apply_secret_decryption(&self, result: &mut RuntimeQueryResult) {
3348        if !self.secret_auto_decrypt() {
3349            return;
3350        }
3351        let Some(key) = self.secret_aes_key() else {
3352            return;
3353        };
3354        for record in result.result.records.iter_mut() {
3355            for value in record.values_mut() {
3356                if let Value::Secret(ref bytes) = value {
3357                    if let Some(plain) =
3358                        super::impl_dml::decrypt_secret_payload(&key, bytes.as_slice())
3359                    {
3360                        if let Ok(text) = String::from_utf8(plain) {
3361                            *value = Value::text(text);
3362                        }
3363                    }
3364                }
3365            }
3366        }
3367    }
3368
3369    /// Emit a CDC change event and replicate to WAL buffer.
3370    /// Create a `MutationEngine` bound to this runtime.
3371    ///
3372    /// The engine is cheap to construct (no allocation) and should be
3373    /// dropped after `apply` returns. Use this from application-layer
3374    /// `create_row` / `create_rows_batch` instead of calling
3375    /// `bulk_insert` + `index_entity_insert` + `cdc_emit` separately.
3376    pub(crate) fn mutation_engine(&self) -> crate::runtime::mutation::MutationEngine<'_> {
3377        crate::runtime::mutation::MutationEngine::new(self)
3378    }
3379
3380    /// Public-mutation gate snapshot (PLAN.md W1).
3381    ///
3382    /// Surfaces that accept untrusted client requests (SQL DML/DDL,
3383    /// gRPC mutating RPCs, HTTP/native wire mutations, admin
3384    /// maintenance, serverless lifecycle) call `check_write` before
3385    /// dispatching to storage. Returns `RedDBError::ReadOnly` on any
3386    /// instance running as a replica or with `options.read_only =
3387    /// true`. The replica internal logical-WAL apply path reaches into
3388    /// the store directly and never calls this method, so legitimate
3389    /// replica catch-up still works.
3390    pub fn check_write(&self, kind: crate::runtime::write_gate::WriteKind) -> RedDBResult<()> {
3391        self.inner.write_gate.check(kind)
3392    }
3393
3394    /// Read-only handle to the gate, useful for transports that want
3395    /// to surface the policy in health/status output without taking on
3396    /// a dependency on the concrete enum.
3397    pub fn write_gate(&self) -> &crate::runtime::write_gate::WriteGate {
3398        &self.inner.write_gate
3399    }
3400
3401    /// Process lifecycle handle (PLAN.md Phase 1). Health probes,
3402    /// admin/shutdown, and signal handlers consult this single
3403    /// state machine.
3404    pub fn lifecycle(&self) -> &crate::runtime::lifecycle::Lifecycle {
3405        &self.inner.lifecycle
3406    }
3407
3408    /// Operator-imposed resource limits (PLAN.md Phase 4.1).
3409    pub fn resource_limits(&self) -> &crate::runtime::resource_limits::ResourceLimits {
3410        &self.inner.resource_limits
3411    }
3412
3413    /// Append-only audit log for admin mutations (PLAN.md Phase 6.5).
3414    pub fn audit_log(&self) -> &crate::runtime::audit_log::AuditLogger {
3415        &self.inner.audit_log
3416    }
3417
3418    /// Shared `Arc` to the audit logger — used by collaborators (the
3419    /// lease lifecycle, future request-context plumbing) that need to
3420    /// keep the logger alive past the runtime's stack frame.
3421    pub fn audit_log_arc(&self) -> Arc<crate::runtime::audit_log::AuditLogger> {
3422        Arc::clone(&self.inner.audit_log)
3423    }
3424
3425    pub(crate) fn emit_control_event(
3426        &self,
3427        kind: crate::runtime::control_events::EventKind,
3428        outcome: crate::runtime::control_events::Outcome,
3429        action: &'static str,
3430        resource: Option<String>,
3431        reason: Option<String>,
3432        extra_fields: Vec<(String, crate::runtime::control_events::Sensitivity)>,
3433    ) -> RedDBResult<()> {
3434        use crate::runtime::control_events::{
3435            ActorRef, ControlEvent, ControlEventCtx, ControlEventLedger, Sensitivity,
3436        };
3437
3438        let tenant = current_tenant();
3439        let principal = current_auth_identity();
3440        let actor_user = principal
3441            .as_ref()
3442            .map(|(principal, _)| UserId::from_parts(tenant.as_deref(), principal));
3443        let actor = actor_user
3444            .as_ref()
3445            .map(ActorRef::User)
3446            .unwrap_or(ActorRef::Anonymous);
3447        let ctx = ControlEventCtx {
3448            actor,
3449            scope: tenant
3450                .as_ref()
3451                .map(|scope| std::borrow::Cow::Borrowed(scope.as_str())),
3452            request_id: Some(std::borrow::Cow::Owned(format!(
3453                "conn-{}",
3454                current_connection_id()
3455            ))),
3456            trace_id: None,
3457        };
3458        let mut fields = std::collections::HashMap::new();
3459        fields.insert(
3460            "connection_id".to_string(),
3461            Sensitivity::raw(current_connection_id().to_string()),
3462        );
3463        if let Some((_, role)) = principal {
3464            fields.insert("actor_role".to_string(), Sensitivity::raw(role.as_str()));
3465        }
3466        for (key, value) in extra_fields {
3467            fields.insert(key, value);
3468        }
3469        let event = ControlEvent {
3470            kind,
3471            outcome,
3472            action: std::borrow::Cow::Borrowed(action),
3473            resource,
3474            reason,
3475            matched_policy_id: None,
3476            fields,
3477        };
3478        let ledger = self.inner.control_event_ledger.read();
3479        match ledger.emit(&ctx, event) {
3480            Ok(_) => Ok(()),
3481            Err(err) if self.inner.control_event_config.require_persistence() => {
3482                Err(RedDBError::Internal(err.to_string()))
3483            }
3484            Err(_) => Ok(()),
3485        }
3486    }
3487
3488    fn policy_mutation_control_ctx<'a>(
3489        &self,
3490        actor: &'a crate::auth::UserId,
3491        tenant: Option<&'a str>,
3492    ) -> crate::runtime::control_events::ControlEventCtx<'a> {
3493        crate::runtime::control_events::ControlEventCtx {
3494            actor: crate::runtime::control_events::ActorRef::User(actor),
3495            scope: tenant.map(std::borrow::Cow::Borrowed),
3496            request_id: Some(std::borrow::Cow::Owned(format!(
3497                "conn-{}",
3498                current_connection_id()
3499            ))),
3500            trace_id: None,
3501        }
3502    }
3503
3504    fn emit_query_audit(
3505        &self,
3506        query: &str,
3507        plan: &QueryAuditPlan,
3508        duration_ms: u64,
3509        result: &RuntimeQueryResult,
3510    ) {
3511        if !self.inner.query_audit.has_rules() {
3512            return;
3513        }
3514        let actor = current_auth_identity().map(|(principal, _)| principal);
3515        let tenant = current_tenant();
3516        let row_count = if result.statement_type == "select" {
3517            result.result.records.len() as u64
3518        } else {
3519            result.affected_rows
3520        };
3521        self.inner
3522            .query_audit
3523            .emit(crate::runtime::query_audit::QueryAuditEvent {
3524                actor,
3525                tenant,
3526                statement_kind: plan.statement_kind,
3527                touched_collections: plan.collections.clone(),
3528                duration_ms,
3529                row_count,
3530                request_id: Some(crate::crypto::uuid::Uuid::new_v7().to_string()),
3531                query_hash: Some(blake3::hash(query.as_bytes()).to_hex().to_string()),
3532            });
3533    }
3534
3535    /// Slice 10 of issue #527 — shared queue telemetry counters
3536    /// (delivered/acked/nacked). Cloned by `queue_delivery.rs` on
3537    /// each transition.
3538    pub(crate) fn queue_telemetry(
3539        &self,
3540    ) -> &crate::runtime::queue_telemetry::QueueTelemetryCounters {
3541        &self.inner.queue_telemetry
3542    }
3543
3544    /// Snapshots of the queue telemetry counters in label-deterministic
3545    /// order for `/metrics` rendering and the integration test.
3546    pub fn queue_telemetry_snapshot(
3547        &self,
3548    ) -> crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3549        crate::runtime::queue_telemetry::QueueTelemetrySnapshot {
3550            delivered: self.inner.queue_telemetry.delivered_snapshot(),
3551            acked: self.inner.queue_telemetry.acked_snapshot(),
3552            nacked: self.inner.queue_telemetry.nacked_snapshot(),
3553            wait_started: self.inner.queue_telemetry.wait_started_snapshot(),
3554            wait_woken: self.inner.queue_telemetry.wait_woken_snapshot(),
3555            wait_timed_out: self.inner.queue_telemetry.wait_timed_out_snapshot(),
3556            wait_cancelled: self.inner.queue_telemetry.wait_cancelled_snapshot(),
3557            wait_duration: self.inner.queue_telemetry.wait_duration_snapshot(),
3558        }
3559    }
3560
3561    /// Issue #742 — consumer presence registry. Heartbeats land here
3562    /// from `QUEUE READ` (and, in a follow-up slice, an explicit
3563    /// `QUEUE HEARTBEAT` command); Red UI and `red.queue_consumers`
3564    /// read snapshots through `queue_consumer_presence_snapshot`.
3565    pub(crate) fn queue_presence(
3566        &self,
3567    ) -> &std::sync::Arc<crate::storage::queue::presence::ConsumerPresenceRegistry> {
3568        &self.inner.queue_presence
3569    }
3570
3571    /// Issue #742 — point-in-time presence snapshot, classifying each
3572    /// `(queue, group, consumer)` as active/stale/expired against the
3573    /// supplied TTL. Wall-clock is read once here so the lifecycle
3574    /// flags inside the snapshot are internally consistent.
3575    pub fn queue_consumer_presence_snapshot(
3576        &self,
3577        ttl_ms: u64,
3578    ) -> Vec<crate::storage::queue::presence::ConsumerPresence> {
3579        let now_ns = std::time::SystemTime::now()
3580            .duration_since(std::time::UNIX_EPOCH)
3581            .map(|d| d.as_nanos() as u64)
3582            .unwrap_or(0);
3583        self.inner.queue_presence.snapshot(now_ns, ttl_ms)
3584    }
3585
3586    /// Issue #742 — active-consumer count per `(queue, group)` for the
3587    /// queue-metadata surface. Stale/expired entries are excluded by
3588    /// definition; they are still visible in the per-row snapshot.
3589    pub fn queue_active_consumer_counts(
3590        &self,
3591        ttl_ms: u64,
3592    ) -> std::collections::HashMap<(String, String), u32> {
3593        let now_ns = std::time::SystemTime::now()
3594            .duration_since(std::time::UNIX_EPOCH)
3595            .map(|d| d.as_nanos() as u64)
3596            .unwrap_or(0);
3597        self.inner
3598            .queue_presence
3599            .count_active_by_group(now_ns, ttl_ms)
3600    }
3601
3602    /// Issue #743 — vector + TurboQuant introspection registry. Engine
3603    /// publish points (collection create, artifact build start /
3604    /// finish, fallback toggle, drop) update this; Red UI and
3605    /// `red.*` vector virtual tables read snapshots through
3606    /// `vector_introspection_snapshot` / `vector_introspection_get`.
3607    pub(crate) fn vector_introspection_registry(
3608        &self,
3609    ) -> &std::sync::Arc<crate::storage::vector::introspection::VectorIntrospectionRegistry> {
3610        &self.inner.vector_introspection
3611    }
3612
3613    /// Issue #743 — full snapshot of every tracked vector collection's
3614    /// `(VectorMetadata, ArtifactMetadata)`. Deterministically ordered
3615    /// by collection name so Red UI tables and tests both see a
3616    /// stable shape.
3617    pub fn vector_introspection_snapshot(
3618        &self,
3619    ) -> Vec<crate::storage::vector::introspection::VectorIntrospection> {
3620        self.inner.vector_introspection.snapshot()
3621    }
3622
3623    /// Issue #743 — single-collection lookup, for the per-collection
3624    /// metadata endpoint Red UI hits when an operator opens one
3625    /// vector's toolbar.
3626    pub fn vector_introspection_get(
3627        &self,
3628        collection: &str,
3629    ) -> Option<crate::storage::vector::introspection::VectorIntrospection> {
3630        self.inner.vector_introspection.get(collection)
3631    }
3632
3633    /// Slice 10 of issue #527 — render-time scan of pending entries
3634    /// per (queue, group) for the `queue_pending_gauge` exposition.
3635    /// Walks `red_queue_meta` live so the gauge cannot drift from
3636    /// the source of truth.
3637    pub fn queue_pending_counts(&self) -> Vec<((String, String), u64)> {
3638        let store = self.inner.db.store();
3639        crate::runtime::impl_queue::pending_counts_by_group(store.as_ref())
3640            .into_iter()
3641            .collect()
3642    }
3643
3644    /// Shared `Arc` to the write gate. Same rationale as
3645    /// `audit_log_arc`: collaborators (lease lifecycle, refresh
3646    /// thread) need a clone-cheap handle they can move into a
3647    /// background thread.
3648    pub fn write_gate_arc(&self) -> Arc<crate::runtime::write_gate::WriteGate> {
3649        Arc::clone(&self.inner.write_gate)
3650    }
3651
3652    /// Serverless writer-lease state machine. `None` when the operator
3653    /// did not opt into lease fencing (`RED_LEASE_REQUIRED` unset).
3654    pub fn lease_lifecycle(&self) -> Option<&Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3655        self.inner.lease_lifecycle.get()
3656    }
3657
3658    /// Install the lease lifecycle. Idempotent; subsequent calls
3659    /// return the previously stored value untouched.
3660    pub fn set_lease_lifecycle(
3661        &self,
3662        lifecycle: Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>,
3663    ) -> Result<(), Arc<crate::runtime::lease_lifecycle::LeaseLifecycle>> {
3664        self.inner.lease_lifecycle.set(lifecycle)
3665    }
3666
3667    /// Reject the call when the requested batch size exceeds
3668    /// `RED_MAX_BATCH_SIZE`. Returns `RedDBError::QuotaExceeded`
3669    /// shaped so the HTTP layer can map it to 413 Payload Too
3670    /// Large (PLAN.md Phase 4.1).
3671    pub fn check_batch_size(&self, requested: usize) -> RedDBResult<()> {
3672        if self.inner.resource_limits.batch_size_exceeded(requested) {
3673            let max = self.inner.resource_limits.max_batch_size.unwrap_or(0);
3674            return Err(RedDBError::QuotaExceeded(format!(
3675                "max_batch_size:{requested}:{max}"
3676            )));
3677        }
3678        Ok(())
3679    }
3680
3681    /// Reject the call when the local DB file exceeds
3682    /// `RED_MAX_DB_SIZE_BYTES`. Reads file metadata once per call —
3683    /// the cost is a single `stat()` syscall, negligible against the
3684    /// I/O the caller is about to do. Returns `QuotaExceeded` shaped
3685    /// for HTTP 507 Insufficient Storage.
3686    pub fn check_db_size(&self) -> RedDBResult<()> {
3687        let Some(limit) = self.inner.resource_limits.max_db_size_bytes else {
3688            return Ok(());
3689        };
3690        if limit == 0 {
3691            return Ok(());
3692        }
3693        let Some(path) = self.inner.db.path() else {
3694            return Ok(());
3695        };
3696        let current = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
3697        if current > limit {
3698            return Err(RedDBError::QuotaExceeded(format!(
3699                "max_db_size_bytes:{current}:{limit}"
3700            )));
3701        }
3702        Ok(())
3703    }
3704
3705    /// Graceful shutdown coordinator (PLAN.md Phase 1.1).
3706    ///
3707    /// Steps, in order, all idempotent across re-entrant calls:
3708    ///   1. Move lifecycle into `ShuttingDown` (concurrent callers
3709    ///      observe `Stopped` after first finishes).
3710    ///   2. Flush WAL + run final checkpoint via `db.flush()` so
3711    ///      every acked write is durable on disk.
3712    ///   3. If `backup_on_shutdown == true` and a remote backend is
3713    ///      configured, run a synchronous `trigger_backup()` so the
3714    ///      remote head reflects the final state.
3715    ///   4. Stamp the report and move to `Stopped`. Subsequent calls
3716    ///      return the cached report without re-running anything.
3717    ///
3718    /// On any error, the runtime is still marked `Stopped` so the
3719    /// process can exit; the caller logs the error context but does
3720    /// not retry the same shutdown — the operator can inspect the
3721    /// report fields to see which step failed.
3722    pub fn graceful_shutdown(
3723        &self,
3724        backup_on_shutdown: bool,
3725    ) -> RedDBResult<crate::runtime::lifecycle::ShutdownReport> {
3726        if !self.inner.lifecycle.begin_shutdown() {
3727            // Someone else already shut down (or is in flight). Return
3728            // the cached report so the HTTP caller and SIGTERM handler
3729            // get the same idempotent answer.
3730            return Ok(self.inner.lifecycle.shutdown_report().unwrap_or_default());
3731        }
3732
3733        let started_ms = std::time::SystemTime::now()
3734            .duration_since(std::time::UNIX_EPOCH)
3735            .map(|d| d.as_millis() as u64)
3736            .unwrap_or(0);
3737        let mut report = crate::runtime::lifecycle::ShutdownReport {
3738            started_at_ms: started_ms,
3739            ..Default::default()
3740        };
3741
3742        // Flush WAL + run any pending checkpoint. Local fsync is
3743        // unconditional — even a lease-lost replica needs its WAL on
3744        // disk before exit so a future restore has the latest tail.
3745        // The remote upload is gated separately so a lost-lease writer
3746        // doesn't clobber the new holder's state on its way out.
3747        let flush_res = self.inner.db.flush_local_only();
3748        report.flushed_wal = flush_res.is_ok();
3749        report.final_checkpoint = flush_res.is_ok();
3750        if let Err(err) = &flush_res {
3751            tracing::error!(
3752                target: "reddb::lifecycle",
3753                error = %err,
3754                "graceful_shutdown: local flush failed"
3755            );
3756        } else if let Err(lease_err) =
3757            self.assert_remote_write_allowed("shutdown/checkpoint_upload")
3758        {
3759            tracing::warn!(
3760                target: "reddb::serverless::lease",
3761                error = %lease_err,
3762                "graceful_shutdown: remote upload skipped — lease not held"
3763            );
3764        } else if let Err(err) = self.inner.db.upload_to_remote_backend() {
3765            tracing::error!(
3766                target: "reddb::lifecycle",
3767                error = %err,
3768                "graceful_shutdown: remote upload failed"
3769            );
3770        }
3771
3772        // Optional final backup. Skipped silently when no remote
3773        // backend is configured — `trigger_backup()` returns Err
3774        // anyway in that case, but logging it as a shutdown failure
3775        // would be misleading on a standalone (no-backend) runtime.
3776        if backup_on_shutdown && self.inner.db.remote_backend.is_some() {
3777            // The trigger_backup gate now reads `WriteKind::Backup`,
3778            // which a replica/read_only instance refuses. That's
3779            // intentional — replicas don't drive backups; only the
3780            // primary does. We still want shutdown to flush its WAL
3781            // even if the backup branch is gated off.
3782            match self.trigger_backup() {
3783                Ok(result) => {
3784                    report.backup_uploaded = result.uploaded;
3785                }
3786                Err(err) => {
3787                    tracing::warn!(
3788                        target: "reddb::lifecycle",
3789                        error = %err,
3790                        "graceful_shutdown: final backup skipped"
3791                    );
3792                }
3793            }
3794        }
3795
3796        let completed_ms = std::time::SystemTime::now()
3797            .duration_since(std::time::UNIX_EPOCH)
3798            .map(|d| d.as_millis() as u64)
3799            .unwrap_or(started_ms);
3800        report.completed_at_ms = completed_ms;
3801        report.duration_ms = completed_ms.saturating_sub(started_ms);
3802
3803        self.inner.lifecycle.finish_shutdown(report.clone());
3804        Ok(report)
3805    }
3806
3807    /// PLAN.md Phase 4.4 — per-caller quota bucket. Always
3808    /// returned; `is_configured()` lets callers short-circuit.
3809    pub fn quota_bucket(&self) -> &crate::runtime::quota_bucket::QuotaBucket {
3810        &self.inner.quota_bucket
3811    }
3812
3813    /// PLAN.md Phase 6.3 — whether at-rest encryption is configured.
3814    /// Reads `RED_ENCRYPTION_KEY` / `RED_ENCRYPTION_KEY_FILE` lazily;
3815    /// returns `("enabled", None)` when a key is loadable, `("error", Some(msg))`
3816    /// when the operator set the env but it doesn't parse, and
3817    /// `("disabled", None)` when no key is configured. The pager
3818    /// hookup is deferred — this accessor surfaces the operator's
3819    /// intent for /admin/status without yet using the key in writes.
3820    pub fn encryption_at_rest_status(&self) -> (&'static str, Option<String>) {
3821        match crate::crypto::page_encryption::key_from_env() {
3822            Ok(Some(_)) => ("enabled", None),
3823            Ok(None) => ("disabled", None),
3824            Err(err) => ("error", Some(err)),
3825        }
3826    }
3827
3828    /// PLAN.md Phase 11.5 — current replica apply health label
3829    /// (`ok`, `gap`, `divergence`, `apply_error`, `connecting`,
3830    /// `stalled_gap`). Read from the persisted `red.replication.state`
3831    /// config key updated by the replica loop. Returns `None` on
3832    /// non-replica instances or when no apply has run yet.
3833    pub fn replica_apply_health(&self) -> Option<String> {
3834        let state = self.config_string("red.replication.state", "");
3835        if state.is_empty() {
3836            None
3837        } else {
3838            Some(state)
3839        }
3840    }
3841
3842    pub fn acquire(&self) -> RedDBResult<RuntimeConnection> {
3843        let mut pool = self
3844            .inner
3845            .pool
3846            .lock()
3847            .map_err(|e| RedDBError::Internal(format!("connection pool lock poisoned: {e}")))?;
3848        if pool.active >= self.inner.pool_config.max_connections {
3849            return Err(RedDBError::Internal(
3850                "connection pool exhausted".to_string(),
3851            ));
3852        }
3853
3854        let id = if let Some(id) = pool.idle.pop() {
3855            id
3856        } else {
3857            let id = pool.next_id;
3858            pool.next_id += 1;
3859            id
3860        };
3861        pool.active += 1;
3862        pool.total_checkouts += 1;
3863        drop(pool);
3864
3865        Ok(RuntimeConnection {
3866            id,
3867            inner: Arc::clone(&self.inner),
3868        })
3869    }
3870
3871    pub fn checkpoint(&self) -> RedDBResult<()> {
3872        // Local fsync always allowed — losing the lease shouldn't
3873        // prevent us from durably persisting what's already in memory.
3874        // The remote upload is the side-effect that risks clobbering a
3875        // peer's state, so it's behind the lease gate.
3876        self.inner.db.flush_local_only().map_err(|err| {
3877            // Issue #205 — local flush failure is a CheckpointFailed
3878            // operator-grade event. The local-flush path also covers
3879            // the WAL fsync we depend on, so a failure here doubles as
3880            // the WalFsyncFailed signal for the runtime entry point.
3881            let msg = err.to_string();
3882            crate::telemetry::operator_event::OperatorEvent::CheckpointFailed {
3883                lsn: 0,
3884                error: msg.clone(),
3885            }
3886            .emit_global();
3887            crate::telemetry::operator_event::OperatorEvent::WalFsyncFailed {
3888                path: "<flush_local_only>".to_string(),
3889                error: msg.clone(),
3890            }
3891            .emit_global();
3892            RedDBError::Engine(msg)
3893        })?;
3894        if let Err(err) = self.assert_remote_write_allowed("checkpoint") {
3895            tracing::warn!(
3896                target: "reddb::serverless::lease",
3897                error = %err,
3898                "checkpoint: skipping remote upload — lease not held"
3899            );
3900            return Ok(());
3901        }
3902        self.inner
3903            .db
3904            .upload_to_remote_backend()
3905            .map_err(|err| RedDBError::Engine(err.to_string()))
3906    }
3907
3908    /// Guard remote-mutating operations on the writer lease.
3909    /// Returns `Ok(())` when no remote backend is configured (the
3910    /// lease is irrelevant) or the lease state is `NotRequired` /
3911    /// `Held`. Returns `RedDBError::ReadOnly` when the lease is
3912    /// `NotHeld`, with an audit-friendly action label so the caller
3913    /// can record the rejection.
3914    pub(crate) fn assert_remote_write_allowed(&self, action: &str) -> RedDBResult<()> {
3915        if self.inner.db.remote_backend.is_none() {
3916            return Ok(());
3917        }
3918        match self.inner.write_gate.lease_state() {
3919            crate::runtime::write_gate::LeaseGateState::NotHeld => {
3920                self.inner.audit_log.record(
3921                    action,
3922                    "system",
3923                    "remote_backend",
3924                    "err: writer lease not held",
3925                    crate::json::Value::Null,
3926                );
3927                Err(RedDBError::ReadOnly(format!(
3928                    "writer lease not held — {action} blocked (serverless fence)"
3929                )))
3930            }
3931            _ => Ok(()),
3932        }
3933    }
3934
3935    pub fn run_maintenance(&self) -> RedDBResult<()> {
3936        self.inner
3937            .db
3938            .run_maintenance()
3939            .map_err(|err| RedDBError::Internal(err.to_string()))
3940    }
3941
3942    pub fn scan_collection(
3943        &self,
3944        collection: &str,
3945        cursor: Option<ScanCursor>,
3946        limit: usize,
3947    ) -> RedDBResult<ScanPage> {
3948        let store = self.inner.db.store();
3949        let manager = store
3950            .get_collection(collection)
3951            .ok_or_else(|| RedDBError::NotFound(collection.to_string()))?;
3952
3953        let mut entities = manager.query_all(|_| true);
3954        entities.sort_by_key(|entity| entity.id.raw());
3955
3956        let offset = cursor.map(|cursor| cursor.offset).unwrap_or(0);
3957        let total = entities.len();
3958        let end = total.min(offset.saturating_add(limit.max(1)));
3959        let items = if offset >= total {
3960            Vec::new()
3961        } else {
3962            entities[offset..end].to_vec()
3963        };
3964        let next = (end < total).then_some(ScanCursor { offset: end });
3965
3966        Ok(ScanPage {
3967            collection: collection.to_string(),
3968            items,
3969            next,
3970            total,
3971        })
3972    }
3973
3974    pub fn catalog(&self) -> CatalogModelSnapshot {
3975        self.inner.db.catalog_model_snapshot()
3976    }
3977
3978    pub fn catalog_consistency_report(&self) -> crate::catalog::CatalogConsistencyReport {
3979        self.inner.db.catalog_consistency_report()
3980    }
3981
3982    pub fn catalog_attention_summary(&self) -> CatalogAttentionSummary {
3983        crate::catalog::attention_summary(&self.catalog())
3984    }
3985
3986    pub fn collection_attention(&self) -> Vec<CollectionDescriptor> {
3987        crate::catalog::collection_attention(&self.catalog())
3988    }
3989
3990    pub fn index_attention(&self) -> Vec<CatalogIndexStatus> {
3991        crate::catalog::index_attention(&self.catalog())
3992    }
3993
3994    pub fn graph_projection_attention(&self) -> Vec<CatalogGraphProjectionStatus> {
3995        crate::catalog::graph_projection_attention(&self.catalog())
3996    }
3997
3998    pub fn analytics_job_attention(&self) -> Vec<CatalogAnalyticsJobStatus> {
3999        crate::catalog::analytics_job_attention(&self.catalog())
4000    }
4001
4002    pub fn stats(&self) -> RuntimeStats {
4003        let pool = runtime_pool_lock(self);
4004        RuntimeStats {
4005            active_connections: pool.active,
4006            idle_connections: pool.idle.len(),
4007            total_checkouts: pool.total_checkouts,
4008            paged_mode: self.inner.db.is_paged(),
4009            started_at_unix_ms: self.inner.started_at_unix_ms,
4010            store: self.inner.db.stats(),
4011            system: SystemInfo::collect(),
4012            result_blob_cache: self.inner.result_blob_cache.stats(),
4013            kv: self.inner.kv_stats.snapshot(),
4014            metrics_ingest: self.inner.metrics_ingest_stats.snapshot(),
4015        }
4016    }
4017
4018    pub(crate) fn record_metrics_ingest(
4019        &self,
4020        accepted_samples: u64,
4021        accepted_series: u64,
4022        rejected_samples: u64,
4023        rejected_series: u64,
4024    ) {
4025        self.inner.metrics_ingest_stats.record(
4026            accepted_samples,
4027            accepted_series,
4028            rejected_samples,
4029            rejected_series,
4030        );
4031    }
4032
4033    pub(crate) fn record_metrics_cardinality_budget_rejections(&self, rejected_series: u64) {
4034        self.inner
4035            .metrics_ingest_stats
4036            .record_cardinality_budget_rejections(rejected_series);
4037    }
4038
4039    pub(crate) fn record_metrics_tenant_activity(
4040        &self,
4041        tenant: &str,
4042        namespace: &str,
4043        operation: &str,
4044    ) {
4045        self.inner
4046            .metrics_tenant_activity_stats
4047            .record(tenant, namespace, operation);
4048    }
4049
4050    pub(crate) fn metrics_tenant_activity_snapshot(
4051        &self,
4052    ) -> Vec<crate::runtime::MetricsTenantActivityStats> {
4053        self.inner.metrics_tenant_activity_stats.snapshot()
4054    }
4055
4056    /// Execute a query under a typed scope override without embedding
4057    /// the tenant / user / role values into the SQL string. Use this
4058    /// from transport middleware (HTTP / gRPC / worker loops) where the
4059    /// scope is resolved from auth claims and the SQL is a parameterised
4060    /// template — avoids the string-concat injection risk of building
4061    /// `WITHIN TENANT '<id>' …` manually, and is drop-in compatible with
4062    /// prepared statements that didn't know about tenancy.
4063    ///
4064    /// Precedence matches the `WITHIN` clause: the passed `scope`
4065    /// overrides `SET LOCAL TENANT`, which overrides `SET TENANT`.
4066    /// The override is pushed on the thread-local scope stack for the
4067    /// duration of the call and popped on return — pool-shared
4068    /// connections cannot leak it across requests.
4069    pub fn execute_query_with_scope(
4070        &self,
4071        query: &str,
4072        scope: crate::runtime::within_clause::ScopeOverride,
4073    ) -> RedDBResult<RuntimeQueryResult> {
4074        if scope.is_empty() {
4075            return self.execute_query(query);
4076        }
4077        let _scope_guard = ScopeOverrideGuard::install(scope);
4078        self.execute_query(query)
4079    }
4080
4081    /// Issue #205 — single lifecycle exit for slow-query logging.
4082    ///
4083    /// `execute_query_inner` does the real work; this wrapper times it
4084    /// and, if elapsed exceeds the configured threshold, hands the
4085    /// triple `(QueryKind, elapsed_ms, sql_redacted, scope)` to the
4086    /// SlowQueryLogger. The threshold + sample_pct were captured at
4087    /// SlowQueryLogger construction (runtime startup), so the per-call
4088    /// cost on below-threshold paths is one relaxed atomic load.
4089    pub fn execute_query(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4090        let started = std::time::Instant::now();
4091        let mut result = self.execute_query_inner(query);
4092        // Issue #765 / S6 — filter integrity-tombstoned rows out of SELECT
4093        // results before they reach any consumer. Fast no-op (one relaxed
4094        // atomic load) unless an input-stream digest mismatch has tombstoned
4095        // a RID range on this store.
4096        if let Ok(ref mut query_result) = result {
4097            if query_result.statement_type == "select" {
4098                self.filter_integrity_tombstoned(&mut query_result.result);
4099            }
4100        }
4101        let elapsed_ms = started.elapsed().as_millis() as u64;
4102
4103        // Build EffectiveScope from the same thread-locals frame-build
4104        // consults — keeps the slow-log row consistent with the audit /
4105        // RLS view of "this statement". `ai_scope()` is the canonical
4106        // builder.
4107        let scope = self.ai_scope();
4108        let kind = match result
4109            .as_ref()
4110            .map(|r| r.statement_type)
4111            .unwrap_or("select")
4112        {
4113            "select" => crate::telemetry::slow_query_logger::QueryKind::Select,
4114            "insert" => crate::telemetry::slow_query_logger::QueryKind::Insert,
4115            "update" => crate::telemetry::slow_query_logger::QueryKind::Update,
4116            "delete" => crate::telemetry::slow_query_logger::QueryKind::Delete,
4117            _ => crate::telemetry::slow_query_logger::QueryKind::Internal,
4118        };
4119        // SQL redaction: pass the raw query through. The slow-query
4120        // logger writes structured JSON so embedded literals stay
4121        // escape-safe at the JSON boundary (proven by
4122        // `adversarial_sql_is_escape_safe` in slow_query_logger.rs).
4123        // PII redaction (e.g. literal masking) is a follow-up.
4124        self.inner
4125            .slow_query_logger
4126            .record(kind, elapsed_ms, query.to_string(), &scope);
4127
4128        if let Ok(ref mut query_result) = result {
4129            if matches!(query_result.statement_type, "insert" | "update" | "delete") {
4130                let bookmark = crate::replication::CausalBookmark::new(
4131                    self.current_replication_term(),
4132                    self.cdc_current_lsn(),
4133                );
4134                query_result.bookmark = Some(bookmark.encode());
4135            }
4136        }
4137
4138        result
4139    }
4140
4141    pub fn causal_session(&self) -> crate::runtime::CausalSession {
4142        crate::runtime::CausalSession {
4143            runtime: self.clone(),
4144            bookmark: None,
4145            wait_timeout: std::time::Duration::from_secs(5),
4146        }
4147    }
4148
4149    pub fn wait_for_bookmark(
4150        &self,
4151        bookmark: &crate::replication::CausalBookmark,
4152        timeout: std::time::Duration,
4153    ) -> RedDBResult<()> {
4154        let deadline = std::time::Instant::now() + timeout;
4155        loop {
4156            let applied_lsn = self.local_contiguous_applied_lsn();
4157            if applied_lsn >= bookmark.commit_lsn() {
4158                return Ok(());
4159            }
4160            let now = std::time::Instant::now();
4161            if now >= deadline {
4162                return Err(RedDBError::InvalidOperation(format!(
4163                    "timed out waiting for causal bookmark lsn {}; applied={}",
4164                    bookmark.commit_lsn(),
4165                    applied_lsn
4166                )));
4167            }
4168            let remaining = deadline.saturating_duration_since(now);
4169            std::thread::sleep(remaining.min(std::time::Duration::from_millis(5)));
4170        }
4171    }
4172
4173    fn local_contiguous_applied_lsn(&self) -> u64 {
4174        match self.inner.db.options().replication.role {
4175            crate::replication::ReplicationRole::Replica { .. } => {
4176                self.config_u64("red.replication.last_applied_lsn", 0)
4177            }
4178            _ => self.cdc_current_lsn(),
4179        }
4180    }
4181
4182    #[inline(never)]
4183    fn execute_query_inner(&self, query: &str) -> RedDBResult<RuntimeQueryResult> {
4184        // ── ULTRA-TURBO: autocommit `SELECT * FROM t WHERE _entity_id = N` ──
4185        //
4186        // Moved above every boot-cost the normal path pays (WITHIN
4187        // strip, SET LOCAL parse, tx_local_tenants read, snapshot
4188        // guard, tracing span, tx_contexts read) because the bench's
4189        // `select_point` scenario was observed at 28× vs PostgreSQL —
4190        // the dominant cost wasn't the entity fetch but the ceremony
4191        // before it. Only fires when there's no ambient transaction
4192        // context or WITHIN override, so the snapshot install we skip
4193        // truly is a no-op for this query.
4194        if !has_scope_override_active()
4195            && !query.trim_start().starts_with("WITHIN")
4196            && !query.trim_start().starts_with("within")
4197            && !self.inner.query_audit.has_rules()
4198            && !self
4199                .inner
4200                .tx_contexts
4201                .read()
4202                .contains_key(&current_connection_id())
4203        {
4204            if let Some(result) = self.try_fast_entity_lookup(query) {
4205                return result;
4206            }
4207        }
4208
4209        // `WITHIN TENANT '<id>' [USER '<u>'] [AS ROLE '<r>'] <stmt>` —
4210        // strip the prefix, push a stack-scoped override, recurse on
4211        // the inner statement, pop on return. Stack lives in a
4212        // thread-local but is balanced by the RAII guard, so a
4213        // pool-shared connection cannot leak the override across
4214        // requests and an early `?` return still pops cleanly.
4215        match crate::runtime::within_clause::try_strip_within_prefix(query) {
4216            Ok(Some((scope, inner))) => {
4217                let _scope_guard = ScopeOverrideGuard::install(scope);
4218                // Re-enter the inner path, NOT `execute_query`, so the
4219                // slow-query lifecycle hook records exactly one row per
4220                // top-level statement (the WITHIN-stripped form would
4221                // double-record).
4222                return self.execute_query_inner(inner);
4223            }
4224            Ok(None) => {}
4225            Err(msg) => return Err(RedDBError::Query(msg)),
4226        }
4227
4228        // `EXPLAIN <stmt>` — introspection. Runs the planner on the
4229        // inner statement (WITHOUT executing it) and returns the
4230        // CanonicalLogicalNode tree as rows so the caller can see the
4231        // operator shape and estimated cost. `EXPLAIN ALTER FOR ...`
4232        // is a distinct schema-diff command and continues down the
4233        // regular SQL path.
4234        if let Some(inner) = strip_explain_prefix(query) {
4235            return self.explain_as_rows(query, inner);
4236        }
4237
4238        // `SET LOCAL TENANT '<id>'` — write the per-transaction tenant
4239        // override and return. Outside a transaction the statement is
4240        // an error (matches PG semantics: SET LOCAL only takes effect
4241        // within an active transaction).
4242        if let Some(value) = parse_set_local_tenant(query)? {
4243            let conn_id = current_connection_id();
4244            if !self.inner.tx_contexts.read().contains_key(&conn_id) {
4245                return Err(RedDBError::Query(
4246                    "SET LOCAL TENANT requires an active transaction".to_string(),
4247                ));
4248            }
4249            self.inner
4250                .tx_local_tenants
4251                .write()
4252                .insert(conn_id, value.clone());
4253            return Ok(RuntimeQueryResult::ok_message(
4254                query.to_string(),
4255                &match &value {
4256                    Some(id) => format!("local tenant set: {id}"),
4257                    None => "local tenant cleared".to_string(),
4258                },
4259                "set_local_tenant",
4260            ));
4261        }
4262
4263        if super::red_schema::is_system_schema_write(query) {
4264            return Err(RedDBError::Query(
4265                super::red_schema::READ_ONLY_ERROR.to_string(),
4266            ));
4267        }
4268
4269        if let Some(create_source) = super::analytics_source_catalog::parse_create_statement(query)?
4270        {
4271            return self.execute_create_analytics_source(query, create_source);
4272        }
4273
4274        // Issue #790 — `READ METRIC <path>` is intentionally rejected at
4275        // v0. The descriptor itself is readable through
4276        // `red.analytics.metrics`; the *output* read returns a
4277        // structured error so callers can tell "execution engine not yet
4278        // built" apart from "metric does not exist".
4279        if let Some(path) = super::metric_descriptor_catalog::parse_read_metric_statement(query) {
4280            return Err(super::metric_descriptor_catalog::read_output_unsupported(
4281                &path,
4282            ));
4283        }
4284
4285        // Issue #918 / ADR 0035 — leaderboard rank capability catalog
4286        // declarations are still recognised before the general parser.
4287        // Rank reads themselves are parser AST nodes, including Redis-flavor
4288        // Z* sugar that desugars to the same canonical rank shapes.
4289        if let Some(parsed) = super::ranking_descriptor_catalog::parse_create_ranking(query) {
4290            return self.execute_create_ranking(query, parsed?);
4291        }
4292        if super::ranking_descriptor_catalog::parse_show_rankings(query) {
4293            return self.execute_show_rankings(query);
4294        }
4295
4296        let rewritten_query = super::red_schema::rewrite_virtual_names(query);
4297        let execution_query = rewritten_query.as_deref().unwrap_or(query);
4298
4299        let frame = super::statement_frame::StatementExecutionFrame::build(self, execution_query)?;
4300        let _frame_guards = frame.install(self);
4301
4302        // Phase 6 logging: enter a span stamped with conn_id / tenant
4303        // / query_len. Every downstream tracing::info!/warn!/error!
4304        // inherits these fields — no need to thread them manually
4305        // through storage/scan layers. Entered AFTER the WITHIN /
4306        // SET LOCAL TENANT resolution above so the span reflects the
4307        // effective scope for this statement.
4308        let _log_span = crate::telemetry::span::query_span(query).entered();
4309
4310        // ── CTE prelude (#41) — `WITH x AS (...) SELECT ... FROM x` ──
4311        if let Some(rewritten) = frame.prepare_cte(execution_query)? {
4312            return self.execute_query_expr(rewritten);
4313        }
4314
4315        // ── TURBO: bypass SQL parse for SELECT * FROM x WHERE _entity_id = N ──
4316        if !self.inner.query_audit.has_rules() {
4317            if let Some(result) = self.try_fast_entity_lookup(execution_query) {
4318                return result;
4319            }
4320        }
4321
4322        // ── Result cache: return cached result if still fresh (30s TTL) ──
4323        if !self.inner.query_audit.has_rules() {
4324            if let Some(result) = frame.read_result_cache(self) {
4325                return Ok(result);
4326            }
4327        }
4328
4329        let prepared = frame.prepare_statement(self, execution_query)?;
4330        let mode = prepared.mode;
4331        let expr = prepared.expr;
4332
4333        let statement = query_expr_name(&expr);
4334        let result_cache_scopes = query_expr_result_cache_scopes(&expr);
4335        let control_event_specs = query_control_event_specs(&expr);
4336        let query_audit_plan = query_audit_plan(&expr);
4337
4338        let _lock_guard = match frame.prepare_dispatch(self, &expr) {
4339            Ok(guard) => guard,
4340            Err(err) => {
4341                let outcome = control_event_outcome_for_error(&err);
4342                for spec in &control_event_specs {
4343                    self.emit_control_event(
4344                        spec.kind,
4345                        outcome,
4346                        spec.action,
4347                        spec.resource.clone(),
4348                        Some(err.to_string()),
4349                        spec.fields.clone(),
4350                    )?;
4351                }
4352                return Err(err);
4353            }
4354        };
4355        let frame_iface: &dyn super::statement_frame::ReadFrame = &frame;
4356        let query_audit_started = std::time::Instant::now();
4357
4358        let query_result = match expr {
4359            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
4360                // Apply MVCC visibility + RLS gate while materialising the
4361                // graph: every node entity is screened against the source
4362                // collection's policy chain (basic and `Nodes`-targeted)
4363                // and dropped when the caller's tenant / role doesn't
4364                // admit it. Edges are pruned automatically because the
4365                // graph builder skips edges whose endpoints aren't in
4366                // `allowed_nodes`.
4367                let (graph, node_properties, edge_properties) =
4368                    self.materialize_graph_with_rls()?;
4369                let result =
4370                    crate::storage::query::unified::UnifiedExecutor::execute_on_with_graph_properties(
4371                        &graph,
4372                        &expr,
4373                        node_properties,
4374                        edge_properties,
4375                    )
4376                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4377
4378                Ok(RuntimeQueryResult {
4379                    query: query.to_string(),
4380                    mode,
4381                    statement,
4382                    engine: "materialized-graph",
4383                    result,
4384                    affected_rows: 0,
4385                    statement_type: "select",
4386                    bookmark: None,
4387                })
4388            }
4389            QueryExpr::Table(table) => {
4390                let table = self.resolve_table_expr_subqueries(
4391                    table,
4392                    &frame as &dyn super::statement_frame::ReadFrame,
4393                )?;
4394                // Table-valued functions (e.g. components(g)) dispatch to a
4395                // read-only executor before any catalog/virtual-table routing
4396                // (issue #795).
4397                if let Some(TableSource::Function {
4398                    name,
4399                    args,
4400                    named_args,
4401                }) = table.source.clone()
4402                {
4403                    // The graph-collection form is cacheable (issue #802): the
4404                    // result-cache read at the top of this function keys on the
4405                    // query string, and `result_cache_scopes` carries the graph
4406                    // collection (see `collect_table_source_scopes`) so a write
4407                    // to it invalidates the entry. Deterministic algorithm
4408                    // output is worth caching at any row count, so the write
4409                    // bypasses the generic ≤5-row payload heuristic.
4410                    let tvf_result = RuntimeQueryResult {
4411                        query: query.to_string(),
4412                        mode,
4413                        statement,
4414                        engine: "runtime-graph-tvf",
4415                        result: self.execute_table_function(&name, &args, &named_args)?,
4416                        affected_rows: 0,
4417                        statement_type: "select",
4418                        bookmark: None,
4419                    };
4420                    frame.write_result_cache(self, &tvf_result, result_cache_scopes.clone());
4421                    return Ok(tvf_result);
4422                }
4423                // Inline-graph TVF (issue #799): the graph is supplied by two
4424                // subqueries instead of a collection reference. Unlike the
4425                // graph-collection form, the result IS cacheable — its cache
4426                // key is the query string (the result-cache read at the top of
4427                // `execute_query_inner` keys on it) and `result_cache_scopes`
4428                // already carries the `nodes`/`edges` source collections, so a
4429                // write to any of them invalidates the entry.
4430                if let Some(TableSource::InlineGraphFunction {
4431                    name,
4432                    nodes,
4433                    edges,
4434                    named_args,
4435                }) = table.source.clone()
4436                {
4437                    let inline_result = RuntimeQueryResult {
4438                        query: query.to_string(),
4439                        mode,
4440                        statement,
4441                        engine: "runtime-graph-tvf-inline",
4442                        result: self.execute_inline_graph_function(
4443                            &name,
4444                            &nodes,
4445                            &edges,
4446                            &named_args,
4447                        )?,
4448                        affected_rows: 0,
4449                        statement_type: "select",
4450                        bookmark: None,
4451                    };
4452                    frame.write_result_cache(self, &inline_result, result_cache_scopes);
4453                    return Ok(inline_result);
4454                }
4455                if super::red_schema::is_virtual_table(&table.table) {
4456                    return Ok(RuntimeQueryResult {
4457                        query: query.to_string(),
4458                        mode,
4459                        statement,
4460                        engine: "runtime-red-schema",
4461                        result: super::red_schema::red_query(
4462                            self,
4463                            &table.table,
4464                            &table,
4465                            &frame as &dyn super::statement_frame::ReadFrame,
4466                        )?,
4467                        affected_rows: 0,
4468                        statement_type: "select",
4469                        bookmark: None,
4470                    });
4471                }
4472
4473                // `<graph>.<output>` analytics virtual view (issue #800).
4474                // Recomputed on demand — intentionally not result-cached, so it
4475                // always reflects the current graph data.
4476                if let Some(view_result) = self.try_resolve_analytics_view(
4477                    &table,
4478                    &frame as &dyn super::statement_frame::ReadFrame,
4479                )? {
4480                    return Ok(RuntimeQueryResult {
4481                        query: query.to_string(),
4482                        mode,
4483                        statement,
4484                        engine: "runtime-graph-analytics-view",
4485                        result: view_result,
4486                        affected_rows: 0,
4487                        statement_type: "select",
4488                        bookmark: None,
4489                    });
4490                }
4491
4492                if let Some(result) = self.execute_probabilistic_select(&table)? {
4493                    return Ok(RuntimeQueryResult {
4494                        query: query.to_string(),
4495                        mode,
4496                        statement,
4497                        engine: "runtime-probabilistic",
4498                        result,
4499                        affected_rows: 0,
4500                        statement_type: "select",
4501                        bookmark: None,
4502                    });
4503                }
4504
4505                // Foreign-table intercept (Phase 3.2.2 PG parity).
4506                //
4507                // When the referenced table matches a `CREATE FOREIGN TABLE`
4508                // registration, short-circuit into the FDW scan. Phase 3.2
4509                // wrappers don't yet support pushdown, so filters/projections
4510                // apply post-scan via `apply_foreign_table_filters` — good
4511                // enough for correctness; perf work lands in 3.2.3.
4512                if self.inner.foreign_tables.is_foreign_table(&table.table) {
4513                    let records = self
4514                        .inner
4515                        .foreign_tables
4516                        .scan(&table.table)
4517                        .map_err(|e| RedDBError::Internal(e.to_string()))?;
4518                    let result = apply_foreign_table_filters(records, &table);
4519                    return Ok(RuntimeQueryResult {
4520                        query: query.to_string(),
4521                        mode,
4522                        statement,
4523                        engine: "runtime-fdw",
4524                        result,
4525                        affected_rows: 0,
4526                        statement_type: "select",
4527                        bookmark: None,
4528                    });
4529                }
4530
4531                // Row-Level Security enforcement (Phase 2.5.2 PG parity).
4532                //
4533                // When RLS is enabled on this table, fetch every policy
4534                // that applies to the current (role, SELECT) pair and
4535                // fold them into the query's WHERE clause: policies
4536                // OR-combine (any of them admitting the row is enough),
4537                // then AND into the caller's existing filter.
4538                //
4539                // Anonymous callers (no thread-local identity) pass
4540                // `role = None`; policies with a specific `TO role`
4541                // clause skip, but `TO PUBLIC` policies still apply.
4542                //
4543                // When `inject_rls_filters` returns `None` the table has
4544                // RLS enabled but no policy admits the caller's role —
4545                // short-circuit with an empty result set instead of
4546                // synthesising a contradiction filter.
4547                let Some(table_with_rls) = self.authorize_relational_table_select(
4548                    table,
4549                    &frame as &dyn super::statement_frame::ReadFrame,
4550                )?
4551                else {
4552                    let empty = crate::storage::query::unified::UnifiedResult::empty();
4553                    return Ok(RuntimeQueryResult {
4554                        query: query.to_string(),
4555                        mode,
4556                        statement,
4557                        engine: "runtime-table-rls",
4558                        result: empty,
4559                        affected_rows: 0,
4560                        statement_type: "select",
4561                        bookmark: None,
4562                    });
4563                };
4564                Ok(RuntimeQueryResult {
4565                    query: query.to_string(),
4566                    mode,
4567                    statement,
4568                    engine: "runtime-table",
4569                    // #885: lend the frame-owned row-buffer arena to the
4570                    // streaming path so chunk buffers are reused across
4571                    // this statement's chunk-fetches instead of allocated
4572                    // fresh per chunk. This is the table-query dispatch
4573                    // that runs under a `StatementExecutionFrame`; the
4574                    // frameless prepared/subquery paths keep `None`.
4575                    result: execute_runtime_table_query_in(
4576                        &self.inner.db,
4577                        &table_with_rls,
4578                        Some(&self.inner.index_store),
4579                        Some(frame.row_arena()),
4580                    )?,
4581                    affected_rows: 0,
4582                    statement_type: "select",
4583                    bookmark: None,
4584                })
4585            }
4586            QueryExpr::Join(join) => {
4587                // Fold per-table RLS filters into each `QueryExpr::Table`
4588                // leaf of the join tree before executing. Without this
4589                // the join executor scans both tables raw and ignores
4590                // policies — a `WITHIN TENANT 'x'` against a join of
4591                // two tenant-scoped tables would leak cross-tenant rows.
4592                // When any leaf has RLS enabled and zero matching policy,
4593                // short-circuit to an empty join result instead of
4594                // emitting a contradiction filter.
4595                let join_with_rls = match self.authorize_relational_join_select(
4596                    join,
4597                    &frame as &dyn super::statement_frame::ReadFrame,
4598                )? {
4599                    Some(j) => j,
4600                    None => {
4601                        return Ok(RuntimeQueryResult {
4602                            query: query.to_string(),
4603                            mode,
4604                            statement,
4605                            engine: "runtime-join-rls",
4606                            result: crate::storage::query::unified::UnifiedResult::empty(),
4607                            affected_rows: 0,
4608                            statement_type: "select",
4609                            bookmark: None,
4610                        });
4611                    }
4612                };
4613                Ok(RuntimeQueryResult {
4614                    query: query.to_string(),
4615                    mode,
4616                    statement,
4617                    engine: "runtime-join",
4618                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
4619                    affected_rows: 0,
4620                    statement_type: "select",
4621                    bookmark: None,
4622                })
4623            }
4624            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
4625                query: query.to_string(),
4626                mode,
4627                statement,
4628                engine: "runtime-vector",
4629                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
4630                affected_rows: 0,
4631                statement_type: "select",
4632                bookmark: None,
4633            }),
4634            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
4635                query: query.to_string(),
4636                mode,
4637                statement,
4638                engine: "runtime-hybrid",
4639                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
4640                affected_rows: 0,
4641                statement_type: "select",
4642                bookmark: None,
4643            }),
4644            QueryExpr::RankOf(ref rank) => self.execute_rank_of(query, rank),
4645            QueryExpr::ApproxRankOf(ref rank) => self.execute_approx_rank_of(query, rank),
4646            QueryExpr::RankRange(ref range) => self.execute_rank_range(query, range),
4647            // DML execution
4648            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
4649                Err(RedDBError::Query(
4650                    super::red_schema::READ_ONLY_ERROR.to_string(),
4651                ))
4652            }
4653            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
4654                Err(RedDBError::Query(
4655                    super::red_schema::READ_ONLY_ERROR.to_string(),
4656                ))
4657            }
4658            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
4659                Err(RedDBError::Query(
4660                    super::red_schema::READ_ONLY_ERROR.to_string(),
4661                ))
4662            }
4663            QueryExpr::Insert(ref insert) => self
4664                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
4665                    self.execute_insert(query, insert)
4666                }),
4667            QueryExpr::Update(ref update) => self
4668                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
4669                    self.execute_update(query, update)
4670                }),
4671            QueryExpr::Delete(ref delete) => self
4672                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
4673                    self.execute_delete(query, delete)
4674                }),
4675            // DDL execution
4676            QueryExpr::CreateTable(ref create) => self.execute_create_table(query, create),
4677            QueryExpr::CreateCollection(ref create) => {
4678                self.execute_create_collection(query, create)
4679            }
4680            QueryExpr::CreateVector(ref create) => self.execute_create_vector(query, create),
4681            QueryExpr::DropTable(ref drop_tbl) => self.execute_drop_table(query, drop_tbl),
4682            QueryExpr::DropGraph(ref drop_graph) => self.execute_drop_graph(query, drop_graph),
4683            QueryExpr::DropVector(ref drop_vector) => self.execute_drop_vector(query, drop_vector),
4684            QueryExpr::DropDocument(ref drop_document) => {
4685                self.execute_drop_document(query, drop_document)
4686            }
4687            QueryExpr::DropKv(ref drop_kv) => self.execute_drop_kv(query, drop_kv),
4688            QueryExpr::DropCollection(ref drop_collection) => {
4689                self.execute_drop_collection(query, drop_collection)
4690            }
4691            QueryExpr::Truncate(ref truncate) => self.execute_truncate(query, truncate),
4692            QueryExpr::AlterTable(ref alter) => self.execute_alter_table(query, alter),
4693            QueryExpr::ExplainAlter(ref explain) => self.execute_explain_alter(query, explain),
4694            // Graph analytics commands
4695            QueryExpr::GraphCommand(ref cmd) => self.execute_graph_command(query, cmd),
4696            // Search commands
4697            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query, cmd),
4698            // ASK: RAG query with LLM synthesis
4699            QueryExpr::Ask(ref ask) => self.execute_ask(query, ask),
4700            QueryExpr::CreateIndex(ref create_idx) => self.execute_create_index(query, create_idx),
4701            QueryExpr::DropIndex(ref drop_idx) => self.execute_drop_index(query, drop_idx),
4702            QueryExpr::ProbabilisticCommand(ref cmd) => {
4703                self.execute_probabilistic_command(query, cmd)
4704            }
4705            // Time-series DDL
4706            QueryExpr::CreateTimeSeries(ref ts) => self.execute_create_timeseries(query, ts),
4707            QueryExpr::CreateMetric(ref metric) => self.execute_create_metric(query, metric),
4708            QueryExpr::AlterMetric(ref alter) => self.execute_alter_metric(query, alter),
4709            QueryExpr::CreateSlo(ref slo) => self.execute_create_slo(query, slo),
4710            QueryExpr::DropTimeSeries(ref ts) => self.execute_drop_timeseries(query, ts),
4711            // Queue DDL and commands
4712            QueryExpr::CreateQueue(ref q) => self.execute_create_queue(query, q),
4713            QueryExpr::AlterQueue(ref q) => self.execute_alter_queue(query, q),
4714            QueryExpr::DropQueue(ref q) => self.execute_drop_queue(query, q),
4715            QueryExpr::QueueSelect(ref q) => self.execute_queue_select(query, q),
4716            QueryExpr::QueueCommand(ref cmd) => self.execute_queue_command(query, cmd),
4717            QueryExpr::EventsBackfill(ref backfill) => {
4718                self.execute_events_backfill(query, backfill)
4719            }
4720            QueryExpr::EventsBackfillStatus { ref collection } => Err(RedDBError::Query(format!(
4721                "EVENTS BACKFILL STATUS for '{collection}' is not implemented in this slice"
4722            ))),
4723            QueryExpr::KvCommand(ref cmd) => self.execute_kv_command(query, cmd),
4724            QueryExpr::ConfigCommand(ref cmd) => self.execute_config_command(query, cmd),
4725            QueryExpr::CreateTree(ref tree) => self.execute_create_tree(query, tree),
4726            QueryExpr::DropTree(ref tree) => self.execute_drop_tree(query, tree),
4727            QueryExpr::TreeCommand(ref cmd) => self.execute_tree_command(query, cmd),
4728            // SET CONFIG key = value
4729            QueryExpr::SetConfig { ref key, ref value } => {
4730                if key.starts_with("red.secret.") {
4731                    return Err(RedDBError::Query(
4732                        "red.secret.* is reserved for vault secrets; use SET SECRET".to_string(),
4733                    ));
4734                }
4735                match self.check_managed_config_write_for_set_config(key) {
4736                    Err(err) => Err(err),
4737                    Ok(()) => {
4738                        let store = self.inner.db.store();
4739                        let json_val = match value {
4740                            Value::Text(s) => crate::serde_json::Value::String(s.to_string()),
4741                            Value::Integer(n) => crate::serde_json::Value::Number(*n as f64),
4742                            Value::Float(n) => crate::serde_json::Value::Number(*n),
4743                            Value::Boolean(b) => crate::serde_json::Value::Bool(*b),
4744                            _ => crate::serde_json::Value::String(value.to_string()),
4745                        };
4746                        store.set_config_tree(key, &json_val);
4747                        update_current_config_value(key, value.clone());
4748                        // Config changes can flip runtime behavior mid-session
4749                        // (auto_decrypt, auto_encrypt, etc.) — invalidate the
4750                        // result cache so subsequent reads re-execute against
4751                        // the new config.
4752                        self.invalidate_result_cache();
4753                        Ok(RuntimeQueryResult::ok_message(
4754                            query.to_string(),
4755                            &format!("config set: {key}"),
4756                            "set",
4757                        ))
4758                    }
4759                }
4760            }
4761            // SET SECRET key = value
4762            QueryExpr::SetSecret { ref key, ref value } => {
4763                if key.starts_with("red.config.") {
4764                    return Err(RedDBError::Query(
4765                        "red.config.* is reserved for config; use SET CONFIG".to_string(),
4766                    ));
4767                }
4768                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4769                    RedDBError::Query("SET SECRET requires an enabled, unsealed vault".to_string())
4770                })?;
4771                if matches!(value, Value::Null) {
4772                    auth_store
4773                        .vault_kv_try_delete(key)
4774                        .map_err(|err| RedDBError::Query(err.to_string()))?;
4775                    update_current_secret_value(key, None);
4776                    self.invalidate_result_cache();
4777                    return Ok(RuntimeQueryResult::ok_message(
4778                        query.to_string(),
4779                        &format!("secret deleted: {key}"),
4780                        "delete_secret",
4781                    ));
4782                }
4783                let value = secret_sql_value_to_string(value)?;
4784                auth_store
4785                    .vault_kv_try_set(key.clone(), value.clone())
4786                    .map_err(|err| RedDBError::Query(err.to_string()))?;
4787                update_current_secret_value(key, Some(value));
4788                self.invalidate_result_cache();
4789                Ok(RuntimeQueryResult::ok_message(
4790                    query.to_string(),
4791                    &format!("secret set: {key}"),
4792                    "set_secret",
4793                ))
4794            }
4795            // DELETE SECRET key
4796            QueryExpr::DeleteSecret { ref key } => {
4797                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4798                    RedDBError::Query(
4799                        "DELETE SECRET requires an enabled, unsealed vault".to_string(),
4800                    )
4801                })?;
4802                let deleted = auth_store
4803                    .vault_kv_try_delete(key)
4804                    .map_err(|err| RedDBError::Query(err.to_string()))?;
4805                if deleted {
4806                    update_current_secret_value(key, None);
4807                }
4808                self.invalidate_result_cache();
4809                Ok(RuntimeQueryResult::ok_message(
4810                    query.to_string(),
4811                    &format!("secret deleted: {key}"),
4812                    if deleted {
4813                        "delete_secret"
4814                    } else {
4815                        "delete_secret_not_found"
4816                    },
4817                ))
4818            }
4819            // SHOW SECRET[S] [prefix]
4820            QueryExpr::ShowSecrets { ref prefix } => {
4821                let auth_store = self.inner.auth_store.read().clone().ok_or_else(|| {
4822                    RedDBError::Query("SHOW SECRET requires an enabled, unsealed vault".to_string())
4823                })?;
4824                if !auth_store.is_vault_backed() {
4825                    return Err(RedDBError::Query(
4826                        "SHOW SECRET requires an enabled, unsealed vault".to_string(),
4827                    ));
4828                }
4829                let mut keys = auth_store.vault_kv_keys();
4830                keys.sort();
4831                let mut result = UnifiedResult::with_columns(vec![
4832                    "key".into(),
4833                    "value".into(),
4834                    "status".into(),
4835                ]);
4836                for key in keys {
4837                    if let Some(ref pfx) = prefix {
4838                        if !key.starts_with(pfx) {
4839                            continue;
4840                        }
4841                    }
4842                    let mut record = UnifiedRecord::new();
4843                    record.set("key", Value::text(key));
4844                    record.set("value", Value::text("***"));
4845                    record.set("status", Value::text("active"));
4846                    result.push(record);
4847                }
4848                Ok(RuntimeQueryResult {
4849                    query: query.to_string(),
4850                    mode,
4851                    statement: "show_secrets",
4852                    engine: "runtime-secret",
4853                    result,
4854                    affected_rows: 0,
4855                    statement_type: "select",
4856                    bookmark: None,
4857                })
4858            }
4859            // SHOW CONFIG [prefix]
4860            QueryExpr::ShowConfig { ref prefix } => {
4861                let store = self.inner.db.store();
4862                let all_collections = store.list_collections();
4863                if !all_collections.contains(&"red_config".to_string()) {
4864                    let result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
4865                    return Ok(RuntimeQueryResult {
4866                        query: query.to_string(),
4867                        mode,
4868                        statement: "show_config",
4869                        engine: "runtime-config",
4870                        result,
4871                        affected_rows: 0,
4872                        statement_type: "select",
4873                        bookmark: None,
4874                    });
4875                }
4876                let manager = store
4877                    .get_collection("red_config")
4878                    .ok_or_else(|| RedDBError::NotFound("red_config".to_string()))?;
4879                let entities = manager.query_all(|_| true);
4880                let mut latest = std::collections::BTreeMap::<String, (u64, Value, Value)>::new();
4881                for entity in entities {
4882                    if let EntityData::Row(ref row) = entity.data {
4883                        if let Some(ref named) = row.named {
4884                            let key_val = named.get("key").cloned().unwrap_or(Value::Null);
4885                            let val = named.get("value").cloned().unwrap_or(Value::Null);
4886                            let key_str = match &key_val {
4887                                Value::Text(s) => s.as_ref(),
4888                                _ => continue,
4889                            };
4890                            if let Some(ref pfx) = prefix {
4891                                if !key_str.starts_with(pfx.as_str()) {
4892                                    continue;
4893                                }
4894                            }
4895                            let entity_id = entity.id.raw();
4896                            match latest.get(key_str) {
4897                                Some((prev_id, _, _)) if *prev_id > entity_id => {}
4898                                _ => {
4899                                    latest.insert(key_str.to_string(), (entity_id, key_val, val));
4900                                }
4901                            }
4902                        }
4903                    }
4904                }
4905                let mut result = UnifiedResult::with_columns(vec!["key".into(), "value".into()]);
4906                for (_, key_val, val) in latest.into_values() {
4907                    let mut record = UnifiedRecord::new();
4908                    record.set("key", key_val);
4909                    record.set("value", val);
4910                    result.push(record);
4911                }
4912                Ok(RuntimeQueryResult {
4913                    query: query.to_string(),
4914                    mode,
4915                    statement: "show_config",
4916                    engine: "runtime-config",
4917                    result,
4918                    affected_rows: 0,
4919                    statement_type: "select",
4920                    bookmark: None,
4921                })
4922            }
4923            // Session-local multi-tenancy handle (Phase 2.5.3).
4924            //
4925            // SET TENANT 'id' / SET TENANT NULL / RESET TENANT — writes
4926            // the thread-local; SHOW TENANT returns it. Paired with the
4927            // CURRENT_TENANT() scalar for use in RLS policies.
4928            QueryExpr::SetTenant(ref value) => {
4929                match value {
4930                    Some(id) => set_current_tenant(id.clone()),
4931                    None => clear_current_tenant(),
4932                }
4933                Ok(RuntimeQueryResult::ok_message(
4934                    query.to_string(),
4935                    &match value {
4936                        Some(id) => format!("tenant set: {id}"),
4937                        None => "tenant cleared".to_string(),
4938                    },
4939                    "set_tenant",
4940                ))
4941            }
4942            QueryExpr::ShowTenant => {
4943                let mut result = UnifiedResult::with_columns(vec!["tenant".into()]);
4944                let mut record = UnifiedRecord::new();
4945                record.set(
4946                    "tenant",
4947                    current_tenant().map(Value::text).unwrap_or(Value::Null),
4948                );
4949                result.push(record);
4950                Ok(RuntimeQueryResult {
4951                    query: query.to_string(),
4952                    mode,
4953                    statement: "show_tenant",
4954                    engine: "runtime-tenant",
4955                    result,
4956                    affected_rows: 0,
4957                    statement_type: "select",
4958                    bookmark: None,
4959                })
4960            }
4961            // Transaction control (Phase 2.3 PG parity).
4962            //
4963            // BEGIN allocates a real `Xid` and stores a `TxnContext` keyed by
4964            // the current connection's id. COMMIT/ROLLBACK release it through
4965            // the `SnapshotManager` so future snapshots see the correct set of
4966            // active/aborted transactions.
4967            //
4968            // Tuple stamping (xmin/xmax) and read-path visibility filtering
4969            // land in Phase 2.3.2 — this dispatch only manages the snapshot
4970            // registry. Statements running outside a TxnContext still behave
4971            // as autocommit (xid=0 → visible to every snapshot).
4972            QueryExpr::TransactionControl(ref ctl) => {
4973                use crate::storage::query::ast::TxnControl;
4974                use crate::storage::transaction::snapshot::{TxnContext, Xid};
4975                use crate::storage::transaction::IsolationLevel;
4976
4977                // Phase 2.3 keys transactions by a thread-local connection id.
4978                // The stdio/gRPC paths wire a real per-connection id later;
4979                // for embedded use (one RedDBRuntime per process-ish caller)
4980                // we fall back to a deterministic placeholder.
4981                let conn_id = current_connection_id();
4982
4983                let (kind, msg) = match ctl {
4984                    TxnControl::Begin => {
4985                        let mgr = Arc::clone(&self.inner.snapshot_manager);
4986                        let xid = mgr.begin();
4987                        let snapshot = mgr.snapshot(xid);
4988                        let ctx = TxnContext {
4989                            xid,
4990                            isolation: IsolationLevel::SnapshotIsolation,
4991                            snapshot,
4992                            savepoints: Vec::new(),
4993                            released_sub_xids: Vec::new(),
4994                        };
4995                        self.inner.tx_contexts.write().insert(conn_id, ctx);
4996                        ("begin", format!("BEGIN — xid={xid} (snapshot isolation)"))
4997                    }
4998                    TxnControl::Commit => {
4999                        // SET LOCAL TENANT ends with the transaction.
5000                        self.inner.tx_local_tenants.write().remove(&conn_id);
5001                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5002                        match ctx {
5003                            Some(ctx) => {
5004                                let mut own_xids = std::collections::HashSet::new();
5005                                own_xids.insert(ctx.xid);
5006                                for (_, sub) in &ctx.savepoints {
5007                                    own_xids.insert(*sub);
5008                                }
5009                                for sub in &ctx.released_sub_xids {
5010                                    own_xids.insert(*sub);
5011                                }
5012                                if let Err(err) = self.check_table_row_write_conflicts(
5013                                    conn_id,
5014                                    &ctx.snapshot,
5015                                    &own_xids,
5016                                ) {
5017                                    for (_, sub) in &ctx.savepoints {
5018                                        self.inner.snapshot_manager.rollback(*sub);
5019                                    }
5020                                    for sub in &ctx.released_sub_xids {
5021                                        self.inner.snapshot_manager.rollback(*sub);
5022                                    }
5023                                    self.inner.snapshot_manager.rollback(ctx.xid);
5024                                    self.revive_pending_versioned_updates(conn_id);
5025                                    self.revive_pending_tombstones(conn_id);
5026                                    self.discard_pending_kv_watch_events(conn_id);
5027                                    self.discard_pending_queue_wakes(conn_id);
5028                                    self.discard_pending_store_wal_actions(conn_id);
5029                                    return Err(err);
5030                                }
5031                                self.restore_pending_write_stamps(conn_id);
5032                                if let Err(err) = self.flush_pending_store_wal_actions(conn_id) {
5033                                    for (_, sub) in &ctx.savepoints {
5034                                        self.inner.snapshot_manager.rollback(*sub);
5035                                    }
5036                                    for sub in &ctx.released_sub_xids {
5037                                        self.inner.snapshot_manager.rollback(*sub);
5038                                    }
5039                                    self.inner.snapshot_manager.rollback(ctx.xid);
5040                                    self.revive_pending_versioned_updates(conn_id);
5041                                    self.revive_pending_tombstones(conn_id);
5042                                    self.discard_pending_kv_watch_events(conn_id);
5043                                    return Err(err);
5044                                }
5045                                // Phase 2.3.2e: commit every open sub-xid
5046                                // so they also become visible. Their
5047                                // work is promoted to the parent txn's
5048                                // result exactly like a RELEASE would
5049                                // have done.
5050                                for (_, sub) in &ctx.savepoints {
5051                                    self.inner.snapshot_manager.commit(*sub);
5052                                }
5053                                for sub in &ctx.released_sub_xids {
5054                                    self.inner.snapshot_manager.commit(*sub);
5055                                }
5056                                self.inner.snapshot_manager.commit(ctx.xid);
5057                                self.finalize_pending_versioned_updates(conn_id);
5058                                self.finalize_pending_tombstones(conn_id);
5059                                self.finalize_pending_kv_watch_events(conn_id);
5060                                self.finalize_pending_queue_wakes(conn_id);
5061                                ("commit", format!("COMMIT — xid={} committed", ctx.xid))
5062                            }
5063                            None => (
5064                                "commit",
5065                                "COMMIT outside transaction — no-op (autocommit)".to_string(),
5066                            ),
5067                        }
5068                    }
5069                    TxnControl::Rollback => {
5070                        self.inner.tx_local_tenants.write().remove(&conn_id);
5071                        let ctx = self.inner.tx_contexts.write().remove(&conn_id);
5072                        match ctx {
5073                            Some(ctx) => {
5074                                // Phase 2.3.2e: abort every open sub-xid
5075                                // too so their writes stay hidden.
5076                                for (_, sub) in &ctx.savepoints {
5077                                    self.inner.snapshot_manager.rollback(*sub);
5078                                }
5079                                for sub in &ctx.released_sub_xids {
5080                                    self.inner.snapshot_manager.rollback(*sub);
5081                                }
5082                                self.inner.snapshot_manager.rollback(ctx.xid);
5083                                // Phase 2.3.2b: tuples that the txn had
5084                                // xmax-stamped become live again — wipe xmax
5085                                // back to 0 so later snapshots see them.
5086                                self.revive_pending_versioned_updates(conn_id);
5087                                self.revive_pending_tombstones(conn_id);
5088                                self.discard_pending_kv_watch_events(conn_id);
5089                                self.discard_pending_queue_wakes(conn_id);
5090                                self.discard_pending_store_wal_actions(conn_id);
5091                                ("rollback", format!("ROLLBACK — xid={} aborted", ctx.xid))
5092                            }
5093                            None => (
5094                                "rollback",
5095                                "ROLLBACK outside transaction — no-op (autocommit)".to_string(),
5096                            ),
5097                        }
5098                    }
5099                    // Phase 2.3.2e: savepoints map onto sub-xids. Each
5100                    // SAVEPOINT allocates a fresh xid and pushes it
5101                    // onto the per-txn stack so subsequent writes can
5102                    // be selectively rolled back. RELEASE pops without
5103                    // aborting; ROLLBACK TO aborts the sub-xid (and
5104                    // any nested ones) + revives their tombstones.
5105                    TxnControl::Savepoint(name) => {
5106                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5107                        let mut guard = self.inner.tx_contexts.write();
5108                        match guard.get_mut(&conn_id) {
5109                            Some(ctx) => {
5110                                let sub = mgr.begin();
5111                                ctx.savepoints.push((name.clone(), sub));
5112                                ("savepoint", format!("SAVEPOINT {name} — sub_xid={sub}"))
5113                            }
5114                            None => (
5115                                "savepoint",
5116                                "SAVEPOINT outside transaction — no-op".to_string(),
5117                            ),
5118                        }
5119                    }
5120                    TxnControl::ReleaseSavepoint(name) => {
5121                        let mut guard = self.inner.tx_contexts.write();
5122                        match guard.get_mut(&conn_id) {
5123                            Some(ctx) => {
5124                                let pos = ctx
5125                                    .savepoints
5126                                    .iter()
5127                                    .position(|(n, _)| n == name)
5128                                    .ok_or_else(|| {
5129                                        RedDBError::Internal(format!(
5130                                            "savepoint {name} does not exist"
5131                                        ))
5132                                    })?;
5133                                // RELEASE pops the named savepoint and
5134                                // any nested ones. Their sub-xids move
5135                                // to `released_sub_xids` so they commit
5136                                // (or roll back) alongside the parent
5137                                // xid — PG semantics: released
5138                                // savepoints still contribute their
5139                                // work, but their names are gone.
5140                                let released = ctx.savepoints.len() - pos;
5141                                let popped: Vec<Xid> = ctx
5142                                    .savepoints
5143                                    .split_off(pos)
5144                                    .into_iter()
5145                                    .map(|(_, x)| x)
5146                                    .collect();
5147                                ctx.released_sub_xids.extend(popped);
5148                                (
5149                                    "release_savepoint",
5150                                    format!("RELEASE SAVEPOINT {name} — {released} level(s)"),
5151                                )
5152                            }
5153                            None => (
5154                                "release_savepoint",
5155                                "RELEASE outside transaction — no-op".to_string(),
5156                            ),
5157                        }
5158                    }
5159                    TxnControl::RollbackToSavepoint(name) => {
5160                        let mgr = Arc::clone(&self.inner.snapshot_manager);
5161                        // Splice out the savepoint + nested ones under
5162                        // a narrow lock, then run the snapshot-manager
5163                        // + tombstone side-effects without the tx map
5164                        // held so nothing re-enters.
5165                        let drop_result: Option<(Xid, Vec<Xid>)> = {
5166                            let mut guard = self.inner.tx_contexts.write();
5167                            if let Some(ctx) = guard.get_mut(&conn_id) {
5168                                let pos = ctx
5169                                    .savepoints
5170                                    .iter()
5171                                    .position(|(n, _)| n == name)
5172                                    .ok_or_else(|| {
5173                                        RedDBError::Internal(format!(
5174                                            "savepoint {name} does not exist"
5175                                        ))
5176                                    })?;
5177                                let savepoint_xid = ctx.savepoints[pos].1;
5178                                let aborted: Vec<Xid> = ctx
5179                                    .savepoints
5180                                    .split_off(pos)
5181                                    .into_iter()
5182                                    .map(|(_, x)| x)
5183                                    .collect();
5184                                Some((savepoint_xid, aborted))
5185                            } else {
5186                                None
5187                            }
5188                        };
5189
5190                        match drop_result {
5191                            Some((savepoint_xid, aborted)) => {
5192                                for x in &aborted {
5193                                    mgr.rollback(*x);
5194                                }
5195                                let reverted_updates =
5196                                    self.revive_versioned_updates_since(conn_id, savepoint_xid);
5197                                let revived = self.revive_tombstones_since(conn_id, savepoint_xid);
5198                                (
5199                                    "rollback_to_savepoint",
5200                                    format!(
5201                                        "ROLLBACK TO SAVEPOINT {name} — aborted {} sub_xid(s), reverted {reverted_updates} update(s), revived {revived} tombstone(s)",
5202                                        aborted.len(),
5203                                    ),
5204                                )
5205                            }
5206                            None => (
5207                                "rollback_to_savepoint",
5208                                "ROLLBACK TO outside transaction — no-op".to_string(),
5209                            ),
5210                        }
5211                    }
5212                };
5213                Ok(RuntimeQueryResult::ok_message(
5214                    query.to_string(),
5215                    &msg,
5216                    kind,
5217                ))
5218            }
5219            // Schema + Sequence DDL (Phase 1.3 PG parity).
5220            //
5221            // Schemas are lightweight logical namespaces: a CREATE SCHEMA call
5222            // just registers the name in `red_config` under `schema.{name}`.
5223            // Table lookups still happen by collection name; clients using
5224            // `schema.table` qualified names collapse to collection `schema.table`.
5225            //
5226            // Sequences persist a 64-bit counter + metadata (start, increment)
5227            // in `red_config` under `sequence.{name}.*`. Scalar callers
5228            // `nextval('name')` / `currval('name')` arrive with the MVCC phase
5229            // once we have a proper mutating-function dispatch path; for now the
5230            // DDL just establishes the catalog entry so clients don't error.
5231            QueryExpr::CreateSchema(ref q) => {
5232                let store = self.inner.db.store();
5233                let key = format!("schema.{}", q.name);
5234                if store.get_config(&key).is_some() {
5235                    if q.if_not_exists {
5236                        return Ok(RuntimeQueryResult::ok_message(
5237                            query.to_string(),
5238                            &format!("schema {} already exists — skipped", q.name),
5239                            "create_schema",
5240                        ));
5241                    }
5242                    return Err(RedDBError::Internal(format!(
5243                        "schema {} already exists",
5244                        q.name
5245                    )));
5246                }
5247                store.set_config_tree(&key, &crate::serde_json::Value::Bool(true));
5248                Ok(RuntimeQueryResult::ok_message(
5249                    query.to_string(),
5250                    &format!("schema {} created", q.name),
5251                    "create_schema",
5252                ))
5253            }
5254            QueryExpr::DropSchema(ref q) => {
5255                let store = self.inner.db.store();
5256                let key = format!("schema.{}", q.name);
5257                let existed = store.get_config(&key).is_some();
5258                if !existed && !q.if_exists {
5259                    return Err(RedDBError::Internal(format!(
5260                        "schema {} does not exist",
5261                        q.name
5262                    )));
5263                }
5264                // Remove marker from red_config via set to null.
5265                store.set_config_tree(&key, &crate::serde_json::Value::Null);
5266                let suffix = if q.cascade {
5267                    " (CASCADE accepted — tables untouched)"
5268                } else {
5269                    ""
5270                };
5271                Ok(RuntimeQueryResult::ok_message(
5272                    query.to_string(),
5273                    &format!("schema {} dropped{}", q.name, suffix),
5274                    "drop_schema",
5275                ))
5276            }
5277            QueryExpr::CreateSequence(ref q) => {
5278                let store = self.inner.db.store();
5279                let base = format!("sequence.{}", q.name);
5280                let start_key = format!("{base}.start");
5281                let incr_key = format!("{base}.increment");
5282                let curr_key = format!("{base}.current");
5283                if store.get_config(&start_key).is_some() {
5284                    if q.if_not_exists {
5285                        return Ok(RuntimeQueryResult::ok_message(
5286                            query.to_string(),
5287                            &format!("sequence {} already exists — skipped", q.name),
5288                            "create_sequence",
5289                        ));
5290                    }
5291                    return Err(RedDBError::Internal(format!(
5292                        "sequence {} already exists",
5293                        q.name
5294                    )));
5295                }
5296                // Persist start + increment, and set current so the first
5297                // nextval returns `start`.
5298                let initial_current = q.start - q.increment;
5299                store.set_config_tree(
5300                    &start_key,
5301                    &crate::serde_json::Value::Number(q.start as f64),
5302                );
5303                store.set_config_tree(
5304                    &incr_key,
5305                    &crate::serde_json::Value::Number(q.increment as f64),
5306                );
5307                store.set_config_tree(
5308                    &curr_key,
5309                    &crate::serde_json::Value::Number(initial_current as f64),
5310                );
5311                Ok(RuntimeQueryResult::ok_message(
5312                    query.to_string(),
5313                    &format!(
5314                        "sequence {} created (start={}, increment={})",
5315                        q.name, q.start, q.increment
5316                    ),
5317                    "create_sequence",
5318                ))
5319            }
5320            QueryExpr::DropSequence(ref q) => {
5321                let store = self.inner.db.store();
5322                let base = format!("sequence.{}", q.name);
5323                let existed = store.get_config(&format!("{base}.start")).is_some();
5324                if !existed && !q.if_exists {
5325                    return Err(RedDBError::Internal(format!(
5326                        "sequence {} does not exist",
5327                        q.name
5328                    )));
5329                }
5330                for k in ["start", "increment", "current"] {
5331                    store.set_config_tree(&format!("{base}.{k}"), &crate::serde_json::Value::Null);
5332                }
5333                Ok(RuntimeQueryResult::ok_message(
5334                    query.to_string(),
5335                    &format!("sequence {} dropped", q.name),
5336                    "drop_sequence",
5337                ))
5338            }
5339            // Views — CREATE [MATERIALIZED] VIEW (Phase 2.1 PG parity).
5340            //
5341            // The view definition is stored in-memory on RuntimeInner (not
5342            // persisted). SELECTs that reference the view name will substitute
5343            // the stored `QueryExpr` via `resolve_view_reference` during
5344            // planning (same entry point used by table-name resolution).
5345            //
5346            // Materialized views additionally allocate a slot in
5347            // `MaterializedViewCache`; a REFRESH repopulates that slot.
5348            QueryExpr::CreateView(ref q) => {
5349                let mut views = self.inner.views.write();
5350                if views.contains_key(&q.name) && !q.or_replace {
5351                    if q.if_not_exists {
5352                        return Ok(RuntimeQueryResult::ok_message(
5353                            query.to_string(),
5354                            &format!("view {} already exists — skipped", q.name),
5355                            "create_view",
5356                        ));
5357                    }
5358                    return Err(RedDBError::Internal(format!(
5359                        "view {} already exists",
5360                        q.name
5361                    )));
5362                }
5363                views.insert(q.name.clone(), Arc::new(q.clone()));
5364                drop(views);
5365
5366                // Materialized view: register cache slot (data is empty until REFRESH).
5367                if q.materialized {
5368                    use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
5369                    let refresh = match q.refresh_every_ms {
5370                        Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
5371                        None => RefreshPolicy::Manual,
5372                    };
5373                    let dependencies = collect_table_refs(&q.query);
5374                    let def = MaterializedViewDef {
5375                        name: q.name.clone(),
5376                        query: format!("<parsed view {}>", q.name),
5377                        dependencies: dependencies.clone(),
5378                        refresh,
5379                        retention_duration_ms: q.retention_duration_ms,
5380                    };
5381                    self.inner.materialized_views.write().register(def);
5382
5383                    // Issue #593 slice 9a — persist the descriptor to
5384                    // the system catalog so the definition survives a
5385                    // restart. Upsert semantics (delete-then-insert by
5386                    // name) keep the catalog free of duplicate rows
5387                    // across `CREATE OR REPLACE` churn.
5388                    let descriptor =
5389                        crate::runtime::continuous_materialized_view::MaterializedViewDescriptor {
5390                            name: q.name.clone(),
5391                            source_sql: query.to_string(),
5392                            source_collections: dependencies,
5393                            refresh_every_ms: q.refresh_every_ms,
5394                            retention_duration_ms: q.retention_duration_ms,
5395                        };
5396                    let store = self.inner.db.store();
5397                    crate::runtime::continuous_materialized_view::persist_descriptor(
5398                        store.as_ref(),
5399                        &descriptor,
5400                    )?;
5401
5402                    // Issue #594 slice 9b — provision a Table-shaped
5403                    // backing collection named after the view. The
5404                    // rewriter skips materialized views (see
5405                    // `rewrite_view_refs_inner`) so `SELECT FROM v`
5406                    // resolves to this collection directly. Empty
5407                    // until REFRESH wires through it in 9c.
5408                    self.ensure_materialized_view_backing(&q.name)?;
5409                }
5410                // Plan cache may have cached a plan that didn't know about this
5411                // view — invalidate so future references pick up the new binding.
5412                // Result cache gets flushed too: OR REPLACE must not serve a
5413                // prior execution of the obsolete body.
5414                self.invalidate_plan_cache();
5415                self.invalidate_result_cache();
5416
5417                Ok(RuntimeQueryResult::ok_message(
5418                    query.to_string(),
5419                    &format!(
5420                        "{}view {} created",
5421                        if q.materialized { "materialized " } else { "" },
5422                        q.name
5423                    ),
5424                    "create_view",
5425                ))
5426            }
5427            QueryExpr::DropView(ref q) => {
5428                let mut views = self.inner.views.write();
5429                let removed = views.remove(&q.name);
5430                let existed = removed.is_some();
5431                let removed_materialized =
5432                    removed.as_ref().map(|v| v.materialized).unwrap_or(false);
5433                drop(views);
5434                if q.materialized || existed {
5435                    // Try the materialised cache too — silent if absent.
5436                    self.inner.materialized_views.write().remove(&q.name);
5437                    // Issue #593 slice 9a — remove any persisted
5438                    // catalog row. Idempotent: a no-op when the view
5439                    // was never materialized (no row was ever written).
5440                    let store = self.inner.db.store();
5441                    crate::runtime::continuous_materialized_view::remove_by_name(
5442                        store.as_ref(),
5443                        &q.name,
5444                    )?;
5445                }
5446                // Issue #594 slice 9b — drop the backing collection
5447                // that was provisioned at CREATE time. Only mat views
5448                // ever had one; regular views never did.
5449                if removed_materialized || q.materialized {
5450                    self.drop_materialized_view_backing(&q.name)?;
5451                }
5452                // Drop any plan / result cache entries that baked the
5453                // view body into their QueryExpr.
5454                self.invalidate_plan_cache();
5455                self.invalidate_result_cache();
5456                if !existed && !q.if_exists {
5457                    return Err(RedDBError::Internal(format!(
5458                        "view {} does not exist",
5459                        q.name
5460                    )));
5461                }
5462                self.invalidate_plan_cache();
5463                Ok(RuntimeQueryResult::ok_message(
5464                    query.to_string(),
5465                    &format!("view {} dropped", q.name),
5466                    "drop_view",
5467                ))
5468            }
5469            QueryExpr::RefreshMaterializedView(ref q) => {
5470                // Look up the view definition, execute its underlying query,
5471                // and stash the serialized result in the materialised cache.
5472                let view = {
5473                    let views = self.inner.views.read();
5474                    views.get(&q.name).cloned()
5475                };
5476                let view = match view {
5477                    Some(v) => v,
5478                    None => {
5479                        return Err(RedDBError::Internal(format!(
5480                            "view {} does not exist",
5481                            q.name
5482                        )))
5483                    }
5484                };
5485                if !view.materialized {
5486                    return Err(RedDBError::Internal(format!(
5487                        "view {} is not materialized — REFRESH requires \
5488                         CREATE MATERIALIZED VIEW",
5489                        q.name
5490                    )));
5491                }
5492                // Execute the underlying query fresh.
5493                let started = std::time::Instant::now();
5494                let now_ms = std::time::SystemTime::now()
5495                    .duration_since(std::time::UNIX_EPOCH)
5496                    .map(|d| d.as_millis() as u64)
5497                    .unwrap_or(0);
5498                match self.execute_query_expr((*view.query).clone()) {
5499                    Ok(inner_result) => {
5500                        // Issue #595 slice 9c — atomically replace the
5501                        // backing collection's contents under a single
5502                        // WAL group. Concurrent SELECT from the view
5503                        // sees either the prior or new contents, never
5504                        // partial. A crash before the WAL commit lands
5505                        // leaves the prior contents intact on recovery.
5506                        let entities =
5507                            view_records_to_entities(&q.name, &inner_result.result.records);
5508                        let row_count = entities.len() as u64;
5509                        let store = self.inner.db.store();
5510                        let serialized_records = match store.refresh_collection(&q.name, entities) {
5511                            Ok(records) => records,
5512                            Err(err) => {
5513                                let duration_ms = started.elapsed().as_millis() as u64;
5514                                let msg = err.to_string();
5515                                self.inner
5516                                    .materialized_views
5517                                    .write()
5518                                    .record_refresh_failure(
5519                                        &q.name,
5520                                        msg.clone(),
5521                                        duration_ms,
5522                                        now_ms,
5523                                    );
5524                                return Err(RedDBError::Internal(format!(
5525                                    "REFRESH MATERIALIZED VIEW {}: {msg}",
5526                                    q.name
5527                                )));
5528                            }
5529                        };
5530
5531                        // Issue #596 slice 9d — emit a Refresh
5532                        // ChangeRecord into the logical-WAL spool so
5533                        // replicas deterministically replay the same
5534                        // backing-collection contents via
5535                        // `LogicalChangeApplier::apply_record`.
5536                        if let Some(ref primary) = self.inner.db.replication {
5537                            let lsn = self.inner.cdc.emit(
5538                                crate::replication::cdc::ChangeOperation::Refresh,
5539                                &q.name,
5540                                0,
5541                                "refresh",
5542                            );
5543                            self.invalidate_result_cache_for_table(&q.name);
5544                            let timestamp = std::time::SystemTime::now()
5545                                .duration_since(std::time::UNIX_EPOCH)
5546                                .unwrap_or_default()
5547                                .as_millis() as u64;
5548                            let record = ChangeRecord::for_refresh(
5549                                lsn,
5550                                timestamp,
5551                                q.name.clone(),
5552                                serialized_records,
5553                            )
5554                            .with_term(self.current_replication_term());
5555                            let encoded = record.encode();
5556                            primary.append_logical_record(record.lsn, encoded);
5557                        }
5558
5559                        let duration_ms = started.elapsed().as_millis() as u64;
5560                        let serialized = format!("{:?}", inner_result.result);
5561                        self.inner
5562                            .materialized_views
5563                            .write()
5564                            .record_refresh_success(
5565                                &q.name,
5566                                serialized.into_bytes(),
5567                                row_count,
5568                                duration_ms,
5569                                now_ms,
5570                            );
5571                        // SELECT FROM v now reads through the rewriter
5572                        // skip into the backing collection — drop the
5573                        // result cache so prior empty-backing reads
5574                        // don't shadow the new contents.
5575                        self.invalidate_result_cache();
5576                        Ok(RuntimeQueryResult::ok_message(
5577                            query.to_string(),
5578                            &format!("materialized view {} refreshed", q.name),
5579                            "refresh_materialized_view",
5580                        ))
5581                    }
5582                    Err(err) => {
5583                        let duration_ms = started.elapsed().as_millis() as u64;
5584                        let msg = err.to_string();
5585                        self.inner
5586                            .materialized_views
5587                            .write()
5588                            .record_refresh_failure(&q.name, msg.clone(), duration_ms, now_ms);
5589                        Err(err)
5590                    }
5591                }
5592            }
5593            // Row Level Security (Phase 2.5 PG parity).
5594            //
5595            // Policies live in an in-memory registry keyed by (table, name).
5596            // Enforcement (AND-ing the policy's USING clause into every
5597            // query's WHERE for the table) arrives in Phase 2.5.2 via the
5598            // filter compiler; this dispatch only manages the catalog.
5599            QueryExpr::CreatePolicy(ref q) => {
5600                let key = (q.table.clone(), q.name.clone());
5601                self.inner
5602                    .rls_policies
5603                    .write()
5604                    .insert(key, Arc::new(q.clone()));
5605                self.invalidate_plan_cache();
5606                // Issue #120 — surface policy names in the
5607                // schema-vocabulary so AskPipeline (#121) can resolve
5608                // a policy reference back to its table.
5609                self.schema_vocabulary_apply(
5610                    crate::runtime::schema_vocabulary::DdlEvent::CreatePolicy {
5611                        collection: q.table.clone(),
5612                        policy: q.name.clone(),
5613                    },
5614                );
5615                Ok(RuntimeQueryResult::ok_message(
5616                    query.to_string(),
5617                    &format!("policy {} on {} created", q.name, q.table),
5618                    "create_policy",
5619                ))
5620            }
5621            QueryExpr::DropPolicy(ref q) => {
5622                let removed = self
5623                    .inner
5624                    .rls_policies
5625                    .write()
5626                    .remove(&(q.table.clone(), q.name.clone()))
5627                    .is_some();
5628                if !removed && !q.if_exists {
5629                    return Err(RedDBError::Internal(format!(
5630                        "policy {} on {} does not exist",
5631                        q.name, q.table
5632                    )));
5633                }
5634                self.invalidate_plan_cache();
5635                // Issue #120 — keep the schema-vocabulary policy
5636                // entry in sync.
5637                self.schema_vocabulary_apply(
5638                    crate::runtime::schema_vocabulary::DdlEvent::DropPolicy {
5639                        collection: q.table.clone(),
5640                        policy: q.name.clone(),
5641                    },
5642                );
5643                Ok(RuntimeQueryResult::ok_message(
5644                    query.to_string(),
5645                    &format!("policy {} on {} dropped", q.name, q.table),
5646                    "drop_policy",
5647                ))
5648            }
5649            // Foreign Data Wrappers (Phase 3.2 PG parity).
5650            //
5651            // CREATE SERVER / CREATE FOREIGN TABLE register into the shared
5652            // `ForeignTableRegistry`. The read path consults that registry
5653            // before dispatching a SELECT — when the table name matches a
5654            // registered foreign table, we forward the scan to the wrapper
5655            // and skip the normal collection lookup.
5656            //
5657            // Phase 3.2 is in-memory only; persistence across restarts is a
5658            // 3.2.2 follow-up that mirrors the view registry pattern.
5659            QueryExpr::CreateServer(ref q) => {
5660                use crate::storage::fdw::FdwOptions;
5661                let registry = Arc::clone(&self.inner.foreign_tables);
5662                if registry.server(&q.name).is_some() {
5663                    if q.if_not_exists {
5664                        return Ok(RuntimeQueryResult::ok_message(
5665                            query.to_string(),
5666                            &format!("server {} already exists — skipped", q.name),
5667                            "create_server",
5668                        ));
5669                    }
5670                    return Err(RedDBError::Internal(format!(
5671                        "server {} already exists",
5672                        q.name
5673                    )));
5674                }
5675                let mut opts = FdwOptions::new();
5676                for (k, v) in &q.options {
5677                    opts.values.insert(k.clone(), v.clone());
5678                }
5679                registry
5680                    .create_server(&q.name, &q.wrapper, opts)
5681                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
5682                Ok(RuntimeQueryResult::ok_message(
5683                    query.to_string(),
5684                    &format!("server {} created (wrapper {})", q.name, q.wrapper),
5685                    "create_server",
5686                ))
5687            }
5688            QueryExpr::DropServer(ref q) => {
5689                let existed = self.inner.foreign_tables.drop_server(&q.name);
5690                if !existed && !q.if_exists {
5691                    return Err(RedDBError::Internal(format!(
5692                        "server {} does not exist",
5693                        q.name
5694                    )));
5695                }
5696                Ok(RuntimeQueryResult::ok_message(
5697                    query.to_string(),
5698                    &format!(
5699                        "server {} dropped{}",
5700                        q.name,
5701                        if q.cascade { " (cascade)" } else { "" }
5702                    ),
5703                    "drop_server",
5704                ))
5705            }
5706            QueryExpr::CreateForeignTable(ref q) => {
5707                use crate::storage::fdw::{FdwOptions, ForeignColumn, ForeignTable};
5708                let registry = Arc::clone(&self.inner.foreign_tables);
5709                if registry.foreign_table(&q.name).is_some() {
5710                    if q.if_not_exists {
5711                        return Ok(RuntimeQueryResult::ok_message(
5712                            query.to_string(),
5713                            &format!("foreign table {} already exists — skipped", q.name),
5714                            "create_foreign_table",
5715                        ));
5716                    }
5717                    return Err(RedDBError::Internal(format!(
5718                        "foreign table {} already exists",
5719                        q.name
5720                    )));
5721                }
5722                let mut opts = FdwOptions::new();
5723                for (k, v) in &q.options {
5724                    opts.values.insert(k.clone(), v.clone());
5725                }
5726                let columns: Vec<ForeignColumn> = q
5727                    .columns
5728                    .iter()
5729                    .map(|c| ForeignColumn {
5730                        name: c.name.clone(),
5731                        data_type: c.data_type.clone(),
5732                        not_null: c.not_null,
5733                    })
5734                    .collect();
5735                registry
5736                    .create_foreign_table(ForeignTable {
5737                        name: q.name.clone(),
5738                        server_name: q.server.clone(),
5739                        columns,
5740                        options: opts,
5741                    })
5742                    .map_err(|e| RedDBError::Internal(e.to_string()))?;
5743                self.invalidate_plan_cache();
5744                Ok(RuntimeQueryResult::ok_message(
5745                    query.to_string(),
5746                    &format!("foreign table {} created (server {})", q.name, q.server),
5747                    "create_foreign_table",
5748                ))
5749            }
5750            QueryExpr::DropForeignTable(ref q) => {
5751                let existed = self.inner.foreign_tables.drop_foreign_table(&q.name);
5752                if !existed && !q.if_exists {
5753                    return Err(RedDBError::Internal(format!(
5754                        "foreign table {} does not exist",
5755                        q.name
5756                    )));
5757                }
5758                self.invalidate_plan_cache();
5759                Ok(RuntimeQueryResult::ok_message(
5760                    query.to_string(),
5761                    &format!("foreign table {} dropped", q.name),
5762                    "drop_foreign_table",
5763                ))
5764            }
5765            // COPY table FROM 'path' (Phase 1.5 PG parity).
5766            //
5767            // Stream CSV rows through the shared `CsvImporter`. The collection
5768            // is auto-created on first insert (via `insert_auto`-style path);
5769            // VACUUM/ANALYZE afterwards is up to the caller.
5770            QueryExpr::CopyFrom(ref q) => {
5771                use crate::storage::import::{CsvConfig, CsvImporter};
5772                let store = self.inner.db.store();
5773                let cfg = CsvConfig {
5774                    collection: q.table.clone(),
5775                    has_header: q.has_header,
5776                    delimiter: q.delimiter.map(|c| c as u8).unwrap_or(b','),
5777                    ..CsvConfig::default()
5778                };
5779                let importer = CsvImporter::new(cfg);
5780                let stats = importer
5781                    .import_file(&q.path, store.as_ref())
5782                    .map_err(|e| RedDBError::Internal(format!("COPY failed: {e}")))?;
5783                // Tables are written → invalidate cached plans / result cache.
5784                self.note_table_write(&q.table);
5785                Ok(RuntimeQueryResult::ok_message(
5786                    query.to_string(),
5787                    &format!(
5788                        "COPY imported {} rows into {} ({} errors skipped, {}ms)",
5789                        stats.records_imported, q.table, stats.errors_skipped, stats.duration_ms
5790                    ),
5791                    "copy_from",
5792                ))
5793            }
5794            // Maintenance commands (Phase 1.2 PG parity).
5795            //
5796            // - VACUUM [FULL] [table]: refreshes planner stats for the target
5797            //   collection(s) and — when FULL — triggers a full pager persist
5798            //   (flushes dirty pages + fsync). Also invalidates the result cache
5799            //   so subsequent reads re-execute against the freshly compacted
5800            //   storage. RedDB's segment/btree GC runs continuously via the
5801            //   background lifecycle; explicit space reclamation for sealed
5802            //   segments arrives with Phase 2.3 (MVCC + dead-tuple reclamation).
5803            // - ANALYZE [table]: reruns `analyze_collection` +
5804            //   `persist_table_stats` via `refresh_table_planner_stats` so the
5805            //   planner has fresh histograms, distinct estimates, null counts.
5806            //
5807            // Both commands accept an optional target; omitting the target
5808            // iterates every collection in the store.
5809            QueryExpr::MaintenanceCommand(ref cmd) => {
5810                use crate::storage::query::ast::MaintenanceCommand as Mc;
5811                let store = self.inner.db.store();
5812                let (kind, msg) = match cmd {
5813                    Mc::Analyze { target } => {
5814                        let targets: Vec<String> = match target {
5815                            Some(t) => vec![t.clone()],
5816                            None => store.list_collections(),
5817                        };
5818                        for t in &targets {
5819                            self.refresh_table_planner_stats(t);
5820                        }
5821                        (
5822                            "analyze",
5823                            format!("ANALYZE refreshed stats for {} table(s)", targets.len()),
5824                        )
5825                    }
5826                    Mc::Vacuum { target, full } => {
5827                        let targets: Vec<String> = match target {
5828                            Some(t) => vec![t.clone()],
5829                            None => store.list_collections(),
5830                        };
5831                        let cutoff_xid = self.mvcc_vacuum_cutoff_xid();
5832                        let mut vacuum_stats =
5833                            crate::storage::unified::store::MvccVacuumStats::default();
5834                        for t in &targets {
5835                            let stats = store.vacuum_mvcc_history(t, cutoff_xid).map_err(|e| {
5836                                RedDBError::Internal(format!(
5837                                    "VACUUM MVCC history failed for {t}: {e}"
5838                                ))
5839                            })?;
5840                            if stats.reclaimed_versions > 0 {
5841                                self.rebuild_runtime_indexes_for_table(t)?;
5842                            }
5843                            vacuum_stats.add(&stats);
5844                        }
5845                        self.inner.snapshot_manager.prune_aborted(cutoff_xid);
5846                        // Stats refresh covers every target (same as ANALYZE).
5847                        for t in &targets {
5848                            self.refresh_table_planner_stats(t);
5849                        }
5850                        // FULL forces a pager persist (dirty-page flush + fsync).
5851                        // Regular VACUUM relies on the background writer / segment
5852                        // lifecycle so the command is non-blocking.
5853                        let persisted = if *full {
5854                            match store.persist() {
5855                                Ok(()) => true,
5856                                Err(e) => {
5857                                    return Err(RedDBError::Internal(format!(
5858                                        "VACUUM FULL persist failed: {e:?}"
5859                                    )));
5860                                }
5861                            }
5862                        } else {
5863                            false
5864                        };
5865                        // Result cache depended on pre-vacuum state.
5866                        self.invalidate_result_cache();
5867                        (
5868                            "vacuum",
5869                            format!(
5870                                "VACUUM{} processed {} table(s): scanned_versions={}, retained_versions={}, reclaimed_versions={}, retained_history_versions={}, reclaimed_history_versions={}, retained_tombstones={}, reclaimed_tombstones={}{}",
5871                                if *full { " FULL" } else { "" },
5872                                targets.len(),
5873                                vacuum_stats.scanned_versions,
5874                                vacuum_stats.retained_versions,
5875                                vacuum_stats.reclaimed_versions,
5876                                vacuum_stats.retained_history_versions,
5877                                vacuum_stats.reclaimed_history_versions,
5878                                vacuum_stats.retained_tombstones,
5879                                vacuum_stats.reclaimed_tombstones,
5880                                if persisted {
5881                                    " (pages flushed to disk)"
5882                                } else {
5883                                    ""
5884                                }
5885                            ),
5886                        )
5887                    }
5888                };
5889                Ok(RuntimeQueryResult::ok_message(
5890                    query.to_string(),
5891                    &msg,
5892                    kind,
5893                ))
5894            }
5895            // GRANT / REVOKE / ALTER USER (RBAC milestone).
5896            //
5897            // These hit the AuthStore directly. The privilege-check
5898            // gate at the top of `execute_query_expr` already decided
5899            // whether the caller may even run the statement; here we
5900            // just translate the AST into AuthStore calls.
5901            QueryExpr::Grant(ref g) => self.execute_grant_statement(query, g),
5902            QueryExpr::Revoke(ref r) => self.execute_revoke_statement(query, r),
5903            QueryExpr::AlterUser(ref a) => self.execute_alter_user_statement(query, a),
5904            QueryExpr::CreateIamPolicy { ref id, ref json } => {
5905                self.execute_create_iam_policy(query, id, json)
5906            }
5907            QueryExpr::DropIamPolicy { ref id } => self.execute_drop_iam_policy(query, id),
5908            QueryExpr::AttachPolicy {
5909                ref policy_id,
5910                ref principal,
5911            } => self.execute_attach_policy(query, policy_id, principal),
5912            QueryExpr::DetachPolicy {
5913                ref policy_id,
5914                ref principal,
5915            } => self.execute_detach_policy(query, policy_id, principal),
5916            QueryExpr::ShowPolicies { ref filter } => {
5917                self.execute_show_policies(query, filter.as_ref())
5918            }
5919            QueryExpr::ShowEffectivePermissions {
5920                ref user,
5921                ref resource,
5922            } => self.execute_show_effective_permissions(query, user, resource.as_ref()),
5923            QueryExpr::SimulatePolicy {
5924                ref user,
5925                ref action,
5926                ref resource,
5927            } => self.execute_simulate_policy(query, user, action, resource),
5928            QueryExpr::LintPolicy { ref source } => self.execute_lint_policy(query, source),
5929            QueryExpr::MigratePolicyMode {
5930                ref target,
5931                dry_run,
5932            } => self.execute_migrate_policy_mode(query, target, dry_run),
5933            QueryExpr::CreateMigration(ref q) => self.execute_create_migration(query, q),
5934            QueryExpr::ApplyMigration(ref q) => self.execute_apply_migration(query, q),
5935            QueryExpr::RollbackMigration(ref q) => self.execute_rollback_migration(query, q),
5936            QueryExpr::ExplainMigration(ref q) => self.execute_explain_migration(query, q),
5937        };
5938
5939        if !control_event_specs.is_empty() {
5940            let (outcome, reason) = match &query_result {
5941                Ok(_) => (crate::runtime::control_events::Outcome::Allowed, None),
5942                Err(err) => (control_event_outcome_for_error(err), Some(err.to_string())),
5943            };
5944            for spec in &control_event_specs {
5945                self.emit_control_event(
5946                    spec.kind,
5947                    outcome,
5948                    spec.action,
5949                    spec.resource.clone(),
5950                    reason.clone(),
5951                    spec.fields.clone(),
5952                )?;
5953            }
5954        }
5955
5956        if let (Some(plan), Ok(result)) = (&query_audit_plan, &query_result) {
5957            self.emit_query_audit(
5958                query,
5959                plan,
5960                query_audit_started.elapsed().as_millis() as u64,
5961                result,
5962            );
5963        }
5964
5965        // Decrypt Value::Secret columns in-place before caching, so
5966        // cached results match the post-decrypt shape and repeat
5967        // queries skip the per-row AES-GCM pass.
5968        let mut query_result = query_result;
5969        if let Ok(ref mut result) = query_result {
5970            if result.statement_type == "select" {
5971                self.apply_secret_decryption(result);
5972            }
5973        }
5974
5975        // Cache SELECT results for 30s.
5976        // Skip: pre-serialized JSON (large clone), and result sets > 5 rows.
5977        // Large multi-row results (range scans, filtered scans) are rarely
5978        // repeated with the same literal values so the cache hit rate is near
5979        // zero while the clone cost (100 records × ~16 fields each) is high.
5980        // Aggregations (1 row) and point lookups (1 row) still benefit.
5981        if let Ok(ref result) = query_result {
5982            frame.write_result_cache(self, result, result_cache_scopes);
5983        }
5984
5985        query_result
5986    }
5987
5988    /// Snapshot of every registered materialized view's runtime
5989    /// state — feeds the `red.materialized_views` virtual table.
5990    /// Issue #583 slice 10.
5991    pub fn materialized_view_metadata(
5992        &self,
5993    ) -> Vec<crate::storage::cache::result::MaterializedViewMetadata> {
5994        // Issue #595 slice 9c — `current_row_count` is now scraped
5995        // live from the backing collection rather than read from the
5996        // cache slot. Mirrors the slice-10 invariant on
5997        // `queue_pending_gauge` in #527: the live store is the source
5998        // of truth, the cache slot only carries last-refresh telemetry
5999        // (timing, error, refresh cadence).
6000        let store = self.inner.db.store();
6001        let mut entries = self.inner.materialized_views.read().metadata();
6002        for entry in &mut entries {
6003            if let Some(manager) = store.get_collection(&entry.name) {
6004                entry.current_row_count = manager.count() as u64;
6005            }
6006        }
6007        entries
6008    }
6009
6010    /// Drive scheduled refreshes for materialized views with a
6011    /// `REFRESH EVERY <duration>` clause. Called from the background
6012    /// scheduler thread (and from unit tests with a fake clock via
6013    /// `claim_due_at`). Each invocation atomically claims the set of
6014    /// due views (so two concurrent ticks never double-fire the same
6015    /// view) and runs each refresh through the standard execution
6016    /// path — failures are captured in `last_error` and the prior
6017    /// content stays intact. Issue #583 slice 10.
6018    /// Snapshot of every tracked retention sweeper state — feeds the
6019    /// three extra columns on `red.retention`. Issue #584 slice 12.
6020    pub(crate) fn retention_sweeper_snapshot(
6021        &self,
6022    ) -> Vec<(String, crate::runtime::retention_sweeper::SweeperState)> {
6023        self.inner.retention_sweeper.read().snapshot()
6024    }
6025
6026    /// Drive one tick of the retention sweeper. Iterates collections
6027    /// with a retention policy set, physically deletes at most
6028    /// `batch_size` expired rows per collection, and records the
6029    /// `last_sweep_at_ms` / `rows_swept_total` / pending estimate that
6030    /// `red.retention` exposes. Called from the background sweeper
6031    /// thread; safe to invoke directly from tests with a small batch
6032    /// size to drain rows deterministically. Issue #584 slice 12.
6033    ///
6034    /// Deletes are issued as `DELETE FROM <collection> WHERE
6035    /// <ts_column> < <cutoff>` through the standard `execute_query`
6036    /// chokepoint so WAL participation and snapshot guards apply
6037    /// exactly as for a user-issued DELETE — replicas replay the
6038    /// sweeper's deletes via the same WAL stream with no special
6039    /// handling on the replication side.
6040    ///
6041    /// Batching is enforced by tightening the cutoff: if more than
6042    /// `batch_size` rows are expired, the cutoff is dropped to the
6043    /// `batch_size`-th oldest expired timestamp + 1 so the predicate
6044    /// matches roughly `batch_size` rows; the remainder is reported
6045    /// as `current_rows_pending_sweep_estimate` and drained on the
6046    /// next tick.
6047    pub fn sweep_retention_tick(&self, batch_size: usize) {
6048        if batch_size == 0 {
6049            return;
6050        }
6051        let now_ms = std::time::SystemTime::now()
6052            .duration_since(std::time::UNIX_EPOCH)
6053            .map(|d| d.as_millis() as u64)
6054            .unwrap_or(0);
6055
6056        let store = self.inner.db.store();
6057        let collections = store.list_collections();
6058        for name in collections {
6059            let Some(contract) = self.inner.db.collection_contract(&name) else {
6060                continue;
6061            };
6062            let Some(retention_ms) = contract.retention_duration_ms else {
6063                continue;
6064            };
6065            let Some(ts_column) =
6066                crate::runtime::retention_filter::resolve_timestamp_column(&contract)
6067            else {
6068                continue;
6069            };
6070            let Some(manager) = store.get_collection(&name) else {
6071                continue;
6072            };
6073            let cutoff = (now_ms as i64).saturating_sub(retention_ms as i64);
6074
6075            // Single pass: collect expired timestamps. We keep the
6076            // full Vec rather than a bounded heap because the partial
6077            // sort below is the simplest correct way to find the
6078            // batch-th oldest; for the slice's "1000-row default
6079            // batch" target this is bounded enough for production
6080            // operation, and the alternative (in-place heap of size
6081            // batch+1) is a follow-up optimisation.
6082            let mut expired_ts: Vec<i64> = Vec::new();
6083            manager.for_each_entity(|entity| {
6084                let ts = match ts_column.as_str() {
6085                    "created_at" => Some(entity.created_at as i64),
6086                    "updated_at" => Some(entity.updated_at as i64),
6087                    other => entity
6088                        .data
6089                        .as_row()
6090                        .and_then(|row| row.get_field(other))
6091                        .and_then(|v| match v {
6092                            crate::storage::schema::Value::TimestampMs(t) => Some(*t),
6093                            crate::storage::schema::Value::Timestamp(t) => {
6094                                Some(t.saturating_mul(1_000))
6095                            }
6096                            crate::storage::schema::Value::BigInt(t) => Some(*t),
6097                            crate::storage::schema::Value::UnsignedInteger(t) => {
6098                                i64::try_from(*t).ok()
6099                            }
6100                            crate::storage::schema::Value::Integer(t) => Some(*t),
6101                            _ => None,
6102                        }),
6103                };
6104                if let Some(t) = ts {
6105                    if t < cutoff {
6106                        expired_ts.push(t);
6107                    }
6108                }
6109                true
6110            });
6111
6112            let total_expired = expired_ts.len() as u64;
6113            if total_expired == 0 {
6114                self.inner
6115                    .retention_sweeper
6116                    .write()
6117                    .record_tick(&name, 0, 0, now_ms);
6118                continue;
6119            }
6120
6121            let (effective_cutoff, pending) = if (total_expired as usize) <= batch_size {
6122                (cutoff, 0u64)
6123            } else {
6124                // Tighten the cutoff to the (batch_size)-th oldest
6125                // expired timestamp + 1 so DELETE matches roughly
6126                // `batch_size` rows.
6127                expired_ts.sort_unstable();
6128                let nth = expired_ts[batch_size - 1];
6129                (
6130                    nth.saturating_add(1),
6131                    total_expired.saturating_sub(batch_size as u64),
6132                )
6133            };
6134
6135            let stmt = format!(
6136                "DELETE FROM {} WHERE {} < {}",
6137                name, ts_column, effective_cutoff
6138            );
6139            let deleted = match self.execute_query(&stmt) {
6140                Ok(r) => r.affected_rows,
6141                Err(_) => 0,
6142            };
6143
6144            self.inner
6145                .retention_sweeper
6146                .write()
6147                .record_tick(&name, deleted, pending, now_ms);
6148        }
6149    }
6150
6151    pub fn refresh_due_materialized_views(&self) {
6152        let due = {
6153            let mut cache = self.inner.materialized_views.write();
6154            cache.claim_due_at(std::time::Instant::now())
6155        };
6156        for name in due {
6157            // Round-trip through `execute_query` (rather than the
6158            // prepared-statement `execute_query_expr` fast path, which
6159            // explicitly rejects DDL/maintenance statements). Failures
6160            // are captured inside the RefreshMaterializedView handler
6161            // via `record_refresh_failure`; the scheduler ignores the
6162            // Result so one bad view doesn't halt the loop.
6163            let stmt = format!("REFRESH MATERIALIZED VIEW {}", name);
6164            let _ = self.execute_query(&stmt);
6165        }
6166    }
6167
6168    /// Execute a pre-parsed `QueryExpr` directly, bypassing SQL parsing and the
6169    /// plan cache. Used by the prepared-statement fast path so that `execute_prepared`
6170    /// calls pay zero parse + cache overhead.
6171    ///
6172    /// Applies secret decryption on SELECT results, identical to `execute_query`.
6173    pub fn execute_query_expr(&self, expr: QueryExpr) -> RedDBResult<RuntimeQueryResult> {
6174        let _config_snapshot_guard = ConfigSnapshotGuard::install(Arc::clone(&self.inner.db));
6175        let _secret_store_guard = SecretStoreGuard::install(self.inner.auth_store.read().clone());
6176        // View rewrite (Phase 2.1): substitute any `QueryExpr::Table(tq)`
6177        // whose `tq.table` matches a registered view with the view's
6178        // underlying query. Safe to call even when no views are registered.
6179        let expr = self.rewrite_view_refs(expr);
6180
6181        self.validate_model_operations_before_auth(&expr)?;
6182        // Granular RBAC privilege check. Runs before dispatch so a
6183        // denied caller never reaches storage. Fail-closed: any error
6184        // resolving the action / resource produces PermissionDenied.
6185        if let Err(err) = self.check_query_privilege(&expr) {
6186            return Err(RedDBError::Query(format!("permission denied: {err}")));
6187        }
6188
6189        let statement = query_expr_name(&expr);
6190        let mode = detect_mode(statement);
6191        let query_str = statement;
6192
6193        let result = self.dispatch_expr(expr, query_str, mode)?;
6194        let mut r = result;
6195        if r.statement_type == "select" {
6196            self.apply_secret_decryption(&mut r);
6197        }
6198        Ok(r)
6199    }
6200
6201    pub(super) fn validate_model_operations_before_auth(
6202        &self,
6203        expr: &QueryExpr,
6204    ) -> RedDBResult<()> {
6205        use crate::catalog::CollectionModel;
6206        use crate::runtime::ddl::polymorphic_resolver;
6207        use crate::storage::query::ast::KvCommand;
6208
6209        let system_schema_target = match expr {
6210            QueryExpr::DropTable(q) => Some(q.name.as_str()),
6211            QueryExpr::DropGraph(q) => Some(q.name.as_str()),
6212            QueryExpr::DropVector(q) => Some(q.name.as_str()),
6213            QueryExpr::DropDocument(q) => Some(q.name.as_str()),
6214            QueryExpr::DropKv(q) => Some(q.name.as_str()),
6215            QueryExpr::DropCollection(q) => Some(q.name.as_str()),
6216            QueryExpr::Truncate(q) => Some(q.name.as_str()),
6217            _ => None,
6218        };
6219        if system_schema_target.is_some_and(crate::runtime::impl_ddl::is_system_schema_name) {
6220            return Err(RedDBError::Query("system schema is read-only".to_string()));
6221        }
6222
6223        let expected = match expr {
6224            QueryExpr::DropTable(q) => Some((q.name.as_str(), CollectionModel::Table)),
6225            QueryExpr::DropGraph(q) => Some((q.name.as_str(), CollectionModel::Graph)),
6226            QueryExpr::DropVector(q) => Some((q.name.as_str(), CollectionModel::Vector)),
6227            QueryExpr::DropDocument(q) => Some((q.name.as_str(), CollectionModel::Document)),
6228            QueryExpr::DropKv(q) => Some((q.name.as_str(), q.model)),
6229            QueryExpr::DropCollection(q) => q.model.map(|model| (q.name.as_str(), model)),
6230            QueryExpr::Truncate(q) => q.model.map(|model| (q.name.as_str(), model)),
6231            QueryExpr::KvCommand(cmd) => {
6232                let (collection, model) = match cmd {
6233                    KvCommand::Put {
6234                        collection, model, ..
6235                    }
6236                    | KvCommand::Get {
6237                        collection, model, ..
6238                    }
6239                    | KvCommand::Incr {
6240                        collection, model, ..
6241                    }
6242                    | KvCommand::Cas {
6243                        collection, model, ..
6244                    }
6245                    | KvCommand::Delete {
6246                        collection, model, ..
6247                    } => (collection.as_str(), *model),
6248                    KvCommand::Rotate { collection, .. }
6249                    | KvCommand::History { collection, .. }
6250                    | KvCommand::List { collection, .. }
6251                    | KvCommand::Purge { collection, .. } => {
6252                        (collection.as_str(), CollectionModel::Vault)
6253                    }
6254                    KvCommand::InvalidateTags { collection, .. } => {
6255                        (collection.as_str(), CollectionModel::Kv)
6256                    }
6257                    KvCommand::Watch {
6258                        collection, model, ..
6259                    } => (collection.as_str(), *model),
6260                    KvCommand::Unseal { collection, .. } => {
6261                        (collection.as_str(), CollectionModel::Vault)
6262                    }
6263                };
6264                Some((collection, model))
6265            }
6266            QueryExpr::ConfigCommand(cmd) => {
6267                self.validate_config_command_before_auth(cmd)?;
6268                None
6269            }
6270            _ => None,
6271        };
6272
6273        let Some((name, expected_model)) = expected else {
6274            return Ok(());
6275        };
6276        let snapshot = self.inner.db.catalog_model_snapshot();
6277        let Some(actual_model) = snapshot
6278            .collections
6279            .iter()
6280            .find(|collection| collection.name == name)
6281            .map(|collection| collection.declared_model.unwrap_or(collection.model))
6282        else {
6283            return Ok(());
6284        };
6285        polymorphic_resolver::ensure_model_match(expected_model, actual_model)
6286    }
6287
6288    /// Walk a `QueryExpr` and replace `QueryExpr::Table(tq)` nodes whose
6289    /// `tq.table` matches a registered view name with the view's stored
6290    /// body. Recurses through joins so `SELECT ... FROM t JOIN myview ...`
6291    /// resolves correctly. Pure operation — no side effects.
6292    pub(super) fn rewrite_view_refs(&self, expr: QueryExpr) -> QueryExpr {
6293        // Fast path: no views registered → return original expression.
6294        if self.inner.views.read().is_empty() {
6295            return expr;
6296        }
6297        self.rewrite_view_refs_inner(expr)
6298    }
6299
6300    fn rewrite_view_refs_inner(&self, expr: QueryExpr) -> QueryExpr {
6301        use crate::storage::query::ast::{Filter, TableSource};
6302        match expr {
6303            QueryExpr::Table(mut tq) => {
6304                // 1. If the TableSource is a subquery, recurse into it so
6305                //    `SELECT ... FROM (SELECT ... FROM myview) t` expands.
6306                //    The legacy `table` field (set to a synthetic
6307                //    "__subq_NNNN" sentinel) stays as-is so callers that
6308                //    read it keep compiling.
6309                if let Some(TableSource::Subquery(body)) = tq.source.take() {
6310                    tq.source = Some(TableSource::Subquery(Box::new(
6311                        self.rewrite_view_refs_inner(*body),
6312                    )));
6313                    return QueryExpr::Table(tq);
6314                }
6315
6316                // 2. Restore the source field (took it above for match).
6317                // When the source was `None` or `TableSource::Name(_)`, the
6318                // real lookup key is `tq.table` — check the view registry.
6319                let maybe_view = {
6320                    let views = self.inner.views.read();
6321                    views.get(&tq.table).cloned()
6322                };
6323                let Some(view) = maybe_view else {
6324                    return QueryExpr::Table(tq);
6325                };
6326
6327                // Issue #594 slice 9b — materialized views are read
6328                // from their backing collection, not by substituting
6329                // the body. Returning the TableQuery as-is lets the
6330                // normal table-read path resolve `SELECT FROM v`
6331                // against the collection provisioned at CREATE time.
6332                if view.materialized {
6333                    return QueryExpr::Table(tq);
6334                }
6335
6336                // Recurse into the view body — views may reference other
6337                // views. The recursion yields the final QueryExpr we need
6338                // to merge the outer's filter / limit / offset into.
6339                let inner_expr = self.rewrite_view_refs_inner((*view.query).clone());
6340
6341                // Phase 5: when the body is a Table we merge the outer
6342                // TableQuery's WHERE / LIMIT / OFFSET into it so stacked
6343                // views filter recursively. Non-table bodies (Search,
6344                // Ask, Vector, Graph, Hybrid) can't meaningfully combine
6345                // with an outer Table query today — return the body
6346                // verbatim; outer predicates are lost. Full projection
6347                // merge lands in Phase 5.2.
6348                match inner_expr {
6349                    QueryExpr::Table(mut inner_tq) => {
6350                        if let Some(outer_filter) = tq.filter.take() {
6351                            inner_tq.filter = Some(match inner_tq.filter.take() {
6352                                Some(existing) => {
6353                                    Filter::And(Box::new(existing), Box::new(outer_filter))
6354                                }
6355                                None => outer_filter,
6356                            });
6357                            // Keep the `Expr` form in lock-step with the
6358                            // merged `Filter`. The executor prefers
6359                            // `where_expr` and nulls `filter` when it is
6360                            // present (see `execute_query_inner`), so a
6361                            // stacked view whose outer predicate was only
6362                            // merged into `filter` would silently drop that
6363                            // predicate at eval time (#635).
6364                            inner_tq.where_expr = inner_tq
6365                                .filter
6366                                .as_ref()
6367                                .map(crate::storage::query::sql_lowering::filter_to_expr);
6368                        }
6369                        if let Some(outer_limit) = tq.limit {
6370                            inner_tq.limit = Some(match inner_tq.limit {
6371                                Some(existing) => existing.min(outer_limit),
6372                                None => outer_limit,
6373                            });
6374                        }
6375                        if let Some(outer_offset) = tq.offset {
6376                            inner_tq.offset = Some(match inner_tq.offset {
6377                                Some(existing) => existing + outer_offset,
6378                                None => outer_offset,
6379                            });
6380                        }
6381                        QueryExpr::Table(inner_tq)
6382                    }
6383                    other => other,
6384                }
6385            }
6386            QueryExpr::Join(mut jq) => {
6387                jq.left = Box::new(self.rewrite_view_refs_inner(*jq.left));
6388                jq.right = Box::new(self.rewrite_view_refs_inner(*jq.right));
6389                QueryExpr::Join(jq)
6390            }
6391            // Other variants don't carry nested QueryExpr that can reference
6392            // a view by table name. Return as-is.
6393            other => other,
6394        }
6395    }
6396
6397    /// Internal dispatch: route a `QueryExpr` to the appropriate executor.
6398    /// Shared by `execute_query` (after parse/cache) and `execute_query_expr`
6399    /// (direct call from prepared-statement handler).
6400    fn authorize_relational_table_select(
6401        &self,
6402        mut table: TableQuery,
6403        frame: &dyn super::statement_frame::ReadFrame,
6404    ) -> RedDBResult<Option<TableQuery>> {
6405        if let Some(TableSource::Subquery(inner)) = table.source.take() {
6406            let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6407            table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6408            return Ok(Some(table));
6409        }
6410
6411        self.check_table_column_projection_authz(&table, frame)?;
6412
6413        if self.inner.rls_enabled_tables.read().contains(&table.table) {
6414            return Ok(inject_rls_filters(self, frame, table));
6415        }
6416
6417        Ok(Some(table))
6418    }
6419
6420    fn authorize_relational_join_select(
6421        &self,
6422        mut join: JoinQuery,
6423        frame: &dyn super::statement_frame::ReadFrame,
6424    ) -> RedDBResult<Option<JoinQuery>> {
6425        self.check_join_column_projection_authz(&join, frame)?;
6426        join.left = Box::new(self.authorize_relational_join_child(*join.left, frame)?);
6427        join.right = Box::new(self.authorize_relational_join_child(*join.right, frame)?);
6428        Ok(inject_rls_into_join(self, frame, join))
6429    }
6430
6431    fn authorize_relational_join_child(
6432        &self,
6433        expr: QueryExpr,
6434        frame: &dyn super::statement_frame::ReadFrame,
6435    ) -> RedDBResult<QueryExpr> {
6436        match expr {
6437            QueryExpr::Table(mut table) => {
6438                if let Some(TableSource::Subquery(inner)) = table.source.take() {
6439                    let authorized_inner = self.authorize_relational_select_expr(*inner, frame)?;
6440                    table.source = Some(TableSource::Subquery(Box::new(authorized_inner)));
6441                }
6442                Ok(QueryExpr::Table(table))
6443            }
6444            QueryExpr::Join(join) => self
6445                .authorize_relational_join_select(join, frame)?
6446                .map(QueryExpr::Join)
6447                .ok_or_else(|| {
6448                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6449                }),
6450            other => Ok(other),
6451        }
6452    }
6453
6454    fn authorize_relational_select_expr(
6455        &self,
6456        expr: QueryExpr,
6457        frame: &dyn super::statement_frame::ReadFrame,
6458    ) -> RedDBResult<QueryExpr> {
6459        match expr {
6460            QueryExpr::Table(table) => self
6461                .authorize_relational_table_select(table, frame)?
6462                .map(QueryExpr::Table)
6463                .ok_or_else(|| {
6464                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6465                }),
6466            QueryExpr::Join(join) => self
6467                .authorize_relational_join_select(join, frame)?
6468                .map(QueryExpr::Join)
6469                .ok_or_else(|| {
6470                    RedDBError::Query("permission denied: RLS denied relational subquery".into())
6471                }),
6472            other => Ok(other),
6473        }
6474    }
6475
6476    fn check_table_column_projection_authz(
6477        &self,
6478        table: &TableQuery,
6479        frame: &dyn super::statement_frame::ReadFrame,
6480    ) -> RedDBResult<()> {
6481        let Some((username, role)) = frame.identity() else {
6482            return Ok(());
6483        };
6484        let Some(auth_store) = self.inner.auth_store.read().clone() else {
6485            return Ok(());
6486        };
6487
6488        let columns = self.resolved_table_projection_columns(table)?;
6489        let request = ColumnAccessRequest::select(table.table.clone(), columns);
6490        let principal = UserId::from_parts(frame.effective_scope(), username);
6491        let ctx = runtime_iam_context(
6492            role,
6493            frame.effective_scope(),
6494            auth_store.principal_is_system_owned(&principal),
6495        );
6496        let outcome = auth_store.check_column_projection_authz(&principal, &request, &ctx);
6497        if outcome.allowed() {
6498            return Ok(());
6499        }
6500
6501        if let Some(denied) = outcome.first_denied_column() {
6502            return Err(RedDBError::Query(format!(
6503                "permission denied: principal=`{username}` cannot select column `{}`",
6504                denied.resource.name
6505            )));
6506        }
6507        Err(RedDBError::Query(format!(
6508            "permission denied: principal=`{username}` cannot select table `{}`",
6509            table.table
6510        )))
6511    }
6512
6513    fn check_join_column_projection_authz(
6514        &self,
6515        join: &JoinQuery,
6516        frame: &dyn super::statement_frame::ReadFrame,
6517    ) -> RedDBResult<()> {
6518        let mut by_table: HashMap<String, BTreeSet<String>> = HashMap::new();
6519        let projections = crate::storage::query::sql_lowering::effective_join_projections(join);
6520        self.collect_join_projection_columns(join, &projections, &mut by_table)?;
6521
6522        for (table, columns) in by_table {
6523            let query = TableQuery {
6524                table,
6525                source: None,
6526                alias: None,
6527                select_items: Vec::new(),
6528                columns: columns.into_iter().map(Projection::Column).collect(),
6529                where_expr: None,
6530                filter: None,
6531                group_by_exprs: Vec::new(),
6532                group_by: Vec::new(),
6533                having_expr: None,
6534                having: None,
6535                order_by: Vec::new(),
6536                limit: None,
6537                limit_param: None,
6538                offset: None,
6539                offset_param: None,
6540                expand: None,
6541                as_of: None,
6542                sessionize: None,
6543            };
6544            self.check_table_column_projection_authz(&query, frame)?;
6545        }
6546        Ok(())
6547    }
6548
6549    fn collect_join_projection_columns(
6550        &self,
6551        join: &JoinQuery,
6552        projections: &[Projection],
6553        out: &mut HashMap<String, BTreeSet<String>>,
6554    ) -> RedDBResult<()> {
6555        let left = table_side_context(join.left.as_ref());
6556        let right = table_side_context(join.right.as_ref());
6557
6558        if projections
6559            .iter()
6560            .any(|projection| matches!(projection, Projection::All))
6561        {
6562            for side in [left.as_ref(), right.as_ref()].into_iter().flatten() {
6563                out.entry(side.table.clone())
6564                    .or_default()
6565                    .extend(self.table_all_projection_columns(&side.table)?);
6566            }
6567            return Ok(());
6568        }
6569
6570        for projection in projections {
6571            collect_projection_columns_for_join_side(
6572                projection,
6573                left.as_ref(),
6574                right.as_ref(),
6575                out,
6576            )?;
6577        }
6578        Ok(())
6579    }
6580
6581    fn resolved_table_projection_columns(&self, table: &TableQuery) -> RedDBResult<Vec<String>> {
6582        let projections = crate::storage::query::sql_lowering::effective_table_projections(table);
6583        if projections
6584            .iter()
6585            .any(|projection| matches!(projection, Projection::All))
6586        {
6587            return self.table_all_projection_columns(&table.table);
6588        }
6589
6590        let mut columns = BTreeSet::new();
6591        for projection in &projections {
6592            collect_projection_columns_for_table(
6593                projection,
6594                &table.table,
6595                table.alias.as_deref(),
6596                &mut columns,
6597            );
6598        }
6599        Ok(columns.into_iter().collect())
6600    }
6601
6602    fn table_all_projection_columns(&self, table: &str) -> RedDBResult<Vec<String>> {
6603        if let Some(contract) = self.inner.db.collection_contract_arc(table) {
6604            let columns: Vec<String> = contract
6605                .declared_columns
6606                .iter()
6607                .map(|column| column.name.clone())
6608                .collect();
6609            if !columns.is_empty() {
6610                return Ok(columns);
6611            }
6612        }
6613
6614        let records = scan_runtime_table_source_records_limited(&self.inner.db, table, Some(1))?;
6615        Ok(records
6616            .first()
6617            .map(|record| {
6618                record
6619                    .column_names()
6620                    .into_iter()
6621                    .map(|column| column.to_string())
6622                    .collect()
6623            })
6624            .unwrap_or_default())
6625    }
6626
6627    fn resolve_table_expr_subqueries(
6628        &self,
6629        mut table: TableQuery,
6630        frame: &dyn super::statement_frame::ReadFrame,
6631    ) -> RedDBResult<TableQuery> {
6632        // Only a `Subquery` source needs recursive resolution. `.take()`
6633        // would otherwise drop a `Name` / `Function` source on the floor
6634        // (the `if let` skips the body but the take already cleared it),
6635        // which silently broke `SELECT * FROM components(g)` — the TVF
6636        // dispatch downstream keys off `TableSource::Function` and never
6637        // fired. Restore any non-subquery source unchanged (issue #795).
6638        match table.source.take() {
6639            Some(TableSource::Subquery(inner)) => {
6640                let inner = self.resolve_select_expr_subqueries(*inner, frame)?;
6641                table.source = Some(TableSource::Subquery(Box::new(inner)));
6642            }
6643            other => table.source = other,
6644        }
6645
6646        let outer_scopes = relation_scopes_for_query(&QueryExpr::Table(table.clone()));
6647        for item in &mut table.select_items {
6648            if let crate::storage::query::ast::SelectItem::Expr { expr, .. } = item {
6649                *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
6650            }
6651        }
6652        if let Some(where_expr) = table.where_expr.take() {
6653            table.where_expr =
6654                Some(self.resolve_expr_subqueries(where_expr, &outer_scopes, frame)?);
6655            table.filter = None;
6656        }
6657        if let Some(having_expr) = table.having_expr.take() {
6658            table.having_expr =
6659                Some(self.resolve_expr_subqueries(having_expr, &outer_scopes, frame)?);
6660            table.having = None;
6661        }
6662        for expr in &mut table.group_by_exprs {
6663            *expr = self.resolve_expr_subqueries(expr.clone(), &outer_scopes, frame)?;
6664        }
6665        for clause in &mut table.order_by {
6666            if let Some(expr) = clause.expr.take() {
6667                clause.expr = Some(self.resolve_expr_subqueries(expr, &outer_scopes, frame)?);
6668            }
6669        }
6670        Ok(table)
6671    }
6672
6673    fn resolve_select_expr_subqueries(
6674        &self,
6675        expr: QueryExpr,
6676        frame: &dyn super::statement_frame::ReadFrame,
6677    ) -> RedDBResult<QueryExpr> {
6678        match expr {
6679            QueryExpr::Table(table) => self
6680                .resolve_table_expr_subqueries(table, frame)
6681                .map(QueryExpr::Table),
6682            QueryExpr::Join(mut join) => {
6683                join.left = Box::new(self.resolve_select_expr_subqueries(*join.left, frame)?);
6684                join.right = Box::new(self.resolve_select_expr_subqueries(*join.right, frame)?);
6685                Ok(QueryExpr::Join(join))
6686            }
6687            other => Ok(other),
6688        }
6689    }
6690
6691    fn resolve_expr_subqueries(
6692        &self,
6693        expr: crate::storage::query::ast::Expr,
6694        outer_scopes: &[String],
6695        frame: &dyn super::statement_frame::ReadFrame,
6696    ) -> RedDBResult<crate::storage::query::ast::Expr> {
6697        use crate::storage::query::ast::Expr;
6698
6699        match expr {
6700            Expr::Subquery { query, span } => {
6701                let values = self.execute_expr_subquery_values(query, outer_scopes, frame)?;
6702                if values.len() > 1 {
6703                    return Err(RedDBError::Query(
6704                        "scalar subquery returned more than one row".to_string(),
6705                    ));
6706                }
6707                Ok(Expr::Literal {
6708                    value: values.into_iter().next().unwrap_or(Value::Null),
6709                    span,
6710                })
6711            }
6712            Expr::BinaryOp { op, lhs, rhs, span } => Ok(Expr::BinaryOp {
6713                op,
6714                lhs: Box::new(self.resolve_expr_subqueries(*lhs, outer_scopes, frame)?),
6715                rhs: Box::new(self.resolve_expr_subqueries(*rhs, outer_scopes, frame)?),
6716                span,
6717            }),
6718            Expr::UnaryOp { op, operand, span } => Ok(Expr::UnaryOp {
6719                op,
6720                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
6721                span,
6722            }),
6723            Expr::Cast {
6724                inner,
6725                target,
6726                span,
6727            } => Ok(Expr::Cast {
6728                inner: Box::new(self.resolve_expr_subqueries(*inner, outer_scopes, frame)?),
6729                target,
6730                span,
6731            }),
6732            Expr::FunctionCall { name, args, span } => {
6733                let args = args
6734                    .into_iter()
6735                    .map(|arg| self.resolve_expr_subqueries(arg, outer_scopes, frame))
6736                    .collect::<RedDBResult<Vec<_>>>()?;
6737                Ok(Expr::FunctionCall { name, args, span })
6738            }
6739            Expr::Case {
6740                branches,
6741                else_,
6742                span,
6743            } => {
6744                let branches = branches
6745                    .into_iter()
6746                    .map(|(cond, value)| {
6747                        Ok((
6748                            self.resolve_expr_subqueries(cond, outer_scopes, frame)?,
6749                            self.resolve_expr_subqueries(value, outer_scopes, frame)?,
6750                        ))
6751                    })
6752                    .collect::<RedDBResult<Vec<_>>>()?;
6753                let else_ = else_
6754                    .map(|expr| self.resolve_expr_subqueries(*expr, outer_scopes, frame))
6755                    .transpose()?
6756                    .map(Box::new);
6757                Ok(Expr::Case {
6758                    branches,
6759                    else_,
6760                    span,
6761                })
6762            }
6763            Expr::IsNull {
6764                operand,
6765                negated,
6766                span,
6767            } => Ok(Expr::IsNull {
6768                operand: Box::new(self.resolve_expr_subqueries(*operand, outer_scopes, frame)?),
6769                negated,
6770                span,
6771            }),
6772            Expr::InList {
6773                target,
6774                values,
6775                negated,
6776                span,
6777            } => {
6778                let target =
6779                    Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?);
6780                let mut resolved = Vec::new();
6781                for value in values {
6782                    if let Expr::Subquery { query, .. } = value {
6783                        resolved.extend(
6784                            self.execute_expr_subquery_values(query, outer_scopes, frame)?
6785                                .into_iter()
6786                                .map(Expr::lit),
6787                        );
6788                    } else {
6789                        resolved.push(self.resolve_expr_subqueries(value, outer_scopes, frame)?);
6790                    }
6791                }
6792                Ok(Expr::InList {
6793                    target,
6794                    values: resolved,
6795                    negated,
6796                    span,
6797                })
6798            }
6799            Expr::Between {
6800                target,
6801                low,
6802                high,
6803                negated,
6804                span,
6805            } => Ok(Expr::Between {
6806                target: Box::new(self.resolve_expr_subqueries(*target, outer_scopes, frame)?),
6807                low: Box::new(self.resolve_expr_subqueries(*low, outer_scopes, frame)?),
6808                high: Box::new(self.resolve_expr_subqueries(*high, outer_scopes, frame)?),
6809                negated,
6810                span,
6811            }),
6812            other => Ok(other),
6813        }
6814    }
6815
6816    fn execute_expr_subquery_values(
6817        &self,
6818        subquery: crate::storage::query::ast::ExprSubquery,
6819        outer_scopes: &[String],
6820        frame: &dyn super::statement_frame::ReadFrame,
6821    ) -> RedDBResult<Vec<Value>> {
6822        let query = *subquery.query;
6823        if query_references_outer_scope(&query, outer_scopes) {
6824            return Err(RedDBError::Query(
6825                "NOT_YET_SUPPORTED: correlated subqueries are not supported yet; track follow-up issue #470-correlated-subqueries".to_string(),
6826            ));
6827        }
6828        let query = self.rewrite_view_refs(query);
6829        let query = self.resolve_select_expr_subqueries(query, frame)?;
6830        let query = self.authorize_relational_select_expr(query, frame)?;
6831        let result = match query {
6832            QueryExpr::Table(table) => {
6833                execute_runtime_table_query(&self.inner.db, &table, Some(&self.inner.index_store))?
6834            }
6835            QueryExpr::Join(join) => execute_runtime_join_query(&self.inner.db, &join)?,
6836            other => {
6837                return Err(RedDBError::Query(format!(
6838                    "expression subquery must be a SELECT query, got {}",
6839                    query_expr_name(&other)
6840                )))
6841            }
6842        };
6843        first_column_values(result)
6844    }
6845
6846    fn dispatch_expr(
6847        &self,
6848        expr: QueryExpr,
6849        query_str: &str,
6850        mode: QueryMode,
6851    ) -> RedDBResult<RuntimeQueryResult> {
6852        let statement = query_expr_name(&expr);
6853        match expr {
6854            QueryExpr::Graph(_) | QueryExpr::Path(_) => {
6855                // Graph queries are not cacheable as prepared statements.
6856                Err(RedDBError::Query(
6857                    "graph queries cannot be used as prepared statements".to_string(),
6858                ))
6859            }
6860            QueryExpr::Table(table) => {
6861                let scope = self.ai_scope();
6862                let table = self.resolve_table_expr_subqueries(
6863                    table,
6864                    &scope as &dyn super::statement_frame::ReadFrame,
6865                )?;
6866                // Table-valued functions (e.g. components(g)) dispatch to a
6867                // read-only executor before any catalog/virtual-table routing
6868                // (issue #795).
6869                if let Some(TableSource::Function {
6870                    name,
6871                    args,
6872                    named_args,
6873                }) = table.source.clone()
6874                {
6875                    return Ok(RuntimeQueryResult {
6876                        query: query_str.to_string(),
6877                        mode,
6878                        statement,
6879                        engine: "runtime-graph-tvf",
6880                        result: self.execute_table_function(&name, &args, &named_args)?,
6881                        affected_rows: 0,
6882                        statement_type: "select",
6883                        bookmark: None,
6884                    });
6885                }
6886                // Inline-graph TVF (issue #799) on the prepared-statement /
6887                // direct-expr path. Result caching is wired on the
6888                // `execute_query_inner` path; here we just compute and return.
6889                if let Some(TableSource::InlineGraphFunction {
6890                    name,
6891                    nodes,
6892                    edges,
6893                    named_args,
6894                }) = table.source.clone()
6895                {
6896                    return Ok(RuntimeQueryResult {
6897                        query: query_str.to_string(),
6898                        mode,
6899                        statement,
6900                        engine: "runtime-graph-tvf-inline",
6901                        result: self.execute_inline_graph_function(
6902                            &name,
6903                            &nodes,
6904                            &edges,
6905                            &named_args,
6906                        )?,
6907                        affected_rows: 0,
6908                        statement_type: "select",
6909                        bookmark: None,
6910                    });
6911                }
6912                if super::red_schema::is_virtual_table(&table.table) {
6913                    return Ok(RuntimeQueryResult {
6914                        query: query_str.to_string(),
6915                        mode,
6916                        statement,
6917                        engine: "runtime-red-schema",
6918                        result: super::red_schema::red_query(
6919                            self,
6920                            &table.table,
6921                            &table,
6922                            &scope as &dyn super::statement_frame::ReadFrame,
6923                        )?,
6924                        affected_rows: 0,
6925                        statement_type: "select",
6926                        bookmark: None,
6927                    });
6928                }
6929                // `<graph>.<output>` analytics virtual view (issue #800).
6930                if let Some(view_result) = self.try_resolve_analytics_view(
6931                    &table,
6932                    &scope as &dyn super::statement_frame::ReadFrame,
6933                )? {
6934                    return Ok(RuntimeQueryResult {
6935                        query: query_str.to_string(),
6936                        mode,
6937                        statement,
6938                        engine: "runtime-graph-analytics-view",
6939                        result: view_result,
6940                        affected_rows: 0,
6941                        statement_type: "select",
6942                        bookmark: None,
6943                    });
6944                }
6945                let Some(table_with_rls) = self.authorize_relational_table_select(
6946                    table,
6947                    &scope as &dyn super::statement_frame::ReadFrame,
6948                )?
6949                else {
6950                    return Ok(RuntimeQueryResult {
6951                        query: query_str.to_string(),
6952                        mode,
6953                        statement,
6954                        engine: "runtime-table-rls",
6955                        result: crate::storage::query::unified::UnifiedResult::empty(),
6956                        affected_rows: 0,
6957                        statement_type: "select",
6958                        bookmark: None,
6959                    });
6960                };
6961                Ok(RuntimeQueryResult {
6962                    query: query_str.to_string(),
6963                    mode,
6964                    statement,
6965                    engine: "runtime-table",
6966                    result: execute_runtime_table_query(
6967                        &self.inner.db,
6968                        &table_with_rls,
6969                        Some(&self.inner.index_store),
6970                    )?,
6971                    affected_rows: 0,
6972                    statement_type: "select",
6973                    bookmark: None,
6974                })
6975            }
6976            QueryExpr::Join(join) => {
6977                let scope = self.ai_scope();
6978                let Some(join_with_rls) = self.authorize_relational_join_select(
6979                    join,
6980                    &scope as &dyn super::statement_frame::ReadFrame,
6981                )?
6982                else {
6983                    return Ok(RuntimeQueryResult {
6984                        query: query_str.to_string(),
6985                        mode,
6986                        statement,
6987                        engine: "runtime-join-rls",
6988                        result: crate::storage::query::unified::UnifiedResult::empty(),
6989                        affected_rows: 0,
6990                        statement_type: "select",
6991                        bookmark: None,
6992                    });
6993                };
6994                Ok(RuntimeQueryResult {
6995                    query: query_str.to_string(),
6996                    mode,
6997                    statement,
6998                    engine: "runtime-join",
6999                    result: execute_runtime_join_query(&self.inner.db, &join_with_rls)?,
7000                    affected_rows: 0,
7001                    statement_type: "select",
7002                    bookmark: None,
7003                })
7004            }
7005            QueryExpr::Vector(vector) => Ok(RuntimeQueryResult {
7006                query: query_str.to_string(),
7007                mode,
7008                statement,
7009                engine: "runtime-vector",
7010                result: execute_runtime_vector_query(&self.inner.db, &vector)?,
7011                affected_rows: 0,
7012                statement_type: "select",
7013                bookmark: None,
7014            }),
7015            QueryExpr::Hybrid(hybrid) => Ok(RuntimeQueryResult {
7016                query: query_str.to_string(),
7017                mode,
7018                statement,
7019                engine: "runtime-hybrid",
7020                result: execute_runtime_hybrid_query(&self.inner.db, &hybrid)?,
7021                affected_rows: 0,
7022                statement_type: "select",
7023                bookmark: None,
7024            }),
7025            QueryExpr::Insert(ref insert) if super::red_schema::is_virtual_table(&insert.table) => {
7026                Err(RedDBError::Query(
7027                    super::red_schema::READ_ONLY_ERROR.to_string(),
7028                ))
7029            }
7030            QueryExpr::Update(ref update) if super::red_schema::is_virtual_table(&update.table) => {
7031                Err(RedDBError::Query(
7032                    super::red_schema::READ_ONLY_ERROR.to_string(),
7033                ))
7034            }
7035            QueryExpr::Delete(ref delete) if super::red_schema::is_virtual_table(&delete.table) => {
7036                Err(RedDBError::Query(
7037                    super::red_schema::READ_ONLY_ERROR.to_string(),
7038                ))
7039            }
7040            QueryExpr::Insert(ref insert) => self
7041                .with_deferred_store_wal_for_dml(self.insert_may_emit_events(insert), || {
7042                    self.execute_insert(query_str, insert)
7043                }),
7044            QueryExpr::Update(ref update) => self
7045                .with_deferred_store_wal_for_dml(self.update_may_emit_events(update), || {
7046                    self.execute_update(query_str, update)
7047                }),
7048            QueryExpr::Delete(ref delete) => self
7049                .with_deferred_store_wal_for_dml(self.delete_may_emit_events(delete), || {
7050                    self.execute_delete(query_str, delete)
7051                }),
7052            QueryExpr::SearchCommand(ref cmd) => self.execute_search_command(query_str, cmd),
7053            QueryExpr::Ask(ref ask) => self.execute_ask(query_str, ask),
7054            _ => Err(RedDBError::Query(format!(
7055                "prepared-statement execution does not support {statement} statements"
7056            ))),
7057        }
7058    }
7059
7060    /// Dispatch a graph-collection table-valued function call in FROM
7061    /// position (e.g. `SELECT * FROM components(g)`).
7062    ///
7063    /// Validates the function name and arity here, materializes the whole
7064    /// active graph read-only, then runs the algorithm via the shared
7065    /// `dispatch_graph_algorithm` path. Never mutates the catalog or store.
7066    fn execute_table_function(
7067        &self,
7068        name: &str,
7069        args: &[String],
7070        named_args: &[(String, f64)],
7071    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7072        if !is_graph_tvf_name(name) {
7073            return Err(RedDBError::Query(format!("unknown table function: {name}")));
7074        }
7075        // Every graph-collection TVF takes exactly one graph argument.
7076        if args.len() != 1 {
7077            return Err(RedDBError::Query(format!(
7078                "table function '{name}' takes exactly 1 graph argument, got {}",
7079                args.len()
7080            )));
7081        }
7082
7083        // Read-only materialization of the full active graph. Passing `None`
7084        // for the projection uses the full graph store. Like #795/#796, the
7085        // v0 form runs over the whole graph store regardless of the collection
7086        // argument value. Materialization never mutates any store.
7087        let (nodes, edges) = self.materialize_whole_graph_abstract()?;
7088        self.dispatch_graph_algorithm(name, nodes, edges, named_args)
7089    }
7090
7091    /// Dispatch an inline-graph table-valued function call in FROM position
7092    /// (e.g. `SELECT * FROM components(nodes => (…), edges => (…))`, issue
7093    /// #799).
7094    ///
7095    /// Materializes the two subqueries through the normal read path (so RLS,
7096    /// column authz, and MVCC visibility all apply), constructs the abstract
7097    /// graph — the first column of `nodes` is the node id; the first two-or-
7098    /// three columns of `edges` are `(source, target [, weight])` — then runs
7099    /// the same algorithm path used by the graph-collection form. Read-only.
7100    fn execute_inline_graph_function(
7101        &self,
7102        name: &str,
7103        nodes_query: &QueryExpr,
7104        edges_query: &QueryExpr,
7105        named_args: &[(String, f64)],
7106    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7107        if !is_graph_tvf_name(name) {
7108            return Err(RedDBError::Query(format!("unknown table function: {name}")));
7109        }
7110
7111        let node_result = self.execute_query_expr(nodes_query.clone())?.result;
7112        let nodes = inline_node_ids(name, &node_result)?;
7113
7114        let edge_result = self.execute_query_expr(edges_query.clone())?.result;
7115        let edges = inline_edges(name, &edge_result)?;
7116
7117        self.dispatch_graph_algorithm(name, nodes, edges, named_args)
7118    }
7119
7120    /// Materialize the whole active graph read-only into the abstract
7121    /// `(nodes, edges)` inputs the pure graph algorithms consume.
7122    fn materialize_whole_graph_abstract(
7123        &self,
7124    ) -> RedDBResult<(
7125        Vec<String>,
7126        Vec<(
7127            String,
7128            String,
7129            crate::storage::engine::graph_algorithms::Weight,
7130        )>,
7131    )> {
7132        use crate::storage::engine::graph_algorithms;
7133
7134        let graph = super::graph_dsl::materialize_graph_with_projection(
7135            self.inner.db.store().as_ref(),
7136            None,
7137        )?;
7138        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
7139        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
7140            .iter_all_edges()
7141            .into_iter()
7142            .map(|e| (e.source_id, e.target_id, e.weight))
7143            .collect();
7144        Ok((nodes, edges))
7145    }
7146
7147    /// Resolve a `<graph>.<output>` analytics virtual view (issue #800).
7148    ///
7149    /// Returns `Ok(None)` when `table` is not an analytics view — either the
7150    /// name is not dotted, a real collection of that exact name exists (a real
7151    /// collection always wins; no shadowing), the suffix is not a recognised
7152    /// analytics output, or the parent is not a graph. Returns `Ok(Some(_))`
7153    /// with the freshly computed result when it does resolve, and an error when
7154    /// the parent graph exists but the output is not enabled, a declared
7155    /// algorithm is unsupported, or the parent collection's policy denies the
7156    /// read.
7157    ///
7158    /// The view is recomputed on every call (no result-cache write) so it
7159    /// always reflects the current graph data, satisfying the on-demand
7160    /// recompute contract for this slice.
7161    fn try_resolve_analytics_view(
7162        &self,
7163        table: &TableQuery,
7164        frame: &dyn super::statement_frame::ReadFrame,
7165    ) -> RedDBResult<Option<crate::storage::query::unified::UnifiedResult>> {
7166        let full = table.table.as_str();
7167        let Some(dot) = full.rfind('.') else {
7168            return Ok(None);
7169        };
7170        // A real collection literally named `g.communities` always wins.
7171        if self.inner.db.store().get_collection(full).is_some() {
7172            return Ok(None);
7173        }
7174        let graph_name = &full[..dot];
7175        let output_name = &full[dot + 1..];
7176        let Some(output) = crate::catalog::AnalyticsOutput::from_str(output_name) else {
7177            return Ok(None);
7178        };
7179
7180        let contracts = self.inner.db.collection_contracts();
7181        let Some(contract) = contracts.iter().find(|c| c.name == graph_name) else {
7182            return Ok(None);
7183        };
7184        if contract.declared_model != crate::catalog::CollectionModel::Graph {
7185            return Ok(None);
7186        }
7187        let Some(view) = contract
7188            .analytics_config
7189            .iter()
7190            .find(|view| view.output == output)
7191        else {
7192            // The parent graph exists but this output was not declared — a
7193            // clear error beats the misleading "collection not found".
7194            return Err(RedDBError::Query(format!(
7195                "analytics output '{output_name}' is not enabled on graph '{graph_name}'; declare it with WITH ANALYTICS (...)"
7196            )));
7197        };
7198
7199        // Policy inheritance (AC5): route through the parent graph collection's
7200        // read authorization. A policy or RLS rule that denies the parent
7201        // denies its analytics views transitively.
7202        let parent_query = TableQuery::new(graph_name);
7203        if self
7204            .authorize_relational_table_select(parent_query, frame)?
7205            .is_none()
7206        {
7207            return Err(RedDBError::Query(format!(
7208                "permission denied: policy on graph '{graph_name}' denies analytics view '{output_name}'"
7209            )));
7210        }
7211
7212        let (algorithm, named_args) = analytics_view_algorithm(graph_name, view)?;
7213        let (nodes, edges) = self.materialize_whole_graph_abstract()?;
7214        let result = self.dispatch_graph_algorithm(&algorithm, nodes, edges, &named_args)?;
7215        Ok(Some(result))
7216    }
7217
7218    /// Shared algorithm dispatch over abstract `(nodes, edges)` inputs.
7219    ///
7220    /// Both the graph-collection form and the inline-graph form route here so
7221    /// named-argument validation and the projected row shape stay identical
7222    /// across the two signatures (issue #799). Projects each algorithm's
7223    /// native output shape.
7224    fn dispatch_graph_algorithm(
7225        &self,
7226        name: &str,
7227        nodes: Vec<String>,
7228        edges: Vec<(
7229            String,
7230            String,
7231            crate::storage::engine::graph_algorithms::Weight,
7232        )>,
7233        named_args: &[(String, f64)],
7234    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7235        use crate::storage::engine::graph_algorithms;
7236        use crate::storage::query::unified::UnifiedResult;
7237        use crate::storage::schema::Value;
7238
7239        if name.eq_ignore_ascii_case("components") {
7240            reject_named_args(name, named_args)?;
7241            let assignment = graph_algorithms::connected_components(&nodes, &edges);
7242            let mut result =
7243                UnifiedResult::with_columns(vec!["node_id".into(), "island_id".into()]);
7244            for (node_id, island_id) in assignment {
7245                let mut record = UnifiedRecord::new();
7246                record.set("node_id", Value::text(node_id));
7247                record.set("island_id", Value::Integer(island_id as i64));
7248                result.push(record);
7249            }
7250            return Ok(result);
7251        }
7252
7253        if name.eq_ignore_ascii_case("louvain") {
7254            // The only supported named argument is `resolution` (γ). It
7255            // defaults to 1.0 (classic modularity) and must be a finite,
7256            // strictly positive number — a non-positive (or NaN/inf)
7257            // resolution has no sensible meaning.
7258            let resolution = louvain_resolution(named_args)?;
7259            let assignment = graph_algorithms::louvain(&nodes, &edges, resolution);
7260            let mut result =
7261                UnifiedResult::with_columns(vec!["node_id".into(), "community_id".into()]);
7262            for (node_id, community_id) in assignment {
7263                let mut record = UnifiedRecord::new();
7264                record.set("node_id", Value::text(node_id));
7265                record.set("community_id", Value::Integer(community_id as i64));
7266                result.push(record);
7267            }
7268            return Ok(result);
7269        }
7270
7271        if name.eq_ignore_ascii_case("degree_centrality") {
7272            reject_named_args(name, named_args)?;
7273            let assignment = abstract_degree_centrality(&nodes, &edges);
7274            let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "degree".into()]);
7275            for (node_id, degree) in assignment {
7276                let mut record = UnifiedRecord::new();
7277                record.set("node_id", Value::text(node_id));
7278                record.set("degree", Value::Integer(degree as i64));
7279                result.push(record);
7280            }
7281            return Ok(result);
7282        }
7283
7284        if name.eq_ignore_ascii_case("shortest_path") {
7285            // Scalar named arguments: `src` and `dst` are required node ids,
7286            // `max_hops` is an optional non-negative edge-count cap. Node ids
7287            // in the graph store are integer entity ids rendered as strings, so
7288            // each id arg must be a non-negative whole number; reject anything
7289            // else (fractional, negative, NaN/inf) with a clear message.
7290            let mut src: Option<String> = None;
7291            let mut dst: Option<String> = None;
7292            let mut max_hops: Option<usize> = None;
7293            let as_node_id = |key: &str, value: f64| -> RedDBResult<String> {
7294                if !value.is_finite() || value < 0.0 || value.fract() != 0.0 {
7295                    return Err(RedDBError::Query(format!(
7296                        "table function 'shortest_path' argument '{key}' must be a non-negative integer node id, got {value}"
7297                    )));
7298                }
7299                Ok((value as i64).to_string())
7300            };
7301            for (key, value) in named_args {
7302                if key.eq_ignore_ascii_case("src") {
7303                    src = Some(as_node_id("src", *value)?);
7304                } else if key.eq_ignore_ascii_case("dst") {
7305                    dst = Some(as_node_id("dst", *value)?);
7306                } else if key.eq_ignore_ascii_case("max_hops") {
7307                    if !value.is_finite() || *value < 0.0 || value.fract() != 0.0 {
7308                        return Err(RedDBError::Query(format!(
7309                            "table function 'shortest_path' max_hops must be a non-negative integer, got {value}"
7310                        )));
7311                    }
7312                    max_hops = Some(*value as usize);
7313                } else {
7314                    return Err(RedDBError::Query(format!(
7315                        "table function 'shortest_path' has no named argument '{key}' (expected 'src', 'dst', 'max_hops')"
7316                    )));
7317                }
7318            }
7319            let src = src.ok_or_else(|| {
7320                RedDBError::Query(
7321                    "table function 'shortest_path' requires named argument 'src'".to_string(),
7322                )
7323            })?;
7324            let dst = dst.ok_or_else(|| {
7325                RedDBError::Query(
7326                    "table function 'shortest_path' requires named argument 'dst'".to_string(),
7327                )
7328            })?;
7329
7330            // Columns are always present; an unreachable pair (within the
7331            // optional `max_hops` budget) simply yields zero rows — never an
7332            // error. `hop` is the 0-based index from the source;
7333            // `cumulative_weight` is the running path weight (0 at the source,
7334            // the total at the destination). Edges are treated as undirected,
7335            // consistent with `components` / `louvain`.
7336            let mut result = UnifiedResult::with_columns(vec![
7337                "hop".into(),
7338                "node_id".into(),
7339                "cumulative_weight".into(),
7340            ]);
7341            if let Some(path) =
7342                graph_algorithms::shortest_path(&nodes, &edges, &src, &dst, max_hops)
7343            {
7344                for (hop, (node_id, cumulative_weight)) in path.into_iter().enumerate() {
7345                    let mut record = UnifiedRecord::new();
7346                    record.set("hop", Value::Integer(hop as i64));
7347                    record.set("node_id", Value::text(node_id));
7348                    record.set("cumulative_weight", Value::Float(cumulative_weight));
7349                    result.push(record);
7350                }
7351            }
7352            return Ok(result);
7353        }
7354        // ── Centrality family (issue #797): each returns rows `(node_id,
7355        // score)` over the abstract `(nodes, edges)` graph. Like the other
7356        // graph TVFs the graph is treated as undirected and scores are
7357        // deterministic; the inline-graph form shares this dispatch. ──
7358        if name.eq_ignore_ascii_case("betweenness") {
7359            reject_named_args(name, named_args)?;
7360            return Ok(Self::centrality_result(graph_algorithms::betweenness(
7361                &nodes, &edges,
7362            )));
7363        }
7364        if name.eq_ignore_ascii_case("eigenvector") {
7365            // Optional `max_iterations` (positive integer, default 100) and
7366            // `tolerance` (finite, strictly positive, default 1e-6).
7367            let mut max_iterations = 100_usize;
7368            let mut tolerance = 1e-6_f64;
7369            for (key, value) in named_args {
7370                if key.eq_ignore_ascii_case("max_iterations") {
7371                    max_iterations = parse_positive_iterations("eigenvector", value)?;
7372                } else if key.eq_ignore_ascii_case("tolerance") {
7373                    if !value.is_finite() || *value <= 0.0 {
7374                        return Err(RedDBError::Query(format!(
7375                            "table function 'eigenvector' tolerance must be > 0, got {value}"
7376                        )));
7377                    }
7378                    tolerance = *value;
7379                } else {
7380                    return Err(RedDBError::Query(format!(
7381                        "table function 'eigenvector' has no named argument '{key}' (expected 'max_iterations' or 'tolerance')"
7382                    )));
7383                }
7384            }
7385            return Ok(Self::centrality_result(graph_algorithms::eigenvector(
7386                &nodes,
7387                &edges,
7388                max_iterations,
7389                tolerance,
7390            )));
7391        }
7392        if name.eq_ignore_ascii_case("pagerank") {
7393            // Optional `damping` (in (0, 1), default 0.85) and `max_iterations`
7394            // (positive integer, default 100).
7395            let mut damping = 0.85_f64;
7396            let mut max_iterations = 100_usize;
7397            for (key, value) in named_args {
7398                if key.eq_ignore_ascii_case("damping") {
7399                    if !value.is_finite() || *value <= 0.0 || *value >= 1.0 {
7400                        return Err(RedDBError::Query(format!(
7401                            "table function 'pagerank' damping must be in (0, 1), got {value}"
7402                        )));
7403                    }
7404                    damping = *value;
7405                } else if key.eq_ignore_ascii_case("max_iterations") {
7406                    max_iterations = parse_positive_iterations("pagerank", value)?;
7407                } else {
7408                    return Err(RedDBError::Query(format!(
7409                        "table function 'pagerank' has no named argument '{key}' (expected 'damping' or 'max_iterations')"
7410                    )));
7411                }
7412            }
7413            return Ok(Self::centrality_result(graph_algorithms::pagerank(
7414                &nodes,
7415                &edges,
7416                damping,
7417                max_iterations,
7418            )));
7419        }
7420        Err(RedDBError::Query(format!("unknown table function: {name}")))
7421    }
7422
7423    /// `components(<graph_collection>)` — returns rows `(node_id, island_id)`.
7424    ///
7425    /// Materializes the active graph (nodes + weighted edges) read-only and
7426    /// runs the pure `graph_algorithms::connected_components`. Edges are
7427    /// treated as undirected; island ids are deterministic (ascending order of
7428    /// each component's smallest node).
7429    fn execute_components_tvf(
7430        &self,
7431        _collection: &str,
7432    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7433        use crate::storage::engine::graph_algorithms;
7434        use crate::storage::query::unified::UnifiedResult;
7435        use crate::storage::schema::Value;
7436
7437        // Read-only materialization of the full active graph. The named
7438        // collection identifies the active graph scope; passing `None` for the
7439        // projection uses the full graph store (the same result
7440        // `active_graph_projection` yields when no projection is registered).
7441        // Materialization never mutates any store.
7442        let graph = super::graph_dsl::materialize_graph_with_projection(
7443            self.inner.db.store().as_ref(),
7444            None,
7445        )?;
7446
7447        // Materialize abstract inputs for the pure algorithm.
7448        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
7449        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
7450            .iter_all_edges()
7451            .into_iter()
7452            .map(|e| (e.source_id, e.target_id, e.weight))
7453            .collect();
7454
7455        let assignment = graph_algorithms::connected_components(&nodes, &edges);
7456
7457        // Project into a UnifiedResult with columns ["node_id", "island_id"].
7458        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "island_id".into()]);
7459        for (node_id, island_id) in assignment {
7460            let mut record = UnifiedRecord::new();
7461            record.set("node_id", Value::text(node_id));
7462            record.set("island_id", Value::Integer(island_id as i64));
7463            result.push(record);
7464        }
7465        Ok(result)
7466    }
7467
7468    /// `louvain(<graph> [, resolution => <f64>])` — returns rows
7469    /// `(node_id, community_id)` (issue #796).
7470    ///
7471    /// Materializes the active graph (nodes + weighted edges) read-only and
7472    /// runs the pure, deterministic `graph_algorithms::louvain`. Edges are
7473    /// treated as undirected; community ids are assigned in ascending order of
7474    /// each community's smallest node, so identical input + resolution always
7475    /// yields identical rows. Like `components`, the v0 form runs over the
7476    /// whole graph store regardless of the collection argument value.
7477    fn execute_louvain_tvf(
7478        &self,
7479        _collection: &str,
7480        resolution: f64,
7481    ) -> RedDBResult<crate::storage::query::unified::UnifiedResult> {
7482        use crate::storage::engine::graph_algorithms;
7483        use crate::storage::query::unified::UnifiedResult;
7484        use crate::storage::schema::Value;
7485
7486        let graph = super::graph_dsl::materialize_graph_with_projection(
7487            self.inner.db.store().as_ref(),
7488            None,
7489        )?;
7490
7491        let nodes: Vec<String> = graph.iter_nodes().map(|n| n.id.clone()).collect();
7492        let edges: Vec<(String, String, graph_algorithms::Weight)> = graph
7493            .iter_all_edges()
7494            .into_iter()
7495            .map(|e| (e.source_id, e.target_id, e.weight))
7496            .collect();
7497
7498        let assignment = graph_algorithms::louvain(&nodes, &edges, resolution);
7499
7500        // Project into a UnifiedResult with columns ["node_id", "community_id"].
7501        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "community_id".into()]);
7502        for (node_id, community_id) in assignment {
7503            let mut record = UnifiedRecord::new();
7504            record.set("node_id", Value::text(node_id));
7505            record.set("community_id", Value::Integer(community_id as i64));
7506            result.push(record);
7507        }
7508        Ok(result)
7509    }
7510
7511    /// Project `(node_id, score)` centrality rows into a `UnifiedResult` with
7512    /// columns `["node_id", "score"]`; scores are `Value::Float`.
7513    fn centrality_result(
7514        rows: Vec<(String, f64)>,
7515    ) -> crate::storage::query::unified::UnifiedResult {
7516        use crate::storage::query::unified::UnifiedResult;
7517        use crate::storage::schema::Value;
7518        let mut result = UnifiedResult::with_columns(vec!["node_id".into(), "score".into()]);
7519        for (node_id, score) in rows {
7520            let mut record = UnifiedRecord::new();
7521            record.set("node_id", Value::text(node_id));
7522            record.set("score", Value::Float(score));
7523            result.push(record);
7524        }
7525        result
7526    }
7527
7528    /// Ultra-fast path: detect `SELECT * FROM table WHERE _entity_id = N` by string pattern
7529    /// and execute it without SQL parsing or planning. Returns None if pattern doesn't match.
7530    fn try_fast_entity_lookup(&self, query: &str) -> Option<RedDBResult<RuntimeQueryResult>> {
7531        // Pattern: "SELECT * FROM <table> WHERE _entity_id = <id>"
7532        // or "SELECT * FROM <table> WHERE _entity_id =<id>"
7533        let q = query.trim();
7534        if !q.starts_with("SELECT") && !q.starts_with("select") {
7535            return None;
7536        }
7537
7538        // Find "WHERE _entity_id = " or "WHERE _entity_id ="
7539        let where_pos = q
7540            .find("WHERE _entity_id")
7541            .or_else(|| q.find("where _entity_id"))?;
7542        let after_field = &q[where_pos + 16..].trim_start(); // skip "WHERE _entity_id"
7543        let after_eq = after_field.strip_prefix('=')?.trim_start();
7544
7545        // Parse the entity ID number
7546        let id_str = after_eq.trim();
7547        let entity_id: u64 = id_str.parse().ok()?;
7548
7549        // Extract table name: between "FROM " and " WHERE"
7550        let from_pos = q.find("FROM ").or_else(|| q.find("from "))? + 5;
7551        let table = q[from_pos..where_pos].trim();
7552        if table.is_empty()
7553            || table.contains(' ') && !table.contains(" AS ") && !table.contains(" as ")
7554        {
7555            return None; // complex query, fall through
7556        }
7557        let table_name = table.split_whitespace().next()?;
7558
7559        // Direct entity lookup — skips SQL parse, plan cache, result
7560        // cache, view rewriter, RLS gate. Safe because the gating in
7561        // `execute_query` guarantees no scope override / no
7562        // transaction context is active. MVCC visibility is still
7563        // honoured against the current snapshot.
7564        let store = self.inner.db.store();
7565        let entity = store
7566            .get(
7567                table_name,
7568                crate::storage::unified::EntityId::new(entity_id),
7569            )
7570            .filter(entity_visible_under_current_snapshot)
7571            .filter(|entity| {
7572                self.inner
7573                    .db
7574                    .replica_allows_entity_at_read(table_name, entity)
7575            });
7576
7577        let count = if entity.is_some() { 1u64 } else { 0 };
7578
7579        // Materialize a record so downstream consumers that walk
7580        // `result.records` (embedded runtime API, decrypt pass, CLI)
7581        // see the row. Previously only `pre_serialized_json` was
7582        // filled, which caused those consumers to see zero rows and
7583        // skewed benchmarks.
7584        let records: Vec<crate::storage::query::unified::UnifiedRecord> = entity
7585            .as_ref()
7586            .and_then(|e| runtime_table_record_from_entity(e.clone()))
7587            .into_iter()
7588            .collect();
7589
7590        let json = match entity {
7591            Some(ref e) => execute_runtime_serialize_single_entity(e),
7592            None => r#"{"columns":[],"record_count":0,"selection":{"scope":"any"},"records":[]}"#
7593                .to_string(),
7594        };
7595
7596        Some(Ok(RuntimeQueryResult {
7597            query: query.to_string(),
7598            mode: crate::storage::query::modes::QueryMode::Sql,
7599            statement: "select",
7600            engine: "fast-entity-lookup",
7601            result: crate::storage::query::unified::UnifiedResult {
7602                columns: Vec::new(),
7603                records,
7604                stats: crate::storage::query::unified::QueryStats {
7605                    rows_scanned: count,
7606                    ..Default::default()
7607                },
7608                pre_serialized_json: Some(json),
7609            },
7610            affected_rows: 0,
7611            statement_type: "select",
7612            bookmark: None,
7613        }))
7614    }
7615
7616    pub(crate) fn invalidate_plan_cache(&self) {
7617        self.inner.query_cache.write().clear();
7618        self.inner
7619            .ddl_epoch
7620            .fetch_add(1, std::sync::atomic::Ordering::Release);
7621    }
7622
7623    /// Read the monotonic DDL epoch counter. Bumped by every
7624    /// `invalidate_plan_cache` call so prepared-statement holders can
7625    /// detect schema drift between PREPARE and EXECUTE.
7626    pub fn ddl_epoch(&self) -> u64 {
7627        self.inner
7628            .ddl_epoch
7629            .load(std::sync::atomic::Ordering::Acquire)
7630    }
7631
7632    pub(crate) fn clear_table_planner_stats(&self, table: &str) {
7633        let store = self.inner.db.store();
7634        crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
7635        self.invalidate_plan_cache();
7636    }
7637
7638    /// Replay `tenant_tables.*.column` keys from red_config at boot so
7639    /// `CREATE TABLE ... TENANT BY (col)` declarations persist across
7640    /// restarts (Phase 2.5.4). Reads every row of the `red_config`
7641    /// collection, picks the keys matching the tenant-marker shape,
7642    /// and calls `register_tenant_table` for each.
7643    ///
7644    /// Safe no-op when `red_config` doesn't exist (first boot on a
7645    /// fresh datadir).
7646    pub(crate) fn rehydrate_tenant_tables(&self) {
7647        let store = self.inner.db.store();
7648        let Some(manager) = store.get_collection("red_config") else {
7649            return;
7650        };
7651        // Replay in insertion order (SegmentManager iteration). Multiple
7652        // toggles on the same table leave several rows behind — the
7653        // last one processed wins because each register/unregister
7654        // call overwrites the in-memory state.
7655        for entity in manager.query_all(|_| true) {
7656            let crate::storage::unified::entity::EntityData::Row(row) = &entity.data else {
7657                continue;
7658            };
7659            let Some(named) = &row.named else { continue };
7660            let Some(crate::storage::schema::Value::Text(key)) = named.get("key") else {
7661                continue;
7662            };
7663            // Shape: tenant_tables.{table}.column
7664            let Some(rest) = key.strip_prefix("tenant_tables.") else {
7665                continue;
7666            };
7667            let Some((table, suffix)) = rest.rsplit_once('.') else {
7668                // Issue #205 — a `tenant_tables.*` row that doesn't
7669                // split cleanly is a schema-shape regression: the
7670                // metadata writer must always emit the `.column`
7671                // suffix, so reaching this branch means an upgrade
7672                // with incompatible state or external tampering.
7673                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7674                    collection: "red_config".to_string(),
7675                    detail: format!("malformed tenant_tables key: {key}"),
7676                }
7677                .emit_global();
7678                continue;
7679            };
7680            if suffix != "column" {
7681                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7682                    collection: "red_config".to_string(),
7683                    detail: format!("unexpected tenant_tables suffix: {key}"),
7684                }
7685                .emit_global();
7686                continue;
7687            }
7688            match named.get("value") {
7689                Some(crate::storage::schema::Value::Text(column)) => {
7690                    self.register_tenant_table(table, column);
7691                }
7692                // Null / missing value = DISABLE TENANCY marker.
7693                Some(crate::storage::schema::Value::Null) | None => {
7694                    self.unregister_tenant_table(table);
7695                }
7696                _ => {}
7697            }
7698        }
7699    }
7700
7701    /// Replay every persisted `MaterializedViewDescriptor` from the
7702    /// `red_materialized_view_defs` system collection (issue #593
7703    /// slice 9a). For each descriptor, re-parse the original SQL,
7704    /// extract the `QueryExpr::CreateView` it produced, and populate
7705    /// the in-memory registries (`inner.views` and
7706    /// `inner.materialized_views`) directly — no write paths run, so
7707    /// rehydrate does not re-persist what it just read.
7708    ///
7709    /// Malformed rows (missing `name`/`source_sql`, parse errors) are
7710    /// skipped with a `SchemaCorruption` operator event so a single
7711    /// bad entry does not block startup.
7712    pub(crate) fn rehydrate_materialized_view_descriptors(&self) {
7713        let store = self.inner.db.store();
7714        let descriptors = crate::runtime::continuous_materialized_view::load_all(store.as_ref());
7715        for descriptor in descriptors {
7716            let parsed = match crate::storage::query::parser::parse(&descriptor.source_sql) {
7717                Ok(qc) => qc,
7718                Err(err) => {
7719                    crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7720                        collection:
7721                            crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
7722                                .to_string(),
7723                        detail: format!(
7724                            "failed to re-parse materialized-view source for {}: {err}",
7725                            descriptor.name
7726                        ),
7727                    }
7728                    .emit_global();
7729                    continue;
7730                }
7731            };
7732            let crate::storage::query::ast::QueryExpr::CreateView(create) = parsed.query else {
7733                crate::telemetry::operator_event::OperatorEvent::SchemaCorruption {
7734                    collection: crate::runtime::continuous_materialized_view::CATALOG_COLLECTION
7735                        .to_string(),
7736                    detail: format!(
7737                        "materialized-view source for {} did not re-parse as CREATE VIEW",
7738                        descriptor.name
7739                    ),
7740                }
7741                .emit_global();
7742                continue;
7743            };
7744            // Populate in-memory view registry.
7745            let view_name = create.name.clone();
7746            self.inner
7747                .views
7748                .write()
7749                .insert(view_name.clone(), Arc::new(create));
7750            // Materialized cache slot (data empty until next REFRESH).
7751            use crate::storage::cache::result::{MaterializedViewDef, RefreshPolicy};
7752            let refresh = match descriptor.refresh_every_ms {
7753                Some(ms) => RefreshPolicy::Periodic(std::time::Duration::from_millis(ms)),
7754                None => RefreshPolicy::Manual,
7755            };
7756            let def = MaterializedViewDef {
7757                name: view_name.clone(),
7758                query: format!("<parsed view {}>", view_name),
7759                dependencies: descriptor.source_collections.clone(),
7760                refresh,
7761                retention_duration_ms: descriptor.retention_duration_ms,
7762            };
7763            self.inner.materialized_views.write().register(def);
7764        }
7765        // A rehydrated view shape may differ from any plans the cache
7766        // bootstrapped before this method ran — flush to be safe.
7767        self.invalidate_plan_cache();
7768    }
7769
7770    pub(crate) fn rehydrate_declared_column_schemas(&self) {
7771        let store = self.inner.db.store();
7772        for contract in self.inner.db.collection_contracts() {
7773            let columns: Vec<String> = contract
7774                .declared_columns
7775                .iter()
7776                .map(|column| column.name.clone())
7777                .collect();
7778            let Some(manager) = store.get_collection(&contract.name) else {
7779                continue;
7780            };
7781            manager.set_column_schema_if_empty(columns);
7782        }
7783    }
7784
7785    /// Register a table as tenant-scoped (Phase 2.5.4). Installs the
7786    /// in-memory column mapping, the implicit RLS policy, and enables
7787    /// row-level security on the table. Idempotent — re-registering
7788    /// the same `(table, column)` replaces the prior auto-policy.
7789    pub fn register_tenant_table(&self, table: &str, column: &str) {
7790        use crate::storage::query::ast::{
7791            CompareOp, CreatePolicyQuery, Expr, FieldRef, Filter, Span,
7792        };
7793        self.inner
7794            .tenant_tables
7795            .write()
7796            .insert(table.to_string(), column.to_string());
7797
7798        // Build the policy: col = CURRENT_TENANT()
7799        // Uses CompareExpr so the comparison happens at runtime against
7800        // the thread-local tenant value read by the CURRENT_TENANT
7801        // scalar. Spans are synthetic — there's no source location for
7802        // an auto-generated policy.
7803        let lhs = Expr::Column {
7804            field: FieldRef::TableColumn {
7805                table: table.to_string(),
7806                column: column.to_string(),
7807            },
7808            span: Span::synthetic(),
7809        };
7810        let rhs = Expr::FunctionCall {
7811            name: "CURRENT_TENANT".to_string(),
7812            args: Vec::new(),
7813            span: Span::synthetic(),
7814        };
7815        let policy_filter = Filter::CompareExpr {
7816            lhs,
7817            op: CompareOp::Eq,
7818            rhs,
7819        };
7820
7821        let policy = CreatePolicyQuery {
7822            name: "__tenant_iso".to_string(),
7823            table: table.to_string(),
7824            action: None, // None = ALL actions (SELECT/INSERT/UPDATE/DELETE)
7825            role: None,   // None = every role
7826            using: Box::new(policy_filter),
7827            // Auto-tenancy defaults to Table targets. Collections of
7828            // other kinds (graph / vector / queue / timeseries) that
7829            // opt in via `ALTER ... ENABLE TENANCY` should use the
7830            // matching kind — but for now we keep the auto-policy
7831            // kind-agnostic so the evaluator can apply it to any
7832            // entity living in the collection.
7833            target_kind: crate::storage::query::ast::PolicyTargetKind::Table,
7834        };
7835
7836        // Replace any prior auto-policy for this table (column rename).
7837        self.inner.rls_policies.write().insert(
7838            (table.to_string(), "__tenant_iso".to_string()),
7839            Arc::new(policy),
7840        );
7841        self.inner
7842            .rls_enabled_tables
7843            .write()
7844            .insert(table.to_string());
7845
7846        // Auto-build a hash index on the tenant column. Every read/write
7847        // against a tenant-scoped table carries an implicit
7848        // `col = CURRENT_TENANT()` predicate from the auto-policy, so an
7849        // index on that column is on the hot path of every query. Without
7850        // it, every SELECT/UPDATE/DELETE degrades to a full scan.
7851        self.ensure_tenant_index(table, column);
7852    }
7853
7854    /// Auto-create the hash index that backs the tenant-iso RLS predicate.
7855    /// Skipped when:
7856    ///   * the column is dotted (nested path — flat secondary indices
7857    ///     don't cover those today; RLS still works via the policy)
7858    ///   * `__tenant_idx_{table}` already exists (idempotent on rehydrate)
7859    ///   * the user already registered an index whose first column matches
7860    ///     (avoids redundant duplicates of a user-defined composite)
7861    fn ensure_tenant_index(&self, table: &str, column: &str) {
7862        if column.contains('.') {
7863            return;
7864        }
7865        let index_name = format!("__tenant_idx_{table}");
7866        let registry = self.inner.index_store.list_indices(table);
7867        if registry.iter().any(|idx| idx.name == index_name) {
7868            return;
7869        }
7870        if registry
7871            .iter()
7872            .any(|idx| idx.columns.first().map(|c| c.as_str()) == Some(column))
7873        {
7874            return;
7875        }
7876
7877        let store = self.inner.db.store();
7878        let Some(manager) = store.get_collection(table) else {
7879            return;
7880        };
7881        let entities = manager.query_all(|_| true);
7882        let entity_fields: Vec<(
7883            crate::storage::unified::EntityId,
7884            Vec<(String, crate::storage::schema::Value)>,
7885        )> = entities
7886            .iter()
7887            .map(|e| {
7888                let fields = match &e.data {
7889                    crate::storage::EntityData::Row(row) => {
7890                        if let Some(ref named) = row.named {
7891                            named.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
7892                        } else if let Some(ref schema) = row.schema {
7893                            schema
7894                                .iter()
7895                                .zip(row.columns.iter())
7896                                .map(|(k, v)| (k.clone(), v.clone()))
7897                                .collect()
7898                        } else {
7899                            Vec::new()
7900                        }
7901                    }
7902                    crate::storage::EntityData::Node(node) => node
7903                        .properties
7904                        .iter()
7905                        .map(|(k, v)| (k.clone(), v.clone()))
7906                        .collect(),
7907                    _ => Vec::new(),
7908                };
7909                (e.id, fields)
7910            })
7911            .collect();
7912
7913        let columns = vec![column.to_string()];
7914        if self
7915            .inner
7916            .index_store
7917            .create_index(
7918                &index_name,
7919                table,
7920                &columns,
7921                super::index_store::IndexMethodKind::Hash,
7922                false,
7923                &entity_fields,
7924            )
7925            .is_err()
7926        {
7927            return;
7928        }
7929        self.inner
7930            .index_store
7931            .register(super::index_store::RegisteredIndex {
7932                name: index_name,
7933                collection: table.to_string(),
7934                columns,
7935                method: super::index_store::IndexMethodKind::Hash,
7936                unique: false,
7937            });
7938        self.invalidate_plan_cache();
7939    }
7940
7941    /// Drop the auto-generated tenant index, if one exists. Called from
7942    /// `unregister_tenant_table` so DISABLE TENANCY / DROP TABLE clean up.
7943    fn drop_tenant_index(&self, table: &str) {
7944        let index_name = format!("__tenant_idx_{table}");
7945        self.inner.index_store.drop_index(&index_name, table);
7946    }
7947
7948    /// Retrieve the tenant column for a table, if any (Phase 2.5.4).
7949    /// Used by the INSERT auto-fill path to know which column to
7950    /// populate with `current_tenant()` when the user didn't name it.
7951    pub fn tenant_column(&self, table: &str) -> Option<String> {
7952        self.inner.tenant_tables.read().get(table).cloned()
7953    }
7954
7955    /// Remove a table's tenant registration (Phase 2.5.4). Called by
7956    /// DROP TABLE / ALTER TABLE DISABLE TENANCY. Removes the auto-policy
7957    /// but leaves any user-installed explicit policies intact.
7958    pub fn unregister_tenant_table(&self, table: &str) {
7959        self.inner.tenant_tables.write().remove(table);
7960        self.inner
7961            .rls_policies
7962            .write()
7963            .remove(&(table.to_string(), "__tenant_iso".to_string()));
7964        self.drop_tenant_index(table);
7965        // Only clear RLS enablement if no other policies remain.
7966        let has_other_policies = self
7967            .inner
7968            .rls_policies
7969            .read()
7970            .keys()
7971            .any(|(t, _)| t == table);
7972        if !has_other_policies {
7973            self.inner.rls_enabled_tables.write().remove(table);
7974        }
7975    }
7976
7977    /// Record that the running transaction has marked `id` in `collection`
7978    /// for deletion (Phase 2.3.2b MVCC tombstones). `stamper_xid` is the
7979    /// xid that was written into `xmax` — either the parent txn xid or
7980    /// the innermost savepoint sub-xid. Savepoint rollback filters by
7981    /// this xid to revive only its own tombstones.
7982    pub(crate) fn record_pending_tombstone(
7983        &self,
7984        conn_id: u64,
7985        collection: &str,
7986        id: crate::storage::unified::entity::EntityId,
7987        stamper_xid: crate::storage::transaction::snapshot::Xid,
7988        previous_xmax: crate::storage::transaction::snapshot::Xid,
7989    ) {
7990        self.inner
7991            .pending_tombstones
7992            .write()
7993            .entry(conn_id)
7994            .or_default()
7995            .push((collection.to_string(), id, stamper_xid, previous_xmax));
7996    }
7997
7998    pub(crate) fn record_pending_versioned_update(
7999        &self,
8000        conn_id: u64,
8001        collection: &str,
8002        old_id: crate::storage::unified::entity::EntityId,
8003        new_id: crate::storage::unified::entity::EntityId,
8004        stamper_xid: crate::storage::transaction::snapshot::Xid,
8005        previous_xmax: crate::storage::transaction::snapshot::Xid,
8006    ) {
8007        self.inner
8008            .pending_versioned_updates
8009            .write()
8010            .entry(conn_id)
8011            .or_default()
8012            .push((
8013                collection.to_string(),
8014                old_id,
8015                new_id,
8016                stamper_xid,
8017                previous_xmax,
8018            ));
8019    }
8020
8021    fn with_deferred_store_wal_if_transaction<T>(
8022        &self,
8023        f: impl FnOnce() -> RedDBResult<T>,
8024    ) -> RedDBResult<T> {
8025        let conn_id = current_connection_id();
8026        if !self.inner.tx_contexts.read().contains_key(&conn_id) {
8027            return f();
8028        }
8029
8030        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
8031        let result = f();
8032        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
8033        match result {
8034            Ok(value) => {
8035                self.record_pending_store_wal_actions(conn_id, captured);
8036                Ok(value)
8037            }
8038            Err(err) => Err(err),
8039        }
8040    }
8041
8042    fn with_deferred_store_wal_for_dml<T>(
8043        &self,
8044        capture_autocommit_events: bool,
8045        f: impl FnOnce() -> RedDBResult<T>,
8046    ) -> RedDBResult<T> {
8047        let conn_id = current_connection_id();
8048        if self.inner.tx_contexts.read().contains_key(&conn_id) {
8049            return self.with_deferred_store_wal_if_transaction(f);
8050        }
8051        if !capture_autocommit_events {
8052            return f();
8053        }
8054
8055        crate::storage::UnifiedStore::begin_deferred_store_wal_capture();
8056        let result = f();
8057        let captured = crate::storage::UnifiedStore::take_deferred_store_wal_capture();
8058        self.inner
8059            .db
8060            .store()
8061            .append_deferred_store_wal_actions(captured)
8062            .map_err(|err| RedDBError::Internal(err.to_string()))?;
8063        result
8064    }
8065
8066    fn insert_may_emit_events(&self, query: &InsertQuery) -> bool {
8067        !query.suppress_events
8068            && self.collection_has_event_subscriptions_for_operation(
8069                &query.table,
8070                crate::catalog::SubscriptionOperation::Insert,
8071            )
8072    }
8073
8074    fn update_may_emit_events(&self, query: &UpdateQuery) -> bool {
8075        !query.suppress_events
8076            && self.collection_has_event_subscriptions_for_operation(
8077                &query.table,
8078                crate::catalog::SubscriptionOperation::Update,
8079            )
8080    }
8081
8082    fn delete_may_emit_events(&self, query: &DeleteQuery) -> bool {
8083        !query.suppress_events
8084            && self.collection_has_event_subscriptions_for_operation(
8085                &query.table,
8086                crate::catalog::SubscriptionOperation::Delete,
8087            )
8088    }
8089
8090    fn collection_has_event_subscriptions_for_operation(
8091        &self,
8092        collection: &str,
8093        operation: crate::catalog::SubscriptionOperation,
8094    ) -> bool {
8095        let Some(contract) = self.db().collection_contract_arc(collection) else {
8096            return false;
8097        };
8098        contract.subscriptions.iter().any(|subscription| {
8099            subscription.enabled
8100                && (subscription.ops_filter.is_empty()
8101                    || subscription.ops_filter.contains(&operation))
8102        })
8103    }
8104
8105    fn record_pending_store_wal_actions(
8106        &self,
8107        conn_id: u64,
8108        actions: crate::storage::unified::DeferredStoreWalActions,
8109    ) {
8110        if actions.is_empty() {
8111            return;
8112        }
8113        let mut guard = self.inner.pending_store_wal_actions.write();
8114        guard.entry(conn_id).or_default().extend(actions);
8115    }
8116
8117    fn flush_pending_store_wal_actions(&self, conn_id: u64) -> RedDBResult<()> {
8118        let Some(actions) = self
8119            .inner
8120            .pending_store_wal_actions
8121            .write()
8122            .remove(&conn_id)
8123        else {
8124            return Ok(());
8125        };
8126        self.inner
8127            .db
8128            .store()
8129            .append_deferred_store_wal_actions(actions)
8130            .map_err(|err| RedDBError::Internal(err.to_string()))
8131    }
8132
8133    fn discard_pending_store_wal_actions(&self, conn_id: u64) {
8134        self.inner
8135            .pending_store_wal_actions
8136            .write()
8137            .remove(&conn_id);
8138    }
8139
8140    fn xid_conflicts_with_snapshot(
8141        &self,
8142        xid: crate::storage::transaction::snapshot::Xid,
8143        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8144        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8145    ) -> bool {
8146        xid != 0
8147            && !own_xids.contains(&xid)
8148            && !self.inner.snapshot_manager.is_aborted(xid)
8149            && !self.inner.snapshot_manager.is_active(xid)
8150            && (xid > snapshot.xid || snapshot.in_progress.contains(&xid))
8151    }
8152
8153    fn conflict_error(
8154        collection: &str,
8155        logical_id: crate::storage::unified::entity::EntityId,
8156        xid: crate::storage::transaction::snapshot::Xid,
8157    ) -> RedDBError {
8158        RedDBError::Query(format!(
8159            "serialization conflict: table row {collection}/{} was modified by concurrent transaction {xid}",
8160            logical_id.raw()
8161        ))
8162    }
8163
8164    fn check_logical_row_conflict(
8165        &self,
8166        collection: &str,
8167        logical_id: crate::storage::unified::entity::EntityId,
8168        excluded_ids: &[crate::storage::unified::entity::EntityId],
8169        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8170        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8171    ) -> RedDBResult<()> {
8172        let store = self.inner.db.store();
8173        let Some(manager) = store.get_collection(collection) else {
8174            return Ok(());
8175        };
8176
8177        for candidate in manager.query_all(|_| true) {
8178            if excluded_ids.contains(&candidate.id) || candidate.logical_id() != logical_id {
8179                continue;
8180            }
8181            if self.xid_conflicts_with_snapshot(candidate.xmin, snapshot, own_xids) {
8182                return Err(Self::conflict_error(collection, logical_id, candidate.xmin));
8183            }
8184            if self.xid_conflicts_with_snapshot(candidate.xmax, snapshot, own_xids) {
8185                return Err(Self::conflict_error(collection, logical_id, candidate.xmax));
8186            }
8187        }
8188        Ok(())
8189    }
8190
8191    pub(crate) fn check_table_row_write_conflicts(
8192        &self,
8193        conn_id: u64,
8194        snapshot: &crate::storage::transaction::snapshot::Snapshot,
8195        own_xids: &std::collections::HashSet<crate::storage::transaction::snapshot::Xid>,
8196    ) -> RedDBResult<()> {
8197        let versioned_updates = self
8198            .inner
8199            .pending_versioned_updates
8200            .read()
8201            .get(&conn_id)
8202            .cloned()
8203            .unwrap_or_default();
8204        let tombstones = self
8205            .inner
8206            .pending_tombstones
8207            .read()
8208            .get(&conn_id)
8209            .cloned()
8210            .unwrap_or_default();
8211
8212        let store = self.inner.db.store();
8213        for (collection, old_id, new_id, xid, previous_xmax) in versioned_updates {
8214            let Some(manager) = store.get_collection(&collection) else {
8215                continue;
8216            };
8217            let Some(old) = manager.get(old_id) else {
8218                continue;
8219            };
8220            let logical_id = old.logical_id();
8221            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
8222                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
8223            }
8224            if old.xmax != xid && self.xid_conflicts_with_snapshot(old.xmax, snapshot, own_xids) {
8225                return Err(Self::conflict_error(&collection, logical_id, old.xmax));
8226            }
8227            self.check_logical_row_conflict(
8228                &collection,
8229                logical_id,
8230                &[old_id, new_id],
8231                snapshot,
8232                own_xids,
8233            )?;
8234        }
8235
8236        for (collection, id, xid, previous_xmax) in tombstones {
8237            let Some(manager) = store.get_collection(&collection) else {
8238                continue;
8239            };
8240            let Some(entity) = manager.get(id) else {
8241                continue;
8242            };
8243            let logical_id = entity.logical_id();
8244            if self.xid_conflicts_with_snapshot(previous_xmax, snapshot, own_xids) {
8245                return Err(Self::conflict_error(&collection, logical_id, previous_xmax));
8246            }
8247            if entity.xmax != xid
8248                && self.xid_conflicts_with_snapshot(entity.xmax, snapshot, own_xids)
8249            {
8250                return Err(Self::conflict_error(&collection, logical_id, entity.xmax));
8251            }
8252            self.check_logical_row_conflict(&collection, logical_id, &[id], snapshot, own_xids)?;
8253        }
8254
8255        Ok(())
8256    }
8257
8258    pub(crate) fn restore_pending_write_stamps(&self, conn_id: u64) {
8259        let versioned_updates = self
8260            .inner
8261            .pending_versioned_updates
8262            .read()
8263            .get(&conn_id)
8264            .cloned()
8265            .unwrap_or_default();
8266        let tombstones = self
8267            .inner
8268            .pending_tombstones
8269            .read()
8270            .get(&conn_id)
8271            .cloned()
8272            .unwrap_or_default();
8273
8274        let store = self.inner.db.store();
8275        for (collection, old_id, _new_id, xid, _previous_xmax) in versioned_updates {
8276            if let Some(manager) = store.get_collection(&collection) {
8277                if let Some(mut entity) = manager.get(old_id) {
8278                    entity.set_xmax(xid);
8279                    let _ = manager.update(entity);
8280                }
8281            }
8282        }
8283        for (collection, id, xid, _previous_xmax) in tombstones {
8284            if let Some(manager) = store.get_collection(&collection) {
8285                if let Some(mut entity) = manager.get(id) {
8286                    entity.set_xmax(xid);
8287                    let _ = manager.update(entity);
8288                }
8289            }
8290        }
8291    }
8292
8293    pub(crate) fn finalize_pending_versioned_updates(&self, conn_id: u64) {
8294        self.inner
8295            .pending_versioned_updates
8296            .write()
8297            .remove(&conn_id);
8298    }
8299
8300    pub(crate) fn revive_pending_versioned_updates(&self, conn_id: u64) {
8301        let Some(pending) = self
8302            .inner
8303            .pending_versioned_updates
8304            .write()
8305            .remove(&conn_id)
8306        else {
8307            return;
8308        };
8309
8310        let store = self.inner.db.store();
8311        for (collection, old_id, new_id, xid, previous_xmax) in pending {
8312            if let Some(manager) = store.get_collection(&collection) {
8313                if let Some(mut old) = manager.get(old_id) {
8314                    if old.xmax == xid {
8315                        old.set_xmax(previous_xmax);
8316                        let _ = manager.update(old);
8317                    }
8318                }
8319            }
8320            let _ = store.delete_batch(&collection, &[new_id]);
8321        }
8322    }
8323
8324    pub(crate) fn revive_versioned_updates_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
8325        let mut guard = self.inner.pending_versioned_updates.write();
8326        let Some(pending) = guard.get_mut(&conn_id) else {
8327            return 0;
8328        };
8329
8330        let store = self.inner.db.store();
8331        let mut reverted = 0usize;
8332        pending.retain(|(collection, old_id, new_id, xid, previous_xmax)| {
8333            if *xid < stamper_xid {
8334                return true;
8335            }
8336            if let Some(manager) = store.get_collection(collection) {
8337                if let Some(mut old) = manager.get(*old_id) {
8338                    if old.xmax == *xid {
8339                        old.set_xmax(*previous_xmax);
8340                        let _ = manager.update(old);
8341                    }
8342                }
8343            }
8344            let _ = store.delete_batch(collection, &[*new_id]);
8345            reverted += 1;
8346            false
8347        });
8348        if pending.is_empty() {
8349            guard.remove(&conn_id);
8350        }
8351        reverted
8352    }
8353
8354    /// Flush tombstones on COMMIT. The xmax stamp is already the durable
8355    /// delete marker; commit only drops the rollback journal and emits
8356    /// side effects. Physical reclamation is left for VACUUM so old
8357    /// snapshots can still resolve the pre-delete row version.
8358    pub(crate) fn finalize_pending_tombstones(&self, conn_id: u64) {
8359        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
8360            return;
8361        };
8362        if pending.is_empty() {
8363            return;
8364        }
8365
8366        let store = self.inner.db.store();
8367        for (collection, id, _xid, _previous_xmax) in pending {
8368            store.context_index().remove_entity(id);
8369            self.cdc_emit(
8370                crate::replication::cdc::ChangeOperation::Delete,
8371                &collection,
8372                id.raw(),
8373                "entity",
8374            );
8375        }
8376    }
8377
8378    /// Revive tombstones on ROLLBACK — reset `xmax` to 0 so the tuples
8379    /// become visible again to future snapshots. Best-effort: a row
8380    /// already reclaimed by a concurrent VACUUM stays gone, but VACUUM
8381    /// never reclaims tuples whose xmax is still referenced by any
8382    /// active snapshot, so this case is only reachable via external
8383    /// storage corruption.
8384    pub(crate) fn revive_pending_tombstones(&self, conn_id: u64) {
8385        let Some(pending) = self.inner.pending_tombstones.write().remove(&conn_id) else {
8386            return;
8387        };
8388
8389        let store = self.inner.db.store();
8390        for (collection, id, xid, previous_xmax) in pending {
8391            let Some(manager) = store.get_collection(&collection) else {
8392                continue;
8393            };
8394            if let Some(mut entity) = manager.get(id) {
8395                if entity.xmax == xid {
8396                    entity.set_xmax(previous_xmax);
8397                    let _ = manager.update(entity);
8398                }
8399            }
8400        }
8401    }
8402
8403    /// Slice C of PRD #718 — accessor for the local wait registry.
8404    pub fn queue_wait_registry(
8405        &self,
8406    ) -> std::sync::Arc<crate::runtime::queue_wait_registry::QueueWaitRegistry> {
8407        self.inner.queue_wait_registry.clone()
8408    }
8409
8410    /// Buffer a `(scope, queue)` wake on the current connection so it
8411    /// fires post-COMMIT, or notify immediately if no transaction is
8412    /// open (autocommit path). The wait registry only ever observes
8413    /// notifies for committed work — rollback drops the buffer.
8414    pub(crate) fn record_queue_wake(&self, scope: &str, queue: &str) {
8415        if self.current_xid().is_some() {
8416            let conn_id = current_connection_id();
8417            self.inner
8418                .pending_queue_wakes
8419                .write()
8420                .entry(conn_id)
8421                .or_default()
8422                .push((scope.to_string(), queue.to_string()));
8423            return;
8424        }
8425        self.inner.queue_wait_registry.notify(scope, queue);
8426    }
8427
8428    pub(crate) fn finalize_pending_queue_wakes(&self, conn_id: u64) {
8429        let Some(pending) = self.inner.pending_queue_wakes.write().remove(&conn_id) else {
8430            return;
8431        };
8432        for (scope, queue) in pending {
8433            self.inner.queue_wait_registry.notify(&scope, &queue);
8434        }
8435    }
8436
8437    pub(crate) fn discard_pending_queue_wakes(&self, conn_id: u64) {
8438        self.inner.pending_queue_wakes.write().remove(&conn_id);
8439    }
8440
8441    pub(crate) fn finalize_pending_kv_watch_events(&self, conn_id: u64) {
8442        let Some(pending) = self.inner.pending_kv_watch_events.write().remove(&conn_id) else {
8443            return;
8444        };
8445        for event in pending {
8446            self.cdc_emit_kv(
8447                event.op,
8448                &event.collection,
8449                &event.key,
8450                0,
8451                event.before,
8452                event.after,
8453            );
8454        }
8455    }
8456
8457    pub(crate) fn discard_pending_kv_watch_events(&self, conn_id: u64) {
8458        self.inner.pending_kv_watch_events.write().remove(&conn_id);
8459    }
8460
8461    /// Materialise the entire graph store while applying MVCC visibility
8462    /// AND per-collection RLS to each candidate node and edge. Mirrors
8463    /// `materialize_graph` but routes every entity through the same
8464    /// gate the SELECT path uses, with the correct `PolicyTargetKind`
8465    /// per entity kind (`Nodes` for graph nodes, `Edges` for graph
8466    /// edges). Returns the filtered `GraphStore` plus the
8467    /// `node_id → properties` map the executor needs for `RETURN n.*`
8468    /// projections.
8469    fn materialize_graph_with_rls(
8470        &self,
8471    ) -> RedDBResult<(
8472        crate::storage::engine::GraphStore,
8473        std::collections::HashMap<
8474            String,
8475            std::collections::HashMap<String, crate::storage::schema::Value>,
8476        >,
8477        crate::storage::query::unified::EdgeProperties,
8478    )> {
8479        use crate::storage::engine::GraphStore;
8480        use crate::storage::query::ast::{PolicyAction, PolicyTargetKind};
8481        use crate::storage::unified::entity::{EntityData, EntityKind};
8482        use std::collections::{HashMap, HashSet};
8483
8484        let store = self.inner.db.store();
8485        let snap_ctx = capture_current_snapshot();
8486        let role = current_auth_identity().map(|(_, r)| r.as_str().to_string());
8487
8488        let graph = GraphStore::new();
8489        let mut node_properties: HashMap<String, HashMap<String, crate::storage::schema::Value>> =
8490            HashMap::new();
8491        let mut edge_properties: crate::storage::query::unified::EdgeProperties = HashMap::new();
8492        let mut allowed_nodes: HashSet<String> = HashSet::new();
8493
8494        // Per-collection cached compiled filters — Nodes-kind for
8495        // first pass, Edges-kind for the second. None entries mean
8496        // "RLS enabled, zero matching policy → deny all of this kind".
8497        let mut node_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
8498            HashMap::new();
8499        let mut edge_rls: HashMap<String, Option<crate::storage::query::ast::Filter>> =
8500            HashMap::new();
8501
8502        let collections = store.list_collections();
8503
8504        // First pass — gather nodes.
8505        for collection in &collections {
8506            let Some(manager) = store.get_collection(collection) else {
8507                continue;
8508            };
8509            let entities = manager.query_all(|_| true);
8510            for entity in entities {
8511                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
8512                    continue;
8513                }
8514                let EntityKind::GraphNode(ref node) = entity.kind else {
8515                    continue;
8516                };
8517                if !node_passes_rls(self, collection, role.as_deref(), &mut node_rls, &entity) {
8518                    continue;
8519                }
8520                let id_str = entity.id.raw().to_string();
8521                graph
8522                    .add_node_with_label(
8523                        &id_str,
8524                        &node.label,
8525                        &super::graph_node_label(&node.node_type),
8526                    )
8527                    .map_err(|err| RedDBError::Query(err.to_string()))?;
8528                allowed_nodes.insert(id_str.clone());
8529                if let EntityData::Node(node_data) = &entity.data {
8530                    node_properties.insert(id_str, node_data.properties.clone());
8531                }
8532            }
8533        }
8534
8535        // Second pass — gather edges. An edge appears only when both
8536        // endpoint nodes survived the RLS pass AND the edge itself
8537        // passes its own RLS gate.
8538        for collection in &collections {
8539            let Some(manager) = store.get_collection(collection) else {
8540                continue;
8541            };
8542            let entities = manager.query_all(|_| true);
8543            for entity in entities {
8544                if !entity_visible_with_context(snap_ctx.as_ref(), &entity) {
8545                    continue;
8546                }
8547                let EntityKind::GraphEdge(ref edge) = entity.kind else {
8548                    continue;
8549                };
8550                if !allowed_nodes.contains(&edge.from_node)
8551                    || !allowed_nodes.contains(&edge.to_node)
8552                {
8553                    continue;
8554                }
8555                if !edge_passes_rls(self, collection, role.as_deref(), &mut edge_rls, &entity) {
8556                    continue;
8557                }
8558                let weight = match &entity.data {
8559                    EntityData::Edge(e) => e.weight,
8560                    _ => edge.weight as f32 / 1000.0,
8561                };
8562                let edge_label = super::graph_edge_label(&edge.label);
8563                graph
8564                    .add_edge_with_label(&edge.from_node, &edge.to_node, &edge_label, weight)
8565                    .map_err(|err| RedDBError::Query(err.to_string()))?;
8566                if let EntityData::Edge(edge_data) = &entity.data {
8567                    edge_properties.insert(
8568                        (edge.from_node.clone(), edge_label, edge.to_node.clone()),
8569                        edge_data.properties.clone(),
8570                    );
8571                }
8572            }
8573        }
8574
8575        // Suppress unused-PolicyAction/PolicyTargetKind warnings — both
8576        // are used inside the helper closures via the per-kind helpers
8577        // declared at the bottom of this file.
8578        let _ = (PolicyAction::Select, PolicyTargetKind::Nodes);
8579
8580        Ok((graph, node_properties, edge_properties))
8581    }
8582
8583    /// Phase 1.1 MVCC universal: post-save hook that stamps `xmin` on a
8584    /// freshly-inserted entity when the current connection holds an
8585    /// open transaction. Used by graph / vector / queue / timeseries
8586    /// write paths that go through the DevX builder API (`db.node(...)
8587    /// .save()` and friends) — those live in the storage crate and
8588    /// can't reach `current_xid()` without crossing layers, so the
8589    /// application layer calls this helper right after `save()` to
8590    /// finalise the MVCC stamp.
8591    ///
8592    /// Autocommit (outside BEGIN) is a no-op — no extra lookup or
8593    /// write, so the non-transactional hot path stays untouched.
8594    ///
8595    /// Best-effort: if the collection or entity disappears between
8596    /// the save and the stamp (concurrent DROP), we silently skip.
8597    pub(crate) fn stamp_xmin_if_in_txn(
8598        &self,
8599        collection: &str,
8600        id: crate::storage::unified::entity::EntityId,
8601    ) {
8602        let Some(xid) = self.current_xid() else {
8603            return;
8604        };
8605        let store = self.inner.db.store();
8606        let Some(manager) = store.get_collection(collection) else {
8607            return;
8608        };
8609        if let Some(mut entity) = manager.get(id) {
8610            entity.set_xmin(xid);
8611            let _ = manager.update(entity);
8612        }
8613    }
8614
8615    /// Revive tombstones stamped by `stamper_xid` or any sub-xid
8616    /// allocated after it (Phase 2.3.2e savepoint rollback). Any
8617    /// pending entries with `xid < stamper_xid` stay queued because
8618    /// they belong to the enclosing scope — they'll either flush on
8619    /// COMMIT or revive on an outer ROLLBACK TO SAVEPOINT.
8620    ///
8621    /// Returns the number of tuples whose `xmax` was wiped back to 0.
8622    pub(crate) fn revive_tombstones_since(&self, conn_id: u64, stamper_xid: u64) -> usize {
8623        let mut guard = self.inner.pending_tombstones.write();
8624        let Some(pending) = guard.get_mut(&conn_id) else {
8625            return 0;
8626        };
8627
8628        let store = self.inner.db.store();
8629        let mut revived = 0usize;
8630        pending.retain(|(collection, id, xid, previous_xmax)| {
8631            if *xid < stamper_xid {
8632                // Stamped before the savepoint — keep in queue.
8633                return true;
8634            }
8635            if let Some(manager) = store.get_collection(collection) {
8636                if let Some(mut entity) = manager.get(*id) {
8637                    if entity.xmax == *xid {
8638                        entity.set_xmax(*previous_xmax);
8639                        let _ = manager.update(entity);
8640                        revived += 1;
8641                    }
8642                }
8643            }
8644            false
8645        });
8646        if pending.is_empty() {
8647            guard.remove(&conn_id);
8648        }
8649        revived
8650    }
8651
8652    /// Return the snapshot the current connection should use for visibility
8653    /// checks (Phase 2.3 PG parity).
8654    ///
8655    /// * If the connection is inside a BEGIN-wrapped transaction, reuse
8656    ///   the snapshot stored in its `TxnContext`.
8657    /// * Otherwise (autocommit), capture a fresh snapshot tied to an
8658    ///   implicit xid=0 — the read path treats pre-MVCC rows as always
8659    ///   visible so this degrades to "see everything committed".
8660    pub fn current_snapshot(&self) -> crate::storage::transaction::snapshot::Snapshot {
8661        let conn_id = current_connection_id();
8662        if let Some(ctx) = self.inner.tx_contexts.read().get(&conn_id).cloned() {
8663            return ctx.snapshot;
8664        }
8665        // Autocommit: take a fresh snapshot bounded by `peek_next_xid` so
8666        // every already-committed xid (which is strictly less) passes the
8667        // `xmin <= snap.xid` gate, while concurrently-active xids land in
8668        // the `in_progress` set and stay hidden until they commit. Using
8669        // xid=0 would incorrectly hide every MVCC-stamped tuple.
8670        let high_water = self.inner.snapshot_manager.peek_next_xid();
8671        self.inner.snapshot_manager.snapshot(high_water)
8672    }
8673
8674    /// Xid of the current connection's active transaction, or `None` when
8675    /// running outside a BEGIN/COMMIT block. Write paths call this to
8676    /// decide whether to stamp `xmin`/`xmax` on tuples.
8677    /// Phase 2.3.2e: when a savepoint is open, `writer_xid` returns the
8678    /// sub-xid so new writes can be selectively rolled back. Otherwise
8679    /// the parent txn's xid is returned, matching pre-savepoint
8680    /// behaviour. Callers that need the enclosing *transaction* xid
8681    /// (e.g. VACUUM min-active calculations) should read `ctx.xid`
8682    /// directly.
8683    pub fn current_xid(&self) -> Option<crate::storage::transaction::snapshot::Xid> {
8684        let conn_id = current_connection_id();
8685        self.inner
8686            .tx_contexts
8687            .read()
8688            .get(&conn_id)
8689            .map(|ctx| ctx.writer_xid())
8690    }
8691
8692    /// `true` when the given connection id has an open `BEGIN`. Issue
8693    /// #760 — `OpenStream` consults this to refuse output streams that
8694    /// would otherwise collide with an interactive transaction (see
8695    /// ADR 0029 "Transaction interaction"). HTTP requests pre-dating the
8696    /// connection-id plumbing run with id `0`, which never carries a
8697    /// transaction context, so this returns `false` on those paths.
8698    pub fn connection_in_transaction(&self, conn_id: u64) -> bool {
8699        self.inner.tx_contexts.read().contains_key(&conn_id)
8700    }
8701
8702    /// Access the shared `SnapshotManager` — useful for VACUUM to compute
8703    /// the oldest-active xid when reclaiming dead tuples.
8704    pub fn snapshot_manager(&self) -> Arc<crate::storage::transaction::snapshot::SnapshotManager> {
8705        Arc::clone(&self.inner.snapshot_manager)
8706    }
8707
8708    fn mvcc_vacuum_cutoff_xid(&self) -> crate::storage::transaction::snapshot::Xid {
8709        let manager = &self.inner.snapshot_manager;
8710        let next_xid = manager.peek_next_xid();
8711        let mut cutoff = next_xid;
8712        if let Some(oldest_active) = manager.oldest_active_xid() {
8713            cutoff = cutoff.min(oldest_active);
8714        }
8715        if let Some(oldest_pinned) = manager.oldest_pinned_xid() {
8716            cutoff = cutoff.min(oldest_pinned);
8717        }
8718        let retention_xids = self.config_u64("runtime.mvcc.vacuum_retention_xids", 0);
8719        if retention_xids > 0 {
8720            cutoff = cutoff.min(next_xid.saturating_sub(retention_xids));
8721        }
8722        cutoff
8723    }
8724
8725    fn rebuild_runtime_indexes_for_table(&self, table: &str) -> RedDBResult<()> {
8726        let registered = self.inner.index_store.list_indices(table);
8727        if registered.is_empty() {
8728            return Ok(());
8729        }
8730        let store = self.inner.db.store();
8731        let Some(manager) = store.get_collection(table) else {
8732            return Ok(());
8733        };
8734        let entity_fields = manager
8735            .query_all(|entity| matches!(entity.kind, crate::storage::EntityKind::TableRow { .. }))
8736            .into_iter()
8737            .map(|entity| (entity.id, table_row_index_fields(&entity)))
8738            .collect::<Vec<_>>();
8739
8740        for index in registered {
8741            self.inner.index_store.drop_index(&index.name, table);
8742            self.inner
8743                .index_store
8744                .create_index(
8745                    &index.name,
8746                    table,
8747                    &index.columns,
8748                    index.method,
8749                    index.unique,
8750                    &entity_fields,
8751                )
8752                .map_err(RedDBError::Internal)?;
8753            self.inner.index_store.register(index);
8754        }
8755        self.invalidate_plan_cache();
8756        Ok(())
8757    }
8758
8759    pub(crate) fn persist_runtime_index_descriptor(
8760        &self,
8761        index: super::index_store::RegisteredIndex,
8762    ) -> RedDBResult<()> {
8763        let store = self.inner.db.store();
8764        let _ = store.get_or_create_collection(RUNTIME_INDEX_REGISTRY_COLLECTION);
8765        let entity = crate::storage::UnifiedEntity::new(
8766            crate::storage::EntityId::new(0),
8767            crate::storage::EntityKind::TableRow {
8768                table: std::sync::Arc::from(RUNTIME_INDEX_REGISTRY_COLLECTION),
8769                row_id: 0,
8770            },
8771            crate::storage::EntityData::Row(crate::storage::RowData {
8772                columns: Vec::new(),
8773                named: Some(
8774                    [
8775                        (
8776                            "collection".to_string(),
8777                            crate::storage::schema::Value::text(index.collection.clone()),
8778                        ),
8779                        (
8780                            "name".to_string(),
8781                            crate::storage::schema::Value::text(index.name.clone()),
8782                        ),
8783                        (
8784                            "columns".to_string(),
8785                            crate::storage::schema::Value::text(index.columns.join("\u{1f}")),
8786                        ),
8787                        (
8788                            "method".to_string(),
8789                            crate::storage::schema::Value::text(index_method_kind_as_str(
8790                                index.method,
8791                            )),
8792                        ),
8793                        (
8794                            "unique".to_string(),
8795                            crate::storage::schema::Value::Boolean(index.unique),
8796                        ),
8797                        (
8798                            "dropped".to_string(),
8799                            crate::storage::schema::Value::Boolean(false),
8800                        ),
8801                    ]
8802                    .into_iter()
8803                    .collect(),
8804                ),
8805                schema: None,
8806            }),
8807        );
8808        store
8809            .insert_auto(RUNTIME_INDEX_REGISTRY_COLLECTION, entity)
8810            .map(|_| ())
8811            .map_err(|err| RedDBError::Internal(format!("{err:?}")))
8812    }
8813
8814    pub(crate) fn persist_runtime_index_drop(
8815        &self,
8816        collection: &str,
8817        name: &str,
8818    ) -> RedDBResult<()> {
8819        let store = self.inner.db.store();
8820        let _ = store.get_or_create_collection(RUNTIME_INDEX_REGISTRY_COLLECTION);
8821        let entity = crate::storage::UnifiedEntity::new(
8822            crate::storage::EntityId::new(0),
8823            crate::storage::EntityKind::TableRow {
8824                table: std::sync::Arc::from(RUNTIME_INDEX_REGISTRY_COLLECTION),
8825                row_id: 0,
8826            },
8827            crate::storage::EntityData::Row(crate::storage::RowData {
8828                columns: Vec::new(),
8829                named: Some(
8830                    [
8831                        (
8832                            "collection".to_string(),
8833                            crate::storage::schema::Value::text(collection.to_string()),
8834                        ),
8835                        (
8836                            "name".to_string(),
8837                            crate::storage::schema::Value::text(name.to_string()),
8838                        ),
8839                        (
8840                            "dropped".to_string(),
8841                            crate::storage::schema::Value::Boolean(true),
8842                        ),
8843                    ]
8844                    .into_iter()
8845                    .collect(),
8846                ),
8847                schema: None,
8848            }),
8849        );
8850        store
8851            .insert_auto(RUNTIME_INDEX_REGISTRY_COLLECTION, entity)
8852            .map(|_| ())
8853            .map_err(|err| RedDBError::Internal(format!("{err:?}")))
8854    }
8855
8856    fn rehydrate_runtime_index_registry(&self) -> RedDBResult<()> {
8857        let store = self.inner.db.store();
8858        let Some(manager) = store.get_collection(RUNTIME_INDEX_REGISTRY_COLLECTION) else {
8859            return Ok(());
8860        };
8861        let mut rows = manager.query_all(|_| true);
8862        rows.sort_by_key(|entity| entity.id.raw());
8863
8864        let mut latest = std::collections::HashMap::<
8865            (String, String),
8866            Option<super::index_store::RegisteredIndex>,
8867        >::new();
8868        for entity in rows {
8869            let crate::storage::EntityData::Row(row) = &entity.data else {
8870                continue;
8871            };
8872            let Some(named) = &row.named else {
8873                continue;
8874            };
8875            let Some(collection) = named_text(named, "collection") else {
8876                continue;
8877            };
8878            let Some(name) = named_text(named, "name") else {
8879                continue;
8880            };
8881            let dropped = named_bool(named, "dropped").unwrap_or(false);
8882            let key = (collection.clone(), name.clone());
8883            if dropped {
8884                latest.insert(key, None);
8885                continue;
8886            }
8887            let columns = named_text(named, "columns")
8888                .map(|raw| {
8889                    raw.split('\u{1f}')
8890                        .filter(|part| !part.is_empty())
8891                        .map(str::to_string)
8892                        .collect::<Vec<_>>()
8893                })
8894                .unwrap_or_default();
8895            let Some(method) =
8896                named_text(named, "method").and_then(|raw| index_method_kind_from_str(&raw))
8897            else {
8898                continue;
8899            };
8900            latest.insert(
8901                key,
8902                Some(super::index_store::RegisteredIndex {
8903                    name,
8904                    collection,
8905                    columns,
8906                    method,
8907                    unique: named_bool(named, "unique").unwrap_or(false),
8908                }),
8909            );
8910        }
8911
8912        for index in latest.into_values().flatten() {
8913            let Some(manager) = store.get_collection(&index.collection) else {
8914                continue;
8915            };
8916            let entity_fields = manager
8917                .query_all(|entity| {
8918                    matches!(entity.kind, crate::storage::EntityKind::TableRow { .. })
8919                })
8920                .into_iter()
8921                .map(|entity| (entity.id, table_row_index_fields(&entity)))
8922                .collect::<Vec<_>>();
8923            self.inner
8924                .index_store
8925                .create_index(
8926                    &index.name,
8927                    &index.collection,
8928                    &index.columns,
8929                    index.method,
8930                    index.unique,
8931                    &entity_fields,
8932                )
8933                .map_err(RedDBError::Internal)?;
8934            self.inner.index_store.register(index);
8935        }
8936        self.invalidate_plan_cache();
8937        Ok(())
8938    }
8939
8940    /// Own-tx xids (parent + open/released savepoints) for the current
8941    /// connection. Transports + tests that build a `SnapshotContext`
8942    /// manually (outside the `execute_query` scope) need this set so
8943    /// the writer's own uncommitted tuples stay visible to self.
8944    pub fn current_txn_own_xids(
8945        &self,
8946    ) -> std::collections::HashSet<crate::storage::transaction::snapshot::Xid> {
8947        let mut set = std::collections::HashSet::new();
8948        if let Some(ctx) = self.inner.tx_contexts.read().get(&current_connection_id()) {
8949            set.insert(ctx.xid);
8950            for (_, sub) in &ctx.savepoints {
8951                set.insert(*sub);
8952            }
8953            for sub in &ctx.released_sub_xids {
8954                set.insert(*sub);
8955            }
8956        }
8957        set
8958    }
8959
8960    /// Access the shared `ForeignTableRegistry` (Phase 3.2 PG parity).
8961    ///
8962    /// Callers use this to check whether a table name is a registered
8963    /// foreign table (`registry.is_foreign_table(name)`) and, if so, to
8964    /// scan it (`registry.scan(name)`). The read-path rewriter consults
8965    /// this before dispatching into native-collection lookup.
8966    pub fn foreign_tables(&self) -> Arc<crate::storage::fdw::ForeignTableRegistry> {
8967        Arc::clone(&self.inner.foreign_tables)
8968    }
8969
8970    /// Is Row-Level Security enabled for this table? (Phase 2.5 PG parity)
8971    pub fn is_rls_enabled(&self, table: &str) -> bool {
8972        self.inner.rls_enabled_tables.read().contains(table)
8973    }
8974
8975    /// Collect the USING predicates that apply to this `(table, role, action)`.
8976    ///
8977    /// Returned filters should be OR-combined (a row passes RLS when *any*
8978    /// matching policy accepts it) and then AND-ed into the query's WHERE.
8979    /// When the table has RLS disabled this returns an empty Vec — callers
8980    /// can fast-path back to the unfiltered read.
8981    pub fn matching_rls_policies(
8982        &self,
8983        table: &str,
8984        role: Option<&str>,
8985        action: crate::storage::query::ast::PolicyAction,
8986    ) -> Vec<crate::storage::query::ast::Filter> {
8987        // Default kind = Table preserves the pre-Phase-2.5.5 behaviour:
8988        // callers that don't name a kind only see Table-scoped
8989        // policies (which is what execute SELECT / UPDATE / DELETE
8990        // expect).
8991        self.matching_rls_policies_for_kind(
8992            table,
8993            role,
8994            action,
8995            crate::storage::query::ast::PolicyTargetKind::Table,
8996        )
8997    }
8998
8999    /// Kind-aware variant used by cross-model scans (Phase 2.5.5).
9000    ///
9001    /// Graph scans request `Nodes` / `Edges`, vector ANN requests
9002    /// `Vectors`, queue consumers request `Messages`, and timeseries
9003    /// range scans request `Points`. Policies tagged with a
9004    /// different kind are skipped so a graph-scoped policy doesn't
9005    /// accidentally gate a table SELECT on the same collection.
9006    pub fn matching_rls_policies_for_kind(
9007        &self,
9008        table: &str,
9009        role: Option<&str>,
9010        action: crate::storage::query::ast::PolicyAction,
9011        kind: crate::storage::query::ast::PolicyTargetKind,
9012    ) -> Vec<crate::storage::query::ast::Filter> {
9013        if !self.is_rls_enabled(table) {
9014            return Vec::new();
9015        }
9016        let policies = self.inner.rls_policies.read();
9017        policies
9018            .iter()
9019            .filter_map(|((t, _), p)| {
9020                if t != table {
9021                    return None;
9022                }
9023                // Kind gate — Table policies also apply to every
9024                // other kind *iff* the policy predicate evaluates
9025                // against entity fields that exist uniformly; the
9026                // caller's kind filter is the stricter check, so
9027                // match literally. Auto-tenancy policies stamp
9028                // Table and the caller passes the concrete kind —
9029                // we allow Table policies to apply cross-kind for
9030                // backwards compat.
9031                if p.target_kind != kind
9032                    && p.target_kind != crate::storage::query::ast::PolicyTargetKind::Table
9033                {
9034                    return None;
9035                }
9036                // Action gate — `None` means "ALL" actions.
9037                if let Some(a) = p.action {
9038                    if a != action {
9039                        return None;
9040                    }
9041                }
9042                // Role gate — `None` means "any role".
9043                if let Some(p_role) = p.role.as_deref() {
9044                    match role {
9045                        Some(r) if r == p_role => {}
9046                        _ => return None,
9047                    }
9048                }
9049                Some((*p.using).clone())
9050            })
9051            .collect()
9052    }
9053
9054    pub(crate) fn refresh_table_planner_stats(&self, table: &str) {
9055        let store = self.inner.db.store();
9056        if let Some(stats) =
9057            crate::storage::query::planner::stats_catalog::analyze_collection(store.as_ref(), table)
9058        {
9059            crate::storage::query::planner::stats_catalog::persist_table_stats(
9060                store.as_ref(),
9061                &stats,
9062            );
9063        } else {
9064            crate::storage::query::planner::stats_catalog::clear_table_stats(store.as_ref(), table);
9065        }
9066        self.invalidate_plan_cache();
9067    }
9068
9069    pub(crate) fn note_table_write(&self, table: &str) {
9070        // Skip the write lock when the table is already marked
9071        // dirty. With single-row UPDATEs in a loop this used to
9072        // grab the planner_dirty_tables write lock N times even
9073        // though the first call already flipped the flag.
9074        let already_dirty = self.inner.planner_dirty_tables.read().contains(table);
9075        if !already_dirty {
9076            self.inner
9077                .planner_dirty_tables
9078                .write()
9079                .insert(table.to_string());
9080        }
9081        self.invalidate_result_cache_for_table(table);
9082    }
9083
9084    /// Wrap the planner's `RuntimeQueryExplain` as rows on a
9085    /// `RuntimeQueryResult` so callers over the SQL interface see the
9086    /// plan tree in the same shape a SELECT produces.
9087    ///
9088    /// Columns: `op`, `source`, `est_rows`, `est_cost`, `depth`.
9089    /// Nodes are walked depth-first; `depth` counts from 0 at the
9090    /// root so a text renderer can indent without re-walking.
9091    fn explain_as_rows(&self, raw_query: &str, inner_sql: &str) -> RedDBResult<RuntimeQueryResult> {
9092        let explain = self.explain_query(inner_sql)?;
9093
9094        let columns = vec![
9095            "op".to_string(),
9096            "source".to_string(),
9097            "est_rows".to_string(),
9098            "est_cost".to_string(),
9099            "depth".to_string(),
9100        ];
9101
9102        let mut records: Vec<crate::storage::query::unified::UnifiedRecord> = Vec::new();
9103
9104        // Prepend `CteScan` markers when the query carried a leading
9105        // WITH clause. The CTE bodies are already inlined into the
9106        // main plan tree, but operators reading EXPLAIN need to see
9107        // which named CTEs were resolved — without this row the plan
9108        // would look indistinguishable from a hand-inlined query.
9109        for name in &explain.cte_materializations {
9110            use std::sync::Arc;
9111            let mut rec = crate::storage::query::unified::UnifiedRecord::default();
9112            rec.set_arc(Arc::from("op"), Value::text("CteScan".to_string()));
9113            rec.set_arc(Arc::from("source"), Value::text(name.clone()));
9114            rec.set_arc(Arc::from("est_rows"), Value::Float(0.0));
9115            rec.set_arc(Arc::from("est_cost"), Value::Float(0.0));
9116            rec.set_arc(Arc::from("depth"), Value::Integer(0));
9117            records.push(rec);
9118        }
9119
9120        walk_plan_node(&explain.logical_plan.root, 0, &mut records);
9121
9122        let result = crate::storage::query::unified::UnifiedResult {
9123            columns,
9124            records,
9125            stats: Default::default(),
9126            pre_serialized_json: None,
9127        };
9128
9129        Ok(RuntimeQueryResult {
9130            query: raw_query.to_string(),
9131            mode: explain.mode,
9132            statement: "explain",
9133            engine: "runtime-explain",
9134            result,
9135            affected_rows: 0,
9136            statement_type: "select",
9137            bookmark: None,
9138        })
9139    }
9140
9141    // -----------------------------------------------------------------
9142    // Granular RBAC — privilege gate + GRANT/REVOKE/ALTER USER dispatch
9143    // -----------------------------------------------------------------
9144
9145    /// Project a `QueryExpr` to the (action, resource) pair the
9146    /// privilege engine cares about. Returns `Ok(())` for statements
9147    /// that don't touch user data (transaction control, SHOW, SET, etc.).
9148    pub(crate) fn check_query_privilege(
9149        &self,
9150        expr: &crate::storage::query::ast::QueryExpr,
9151    ) -> Result<(), String> {
9152        use crate::auth::privileges::{Action, AuthzContext, Resource};
9153        use crate::auth::UserId;
9154        use crate::storage::query::ast::QueryExpr;
9155
9156        // No auth store wired (embedded mode / fresh DB / tests) → bypass.
9157        // The bootstrap path itself goes through `execute_query` so this
9158        // is the only sensible default; once auth is wired, the gate
9159        // becomes active.
9160        let auth_store = match self.inner.auth_store.read().clone() {
9161            Some(s) => s,
9162            None => return Ok(()),
9163        };
9164
9165        // Resolve principal + role from the thread-local identity.
9166        // Anonymous (no identity) is allowed to read the bootstrap path
9167        // only when auth_store says so; we treat missing identity as
9168        // platform-admin-equivalent here so embedded test harnesses
9169        // continue to work without setting an identity.
9170        let (username, role) = match current_auth_identity() {
9171            Some(p) => p,
9172            None => return Ok(()),
9173        };
9174        let tenant = current_tenant();
9175
9176        let ctx = AuthzContext {
9177            principal: &username,
9178            effective_role: role,
9179            tenant: tenant.as_deref(),
9180        };
9181        let principal_id = UserId::from_parts(tenant.as_deref(), &username);
9182
9183        // Map QueryExpr → (Action, Resource).
9184        let (action, resource) = match expr {
9185            QueryExpr::Table(t) => (Action::Select, Resource::table_from_name(&t.table)),
9186            QueryExpr::RankOf(_) | QueryExpr::ApproxRankOf(_) | QueryExpr::RankRange(_) => {
9187                (Action::Select, Resource::Database)
9188            }
9189            QueryExpr::QueueSelect(q) => {
9190                return self.check_queue_op_privilege(
9191                    &auth_store,
9192                    &principal_id,
9193                    role,
9194                    tenant.as_deref(),
9195                    "queue:peek",
9196                    &q.queue,
9197                );
9198            }
9199            QueryExpr::QueueCommand(cmd) => {
9200                use crate::storage::query::ast::QueueCommand;
9201                let (queue, action_verb) = match cmd {
9202                    QueueCommand::Push { queue, .. } => (queue.as_str(), "queue:enqueue"),
9203                    QueueCommand::Pop { queue, .. }
9204                    | QueueCommand::GroupRead { queue, .. }
9205                    | QueueCommand::Claim { queue, .. } => (queue.as_str(), "queue:read"),
9206                    QueueCommand::Peek { queue, .. }
9207                    | QueueCommand::Len { queue }
9208                    | QueueCommand::Pending { queue, .. } => (queue.as_str(), "queue:peek"),
9209                    QueueCommand::Ack { queue, .. } => (queue.as_str(), "queue:ack"),
9210                    QueueCommand::Nack {
9211                        queue, delay_ms, ..
9212                    } => {
9213                        // Per-failure retry overrides re-shape retry
9214                        // behaviour for everyone draining the queue and
9215                        // gate on the dedicated `queue:retry` verb so
9216                        // operators can grant base NACK without granting
9217                        // the override capability.
9218                        let verb = if delay_ms.is_some() {
9219                            "queue:retry"
9220                        } else {
9221                            "queue:nack"
9222                        };
9223                        (queue.as_str(), verb)
9224                    }
9225                    QueueCommand::Purge { queue } => (queue.as_str(), "queue:purge"),
9226                    // `GroupCreate` is part of the consumer-setup
9227                    // surface — read-side, never destructive.
9228                    QueueCommand::GroupCreate { queue, .. } => (queue.as_str(), "queue:read"),
9229                    QueueCommand::Move { source, .. } => (source.as_str(), "queue:dlq:move"),
9230                };
9231                return self.check_queue_op_privilege(
9232                    &auth_store,
9233                    &principal_id,
9234                    role,
9235                    tenant.as_deref(),
9236                    action_verb,
9237                    queue,
9238                );
9239            }
9240            QueryExpr::Graph(g) => {
9241                // MATCH … RETURN is the explorer's pattern-traversal
9242                // surface — gate on `graph:traverse` (#757).
9243                self.check_graph_op_privilege(
9244                    &auth_store,
9245                    &principal_id,
9246                    role,
9247                    tenant.as_deref(),
9248                    "graph:traverse",
9249                )?;
9250                if auth_store.iam_authorization_enabled() {
9251                    self.check_graph_property_projection_privilege(
9252                        &auth_store,
9253                        &principal_id,
9254                        role,
9255                        tenant.as_deref(),
9256                        g,
9257                    )?;
9258                    return Ok(());
9259                }
9260                return Ok(());
9261            }
9262            QueryExpr::Path(_) => {
9263                // PATH FROM … TO … is a path-traversal query — gates
9264                // on `graph:traverse` like neighborhood/shortest-path
9265                // (#757).
9266                return self.check_graph_op_privilege(
9267                    &auth_store,
9268                    &principal_id,
9269                    role,
9270                    tenant.as_deref(),
9271                    "graph:traverse",
9272                );
9273            }
9274            QueryExpr::GraphCommand(cmd) => {
9275                use crate::storage::query::ast::GraphCommand;
9276                let action_verb = match cmd {
9277                    // Metadata / property reads.
9278                    GraphCommand::Properties { .. } => "graph:read",
9279                    // Traversal / pattern-walk surface.
9280                    GraphCommand::Neighborhood { .. }
9281                    | GraphCommand::Traverse { .. }
9282                    | GraphCommand::ShortestPath { .. } => "graph:traverse",
9283                    // Analytics algorithms — expensive enough that Red
9284                    // UI needs to gate the runner independently of
9285                    // ordinary traversal.
9286                    GraphCommand::Centrality { .. }
9287                    | GraphCommand::Community { .. }
9288                    | GraphCommand::Components { .. }
9289                    | GraphCommand::Cycles { .. }
9290                    | GraphCommand::Clustering
9291                    | GraphCommand::TopologicalSort => "graph:algorithm:run",
9292                };
9293                return self.check_graph_op_privilege(
9294                    &auth_store,
9295                    &principal_id,
9296                    role,
9297                    tenant.as_deref(),
9298                    action_verb,
9299                );
9300            }
9301            QueryExpr::Vector(v) => {
9302                if auth_store.iam_authorization_enabled() {
9303                    self.check_vector_op_privilege(
9304                        &auth_store,
9305                        &principal_id,
9306                        role,
9307                        tenant.as_deref(),
9308                        "vector:search",
9309                        &v.collection,
9310                    )?;
9311                    self.check_table_like_column_projection_privilege(
9312                        &auth_store,
9313                        &principal_id,
9314                        role,
9315                        tenant.as_deref(),
9316                        &v.collection,
9317                        &["content".to_string()],
9318                    )?;
9319                    return Ok(());
9320                }
9321                return Ok(());
9322            }
9323            QueryExpr::SearchCommand(cmd) => {
9324                use crate::storage::query::ast::SearchCommand;
9325                if auth_store.iam_authorization_enabled() {
9326                    // `SEARCH SIMILAR [..] COLLECTION <c>` and `SEARCH
9327                    // HYBRID ... COLLECTION <c>` are the same UI
9328                    // affordances as `VECTOR SEARCH` / hybrid joins —
9329                    // Red UI must see the same `vector:search` envelope
9330                    // so a single toolbar grant is sufficient.
9331                    let collection = match cmd {
9332                        SearchCommand::Similar { collection, .. }
9333                        | SearchCommand::Hybrid { collection, .. } => Some(collection.as_str()),
9334                        _ => None,
9335                    };
9336                    if let Some(c) = collection {
9337                        self.check_vector_op_privilege(
9338                            &auth_store,
9339                            &principal_id,
9340                            role,
9341                            tenant.as_deref(),
9342                            "vector:search",
9343                            c,
9344                        )?;
9345                        return Ok(());
9346                    }
9347                }
9348                return Ok(());
9349            }
9350            QueryExpr::Hybrid(h) => {
9351                if auth_store.iam_authorization_enabled() {
9352                    // The vector half of a hybrid search is gated under
9353                    // the same `vector:search` verb as a standalone
9354                    // VECTOR SEARCH — Red UI's hybrid-search toolbar
9355                    // must surface the same UI-safe denial envelope
9356                    // when the principal lacks the grant. The
9357                    // structured half is dispatched to its own gate via
9358                    // the inner query during execution.
9359                    self.check_vector_op_privilege(
9360                        &auth_store,
9361                        &principal_id,
9362                        role,
9363                        tenant.as_deref(),
9364                        "vector:search",
9365                        &h.vector.collection,
9366                    )?;
9367                    return Ok(());
9368                }
9369                return Ok(());
9370            }
9371            QueryExpr::Insert(i) => (Action::Insert, Resource::table_from_name(&i.table)),
9372            QueryExpr::Update(u) => (Action::Update, Resource::table_from_name(&u.table)),
9373            QueryExpr::Delete(d) => (Action::Delete, Resource::table_from_name(&d.table)),
9374            // Joins inherit the read privilege from any constituent
9375            // table — for now we emit a single Select on the database
9376            // (admins bypass; non-admins need a Database/Schema grant).
9377            QueryExpr::Join(_) => (Action::Select, Resource::Database),
9378            // GRANT / REVOKE / ALTER USER are authority statements;
9379            // require Admin (the helper methods enforce).
9380            QueryExpr::Grant(_) | QueryExpr::Revoke(_) | QueryExpr::AlterUser(_) => {
9381                return if role == crate::auth::Role::Admin {
9382                    Ok(())
9383                } else {
9384                    Err(format!(
9385                        "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
9386                        username, role
9387                    ))
9388                };
9389            }
9390            QueryExpr::CreateIamPolicy { id, .. } => {
9391                return self.check_policy_management_privilege(
9392                    &auth_store,
9393                    &principal_id,
9394                    role,
9395                    tenant.as_deref(),
9396                    "policy:put",
9397                    "policy",
9398                    id,
9399                );
9400            }
9401            QueryExpr::DropIamPolicy { id } => {
9402                return self.check_policy_management_privilege(
9403                    &auth_store,
9404                    &principal_id,
9405                    role,
9406                    tenant.as_deref(),
9407                    "policy:drop",
9408                    "policy",
9409                    id,
9410                );
9411            }
9412            QueryExpr::AttachPolicy { policy_id, .. } => {
9413                return self.check_policy_management_privilege(
9414                    &auth_store,
9415                    &principal_id,
9416                    role,
9417                    tenant.as_deref(),
9418                    "policy:attach",
9419                    "policy",
9420                    policy_id,
9421                );
9422            }
9423            QueryExpr::DetachPolicy { policy_id, .. } => {
9424                return self.check_policy_management_privilege(
9425                    &auth_store,
9426                    &principal_id,
9427                    role,
9428                    tenant.as_deref(),
9429                    "policy:detach",
9430                    "policy",
9431                    policy_id,
9432                );
9433            }
9434            QueryExpr::ShowPolicies { .. } | QueryExpr::ShowEffectivePermissions { .. } => {
9435                return Ok(());
9436            }
9437            QueryExpr::SimulatePolicy { .. } => {
9438                return self.check_policy_management_privilege(
9439                    &auth_store,
9440                    &principal_id,
9441                    role,
9442                    tenant.as_deref(),
9443                    "policy:simulate",
9444                    "policy",
9445                    "*",
9446                );
9447            }
9448            QueryExpr::LintPolicy { .. } => {
9449                // Linting is a read-only inspection — gate it like
9450                // simulate (policy management role).
9451                return self.check_policy_management_privilege(
9452                    &auth_store,
9453                    &principal_id,
9454                    role,
9455                    tenant.as_deref(),
9456                    "policy:simulate",
9457                    "policy",
9458                    "*",
9459                );
9460            }
9461            QueryExpr::MigratePolicyMode { dry_run, .. } => {
9462                // DRY RUN is a pre-flight inspection (policy:simulate).
9463                // The actual mode flip is a privileged mutation under
9464                // the policy:put action (it persists a new enforcement
9465                // mode to the vault KV through `set_enforcement_mode`).
9466                let action = if *dry_run {
9467                    "policy:simulate"
9468                } else {
9469                    "policy:put"
9470                };
9471                return self.check_policy_management_privilege(
9472                    &auth_store,
9473                    &principal_id,
9474                    role,
9475                    tenant.as_deref(),
9476                    action,
9477                    "policy",
9478                    "*",
9479                );
9480            }
9481            // DROP and TRUNCATE — Write-role gate + per-collection IAM policy
9482            // when IAM mode is active. Other DDL stays role-only for now.
9483            QueryExpr::DropTable(q) => {
9484                return self.check_ddl_collection_privilege(
9485                    &auth_store,
9486                    &principal_id,
9487                    role,
9488                    tenant.as_deref(),
9489                    &username,
9490                    "drop",
9491                    &q.name,
9492                );
9493            }
9494            QueryExpr::DropGraph(q) => {
9495                return self.check_ddl_collection_privilege(
9496                    &auth_store,
9497                    &principal_id,
9498                    role,
9499                    tenant.as_deref(),
9500                    &username,
9501                    "drop",
9502                    &q.name,
9503                );
9504            }
9505            QueryExpr::DropVector(q) => {
9506                return self.check_ddl_collection_privilege(
9507                    &auth_store,
9508                    &principal_id,
9509                    role,
9510                    tenant.as_deref(),
9511                    &username,
9512                    "drop",
9513                    &q.name,
9514                );
9515            }
9516            QueryExpr::DropDocument(q) => {
9517                return self.check_ddl_collection_privilege(
9518                    &auth_store,
9519                    &principal_id,
9520                    role,
9521                    tenant.as_deref(),
9522                    &username,
9523                    "drop",
9524                    &q.name,
9525                );
9526            }
9527            QueryExpr::DropKv(q) => {
9528                return self.check_ddl_collection_privilege(
9529                    &auth_store,
9530                    &principal_id,
9531                    role,
9532                    tenant.as_deref(),
9533                    &username,
9534                    "drop",
9535                    &q.name,
9536                );
9537            }
9538            QueryExpr::DropCollection(q) => {
9539                return self.check_ddl_collection_privilege(
9540                    &auth_store,
9541                    &principal_id,
9542                    role,
9543                    tenant.as_deref(),
9544                    &username,
9545                    "drop",
9546                    &q.name,
9547                );
9548            }
9549            QueryExpr::Truncate(q) => {
9550                return self.check_ddl_collection_privilege(
9551                    &auth_store,
9552                    &principal_id,
9553                    role,
9554                    tenant.as_deref(),
9555                    &username,
9556                    "truncate",
9557                    &q.name,
9558                );
9559            }
9560            // Remaining DDL (#753) — hybrid policy-aware gate. Specific
9561            // create/alter/drop verbs gate operations with a clear
9562            // per-collection target so Red UI can author fine-grained
9563            // policies (`create on collection:users`). Namespace-level
9564            // and grouped DDL fall back to broader `schema:admin` /
9565            // `schema:write` verbs against a `schema:<name>` resource.
9566            // All branches share the [`check_ddl_object_privilege`]
9567            // helper so allows / denies produce the same structured
9568            // "principal=… action=… resource=<kind>:<name> denied by
9569            // IAM policy" reason the Red UI security read contracts
9570            // (#740) already render.
9571            QueryExpr::CreateTable(q) => {
9572                return self.check_ddl_object_privilege(
9573                    &auth_store,
9574                    &principal_id,
9575                    role,
9576                    tenant.as_deref(),
9577                    &username,
9578                    "create",
9579                    "collection",
9580                    &q.name,
9581                    crate::auth::Role::Write,
9582                );
9583            }
9584            QueryExpr::CreateCollection(q) => {
9585                return self.check_ddl_object_privilege(
9586                    &auth_store,
9587                    &principal_id,
9588                    role,
9589                    tenant.as_deref(),
9590                    &username,
9591                    "create",
9592                    "collection",
9593                    &q.name,
9594                    crate::auth::Role::Write,
9595                );
9596            }
9597            QueryExpr::CreateVector(q) => {
9598                return self.check_ddl_object_privilege(
9599                    &auth_store,
9600                    &principal_id,
9601                    role,
9602                    tenant.as_deref(),
9603                    &username,
9604                    "create",
9605                    "collection",
9606                    &q.name,
9607                    crate::auth::Role::Write,
9608                );
9609            }
9610            QueryExpr::AlterTable(q) => {
9611                return self.check_ddl_object_privilege(
9612                    &auth_store,
9613                    &principal_id,
9614                    role,
9615                    tenant.as_deref(),
9616                    &username,
9617                    "alter",
9618                    "collection",
9619                    &q.name,
9620                    crate::auth::Role::Write,
9621                );
9622            }
9623            QueryExpr::CreateIndex(q) => {
9624                return self.check_ddl_object_privilege(
9625                    &auth_store,
9626                    &principal_id,
9627                    role,
9628                    tenant.as_deref(),
9629                    &username,
9630                    "create",
9631                    "collection",
9632                    &q.table,
9633                    crate::auth::Role::Write,
9634                );
9635            }
9636            QueryExpr::DropIndex(q) => {
9637                return self.check_ddl_object_privilege(
9638                    &auth_store,
9639                    &principal_id,
9640                    role,
9641                    tenant.as_deref(),
9642                    &username,
9643                    "drop",
9644                    "collection",
9645                    &q.table,
9646                    crate::auth::Role::Write,
9647                );
9648            }
9649            QueryExpr::CreateSchema(q) => {
9650                return self.check_ddl_object_privilege(
9651                    &auth_store,
9652                    &principal_id,
9653                    role,
9654                    tenant.as_deref(),
9655                    &username,
9656                    "schema:admin",
9657                    "schema",
9658                    &q.name,
9659                    crate::auth::Role::Admin,
9660                );
9661            }
9662            QueryExpr::DropSchema(q) => {
9663                return self.check_ddl_object_privilege(
9664                    &auth_store,
9665                    &principal_id,
9666                    role,
9667                    tenant.as_deref(),
9668                    &username,
9669                    "schema:admin",
9670                    "schema",
9671                    &q.name,
9672                    crate::auth::Role::Admin,
9673                );
9674            }
9675            QueryExpr::CreateSequence(q) => {
9676                return self.check_ddl_object_privilege(
9677                    &auth_store,
9678                    &principal_id,
9679                    role,
9680                    tenant.as_deref(),
9681                    &username,
9682                    "create",
9683                    "collection",
9684                    &q.name,
9685                    crate::auth::Role::Write,
9686                );
9687            }
9688            QueryExpr::DropSequence(q) => {
9689                return self.check_ddl_object_privilege(
9690                    &auth_store,
9691                    &principal_id,
9692                    role,
9693                    tenant.as_deref(),
9694                    &username,
9695                    "drop",
9696                    "collection",
9697                    &q.name,
9698                    crate::auth::Role::Write,
9699                );
9700            }
9701            QueryExpr::CreateView(q) => {
9702                return self.check_ddl_object_privilege(
9703                    &auth_store,
9704                    &principal_id,
9705                    role,
9706                    tenant.as_deref(),
9707                    &username,
9708                    "create",
9709                    "collection",
9710                    &q.name,
9711                    crate::auth::Role::Write,
9712                );
9713            }
9714            QueryExpr::DropView(q) => {
9715                return self.check_ddl_object_privilege(
9716                    &auth_store,
9717                    &principal_id,
9718                    role,
9719                    tenant.as_deref(),
9720                    &username,
9721                    "drop",
9722                    "collection",
9723                    &q.name,
9724                    crate::auth::Role::Write,
9725                );
9726            }
9727            QueryExpr::RefreshMaterializedView(q) => {
9728                return self.check_ddl_object_privilege(
9729                    &auth_store,
9730                    &principal_id,
9731                    role,
9732                    tenant.as_deref(),
9733                    &username,
9734                    "alter",
9735                    "collection",
9736                    &q.name,
9737                    crate::auth::Role::Write,
9738                );
9739            }
9740            QueryExpr::CreatePolicy(q) => {
9741                return self.check_ddl_object_privilege(
9742                    &auth_store,
9743                    &principal_id,
9744                    role,
9745                    tenant.as_deref(),
9746                    &username,
9747                    "create",
9748                    "collection",
9749                    &q.table,
9750                    crate::auth::Role::Write,
9751                );
9752            }
9753            QueryExpr::DropPolicy(q) => {
9754                return self.check_ddl_object_privilege(
9755                    &auth_store,
9756                    &principal_id,
9757                    role,
9758                    tenant.as_deref(),
9759                    &username,
9760                    "drop",
9761                    "collection",
9762                    &q.table,
9763                    crate::auth::Role::Write,
9764                );
9765            }
9766            QueryExpr::CreateServer(q) => {
9767                return self.check_ddl_object_privilege(
9768                    &auth_store,
9769                    &principal_id,
9770                    role,
9771                    tenant.as_deref(),
9772                    &username,
9773                    "schema:admin",
9774                    "schema",
9775                    &q.name,
9776                    crate::auth::Role::Admin,
9777                );
9778            }
9779            QueryExpr::DropServer(q) => {
9780                return self.check_ddl_object_privilege(
9781                    &auth_store,
9782                    &principal_id,
9783                    role,
9784                    tenant.as_deref(),
9785                    &username,
9786                    "schema:admin",
9787                    "schema",
9788                    &q.name,
9789                    crate::auth::Role::Admin,
9790                );
9791            }
9792            QueryExpr::CreateForeignTable(q) => {
9793                return self.check_ddl_object_privilege(
9794                    &auth_store,
9795                    &principal_id,
9796                    role,
9797                    tenant.as_deref(),
9798                    &username,
9799                    "schema:write",
9800                    "schema",
9801                    &q.name,
9802                    crate::auth::Role::Write,
9803                );
9804            }
9805            QueryExpr::DropForeignTable(q) => {
9806                return self.check_ddl_object_privilege(
9807                    &auth_store,
9808                    &principal_id,
9809                    role,
9810                    tenant.as_deref(),
9811                    &username,
9812                    "schema:write",
9813                    "schema",
9814                    &q.name,
9815                    crate::auth::Role::Write,
9816                );
9817            }
9818            QueryExpr::CreateTimeSeries(q) => {
9819                return self.check_ddl_object_privilege(
9820                    &auth_store,
9821                    &principal_id,
9822                    role,
9823                    tenant.as_deref(),
9824                    &username,
9825                    "create",
9826                    "collection",
9827                    &q.name,
9828                    crate::auth::Role::Write,
9829                );
9830            }
9831            QueryExpr::CreateMetric(q) => {
9832                return self.check_ddl_object_privilege(
9833                    &auth_store,
9834                    &principal_id,
9835                    role,
9836                    tenant.as_deref(),
9837                    &username,
9838                    "create",
9839                    "collection",
9840                    &q.path,
9841                    crate::auth::Role::Write,
9842                );
9843            }
9844            QueryExpr::AlterMetric(q) => {
9845                return self.check_ddl_object_privilege(
9846                    &auth_store,
9847                    &principal_id,
9848                    role,
9849                    tenant.as_deref(),
9850                    &username,
9851                    "alter",
9852                    "collection",
9853                    &q.path,
9854                    crate::auth::Role::Write,
9855                );
9856            }
9857            QueryExpr::CreateSlo(q) => {
9858                return self.check_ddl_object_privilege(
9859                    &auth_store,
9860                    &principal_id,
9861                    role,
9862                    tenant.as_deref(),
9863                    &username,
9864                    "create",
9865                    "collection",
9866                    &q.path,
9867                    crate::auth::Role::Write,
9868                );
9869            }
9870            QueryExpr::DropTimeSeries(q) => {
9871                return self.check_ddl_object_privilege(
9872                    &auth_store,
9873                    &principal_id,
9874                    role,
9875                    tenant.as_deref(),
9876                    &username,
9877                    "drop",
9878                    "collection",
9879                    &q.name,
9880                    crate::auth::Role::Write,
9881                );
9882            }
9883            QueryExpr::CreateQueue(q) => {
9884                return self.check_ddl_object_privilege(
9885                    &auth_store,
9886                    &principal_id,
9887                    role,
9888                    tenant.as_deref(),
9889                    &username,
9890                    "create",
9891                    "collection",
9892                    &q.name,
9893                    crate::auth::Role::Write,
9894                );
9895            }
9896            QueryExpr::AlterQueue(q) => {
9897                return self.check_ddl_object_privilege(
9898                    &auth_store,
9899                    &principal_id,
9900                    role,
9901                    tenant.as_deref(),
9902                    &username,
9903                    "alter",
9904                    "collection",
9905                    &q.name,
9906                    crate::auth::Role::Write,
9907                );
9908            }
9909            QueryExpr::DropQueue(q) => {
9910                return self.check_ddl_object_privilege(
9911                    &auth_store,
9912                    &principal_id,
9913                    role,
9914                    tenant.as_deref(),
9915                    &username,
9916                    "drop",
9917                    "collection",
9918                    &q.name,
9919                    crate::auth::Role::Write,
9920                );
9921            }
9922            QueryExpr::CreateTree(q) => {
9923                return self.check_ddl_object_privilege(
9924                    &auth_store,
9925                    &principal_id,
9926                    role,
9927                    tenant.as_deref(),
9928                    &username,
9929                    "create",
9930                    "collection",
9931                    &q.collection,
9932                    crate::auth::Role::Write,
9933                );
9934            }
9935            QueryExpr::DropTree(q) => {
9936                return self.check_ddl_object_privilege(
9937                    &auth_store,
9938                    &principal_id,
9939                    role,
9940                    tenant.as_deref(),
9941                    &username,
9942                    "drop",
9943                    "collection",
9944                    &q.collection,
9945                    crate::auth::Role::Write,
9946                );
9947            }
9948            // Migration DDL — CREATE MIGRATION is grouped DDL on the
9949            // schema namespace; uses the `schema:write` fallback verb
9950            // (no obvious per-collection target).
9951            QueryExpr::CreateMigration(q) => {
9952                return self.check_ddl_object_privilege(
9953                    &auth_store,
9954                    &principal_id,
9955                    role,
9956                    tenant.as_deref(),
9957                    &username,
9958                    "schema:write",
9959                    "schema",
9960                    &q.name,
9961                    crate::auth::Role::Write,
9962                );
9963            }
9964            // APPLY / ROLLBACK change data and schema — require Admin.
9965            QueryExpr::ApplyMigration(_) | QueryExpr::RollbackMigration(_) => {
9966                return if role == crate::auth::Role::Admin {
9967                    Ok(())
9968                } else {
9969                    Err(format!(
9970                        "principal=`{}` role=`{:?}` cannot issue APPLY/ROLLBACK MIGRATION",
9971                        username, role
9972                    ))
9973                };
9974            }
9975            // EXPLAIN MIGRATION is read-only — any authenticated principal.
9976            QueryExpr::ExplainMigration(_) => return Ok(()),
9977            // Everything else (SET, SHOW, transaction control, graph
9978            // commands, queue/tree commands, MaintenanceCommand …)
9979            // is allowed for any authenticated principal.
9980            _ => return Ok(()),
9981        };
9982
9983        if auth_store.iam_authorization_enabled() {
9984            let iam_action = legacy_action_to_iam(action);
9985            let iam_resource = legacy_resource_to_iam(&resource, tenant.as_deref());
9986            let iam_ctx = runtime_iam_context(
9987                role,
9988                tenant.as_deref(),
9989                auth_store.principal_is_system_owned(&principal_id),
9990            );
9991            if !auth_store.check_policy_authz_with_role(
9992                &principal_id,
9993                iam_action,
9994                &iam_resource,
9995                &iam_ctx,
9996                role,
9997            ) {
9998                return Err(format!(
9999                    "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10000                    username, iam_action, iam_resource.kind, iam_resource.name
10001                ));
10002            }
10003
10004            if let QueryExpr::Table(table) = expr {
10005                self.check_table_column_projection_privilege(
10006                    &auth_store,
10007                    &principal_id,
10008                    &iam_ctx,
10009                    table,
10010                )?;
10011            }
10012
10013            if let QueryExpr::Update(update) = expr {
10014                let columns = update_set_target_columns(update);
10015                if !columns.is_empty() {
10016                    let request = column_access_request_for_table_update(&update.table, columns);
10017                    let outcome =
10018                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
10019                    if let Some(denied) = outcome.first_denied_column() {
10020                        return Err(format!(
10021                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM column policy",
10022                            username, iam_action, denied.resource.kind, denied.resource.name
10023                        ));
10024                    }
10025                    if !outcome.allowed() {
10026                        return Err(format!(
10027                            "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10028                            username,
10029                            iam_action,
10030                            outcome.table_resource.kind,
10031                            outcome.table_resource.name
10032                        ));
10033                    }
10034                }
10035
10036                if let Some(columns) = update_returning_columns_for_policy(self, update) {
10037                    let request = column_access_request_for_table_select(&update.table, columns);
10038                    let outcome =
10039                        auth_store.check_column_projection_authz(&principal_id, &request, &iam_ctx);
10040                    if let Some(denied) = outcome.first_denied_column() {
10041                        return Err(format!(
10042                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM column policy",
10043                            username, denied.resource.kind, denied.resource.name
10044                        ));
10045                    }
10046                    if !outcome.allowed() {
10047                        return Err(format!(
10048                            "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
10049                            username, outcome.table_resource.kind, outcome.table_resource.name
10050                        ));
10051                    }
10052                }
10053            }
10054
10055            Ok(())
10056        } else {
10057            auth_store
10058                .check_grant(&ctx, action, &resource)
10059                .map_err(|e| e.to_string())
10060        }
10061    }
10062
10063    fn check_table_column_projection_privilege(
10064        &self,
10065        auth_store: &Arc<crate::auth::store::AuthStore>,
10066        principal: &crate::auth::UserId,
10067        ctx: &crate::auth::policies::EvalContext,
10068        table: &crate::storage::query::ast::TableQuery,
10069    ) -> Result<(), String> {
10070        use crate::auth::{ColumnAccessRequest, ColumnDecisionEffect};
10071
10072        let columns = requested_table_columns_for_policy(table);
10073        if columns.is_empty() {
10074            return Ok(());
10075        }
10076
10077        let request = ColumnAccessRequest::select(table.table.clone(), columns);
10078        let outcome = auth_store.check_column_projection_authz(principal, &request, ctx);
10079        if outcome.allowed() {
10080            return Ok(());
10081        }
10082
10083        if !matches!(
10084            outcome.table_decision,
10085            crate::auth::policies::Decision::Allow { .. }
10086                | crate::auth::policies::Decision::AdminBypass
10087        ) {
10088            return Err(format!(
10089                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
10090                principal, outcome.table_resource.kind, outcome.table_resource.name
10091            ));
10092        }
10093
10094        let denied = outcome
10095            .first_denied_column()
10096            .filter(|decision| decision.effective == ColumnDecisionEffect::Denied);
10097        match denied {
10098            Some(decision) => Err(format!(
10099                "principal=`{}` action=`select` resource=`{}:{}` denied by IAM policy",
10100                principal, decision.resource.kind, decision.resource.name
10101            )),
10102            None => Ok(()),
10103        }
10104    }
10105
10106    fn check_graph_property_projection_privilege(
10107        &self,
10108        auth_store: &Arc<crate::auth::store::AuthStore>,
10109        principal: &crate::auth::UserId,
10110        role: crate::auth::Role,
10111        tenant: Option<&str>,
10112        query: &crate::storage::query::ast::GraphQuery,
10113    ) -> Result<(), String> {
10114        let columns = explicit_graph_projection_properties(query);
10115        if columns.is_empty() {
10116            return Ok(());
10117        }
10118        self.check_table_like_column_projection_privilege(
10119            auth_store, principal, role, tenant, "graph", &columns,
10120        )
10121    }
10122
10123    fn check_table_like_column_projection_privilege(
10124        &self,
10125        auth_store: &Arc<crate::auth::store::AuthStore>,
10126        principal: &crate::auth::UserId,
10127        role: crate::auth::Role,
10128        tenant: Option<&str>,
10129        table: &str,
10130        columns: &[String],
10131    ) -> Result<(), String> {
10132        let iam_ctx = runtime_iam_context(
10133            role,
10134            tenant,
10135            auth_store.principal_is_system_owned(principal),
10136        );
10137        let request =
10138            crate::auth::ColumnAccessRequest::select(table.to_string(), columns.iter().cloned());
10139        let outcome = auth_store.check_column_projection_authz(principal, &request, &iam_ctx);
10140        if outcome.allowed() {
10141            return Ok(());
10142        }
10143        let denied = outcome
10144            .first_denied_column()
10145            .map(|d| d.resource.name.clone())
10146            .unwrap_or_else(|| format!("{table}.<unknown>"));
10147        Err(format!(
10148            "principal=`{}` action=`select` resource=`column:{}` denied by IAM policy",
10149            principal, denied
10150        ))
10151    }
10152
10153    fn check_policy_management_privilege(
10154        &self,
10155        auth_store: &Arc<crate::auth::store::AuthStore>,
10156        principal: &crate::auth::UserId,
10157        role: crate::auth::Role,
10158        tenant: Option<&str>,
10159        action: &str,
10160        resource_kind: &str,
10161        resource_name: &str,
10162    ) -> Result<(), String> {
10163        let ctx = runtime_iam_context(
10164            role,
10165            tenant,
10166            auth_store.principal_is_system_owned(principal),
10167        );
10168
10169        if !auth_store.iam_authorization_enabled() {
10170            return if role == crate::auth::Role::Admin {
10171                Ok(())
10172            } else {
10173                Err(format!(
10174                    "principal=`{}` role=`{:?}` cannot issue ACL/auth DDL",
10175                    principal, role
10176                ))
10177            };
10178        }
10179
10180        let mut resource = crate::auth::policies::ResourceRef::new(
10181            resource_kind.to_string(),
10182            resource_name.to_string(),
10183        );
10184        if let Some(t) = tenant {
10185            resource = resource.with_tenant(t.to_string());
10186        }
10187        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10188            Ok(())
10189        } else {
10190            Err(format!(
10191                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10192                principal, action, resource.kind, resource.name
10193            ))
10194        }
10195    }
10196
10197    fn check_managed_config_write_for_set_config(&self, key: &str) -> RedDBResult<()> {
10198        let Some(auth_store) = self.inner.auth_store.read().clone() else {
10199            return Ok(());
10200        };
10201        let (username, role) = current_auth_identity()
10202            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
10203        let tenant = current_tenant();
10204        let principal = crate::auth::UserId::from_parts(tenant.as_deref(), &username);
10205        let ctx = runtime_iam_context(
10206            role,
10207            tenant.as_deref(),
10208            auth_store.principal_is_system_owned(&principal),
10209        );
10210        let gate = crate::auth::managed_config::ManagedConfigGate::new(
10211            self.inner.config_registry.as_ref(),
10212        );
10213        match gate.check_write(&auth_store, &principal, &ctx, key) {
10214            crate::auth::managed_config::ManagedConfigDecision::PassThrough { .. }
10215            | crate::auth::managed_config::ManagedConfigDecision::Allow { .. } => Ok(()),
10216            crate::auth::managed_config::ManagedConfigDecision::Deny { reason, .. } => {
10217                Err(RedDBError::Query(format!(
10218                    "permission denied: managed config mutation blocked for `{key}`: {reason}"
10219                )))
10220            }
10221        }
10222    }
10223
10224    /// IAM privilege check for a granular queue operation (issue #755 /
10225    /// PRD #735).
10226    ///
10227    /// Each queue operation maps to a stable verb in
10228    /// [`crate::auth::action_catalog`] (`queue:enqueue`, `queue:read`,
10229    /// `queue:peek`, `queue:ack`, `queue:nack`, `queue:retry`,
10230    /// `queue:dlq:move`, `queue:purge`, `queue:presence:read`). The
10231    /// resource is `queue:<name>` scoped to the current tenant. In
10232    /// legacy mode (no IAM authorization configured) the check is a
10233    /// no-op — the role gates in `execute_queue_command` still apply
10234    /// and the legacy `select` / `write` grant table continues to
10235    /// govern queue access. In IAM-enabled mode a missing granular
10236    /// grant yields a structured, UI-safe error of the form
10237    /// `principal=… action=queue:… resource=queue:… denied by IAM
10238    /// policy` so Red UI can surface the failing toolbar action.
10239    fn check_queue_op_privilege(
10240        &self,
10241        auth_store: &Arc<crate::auth::store::AuthStore>,
10242        principal: &crate::auth::UserId,
10243        role: crate::auth::Role,
10244        tenant: Option<&str>,
10245        action: &str,
10246        queue: &str,
10247    ) -> Result<(), String> {
10248        if !auth_store.iam_authorization_enabled() {
10249            return Ok(());
10250        }
10251        let mut resource =
10252            crate::auth::policies::ResourceRef::new("queue".to_string(), queue.to_string());
10253        if let Some(t) = tenant {
10254            resource = resource.with_tenant(t.to_string());
10255        }
10256        let ctx = runtime_iam_context(
10257            role,
10258            tenant,
10259            auth_store.principal_is_system_owned(principal),
10260        );
10261        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10262            Ok(())
10263        } else {
10264            Err(format!(
10265                "principal=`{}` action=`{}` resource=`queue:{}` denied by IAM policy",
10266                principal, action, queue
10267            ))
10268        }
10269    }
10270
10271    /// IAM privilege check for a graph operation (issue #757 / PRD
10272    /// #735).
10273    ///
10274    /// Each graph operation maps to a stable verb in
10275    /// [`crate::auth::action_catalog`] — `graph:read` for
10276    /// metadata/property lookups, `graph:traverse` for MATCH / PATH /
10277    /// NEIGHBORHOOD / TRAVERSE / SHORTEST_PATH, and
10278    /// `graph:algorithm:run` for analytics algorithms (centrality,
10279    /// community, components, cycles, clustering, topological sort).
10280    /// The resource is `graph:*` scoped to the current tenant — the
10281    /// runtime today operates on a singleton graph store so the name
10282    /// has no concrete identifier; policies grant the explorer
10283    /// surface by writing `graph:*` as the resource pattern.
10284    ///
10285    /// In legacy mode (no IAM authorization configured) the check is
10286    /// a no-op so the existing role-based defaults continue to
10287    /// govern. In IAM-enabled mode a missing grant produces the
10288    /// UI-safe envelope `principal=… action=graph:… resource=graph:*
10289    /// denied by IAM policy` Red UI keys on.
10290    fn check_graph_op_privilege(
10291        &self,
10292        auth_store: &Arc<crate::auth::store::AuthStore>,
10293        principal: &crate::auth::UserId,
10294        role: crate::auth::Role,
10295        tenant: Option<&str>,
10296        action: &str,
10297    ) -> Result<(), String> {
10298        if !auth_store.iam_authorization_enabled() {
10299            return Ok(());
10300        }
10301        let mut resource =
10302            crate::auth::policies::ResourceRef::new("graph".to_string(), "*".to_string());
10303        if let Some(t) = tenant {
10304            resource = resource.with_tenant(t.to_string());
10305        }
10306        let ctx = runtime_iam_context(
10307            role,
10308            tenant,
10309            auth_store.principal_is_system_owned(principal),
10310        );
10311        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10312            Ok(())
10313        } else {
10314            Err(format!(
10315                "principal=`{}` action=`{}` resource=`graph:*` denied by IAM policy",
10316                principal, action
10317            ))
10318        }
10319    }
10320
10321    /// IAM privilege check for a granular vector operation (issue #756
10322    /// / PRD #735).
10323    ///
10324    /// Each vector operation maps to a stable verb in
10325    /// [`crate::auth::action_catalog`] (`vector:read`, `vector:search`,
10326    /// `vector:artifact:read`, `vector:artifact:rebuild`,
10327    /// `vector:admin`). The resource is `vector:<collection>` scoped to
10328    /// the current tenant. In legacy mode (no IAM authorization
10329    /// configured) the check is a no-op — the role gates and existing
10330    /// `select` / column-projection grants continue to govern access.
10331    /// In IAM-enabled mode a missing granular grant yields a
10332    /// structured, UI-safe error of the form `principal=…
10333    /// action=vector:… resource=vector:… denied by IAM policy` so Red
10334    /// UI can surface the failing toolbar action.
10335    fn check_vector_op_privilege(
10336        &self,
10337        auth_store: &Arc<crate::auth::store::AuthStore>,
10338        principal: &crate::auth::UserId,
10339        role: crate::auth::Role,
10340        tenant: Option<&str>,
10341        action: &str,
10342        collection: &str,
10343    ) -> Result<(), String> {
10344        if !auth_store.iam_authorization_enabled() {
10345            return Ok(());
10346        }
10347        let mut resource =
10348            crate::auth::policies::ResourceRef::new("vector".to_string(), collection.to_string());
10349        if let Some(t) = tenant {
10350            resource = resource.with_tenant(t.to_string());
10351        }
10352        let ctx = runtime_iam_context(
10353            role,
10354            tenant,
10355            auth_store.principal_is_system_owned(principal),
10356        );
10357        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10358            Ok(())
10359        } else {
10360            Err(format!(
10361                "principal=`{}` action=`{}` resource=`vector:{}` denied by IAM policy",
10362                principal, action, collection
10363            ))
10364        }
10365    }
10366
10367    /// IAM privilege check for DROP / TRUNCATE on a named collection.
10368    ///
10369    /// Delegates to [`check_ddl_object_privilege`] with `resource_kind =
10370    /// "collection"`. Kept as a thin wrapper so the existing DROP/TRUNCATE
10371    /// callsites stay readable.
10372    fn check_ddl_collection_privilege(
10373        &self,
10374        auth_store: &Arc<crate::auth::store::AuthStore>,
10375        principal: &crate::auth::UserId,
10376        role: crate::auth::Role,
10377        tenant: Option<&str>,
10378        username: &str,
10379        action: &str,
10380        collection: &str,
10381    ) -> Result<(), String> {
10382        self.check_ddl_object_privilege(
10383            auth_store,
10384            principal,
10385            role,
10386            tenant,
10387            username,
10388            action,
10389            "collection",
10390            collection,
10391            crate::auth::Role::Write,
10392        )
10393    }
10394
10395    /// Generalised IAM privilege check for DDL on a named object.
10396    ///
10397    /// `action` is the stable verb advertised through the action catalog
10398    /// (`create`, `alter`, `drop`, `truncate`, `schema:write`,
10399    /// `schema:admin`). `resource_kind` / `resource_name` form the policy
10400    /// resource (`collection:<name>`, `schema:<name>`). `min_role` is the
10401    /// legacy gate when IAM is not yet enabled.
10402    ///
10403    /// Behaviour:
10404    /// * Role below `min_role` → structured "principal=… role=… cannot
10405    ///   issue DDL" denial, audit recorded.
10406    /// * IAM disabled → audit-record success and allow (legacy path).
10407    /// * IAM enabled → call `check_policy_authz_with_role`. Explicit Deny
10408    ///   and DefaultDeny in PolicyOnly mode both produce a UI-safe
10409    ///   "principal=… action=… resource=<kind>:<name> denied by IAM
10410    ///   policy" string. Explicit Allow and the LegacyRbac fallback
10411    ///   allow the action.
10412    #[allow(clippy::too_many_arguments)]
10413    fn check_ddl_object_privilege(
10414        &self,
10415        auth_store: &Arc<crate::auth::store::AuthStore>,
10416        principal: &crate::auth::UserId,
10417        role: crate::auth::Role,
10418        tenant: Option<&str>,
10419        username: &str,
10420        action: &str,
10421        resource_kind: &str,
10422        resource_name: &str,
10423        min_role: crate::auth::Role,
10424    ) -> Result<(), String> {
10425        if role < min_role {
10426            let msg = format!(
10427                "principal=`{}` role=`{:?}` cannot issue DDL action=`{}` resource=`{}:{}`",
10428                username, role, action, resource_kind, resource_name
10429            );
10430            self.inner.audit_log.record(
10431                action,
10432                username,
10433                resource_name,
10434                "denied",
10435                crate::json::Value::Null,
10436            );
10437            return Err(msg);
10438        }
10439
10440        if !auth_store.iam_authorization_enabled() {
10441            self.inner.audit_log.record(
10442                action,
10443                username,
10444                resource_name,
10445                "ok",
10446                crate::json::Value::Null,
10447            );
10448            return Ok(());
10449        }
10450
10451        let mut resource = crate::auth::policies::ResourceRef::new(
10452            resource_kind.to_string(),
10453            resource_name.to_string(),
10454        );
10455        if let Some(t) = tenant {
10456            resource = resource.with_tenant(t.to_string());
10457        }
10458        let ctx = runtime_iam_context(
10459            role,
10460            tenant,
10461            auth_store.principal_is_system_owned(principal),
10462        );
10463        if auth_store.check_policy_authz_with_role(principal, action, &resource, &ctx, role) {
10464            self.inner.audit_log.record(
10465                action,
10466                username,
10467                resource_name,
10468                "ok",
10469                crate::json::Value::Null,
10470            );
10471            Ok(())
10472        } else {
10473            self.inner.audit_log.record(
10474                action,
10475                username,
10476                resource_name,
10477                "denied",
10478                crate::json::Value::Null,
10479            );
10480            Err(format!(
10481                "principal=`{}` action=`{}` resource=`{}:{}` denied by IAM policy",
10482                username, action, resource_kind, resource_name
10483            ))
10484        }
10485    }
10486
10487    /// Translate the parsed [`GrantStmt`] into AuthStore mutations.
10488    fn execute_grant_statement(
10489        &self,
10490        query: &str,
10491        stmt: &crate::storage::query::ast::GrantStmt,
10492    ) -> RedDBResult<RuntimeQueryResult> {
10493        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
10494        use crate::auth::UserId;
10495        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
10496
10497        let auth_store = self
10498            .inner
10499            .auth_store
10500            .read()
10501            .clone()
10502            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10503
10504        // Granter identity + role.
10505        let (gname, grole) = current_auth_identity().ok_or_else(|| {
10506            RedDBError::Query("GRANT requires an authenticated principal".to_string())
10507        })?;
10508        let granter = UserId::from_parts(current_tenant().as_deref(), &gname);
10509        let granter_role = grole;
10510
10511        // Build the action set.
10512        let mut actions: Vec<Action> = Vec::new();
10513        if stmt.all {
10514            actions.push(Action::All);
10515        } else {
10516            for kw in &stmt.actions {
10517                let a = Action::from_keyword(kw).ok_or_else(|| {
10518                    RedDBError::Query(format!("unknown privilege keyword `{}`", kw))
10519                })?;
10520                actions.push(a);
10521            }
10522        }
10523
10524        // Audit emit (printed; structured emission is Agent #4's lane).
10525        let mut applied = 0usize;
10526        for obj in &stmt.objects {
10527            let resource = match stmt.object_kind {
10528                GrantObjectKind::Table => Resource::Table {
10529                    schema: obj.schema.clone(),
10530                    table: obj.name.clone(),
10531                },
10532                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
10533                GrantObjectKind::Database => Resource::Database,
10534                GrantObjectKind::Function => Resource::Function {
10535                    schema: obj.schema.clone(),
10536                    name: obj.name.clone(),
10537                },
10538            };
10539            for principal in &stmt.principals {
10540                let p = match principal {
10541                    GrantPrincipalRef::Public => GrantPrincipal::Public,
10542                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
10543                    GrantPrincipalRef::User { tenant, name } => {
10544                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
10545                    }
10546                };
10547                // Tenant of the grant follows the granter's tenant
10548                // (cross-tenant guard inside `AuthStore::grant`).
10549                let tenant = granter.tenant.clone();
10550                auth_store
10551                    .grant(
10552                        &granter,
10553                        granter_role,
10554                        p.clone(),
10555                        resource.clone(),
10556                        actions.clone(),
10557                        stmt.with_grant_option,
10558                        tenant.clone(),
10559                    )
10560                    .map_err(|e| RedDBError::Query(e.to_string()))?;
10561
10562                // IAM policy translation: every GRANT also lands as a
10563                // synthetic `_grant_<id>` policy attached to the
10564                // principal so the new evaluator sees it.
10565                if let Some(policy) =
10566                    grant_to_iam_policy(&p, &resource, &actions, tenant.as_deref())
10567                {
10568                    let pid = policy.id.clone();
10569                    auth_store
10570                        .put_policy_internal(policy)
10571                        .map_err(|e| RedDBError::Query(e.to_string()))?;
10572                    let attachment = match &p {
10573                        GrantPrincipal::User(uid) => {
10574                            crate::auth::store::PrincipalRef::User(uid.clone())
10575                        }
10576                        GrantPrincipal::Group(group) => {
10577                            crate::auth::store::PrincipalRef::Group(group.clone())
10578                        }
10579                        GrantPrincipal::Public => crate::auth::store::PrincipalRef::Group(
10580                            crate::auth::store::PUBLIC_IAM_GROUP.to_string(),
10581                        ),
10582                    };
10583                    auth_store
10584                        .attach_policy(attachment, &pid)
10585                        .map_err(|e| RedDBError::Query(e.to_string()))?;
10586                }
10587                applied += 1;
10588                tracing::info!(
10589                    target: "audit",
10590                    principal = %granter,
10591                    action = "grant",
10592                    "GRANT applied"
10593                );
10594            }
10595        }
10596
10597        self.invalidate_result_cache();
10598        Ok(RuntimeQueryResult::ok_message(
10599            query.to_string(),
10600            &format!("GRANT applied to {} target(s)", applied),
10601            "grant",
10602        ))
10603    }
10604
10605    /// Translate the parsed [`RevokeStmt`] into AuthStore mutations.
10606    fn execute_revoke_statement(
10607        &self,
10608        query: &str,
10609        stmt: &crate::storage::query::ast::RevokeStmt,
10610    ) -> RedDBResult<RuntimeQueryResult> {
10611        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
10612        use crate::auth::UserId;
10613        use crate::storage::query::ast::{GrantObjectKind, GrantPrincipalRef};
10614
10615        let auth_store = self
10616            .inner
10617            .auth_store
10618            .read()
10619            .clone()
10620            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10621
10622        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
10623            RedDBError::Query("REVOKE requires an authenticated principal".to_string())
10624        })?;
10625        let granter_role = grole;
10626
10627        let actions: Vec<Action> = if stmt.all {
10628            vec![Action::All]
10629        } else {
10630            stmt.actions
10631                .iter()
10632                .map(|kw| Action::from_keyword(kw).unwrap_or(Action::Select))
10633                .collect()
10634        };
10635
10636        let mut total_removed = 0usize;
10637        for obj in &stmt.objects {
10638            let resource = match stmt.object_kind {
10639                GrantObjectKind::Table => Resource::Table {
10640                    schema: obj.schema.clone(),
10641                    table: obj.name.clone(),
10642                },
10643                GrantObjectKind::Schema => Resource::Schema(obj.name.clone()),
10644                GrantObjectKind::Database => Resource::Database,
10645                GrantObjectKind::Function => Resource::Function {
10646                    schema: obj.schema.clone(),
10647                    name: obj.name.clone(),
10648                },
10649            };
10650            for principal in &stmt.principals {
10651                let p = match principal {
10652                    GrantPrincipalRef::Public => GrantPrincipal::Public,
10653                    GrantPrincipalRef::Group(g) => GrantPrincipal::Group(g.clone()),
10654                    GrantPrincipalRef::User { tenant, name } => {
10655                        GrantPrincipal::User(UserId::from_parts(tenant.as_deref(), name))
10656                    }
10657                };
10658                let removed = auth_store
10659                    .revoke(granter_role, &p, &resource, &actions)
10660                    .map_err(|e| RedDBError::Query(e.to_string()))?;
10661                let _removed_policies =
10662                    auth_store.delete_synthetic_grant_policies(&p, &resource, &actions);
10663                total_removed += removed;
10664            }
10665        }
10666
10667        self.invalidate_result_cache();
10668        Ok(RuntimeQueryResult::ok_message(
10669            query.to_string(),
10670            &format!("REVOKE removed {} grant(s)", total_removed),
10671            "revoke",
10672        ))
10673    }
10674
10675    /// Translate the parsed [`AlterUserStmt`] into AuthStore mutations.
10676    fn execute_alter_user_statement(
10677        &self,
10678        query: &str,
10679        stmt: &crate::storage::query::ast::AlterUserStmt,
10680    ) -> RedDBResult<RuntimeQueryResult> {
10681        use crate::auth::privileges::UserAttributes;
10682        use crate::auth::UserId;
10683        use crate::storage::query::ast::AlterUserAttribute;
10684
10685        let auth_store = self
10686            .inner
10687            .auth_store
10688            .read()
10689            .clone()
10690            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10691
10692        let (_gname, grole) = current_auth_identity().ok_or_else(|| {
10693            RedDBError::Query("ALTER USER requires an authenticated principal".to_string())
10694        })?;
10695        if grole != crate::auth::Role::Admin {
10696            return Err(RedDBError::Query(
10697                "ALTER USER requires Admin role".to_string(),
10698            ));
10699        }
10700
10701        let target = UserId::from_parts(stmt.tenant.as_deref(), &stmt.username);
10702
10703        // Apply attributes incrementally — each one reads the current
10704        // record, mutates the relevant field, writes back.
10705        let mut attrs = auth_store.user_attributes(&target);
10706        let mut enable_change: Option<bool> = None;
10707
10708        for a in &stmt.attributes {
10709            match a {
10710                AlterUserAttribute::ValidUntil(ts) => {
10711                    // Parse ISO-ish timestamp → ms since epoch. Fall
10712                    // back to integer-ms parsing for callers that pass
10713                    // `'1234567890123'`.
10714                    let ms = parse_timestamp_to_ms(ts).ok_or_else(|| {
10715                        RedDBError::Query(format!("invalid VALID UNTIL timestamp `{ts}`"))
10716                    })?;
10717                    attrs.valid_until = Some(ms);
10718                }
10719                AlterUserAttribute::ConnectionLimit(n) => {
10720                    if *n < 0 {
10721                        return Err(RedDBError::Query(
10722                            "CONNECTION LIMIT must be non-negative".to_string(),
10723                        ));
10724                    }
10725                    attrs.connection_limit = Some(*n as u32);
10726                }
10727                AlterUserAttribute::SetSearchPath(p) => {
10728                    attrs.search_path = Some(p.clone());
10729                }
10730                AlterUserAttribute::AddGroup(g) => {
10731                    if !attrs.groups.iter().any(|existing| existing == g) {
10732                        attrs.groups.push(g.clone());
10733                        attrs.groups.sort();
10734                    }
10735                }
10736                AlterUserAttribute::DropGroup(g) => {
10737                    attrs.groups.retain(|existing| existing != g);
10738                }
10739                AlterUserAttribute::Enable => enable_change = Some(true),
10740                AlterUserAttribute::Disable => enable_change = Some(false),
10741                AlterUserAttribute::Password(_) => {
10742                    // Out of scope — accept the AST but no-op so the
10743                    // parser stays compatible with future password
10744                    // rotation work.
10745                }
10746            }
10747        }
10748
10749        auth_store
10750            .set_user_attributes(&target, attrs)
10751            .map_err(|e| RedDBError::Query(e.to_string()))?;
10752        if let Some(en) = enable_change {
10753            auth_store
10754                .set_user_enabled(&target, en)
10755                .map_err(|e| RedDBError::Query(e.to_string()))?;
10756        }
10757        self.invalidate_result_cache();
10758        tracing::info!(
10759            target: "audit",
10760            principal = %target,
10761            action = "alter_user",
10762            "ALTER USER applied"
10763        );
10764
10765        Ok(RuntimeQueryResult::ok_message(
10766            query.to_string(),
10767            &format!("ALTER USER {} applied", target),
10768            "alter_user",
10769        ))
10770    }
10771
10772    // -----------------------------------------------------------------
10773    // IAM policy executors
10774    // -----------------------------------------------------------------
10775
10776    fn execute_create_iam_policy(
10777        &self,
10778        query: &str,
10779        id: &str,
10780        json: &str,
10781    ) -> RedDBResult<RuntimeQueryResult> {
10782        use crate::auth::policies::Policy;
10783
10784        let auth_store = self
10785            .inner
10786            .auth_store
10787            .read()
10788            .clone()
10789            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10790
10791        // Parse + validate. The kernel rejects oversize / bad shape /
10792        // bad action keywords. If the supplied id differs from the JSON
10793        // id, override it with the SQL-provided id (the JSON id is
10794        // optional context — the SQL DDL form is authoritative).
10795        let mut policy = Policy::from_json_str(json)
10796            .map_err(|e| RedDBError::Query(format!("policy parse: {e}")))?;
10797        if policy.id != id {
10798            policy.id = id.to_string();
10799        }
10800        let pid = policy.id.clone();
10801        let tenant = current_tenant();
10802        let (actor_name, actor_role) = current_auth_identity()
10803            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
10804        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
10805        let eval_ctx = runtime_iam_context(
10806            actor_role,
10807            tenant.as_deref(),
10808            auth_store.principal_is_system_owned(&actor),
10809        );
10810        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
10811        let ledger = self.inner.control_event_ledger.read();
10812        let control = crate::auth::store::PolicyMutationControl {
10813            ctx: &event_ctx,
10814            ledger: ledger.as_ref(),
10815            config: self.inner.control_event_config,
10816            registry: Some(self.inner.config_registry.as_ref()),
10817            actor: &actor,
10818            eval_ctx: &eval_ctx,
10819        };
10820        auth_store
10821            .put_policy_with_control_events(policy, &control)
10822            .map_err(|e| RedDBError::Query(e.to_string()))?;
10823
10824        let principal = actor_name;
10825        tracing::info!(
10826            target: "audit",
10827            principal = %principal,
10828            action = "iam:policy.put",
10829            matched_policy_id = %pid,
10830            "CREATE POLICY applied"
10831        );
10832        self.inner.audit_log.record(
10833            "iam/policy.put",
10834            &principal,
10835            &pid,
10836            "ok",
10837            crate::json::Value::Null,
10838        );
10839
10840        self.invalidate_result_cache();
10841        Ok(RuntimeQueryResult::ok_message(
10842            query.to_string(),
10843            &format!("policy `{pid}` stored"),
10844            "create_iam_policy",
10845        ))
10846    }
10847
10848    fn execute_drop_iam_policy(&self, query: &str, id: &str) -> RedDBResult<RuntimeQueryResult> {
10849        let auth_store = self
10850            .inner
10851            .auth_store
10852            .read()
10853            .clone()
10854            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10855        let tenant = current_tenant();
10856        let (actor_name, actor_role) = current_auth_identity()
10857            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
10858        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
10859        let eval_ctx = runtime_iam_context(
10860            actor_role,
10861            tenant.as_deref(),
10862            auth_store.principal_is_system_owned(&actor),
10863        );
10864        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
10865        let ledger = self.inner.control_event_ledger.read();
10866        let control = crate::auth::store::PolicyMutationControl {
10867            ctx: &event_ctx,
10868            ledger: ledger.as_ref(),
10869            config: self.inner.control_event_config,
10870            registry: Some(self.inner.config_registry.as_ref()),
10871            actor: &actor,
10872            eval_ctx: &eval_ctx,
10873        };
10874        auth_store
10875            .delete_policy_with_control_events(id, &control)
10876            .map_err(|e| RedDBError::Query(e.to_string()))?;
10877
10878        let principal = actor_name;
10879        tracing::info!(
10880            target: "audit",
10881            principal = %principal,
10882            action = "iam:policy.drop",
10883            matched_policy_id = %id,
10884            "DROP POLICY applied"
10885        );
10886        self.inner.audit_log.record(
10887            "iam/policy.drop",
10888            &principal,
10889            id,
10890            "ok",
10891            crate::json::Value::Null,
10892        );
10893
10894        self.invalidate_result_cache();
10895        Ok(RuntimeQueryResult::ok_message(
10896            query.to_string(),
10897            &format!("policy `{id}` dropped"),
10898            "drop_iam_policy",
10899        ))
10900    }
10901
10902    fn execute_attach_policy(
10903        &self,
10904        query: &str,
10905        policy_id: &str,
10906        principal: &crate::storage::query::ast::PolicyPrincipalRef,
10907    ) -> RedDBResult<RuntimeQueryResult> {
10908        use crate::auth::store::PrincipalRef;
10909        use crate::auth::UserId;
10910        use crate::storage::query::ast::PolicyPrincipalRef;
10911
10912        let auth_store = self
10913            .inner
10914            .auth_store
10915            .read()
10916            .clone()
10917            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10918        let p = match principal {
10919            PolicyPrincipalRef::User(u) => {
10920                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
10921            }
10922            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
10923        };
10924        let pretty_target = principal_label(principal);
10925        let tenant = current_tenant();
10926        let (actor_name, actor_role) = current_auth_identity()
10927            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
10928        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
10929        let eval_ctx = runtime_iam_context(
10930            actor_role,
10931            tenant.as_deref(),
10932            auth_store.principal_is_system_owned(&actor),
10933        );
10934        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
10935        let ledger = self.inner.control_event_ledger.read();
10936        let control = crate::auth::store::PolicyMutationControl {
10937            ctx: &event_ctx,
10938            ledger: ledger.as_ref(),
10939            config: self.inner.control_event_config,
10940            registry: Some(self.inner.config_registry.as_ref()),
10941            actor: &actor,
10942            eval_ctx: &eval_ctx,
10943        };
10944        auth_store
10945            .attach_policy_with_control_events(p, policy_id, &control)
10946            .map_err(|e| RedDBError::Query(e.to_string()))?;
10947
10948        let principal_str = actor_name;
10949        tracing::info!(
10950            target: "audit",
10951            principal = %principal_str,
10952            action = "iam:policy.attach",
10953            matched_policy_id = %policy_id,
10954            target = %pretty_target,
10955            "ATTACH POLICY applied"
10956        );
10957        self.inner.audit_log.record(
10958            "iam/policy.attach",
10959            &principal_str,
10960            &pretty_target,
10961            "ok",
10962            crate::json::Value::Null,
10963        );
10964
10965        self.invalidate_result_cache();
10966        Ok(RuntimeQueryResult::ok_message(
10967            query.to_string(),
10968            &format!("policy `{policy_id}` attached to {pretty_target}"),
10969            "attach_policy",
10970        ))
10971    }
10972
10973    fn execute_detach_policy(
10974        &self,
10975        query: &str,
10976        policy_id: &str,
10977        principal: &crate::storage::query::ast::PolicyPrincipalRef,
10978    ) -> RedDBResult<RuntimeQueryResult> {
10979        use crate::auth::store::PrincipalRef;
10980        use crate::auth::UserId;
10981        use crate::storage::query::ast::PolicyPrincipalRef;
10982
10983        let auth_store = self
10984            .inner
10985            .auth_store
10986            .read()
10987            .clone()
10988            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
10989        let p = match principal {
10990            PolicyPrincipalRef::User(u) => {
10991                PrincipalRef::User(UserId::from_parts(u.tenant.as_deref(), &u.username))
10992            }
10993            PolicyPrincipalRef::Group(g) => PrincipalRef::Group(g.clone()),
10994        };
10995        let pretty_target = principal_label(principal);
10996        let tenant = current_tenant();
10997        let (actor_name, actor_role) = current_auth_identity()
10998            .unwrap_or_else(|| ("anonymous".to_string(), crate::auth::Role::Read));
10999        let actor = crate::auth::UserId::from_parts(tenant.as_deref(), &actor_name);
11000        let eval_ctx = runtime_iam_context(
11001            actor_role,
11002            tenant.as_deref(),
11003            auth_store.principal_is_system_owned(&actor),
11004        );
11005        let event_ctx = self.policy_mutation_control_ctx(&actor, tenant.as_deref());
11006        let ledger = self.inner.control_event_ledger.read();
11007        let control = crate::auth::store::PolicyMutationControl {
11008            ctx: &event_ctx,
11009            ledger: ledger.as_ref(),
11010            config: self.inner.control_event_config,
11011            registry: Some(self.inner.config_registry.as_ref()),
11012            actor: &actor,
11013            eval_ctx: &eval_ctx,
11014        };
11015        auth_store
11016            .detach_policy_with_control_events(p, policy_id, &control)
11017            .map_err(|e| RedDBError::Query(e.to_string()))?;
11018
11019        let principal_str = actor_name;
11020        tracing::info!(
11021            target: "audit",
11022            principal = %principal_str,
11023            action = "iam:policy.detach",
11024            matched_policy_id = %policy_id,
11025            target = %pretty_target,
11026            "DETACH POLICY applied"
11027        );
11028        self.inner.audit_log.record(
11029            "iam/policy.detach",
11030            &principal_str,
11031            &pretty_target,
11032            "ok",
11033            crate::json::Value::Null,
11034        );
11035
11036        self.invalidate_result_cache();
11037        Ok(RuntimeQueryResult::ok_message(
11038            query.to_string(),
11039            &format!("policy `{policy_id}` detached from {pretty_target}"),
11040            "detach_policy",
11041        ))
11042    }
11043
11044    fn execute_show_policies(
11045        &self,
11046        query: &str,
11047        filter: Option<&crate::storage::query::ast::PolicyPrincipalRef>,
11048    ) -> RedDBResult<RuntimeQueryResult> {
11049        use crate::auth::UserId;
11050        use crate::storage::query::ast::PolicyPrincipalRef;
11051        use crate::storage::query::unified::UnifiedRecord;
11052        use crate::storage::schema::Value as SchemaValue;
11053        use std::sync::Arc;
11054
11055        let auth_store = self
11056            .inner
11057            .auth_store
11058            .read()
11059            .clone()
11060            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11061
11062        let pols = match filter {
11063            None => auth_store.list_policies(),
11064            Some(PolicyPrincipalRef::User(u)) => {
11065                let id = UserId::from_parts(u.tenant.as_deref(), &u.username);
11066                auth_store.effective_policies(&id)
11067            }
11068            Some(PolicyPrincipalRef::Group(g)) => auth_store.group_policies(g),
11069        };
11070
11071        let mut records = Vec::with_capacity(pols.len() + 1);
11072
11073        // Header row (#712 / S5A): synthetic record at index 0 that
11074        // reports the active PolicyEnforcementMode and the hard-cutover
11075        // version, so an operator running SHOW POLICIES can see the
11076        // current posture without a separate command.
11077        let mode = auth_store.enforcement_mode();
11078        let mut header = UnifiedRecord::default();
11079        header.set_arc(
11080            Arc::from("id"),
11081            SchemaValue::text("<enforcement_mode>".to_string()),
11082        );
11083        header.set_arc(Arc::from("statements"), SchemaValue::Integer(0));
11084        header.set_arc(Arc::from("tenant"), SchemaValue::Null);
11085        let header_json = format!(
11086            r#"{{"enforcement_mode":"{}","policy_only_hard_version":"{}"}}"#,
11087            mode.as_str(),
11088            crate::auth::enforcement_mode::POLICY_ONLY_HARD_VERSION
11089        );
11090        header.set_arc(Arc::from("json"), SchemaValue::text(header_json));
11091        records.push(header);
11092
11093        for p in pols.iter() {
11094            let mut rec = UnifiedRecord::default();
11095            rec.set_arc(Arc::from("id"), SchemaValue::text(p.id.clone()));
11096            rec.set_arc(
11097                Arc::from("statements"),
11098                SchemaValue::Integer(p.statements.len() as i64),
11099            );
11100            rec.set_arc(
11101                Arc::from("tenant"),
11102                p.tenant
11103                    .as_deref()
11104                    .map(|t| SchemaValue::text(t.to_string()))
11105                    .unwrap_or(SchemaValue::Null),
11106            );
11107            rec.set_arc(Arc::from("json"), SchemaValue::text(p.to_json_string()));
11108            records.push(rec);
11109        }
11110        let mut result = crate::storage::query::unified::UnifiedResult::empty();
11111        result.records = records;
11112        Ok(RuntimeQueryResult {
11113            query: query.to_string(),
11114            mode: crate::storage::query::modes::QueryMode::Sql,
11115            statement: "show_policies",
11116            engine: "iam-policies",
11117            result,
11118            affected_rows: 0,
11119            statement_type: "select",
11120            bookmark: None,
11121        })
11122    }
11123
11124    fn execute_show_effective_permissions(
11125        &self,
11126        query: &str,
11127        user: &crate::storage::query::ast::PolicyUserRef,
11128        resource: Option<&crate::storage::query::ast::PolicyResourceRef>,
11129    ) -> RedDBResult<RuntimeQueryResult> {
11130        use crate::auth::UserId;
11131        use crate::storage::query::unified::UnifiedRecord;
11132        use crate::storage::schema::Value as SchemaValue;
11133        use std::sync::Arc;
11134
11135        let auth_store = self
11136            .inner
11137            .auth_store
11138            .read()
11139            .clone()
11140            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11141        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
11142        let pols = auth_store.effective_policies(&id);
11143
11144        // Show one row per (policy, statement) tuple, plus any
11145        // resource-level filter passed by the caller.
11146        let mut records = Vec::new();
11147        for p in pols.iter() {
11148            for (idx, st) in p.statements.iter().enumerate() {
11149                if let Some(_r) = resource {
11150                    // Naive filter: render statement targets to strings
11151                    // and skip if no match. Conservative default = include
11152                    // (the simulator handles fine-grained matching).
11153                }
11154                let mut rec = UnifiedRecord::default();
11155                rec.set_arc(Arc::from("policy_id"), SchemaValue::text(p.id.clone()));
11156                rec.set_arc(
11157                    Arc::from("statement_index"),
11158                    SchemaValue::Integer(idx as i64),
11159                );
11160                rec.set_arc(
11161                    Arc::from("sid"),
11162                    st.sid
11163                        .as_deref()
11164                        .map(|s| SchemaValue::text(s.to_string()))
11165                        .unwrap_or(SchemaValue::Null),
11166                );
11167                rec.set_arc(
11168                    Arc::from("effect"),
11169                    SchemaValue::text(match st.effect {
11170                        crate::auth::policies::Effect::Allow => "allow",
11171                        crate::auth::policies::Effect::Deny => "deny",
11172                    }),
11173                );
11174                rec.set_arc(
11175                    Arc::from("actions"),
11176                    SchemaValue::Integer(st.actions.len() as i64),
11177                );
11178                rec.set_arc(
11179                    Arc::from("resources"),
11180                    SchemaValue::Integer(st.resources.len() as i64),
11181                );
11182                records.push(rec);
11183            }
11184        }
11185        let mut result = crate::storage::query::unified::UnifiedResult::empty();
11186        result.records = records;
11187        Ok(RuntimeQueryResult {
11188            query: query.to_string(),
11189            mode: crate::storage::query::modes::QueryMode::Sql,
11190            statement: "show_effective_permissions",
11191            engine: "iam-policies",
11192            result,
11193            affected_rows: 0,
11194            statement_type: "select",
11195            bookmark: None,
11196        })
11197    }
11198
11199    fn execute_lint_policy(
11200        &self,
11201        query: &str,
11202        source: &crate::storage::query::ast::LintPolicySource,
11203    ) -> RedDBResult<RuntimeQueryResult> {
11204        use crate::auth::policy_linter::lint;
11205        use crate::storage::query::ast::LintPolicySource;
11206        use crate::storage::query::unified::UnifiedRecord;
11207        use crate::storage::schema::Value as SchemaValue;
11208        use std::sync::Arc;
11209
11210        // Resolve the policy text. `JSON` source lints the literal
11211        // verbatim; `Id` source fetches the stored document so
11212        // operators can lint a policy by name without rebuilding the
11213        // JSON from `SHOW POLICY`.
11214        let policy_text = match source {
11215            LintPolicySource::Json(text) => text.clone(),
11216            LintPolicySource::Id(id) => {
11217                let auth_store =
11218                    self.inner.auth_store.read().clone().ok_or_else(|| {
11219                        RedDBError::Query("auth store not configured".to_string())
11220                    })?;
11221                let policy = auth_store
11222                    .get_policy(id)
11223                    .ok_or_else(|| RedDBError::Query(format!("policy `{id}` not found")))?;
11224                policy.to_json_string()
11225            }
11226        };
11227        let diagnostics = lint(&policy_text);
11228
11229        let principal_str = current_auth_identity()
11230            .map(|(u, _)| u)
11231            .unwrap_or_else(|| "anonymous".into());
11232        tracing::info!(
11233            target: "audit",
11234            principal = %principal_str,
11235            action = "iam:policy.lint",
11236            diagnostic_count = diagnostics.len(),
11237            "LINT POLICY issued"
11238        );
11239        self.inner.audit_log.record(
11240            "iam/policy.lint",
11241            &principal_str,
11242            match source {
11243                LintPolicySource::Id(id) => id.as_str(),
11244                LintPolicySource::Json(_) => "<json>",
11245            },
11246            "ok",
11247            crate::json::Value::Null,
11248        );
11249
11250        // One row per diagnostic. Column order matches the HTTP
11251        // surface's JSON keys so the two contracts line up.
11252        const COLUMNS: [&str; 5] = ["severity", "code", "message", "suggested_fix", "location"];
11253        let schema = Arc::new(
11254            COLUMNS
11255                .iter()
11256                .map(|name| Arc::<str>::from(*name))
11257                .collect::<Vec<_>>(),
11258        );
11259        let records: Vec<UnifiedRecord> = diagnostics
11260            .iter()
11261            .map(|d| {
11262                UnifiedRecord::with_schema(
11263                    Arc::clone(&schema),
11264                    vec![
11265                        SchemaValue::text(d.severity.as_str()),
11266                        SchemaValue::text(d.code.as_str()),
11267                        SchemaValue::text(d.message.clone()),
11268                        d.suggested_fix
11269                            .as_deref()
11270                            .map(SchemaValue::text)
11271                            .unwrap_or(SchemaValue::Null),
11272                        d.location
11273                            .as_deref()
11274                            .map(SchemaValue::text)
11275                            .unwrap_or(SchemaValue::Null),
11276                    ],
11277                )
11278            })
11279            .collect();
11280        let mut result = crate::storage::query::unified::UnifiedResult::with_columns(
11281            COLUMNS.iter().map(|c| c.to_string()).collect(),
11282        );
11283        result.records = records;
11284        Ok(RuntimeQueryResult {
11285            query: query.to_string(),
11286            mode: crate::storage::query::modes::QueryMode::Sql,
11287            statement: "lint_policy",
11288            engine: "iam-policies",
11289            result,
11290            affected_rows: 0,
11291            statement_type: "select",
11292            bookmark: None,
11293        })
11294    }
11295
11296    /// `MIGRATE POLICY MODE TO '<target>' [DRY RUN]` — flip the install
11297    /// from `legacy_rbac` to `policy_only` after the pre-flight delta
11298    /// simulator confirms no non-admin principal would lose access.
11299    /// Issue #714.
11300    fn execute_migrate_policy_mode(
11301        &self,
11302        query: &str,
11303        target: &str,
11304        dry_run: bool,
11305    ) -> RedDBResult<RuntimeQueryResult> {
11306        use crate::auth::enforcement_mode::PolicyEnforcementMode;
11307        use crate::auth::migrate_policy_mode::{
11308            principal_label, simulate_migration_delta, MigratePolicyDelta,
11309        };
11310        use crate::auth::policies::ResourceRef;
11311        use crate::storage::query::unified::UnifiedRecord;
11312        use crate::storage::schema::Value as SchemaValue;
11313        use std::sync::Arc;
11314
11315        // Only `policy_only` is a meaningful destination for this
11316        // command — flipping back to `legacy_rbac` is supported via
11317        // direct config writes (it doesn't need a pre-flight). We
11318        // reject everything else with the same allowlist `parse` uses.
11319        let parsed = PolicyEnforcementMode::parse(target).ok_or_else(|| {
11320            RedDBError::Query(format!(
11321                "MIGRATE POLICY MODE: invalid target `{target}` (expected `policy_only`)"
11322            ))
11323        })?;
11324        if parsed != PolicyEnforcementMode::PolicyOnly {
11325            return Err(RedDBError::Query(format!(
11326                "MIGRATE POLICY MODE: target `{target}` is not supported — only `policy_only` may be migrated to via this command"
11327            )));
11328        }
11329
11330        let auth_store = self
11331            .inner
11332            .auth_store
11333            .read()
11334            .clone()
11335            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11336
11337        // Resource enumeration: every existing collection probed as
11338        // `table:<name>`. This is the realistic resource surface for
11339        // the legacy_rbac fallback (the role floors gate per-table
11340        // actions). Wildcard / column-scoped resources are still
11341        // covered by the policy evaluator because evaluate() resolves
11342        // resource patterns relative to the concrete resources we
11343        // probe here.
11344        let snapshot = self.inner.db.catalog_model_snapshot();
11345        let resources: Vec<ResourceRef> = snapshot
11346            .collections
11347            .iter()
11348            .map(|c| ResourceRef::new("table", c.name.clone()))
11349            .collect();
11350
11351        let now_ms = crate::utils::now_unix_millis() as u128;
11352        let deltas: Vec<MigratePolicyDelta> =
11353            simulate_migration_delta(auth_store.as_ref(), &resources, now_ms);
11354
11355        let principal_str = current_auth_identity()
11356            .map(|(u, _)| u)
11357            .unwrap_or_else(|| "anonymous".into());
11358
11359        // Audit every issuance. The outcome line differentiates
11360        // dry-run, refused, and applied — operators can grep for these
11361        // strings in the audit log.
11362        let outcome_str = if dry_run {
11363            "dry_run"
11364        } else if deltas.is_empty() {
11365            "applied"
11366        } else {
11367            "refused"
11368        };
11369        tracing::info!(
11370            target: "audit",
11371            principal = %principal_str,
11372            action = "iam:policy.migrate_mode",
11373            target = %target,
11374            dry_run,
11375            delta_count = deltas.len(),
11376            outcome = outcome_str,
11377            "MIGRATE POLICY MODE issued"
11378        );
11379        self.inner.audit_log.record(
11380            "iam/policy.migrate_mode",
11381            &principal_str,
11382            target,
11383            outcome_str,
11384            crate::json::Value::Null,
11385        );
11386
11387        // Refuse the non-dry-run path when any principal would lose
11388        // access. The error string carries a compact summary plus the
11389        // delta count so operators can re-run with DRY RUN to inspect.
11390        if !dry_run && !deltas.is_empty() {
11391            let summary = deltas
11392                .iter()
11393                .take(5)
11394                .map(|d| {
11395                    format!(
11396                        "{}:{}/{}:{}",
11397                        principal_label(&d.principal),
11398                        d.action,
11399                        d.resource_kind,
11400                        d.resource_name
11401                    )
11402                })
11403                .collect::<Vec<_>>()
11404                .join(", ");
11405            let more = if deltas.len() > 5 {
11406                format!(" (and {} more)", deltas.len() - 5)
11407            } else {
11408                String::new()
11409            };
11410            return Err(RedDBError::Query(format!(
11411                "MIGRATE POLICY MODE refused: {n} principal/action/resource pair(s) would lose access under `policy_only`. Run `MIGRATE POLICY MODE TO '{target}' DRY RUN` to inspect. Sample: {summary}{more}",
11412                n = deltas.len(),
11413            )));
11414        }
11415
11416        // Mutate the live enforcement mode only on the non-dry-run
11417        // path with an empty delta. `set_enforcement_mode` also
11418        // persists to vault_kv so the new mode survives restart.
11419        if !dry_run {
11420            auth_store.set_enforcement_mode(parsed);
11421        }
11422
11423        const COLUMNS: [&str; 5] = [
11424            "principal",
11425            "role",
11426            "action",
11427            "resource_kind",
11428            "resource_name",
11429        ];
11430        let schema = Arc::new(
11431            COLUMNS
11432                .iter()
11433                .map(|name| Arc::<str>::from(*name))
11434                .collect::<Vec<_>>(),
11435        );
11436        let records: Vec<UnifiedRecord> = deltas
11437            .iter()
11438            .map(|d| {
11439                UnifiedRecord::with_schema(
11440                    Arc::clone(&schema),
11441                    vec![
11442                        SchemaValue::text(principal_label(&d.principal)),
11443                        SchemaValue::text(d.role.as_str()),
11444                        SchemaValue::text(d.action.clone()),
11445                        SchemaValue::text(d.resource_kind.clone()),
11446                        SchemaValue::text(d.resource_name.clone()),
11447                    ],
11448                )
11449            })
11450            .collect();
11451        let mut result = crate::storage::query::unified::UnifiedResult::with_columns(
11452            COLUMNS.iter().map(|c| c.to_string()).collect(),
11453        );
11454        result.records = records;
11455        Ok(RuntimeQueryResult {
11456            query: query.to_string(),
11457            mode: crate::storage::query::modes::QueryMode::Sql,
11458            statement: "migrate_policy_mode",
11459            engine: "iam-policies",
11460            result,
11461            affected_rows: 0,
11462            statement_type: "select",
11463            bookmark: None,
11464        })
11465    }
11466
11467    fn execute_simulate_policy(
11468        &self,
11469        query: &str,
11470        user: &crate::storage::query::ast::PolicyUserRef,
11471        action: &str,
11472        resource: &crate::storage::query::ast::PolicyResourceRef,
11473    ) -> RedDBResult<RuntimeQueryResult> {
11474        use crate::auth::policies::ResourceRef;
11475        use crate::auth::store::SimCtx;
11476        use crate::auth::UserId;
11477        use crate::storage::query::unified::UnifiedRecord;
11478        use crate::storage::schema::Value as SchemaValue;
11479        use std::sync::Arc;
11480
11481        let auth_store = self
11482            .inner
11483            .auth_store
11484            .read()
11485            .clone()
11486            .ok_or_else(|| RedDBError::Query("auth store not configured".to_string()))?;
11487        let id = UserId::from_parts(user.tenant.as_deref(), &user.username);
11488        let r = ResourceRef::new(resource.kind.clone(), resource.name.clone());
11489        let outcome = auth_store.simulate(&id, action, &r, SimCtx::default());
11490
11491        let principal_str = current_auth_identity()
11492            .map(|(u, _)| u)
11493            .unwrap_or_else(|| "anonymous".into());
11494        let (decision_str, matched_pid, matched_sid) = decision_to_strings(&outcome.decision);
11495        tracing::info!(
11496            target: "audit",
11497            principal = %principal_str,
11498            action = "iam:policy.simulate",
11499            decision = %decision_str,
11500            matched_policy_id = ?matched_pid,
11501            matched_sid = ?matched_sid,
11502            "SIMULATE issued"
11503        );
11504        self.inner.audit_log.record(
11505            "iam/policy.simulate",
11506            &principal_str,
11507            &id.to_string(),
11508            "ok",
11509            crate::json::Value::Null,
11510        );
11511
11512        let mut rec = UnifiedRecord::default();
11513        rec.set_arc(Arc::from("decision"), SchemaValue::text(decision_str));
11514        rec.set_arc(
11515            Arc::from("matched_policy_id"),
11516            matched_pid
11517                .map(SchemaValue::text)
11518                .unwrap_or(SchemaValue::Null),
11519        );
11520        rec.set_arc(
11521            Arc::from("matched_sid"),
11522            matched_sid
11523                .map(SchemaValue::text)
11524                .unwrap_or(SchemaValue::Null),
11525        );
11526        rec.set_arc(Arc::from("reason"), SchemaValue::text(outcome.reason));
11527        rec.set_arc(
11528            Arc::from("trail_len"),
11529            SchemaValue::Integer(outcome.trail.len() as i64),
11530        );
11531        let mut result = crate::storage::query::unified::UnifiedResult::empty();
11532        result.records = vec![rec];
11533        Ok(RuntimeQueryResult {
11534            query: query.to_string(),
11535            mode: crate::storage::query::modes::QueryMode::Sql,
11536            statement: "simulate_policy",
11537            engine: "iam-policies",
11538            result,
11539            affected_rows: 0,
11540            statement_type: "select",
11541            bookmark: None,
11542        })
11543    }
11544}
11545
11546/// Translate a parsed GRANT into a synthetic IAM policy whose id
11547/// starts with `_grant_<unique>`. PUBLIC is represented as an
11548/// implicit IAM group; legacy GROUP grants are still rejected by the
11549/// grant store and are not translated here.
11550fn grant_to_iam_policy(
11551    principal: &crate::auth::privileges::GrantPrincipal,
11552    resource: &crate::auth::privileges::Resource,
11553    actions: &[crate::auth::privileges::Action],
11554    tenant: Option<&str>,
11555) -> Option<crate::auth::policies::Policy> {
11556    use crate::auth::policies::{
11557        compile_action, ActionPattern, Effect, Policy, ResourcePattern, Statement,
11558    };
11559    use crate::auth::privileges::{Action, GrantPrincipal, Resource};
11560
11561    if matches!(principal, GrantPrincipal::Group(_)) {
11562        return None;
11563    }
11564
11565    let now = crate::auth::now_ms();
11566    let id = format!("_grant_{:x}_{:x}", now, std::process::id());
11567
11568    let resource_str = match resource {
11569        Resource::Database => "table:*".to_string(),
11570        Resource::Schema(s) => format!("table:{s}.*"),
11571        Resource::Table { schema, table } => match schema {
11572            Some(s) => format!("table:{s}.{table}"),
11573            None => format!("table:{table}"),
11574        },
11575        Resource::Function { schema, name } => match schema {
11576            Some(s) => format!("function:{s}.{name}"),
11577            None => format!("function:{name}"),
11578        },
11579    };
11580
11581    // Compile actions — fall back to `*` only when the grant included
11582    // `Action::All`. Map every other action keyword to its lowercase
11583    // form so it lines up with the kernel's allowlist.
11584    let action_patterns: Vec<ActionPattern> = if actions.contains(&Action::All) {
11585        vec![ActionPattern::Wildcard]
11586    } else {
11587        actions
11588            .iter()
11589            .map(|a| compile_action(&a.as_str().to_ascii_lowercase()))
11590            .collect()
11591    };
11592    if action_patterns.is_empty() {
11593        return None;
11594    }
11595
11596    // Inline resource compilation matching the kernel's `compile_resource`:
11597    //   * `*` → wildcard
11598    //   * contains `*` → glob
11599    //   * `kind:name` → exact
11600    let resource_patterns = if resource_str == "*" {
11601        vec![ResourcePattern::Wildcard]
11602    } else if resource_str.contains('*') {
11603        vec![ResourcePattern::Glob(resource_str.clone())]
11604    } else if let Some((kind, name)) = resource_str.split_once(':') {
11605        vec![ResourcePattern::Exact {
11606            kind: kind.to_string(),
11607            name: name.to_string(),
11608        }]
11609    } else {
11610        vec![ResourcePattern::Wildcard]
11611    };
11612
11613    let policy = Policy {
11614        id,
11615        version: 1,
11616        tenant: tenant.map(|t| t.to_string()),
11617        created_at: now,
11618        updated_at: now,
11619        statements: vec![Statement {
11620            sid: None,
11621            effect: Effect::Allow,
11622            actions: action_patterns,
11623            resources: resource_patterns,
11624            condition: None,
11625        }],
11626    };
11627    if policy.validate().is_err() {
11628        return None;
11629    }
11630    Some(policy)
11631}
11632
11633/// Coerce a `key => <number>` table-function named argument into a positive
11634/// iteration count for the centrality TVFs (issue #797). The parser lexes all
11635/// named values as `f64`, so an integral, finite, strictly-positive value is
11636/// required here; anything else (fractional, zero, negative, NaN/inf) is a
11637/// clear query error. `func` names the function for the message.
11638fn parse_positive_iterations(func: &str, value: &f64) -> RedDBResult<usize> {
11639    if !value.is_finite() || *value < 1.0 || value.fract() != 0.0 {
11640        return Err(RedDBError::Query(format!(
11641            "table function '{func}' max_iterations must be a positive integer, got {value}"
11642        )));
11643    }
11644    Ok(*value as usize)
11645}
11646
11647fn legacy_action_to_iam(action: crate::auth::privileges::Action) -> &'static str {
11648    use crate::auth::privileges::Action;
11649    match action {
11650        Action::Select => "select",
11651        Action::Insert => "insert",
11652        Action::Update => "update",
11653        Action::Delete => "delete",
11654        Action::Truncate => "truncate",
11655        Action::References => "references",
11656        Action::Execute => "execute",
11657        Action::Usage => "usage",
11658        Action::All => "*",
11659    }
11660}
11661
11662fn update_set_target_columns(query: &crate::storage::query::ast::UpdateQuery) -> Vec<String> {
11663    let mut columns = Vec::new();
11664    for (column, _) in &query.assignment_exprs {
11665        if !columns.iter().any(|seen| seen == column) {
11666            columns.push(column.clone());
11667        }
11668    }
11669    columns
11670}
11671
11672fn column_access_request_for_table_update(
11673    table_name: &str,
11674    columns: Vec<String>,
11675) -> crate::auth::ColumnAccessRequest {
11676    match table_name.split_once('.') {
11677        Some((schema, table)) => {
11678            crate::auth::ColumnAccessRequest::update(table.to_string(), columns)
11679                .with_schema(schema.to_string())
11680        }
11681        None => crate::auth::ColumnAccessRequest::update(table_name.to_string(), columns),
11682    }
11683}
11684
11685fn column_access_request_for_table_select(
11686    table_name: &str,
11687    columns: Vec<String>,
11688) -> crate::auth::ColumnAccessRequest {
11689    match table_name.split_once('.') {
11690        Some((schema, table)) => {
11691            crate::auth::ColumnAccessRequest::select(table.to_string(), columns)
11692                .with_schema(schema.to_string())
11693        }
11694        None => crate::auth::ColumnAccessRequest::select(table_name.to_string(), columns),
11695    }
11696}
11697
11698fn update_returning_columns_for_policy(
11699    runtime: &RedDBRuntime,
11700    query: &crate::storage::query::ast::UpdateQuery,
11701) -> Option<Vec<String>> {
11702    let items = query.returning.as_ref()?;
11703    let mut columns = Vec::new();
11704    let project_all = items
11705        .iter()
11706        .any(|item| matches!(item, crate::storage::query::ast::ReturningItem::All));
11707    if project_all {
11708        collect_returning_star_columns(runtime, query, &mut columns);
11709    } else {
11710        for item in items {
11711            let crate::storage::query::ast::ReturningItem::Column(column) = item else {
11712                continue;
11713            };
11714            push_returning_policy_column(&mut columns, column);
11715        }
11716    }
11717    (!columns.is_empty()).then_some(columns)
11718}
11719
11720fn collect_returning_star_columns(
11721    runtime: &RedDBRuntime,
11722    query: &crate::storage::query::ast::UpdateQuery,
11723    columns: &mut Vec<String>,
11724) {
11725    let store = runtime.db().store();
11726    let Some(manager) = store.get_collection(&query.table) else {
11727        return;
11728    };
11729    if let Some(schema) = manager.column_schema() {
11730        for column in schema.iter() {
11731            push_returning_policy_column(columns, column);
11732        }
11733    }
11734    for entity in manager.query_all(|_| true) {
11735        if !returning_entity_matches_update_target(&entity, query.target) {
11736            continue;
11737        }
11738        match &entity.data {
11739            crate::storage::EntityData::Row(row) => {
11740                for (column, _) in row.iter_fields() {
11741                    push_returning_policy_column(columns, column);
11742                }
11743            }
11744            crate::storage::EntityData::Node(node) => {
11745                push_returning_policy_column(columns, "label");
11746                push_returning_policy_column(columns, "node_type");
11747                for column in node.properties.keys() {
11748                    push_returning_policy_column(columns, column);
11749                }
11750            }
11751            crate::storage::EntityData::Edge(edge) => {
11752                push_returning_policy_column(columns, "label");
11753                push_returning_policy_column(columns, "from_rid");
11754                push_returning_policy_column(columns, "to_rid");
11755                push_returning_policy_column(columns, "weight");
11756                for column in edge.properties.keys() {
11757                    push_returning_policy_column(columns, column);
11758                }
11759            }
11760            _ => {}
11761        }
11762    }
11763}
11764
11765fn push_returning_policy_column(columns: &mut Vec<String>, column: &str) {
11766    if returning_public_envelope_column(column) {
11767        return;
11768    }
11769    if !columns.iter().any(|seen| seen == column) {
11770        columns.push(column.to_string());
11771    }
11772}
11773
11774fn returning_public_envelope_column(column: &str) -> bool {
11775    matches!(
11776        column.to_ascii_lowercase().as_str(),
11777        "rid" | "collection" | "kind" | "tenant" | "created_at" | "updated_at" | "red_entity_id"
11778    )
11779}
11780
11781fn returning_entity_matches_update_target(
11782    entity: &crate::storage::UnifiedEntity,
11783    target: crate::storage::query::ast::UpdateTarget,
11784) -> bool {
11785    use crate::storage::query::ast::UpdateTarget;
11786    match target {
11787        UpdateTarget::Rows => {
11788            matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Row))
11789        }
11790        UpdateTarget::Documents => {
11791            matches!(
11792                returning_row_item_kind(entity),
11793                Some(ReturningRowKind::Document)
11794            )
11795        }
11796        UpdateTarget::Kv => matches!(returning_row_item_kind(entity), Some(ReturningRowKind::Kv)),
11797        UpdateTarget::Nodes => matches!(
11798            (&entity.kind, &entity.data),
11799            (
11800                crate::storage::EntityKind::GraphNode(_),
11801                crate::storage::EntityData::Node(_)
11802            )
11803        ),
11804        UpdateTarget::Edges => matches!(
11805            (&entity.kind, &entity.data),
11806            (
11807                crate::storage::EntityKind::GraphEdge(_),
11808                crate::storage::EntityData::Edge(_)
11809            )
11810        ),
11811    }
11812}
11813
11814#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11815enum ReturningRowKind {
11816    Row,
11817    Document,
11818    Kv,
11819}
11820
11821fn returning_row_item_kind(entity: &crate::storage::UnifiedEntity) -> Option<ReturningRowKind> {
11822    let row = entity.data.as_row()?;
11823    let is_kv = row.iter_fields().all(|(column, _)| {
11824        column.eq_ignore_ascii_case("key") || column.eq_ignore_ascii_case("value")
11825    });
11826    if is_kv {
11827        return Some(ReturningRowKind::Kv);
11828    }
11829    let is_document = row
11830        .iter_fields()
11831        .any(|(_, value)| matches!(value, crate::storage::schema::Value::Json(_)));
11832    if is_document {
11833        Some(ReturningRowKind::Document)
11834    } else {
11835        Some(ReturningRowKind::Row)
11836    }
11837}
11838
11839fn requested_table_columns_for_policy(
11840    table: &crate::storage::query::ast::TableQuery,
11841) -> Vec<String> {
11842    use crate::storage::query::sql_lowering::{
11843        effective_table_filter, effective_table_group_by_exprs, effective_table_having_filter,
11844        effective_table_projections,
11845    };
11846
11847    let table_name = table.table.as_str();
11848    let table_alias = table.alias.as_deref();
11849    let mut columns = std::collections::BTreeSet::new();
11850
11851    for projection in effective_table_projections(table) {
11852        collect_projection_columns(&projection, table_name, table_alias, &mut columns);
11853    }
11854    if let Some(filter) = effective_table_filter(table) {
11855        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
11856    }
11857    for expr in effective_table_group_by_exprs(table) {
11858        collect_expr_columns(&expr, table_name, table_alias, &mut columns);
11859    }
11860    if let Some(filter) = effective_table_having_filter(table) {
11861        collect_filter_columns(&filter, table_name, table_alias, &mut columns);
11862    }
11863    for order in &table.order_by {
11864        if let Some(expr) = order.expr.as_ref() {
11865            collect_expr_columns(expr, table_name, table_alias, &mut columns);
11866        } else {
11867            collect_field_ref_column(&order.field, table_name, table_alias, &mut columns);
11868        }
11869    }
11870
11871    columns.into_iter().collect()
11872}
11873
11874fn collect_projection_columns(
11875    projection: &crate::storage::query::ast::Projection,
11876    table_name: &str,
11877    table_alias: Option<&str>,
11878    columns: &mut std::collections::BTreeSet<String>,
11879) {
11880    use crate::storage::query::ast::Projection;
11881    match projection {
11882        Projection::All => {
11883            columns.insert("*".to_string());
11884        }
11885        Projection::Column(column) | Projection::Alias(column, _) => {
11886            if column != "*" {
11887                columns.insert(column.clone());
11888            }
11889        }
11890        Projection::Function(_, args) => {
11891            for arg in args {
11892                collect_projection_columns(arg, table_name, table_alias, columns);
11893            }
11894        }
11895        Projection::Expression(filter, _) => {
11896            collect_filter_columns(filter, table_name, table_alias, columns);
11897        }
11898        Projection::Field(field, _) => {
11899            collect_field_ref_column(field, table_name, table_alias, columns);
11900        }
11901        // Slice 7a (#589): no runtime support yet; recurse into args so
11902        // any column references are still tracked in case a future
11903        // executor needs the column set.
11904        Projection::Window { args, .. } => {
11905            for arg in args {
11906                collect_projection_columns(arg, table_name, table_alias, columns);
11907            }
11908        }
11909    }
11910}
11911
11912fn collect_filter_columns(
11913    filter: &crate::storage::query::ast::Filter,
11914    table_name: &str,
11915    table_alias: Option<&str>,
11916    columns: &mut std::collections::BTreeSet<String>,
11917) {
11918    use crate::storage::query::ast::Filter;
11919    match filter {
11920        Filter::Compare { field, .. }
11921        | Filter::IsNull(field)
11922        | Filter::IsNotNull(field)
11923        | Filter::In { field, .. }
11924        | Filter::Between { field, .. }
11925        | Filter::Like { field, .. }
11926        | Filter::StartsWith { field, .. }
11927        | Filter::EndsWith { field, .. }
11928        | Filter::Contains { field, .. } => {
11929            collect_field_ref_column(field, table_name, table_alias, columns);
11930        }
11931        Filter::CompareFields { left, right, .. } => {
11932            collect_field_ref_column(left, table_name, table_alias, columns);
11933            collect_field_ref_column(right, table_name, table_alias, columns);
11934        }
11935        Filter::CompareExpr { lhs, rhs, .. } => {
11936            collect_expr_columns(lhs, table_name, table_alias, columns);
11937            collect_expr_columns(rhs, table_name, table_alias, columns);
11938        }
11939        Filter::And(left, right) | Filter::Or(left, right) => {
11940            collect_filter_columns(left, table_name, table_alias, columns);
11941            collect_filter_columns(right, table_name, table_alias, columns);
11942        }
11943        Filter::Not(inner) => collect_filter_columns(inner, table_name, table_alias, columns),
11944    }
11945}
11946
11947fn collect_expr_columns(
11948    expr: &crate::storage::query::ast::Expr,
11949    table_name: &str,
11950    table_alias: Option<&str>,
11951    columns: &mut std::collections::BTreeSet<String>,
11952) {
11953    use crate::storage::query::ast::Expr;
11954    match expr {
11955        Expr::Column { field, .. } => {
11956            collect_field_ref_column(field, table_name, table_alias, columns);
11957        }
11958        Expr::Literal { .. } | Expr::Parameter { .. } => {}
11959        Expr::UnaryOp { operand, .. } | Expr::Cast { inner: operand, .. } => {
11960            collect_expr_columns(operand, table_name, table_alias, columns);
11961        }
11962        Expr::BinaryOp { lhs, rhs, .. } => {
11963            collect_expr_columns(lhs, table_name, table_alias, columns);
11964            collect_expr_columns(rhs, table_name, table_alias, columns);
11965        }
11966        Expr::FunctionCall { args, .. } => {
11967            for arg in args {
11968                collect_expr_columns(arg, table_name, table_alias, columns);
11969            }
11970        }
11971        Expr::Case {
11972            branches, else_, ..
11973        } => {
11974            for (condition, value) in branches {
11975                collect_expr_columns(condition, table_name, table_alias, columns);
11976                collect_expr_columns(value, table_name, table_alias, columns);
11977            }
11978            if let Some(value) = else_ {
11979                collect_expr_columns(value, table_name, table_alias, columns);
11980            }
11981        }
11982        Expr::IsNull { operand, .. } => {
11983            collect_expr_columns(operand, table_name, table_alias, columns);
11984        }
11985        Expr::InList { target, values, .. } => {
11986            collect_expr_columns(target, table_name, table_alias, columns);
11987            for value in values {
11988                collect_expr_columns(value, table_name, table_alias, columns);
11989            }
11990        }
11991        Expr::Between {
11992            target, low, high, ..
11993        } => {
11994            collect_expr_columns(target, table_name, table_alias, columns);
11995            collect_expr_columns(low, table_name, table_alias, columns);
11996            collect_expr_columns(high, table_name, table_alias, columns);
11997        }
11998        Expr::Subquery { .. } => {}
11999        Expr::WindowFunctionCall { args, window, .. } => {
12000            for arg in args {
12001                collect_expr_columns(arg, table_name, table_alias, columns);
12002            }
12003            for e in &window.partition_by {
12004                collect_expr_columns(e, table_name, table_alias, columns);
12005            }
12006            for o in &window.order_by {
12007                collect_expr_columns(&o.expr, table_name, table_alias, columns);
12008            }
12009        }
12010    }
12011}
12012
12013fn collect_field_ref_column(
12014    field: &crate::storage::query::ast::FieldRef,
12015    table_name: &str,
12016    table_alias: Option<&str>,
12017    columns: &mut std::collections::BTreeSet<String>,
12018) {
12019    if let Some(column) = policy_column_name_from_field_ref(field, table_name, table_alias) {
12020        if column != "*" {
12021            columns.insert(column);
12022        }
12023    }
12024}
12025
12026fn policy_column_name_from_field_ref(
12027    field: &crate::storage::query::ast::FieldRef,
12028    table_name: &str,
12029    table_alias: Option<&str>,
12030) -> Option<String> {
12031    match field {
12032        crate::storage::query::ast::FieldRef::TableColumn { table, column } => {
12033            if column == "*" {
12034                return Some("*".to_string());
12035            }
12036            if table.is_empty() || table == table_name || Some(table.as_str()) == table_alias {
12037                Some(column.clone())
12038            } else {
12039                Some(format!("{table}.{column}"))
12040            }
12041        }
12042        _ => None,
12043    }
12044}
12045
12046fn legacy_resource_to_iam(
12047    resource: &crate::auth::privileges::Resource,
12048    tenant: Option<&str>,
12049) -> crate::auth::policies::ResourceRef {
12050    use crate::auth::privileges::Resource;
12051
12052    let (kind, name) = match resource {
12053        Resource::Database => ("database".to_string(), "*".to_string()),
12054        Resource::Schema(s) => ("schema".to_string(), format!("{s}.*")),
12055        Resource::Table { schema, table } => (
12056            "table".to_string(),
12057            match schema {
12058                Some(s) => format!("{s}.{table}"),
12059                None => table.clone(),
12060            },
12061        ),
12062        Resource::Function { schema, name } => (
12063            "function".to_string(),
12064            match schema {
12065                Some(s) => format!("{s}.{name}"),
12066                None => name.clone(),
12067            },
12068        ),
12069    };
12070
12071    let mut out = crate::auth::policies::ResourceRef::new(kind, name);
12072    if let Some(t) = tenant {
12073        out = out.with_tenant(t.to_string());
12074    }
12075    out
12076}
12077
12078#[derive(Debug)]
12079struct JoinTableSide {
12080    table: String,
12081    alias: String,
12082}
12083
12084fn table_side_context(expr: &QueryExpr) -> Option<JoinTableSide> {
12085    match expr {
12086        QueryExpr::Table(table) => Some(JoinTableSide {
12087            table: table.table.clone(),
12088            alias: table.alias.clone().unwrap_or_else(|| table.table.clone()),
12089        }),
12090        _ => None,
12091    }
12092}
12093
12094fn collect_projection_columns_for_table(
12095    projection: &Projection,
12096    table: &str,
12097    alias: Option<&str>,
12098    out: &mut BTreeSet<String>,
12099) {
12100    match projection {
12101        Projection::Column(column) | Projection::Alias(column, _) => {
12102            match split_qualified_column(column) {
12103                Some((qualifier, column))
12104                    if qualifier == table || alias.is_some_and(|alias| qualifier == alias) =>
12105                {
12106                    push_policy_column(column, out);
12107                }
12108                Some(_) => {}
12109                None => push_policy_column(column, out),
12110            }
12111        }
12112        Projection::Field(
12113            FieldRef::TableColumn {
12114                table: qualifier,
12115                column,
12116            },
12117            _,
12118        ) => {
12119            if qualifier.is_empty()
12120                || qualifier == table
12121                || alias.is_some_and(|alias| qualifier == alias)
12122            {
12123                push_policy_column(column, out);
12124            }
12125        }
12126        Projection::Field(
12127            FieldRef::NodeProperty {
12128                alias: qualifier,
12129                property,
12130            },
12131            _,
12132        )
12133        | Projection::Field(
12134            FieldRef::EdgeProperty {
12135                alias: qualifier,
12136                property,
12137            },
12138            _,
12139        ) => {
12140            if qualifier == table || alias.is_some_and(|alias| qualifier == alias) {
12141                push_policy_column(property, out);
12142            }
12143        }
12144        Projection::Function(_, args) => {
12145            for arg in args {
12146                collect_projection_columns_for_table(arg, table, alias, out);
12147            }
12148        }
12149        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
12150        Projection::Window { args, .. } => {
12151            for arg in args {
12152                collect_projection_columns_for_table(arg, table, alias, out);
12153            }
12154        }
12155    }
12156}
12157
12158fn collect_projection_columns_for_join_side(
12159    projection: &Projection,
12160    left: Option<&JoinTableSide>,
12161    right: Option<&JoinTableSide>,
12162    out: &mut HashMap<String, BTreeSet<String>>,
12163) -> RedDBResult<()> {
12164    match projection {
12165        Projection::Column(column) | Projection::Alias(column, _) => {
12166            if let Some((qualifier, column)) = split_qualified_column(column) {
12167                push_qualified_join_column(qualifier, column, left, right, out);
12168            } else {
12169                push_unqualified_join_column(column, left, right, out);
12170            }
12171        }
12172        Projection::Field(FieldRef::TableColumn { table, column }, _) => {
12173            if table.is_empty() {
12174                push_unqualified_join_column(column, left, right, out);
12175            } else if let Some(side) = [left, right]
12176                .into_iter()
12177                .flatten()
12178                .find(|side| table == side.table.as_str() || table == side.alias.as_str())
12179            {
12180                push_join_column(&side.table, column, out);
12181            }
12182        }
12183        Projection::Field(FieldRef::NodeProperty { alias, property }, _)
12184        | Projection::Field(FieldRef::EdgeProperty { alias, property }, _) => {
12185            push_qualified_join_column(alias, property, left, right, out);
12186        }
12187        Projection::Function(_, args) => {
12188            for arg in args {
12189                collect_projection_columns_for_join_side(arg, left, right, out)?;
12190            }
12191        }
12192        Projection::Expression(_, _) | Projection::All | Projection::Field(_, _) => {}
12193        Projection::Window { args, .. } => {
12194            for arg in args {
12195                collect_projection_columns_for_join_side(arg, left, right, out)?;
12196            }
12197        }
12198    }
12199    Ok(())
12200}
12201
12202fn split_qualified_column(column: &str) -> Option<(&str, &str)> {
12203    let (qualifier, column) = column.split_once('.')?;
12204    if qualifier.is_empty() || column.is_empty() || column.contains('.') {
12205        return None;
12206    }
12207    Some((qualifier, column))
12208}
12209
12210fn push_qualified_join_column(
12211    qualifier: &str,
12212    column: &str,
12213    left: Option<&JoinTableSide>,
12214    right: Option<&JoinTableSide>,
12215    out: &mut HashMap<String, BTreeSet<String>>,
12216) {
12217    if let Some(side) = [left, right]
12218        .into_iter()
12219        .flatten()
12220        .find(|side| qualifier == side.table.as_str() || qualifier == side.alias.as_str())
12221    {
12222        push_join_column(&side.table, column, out);
12223    }
12224}
12225
12226fn push_unqualified_join_column(
12227    column: &str,
12228    left: Option<&JoinTableSide>,
12229    right: Option<&JoinTableSide>,
12230    out: &mut HashMap<String, BTreeSet<String>>,
12231) {
12232    for side in [left, right].into_iter().flatten() {
12233        push_join_column(&side.table, column, out);
12234    }
12235}
12236
12237fn push_join_column(table: &str, column: &str, out: &mut HashMap<String, BTreeSet<String>>) {
12238    if is_policy_column_name(column) {
12239        out.entry(table.to_string())
12240            .or_default()
12241            .insert(column.to_string());
12242    }
12243}
12244
12245fn push_policy_column(column: &str, out: &mut BTreeSet<String>) {
12246    if is_policy_column_name(column) {
12247        out.insert(column.to_string());
12248    }
12249}
12250
12251fn is_policy_column_name(column: &str) -> bool {
12252    !column.is_empty()
12253        && column != "*"
12254        && !column.starts_with("LIT:")
12255        && !column.starts_with("TYPE:")
12256}
12257
12258fn runtime_iam_context(
12259    role: crate::auth::Role,
12260    tenant: Option<&str>,
12261    principal_is_system_owned: bool,
12262) -> crate::auth::policies::EvalContext {
12263    crate::auth::policies::EvalContext {
12264        principal_tenant: tenant.map(|t| t.to_string()),
12265        current_tenant: tenant.map(|t| t.to_string()),
12266        peer_ip: None,
12267        mfa_present: false,
12268        now_ms: crate::auth::now_ms(),
12269        principal_is_admin_role: role == crate::auth::Role::Admin,
12270        principal_is_system_owned,
12271        principal_is_platform_scoped: tenant.is_none(),
12272    }
12273}
12274
12275fn explicit_table_projection_columns(
12276    query: &crate::storage::query::ast::TableQuery,
12277) -> Vec<String> {
12278    use crate::storage::query::ast::{FieldRef, Projection};
12279
12280    let mut columns = Vec::new();
12281    for projection in crate::storage::query::sql_lowering::effective_table_projections(query) {
12282        match projection {
12283            Projection::Column(column) | Projection::Alias(column, _) => {
12284                push_unique(&mut columns, column)
12285            }
12286            Projection::Field(FieldRef::TableColumn { column, .. }, _) => {
12287                push_unique(&mut columns, column)
12288            }
12289            // SELECT * and expression/function projections need the
12290            // executor-wide column-policy context mapped in
12291            // docs/security/select-relational-column-policy-audit-2026-05-08.md.
12292            _ => {}
12293        }
12294    }
12295    columns
12296}
12297
12298fn explicit_graph_projection_properties(
12299    query: &crate::storage::query::ast::GraphQuery,
12300) -> Vec<String> {
12301    use crate::storage::query::ast::{FieldRef, Projection};
12302
12303    let mut columns = Vec::new();
12304    for projection in &query.return_ {
12305        match projection {
12306            Projection::Field(FieldRef::NodeProperty { property, .. }, _)
12307            | Projection::Field(FieldRef::EdgeProperty { property, .. }, _) => {
12308                push_unique(&mut columns, property.clone())
12309            }
12310            _ => {}
12311        }
12312    }
12313    columns
12314}
12315
12316fn push_unique(columns: &mut Vec<String>, column: String) {
12317    if !columns.iter().any(|existing| existing == &column) {
12318        columns.push(column);
12319    }
12320}
12321
12322fn principal_label(p: &crate::storage::query::ast::PolicyPrincipalRef) -> String {
12323    use crate::storage::query::ast::PolicyPrincipalRef;
12324    match p {
12325        PolicyPrincipalRef::User(u) => match &u.tenant {
12326            Some(t) => format!("user:{t}/{}", u.username),
12327            None => format!("user:{}", u.username),
12328        },
12329        PolicyPrincipalRef::Group(g) => format!("group:{g}"),
12330    }
12331}
12332
12333/// Render a `Decision` into the (decision, matched_policy_id, matched_sid)
12334/// shape used by every audit emit + the simulator response.
12335pub(crate) fn decision_to_strings(
12336    d: &crate::auth::policies::Decision,
12337) -> (String, Option<String>, Option<String>) {
12338    use crate::auth::policies::Decision;
12339    match d {
12340        Decision::Allow {
12341            matched_policy_id,
12342            matched_sid,
12343        } => (
12344            "allow".into(),
12345            Some(matched_policy_id.clone()),
12346            matched_sid.clone(),
12347        ),
12348        Decision::Deny {
12349            matched_policy_id,
12350            matched_sid,
12351        } => (
12352            "deny".into(),
12353            Some(matched_policy_id.clone()),
12354            matched_sid.clone(),
12355        ),
12356        Decision::DefaultDeny => ("default_deny".into(), None, None),
12357        Decision::AdminBypass => ("admin_bypass".into(), None, None),
12358    }
12359}
12360
12361fn relation_scopes_for_query(query: &QueryExpr) -> Vec<String> {
12362    let mut scopes = Vec::new();
12363    collect_relation_scopes(query, &mut scopes);
12364    scopes.sort();
12365    scopes.dedup();
12366    scopes
12367}
12368
12369fn collect_relation_scopes(query: &QueryExpr, scopes: &mut Vec<String>) {
12370    match query {
12371        QueryExpr::Table(table) => {
12372            if !table.table.is_empty() {
12373                scopes.push(table.table.clone());
12374            }
12375            if let Some(alias) = &table.alias {
12376                scopes.push(alias.clone());
12377            }
12378        }
12379        QueryExpr::Join(join) => {
12380            collect_relation_scopes(&join.left, scopes);
12381            collect_relation_scopes(&join.right, scopes);
12382        }
12383        _ => {}
12384    }
12385}
12386
12387fn query_references_outer_scope(query: &QueryExpr, outer_scopes: &[String]) -> bool {
12388    let inner_scopes = relation_scopes_for_query(query);
12389    query_expr_references_outer_scope(query, outer_scopes, &inner_scopes)
12390}
12391
12392fn query_expr_references_outer_scope(
12393    query: &QueryExpr,
12394    outer_scopes: &[String],
12395    inner_scopes: &[String],
12396) -> bool {
12397    match query {
12398        QueryExpr::Table(table) => {
12399            table.select_items.iter().any(|item| match item {
12400                crate::storage::query::ast::SelectItem::Wildcard => false,
12401                crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
12402                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12403                }
12404            }) || table
12405                .where_expr
12406                .as_ref()
12407                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
12408                || table.filter.as_ref().is_some_and(|filter| {
12409                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
12410                })
12411                || table.having_expr.as_ref().is_some_and(|expr| {
12412                    expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12413                })
12414                || table.having.as_ref().is_some_and(|filter| {
12415                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
12416                })
12417                || table
12418                    .group_by_exprs
12419                    .iter()
12420                    .any(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
12421                || table.order_by.iter().any(|clause| {
12422                    clause.expr.as_ref().is_some_and(|expr| {
12423                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12424                    })
12425                })
12426        }
12427        QueryExpr::Join(join) => {
12428            query_expr_references_outer_scope(&join.left, outer_scopes, inner_scopes)
12429                || query_expr_references_outer_scope(&join.right, outer_scopes, inner_scopes)
12430                || join.filter.as_ref().is_some_and(|filter| {
12431                    filter_references_outer_scope(filter, outer_scopes, inner_scopes)
12432                })
12433                || join.return_items.iter().any(|item| match item {
12434                    crate::storage::query::ast::SelectItem::Wildcard => false,
12435                    crate::storage::query::ast::SelectItem::Expr { expr, .. } => {
12436                        expr_references_outer_scope(expr, outer_scopes, inner_scopes)
12437                    }
12438                })
12439        }
12440        _ => false,
12441    }
12442}
12443
12444fn filter_references_outer_scope(
12445    filter: &crate::storage::query::ast::Filter,
12446    outer_scopes: &[String],
12447    inner_scopes: &[String],
12448) -> bool {
12449    use crate::storage::query::ast::Filter;
12450    match filter {
12451        Filter::Compare { field, .. }
12452        | Filter::IsNull(field)
12453        | Filter::IsNotNull(field)
12454        | Filter::In { field, .. }
12455        | Filter::Between { field, .. }
12456        | Filter::Like { field, .. }
12457        | Filter::StartsWith { field, .. }
12458        | Filter::EndsWith { field, .. }
12459        | Filter::Contains { field, .. } => {
12460            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
12461        }
12462        Filter::CompareFields { left, right, .. } => {
12463            field_ref_references_outer_scope(left, outer_scopes, inner_scopes)
12464                || field_ref_references_outer_scope(right, outer_scopes, inner_scopes)
12465        }
12466        Filter::CompareExpr { lhs, rhs, .. } => {
12467            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
12468                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
12469        }
12470        Filter::And(left, right) | Filter::Or(left, right) => {
12471            filter_references_outer_scope(left, outer_scopes, inner_scopes)
12472                || filter_references_outer_scope(right, outer_scopes, inner_scopes)
12473        }
12474        Filter::Not(inner) => filter_references_outer_scope(inner, outer_scopes, inner_scopes),
12475    }
12476}
12477
12478fn expr_references_outer_scope(
12479    expr: &crate::storage::query::ast::Expr,
12480    outer_scopes: &[String],
12481    inner_scopes: &[String],
12482) -> bool {
12483    use crate::storage::query::ast::Expr;
12484    match expr {
12485        Expr::Column { field, .. } => {
12486            field_ref_references_outer_scope(field, outer_scopes, inner_scopes)
12487        }
12488        Expr::BinaryOp { lhs, rhs, .. } => {
12489            expr_references_outer_scope(lhs, outer_scopes, inner_scopes)
12490                || expr_references_outer_scope(rhs, outer_scopes, inner_scopes)
12491        }
12492        Expr::UnaryOp { operand, .. }
12493        | Expr::Cast { inner: operand, .. }
12494        | Expr::IsNull { operand, .. } => {
12495            expr_references_outer_scope(operand, outer_scopes, inner_scopes)
12496        }
12497        Expr::FunctionCall { args, .. } => args
12498            .iter()
12499            .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes)),
12500        Expr::Case {
12501            branches, else_, ..
12502        } => {
12503            branches.iter().any(|(cond, value)| {
12504                expr_references_outer_scope(cond, outer_scopes, inner_scopes)
12505                    || expr_references_outer_scope(value, outer_scopes, inner_scopes)
12506            }) || else_
12507                .as_ref()
12508                .is_some_and(|expr| expr_references_outer_scope(expr, outer_scopes, inner_scopes))
12509        }
12510        Expr::InList { target, values, .. } => {
12511            expr_references_outer_scope(target, outer_scopes, inner_scopes)
12512                || values
12513                    .iter()
12514                    .any(|value| expr_references_outer_scope(value, outer_scopes, inner_scopes))
12515        }
12516        Expr::Between {
12517            target, low, high, ..
12518        } => {
12519            expr_references_outer_scope(target, outer_scopes, inner_scopes)
12520                || expr_references_outer_scope(low, outer_scopes, inner_scopes)
12521                || expr_references_outer_scope(high, outer_scopes, inner_scopes)
12522        }
12523        Expr::Subquery { query, .. } => query_references_outer_scope(&query.query, inner_scopes),
12524        Expr::Literal { .. } | Expr::Parameter { .. } => false,
12525        Expr::WindowFunctionCall { args, window, .. } => {
12526            args.iter()
12527                .any(|arg| expr_references_outer_scope(arg, outer_scopes, inner_scopes))
12528                || window
12529                    .partition_by
12530                    .iter()
12531                    .any(|e| expr_references_outer_scope(e, outer_scopes, inner_scopes))
12532                || window
12533                    .order_by
12534                    .iter()
12535                    .any(|o| expr_references_outer_scope(&o.expr, outer_scopes, inner_scopes))
12536        }
12537    }
12538}
12539
12540fn field_ref_references_outer_scope(
12541    field: &crate::storage::query::ast::FieldRef,
12542    outer_scopes: &[String],
12543    inner_scopes: &[String],
12544) -> bool {
12545    match field {
12546        crate::storage::query::ast::FieldRef::TableColumn { table, .. } if !table.is_empty() => {
12547            outer_scopes.iter().any(|scope| scope == table)
12548                && !inner_scopes.iter().any(|scope| scope == table)
12549        }
12550        _ => false,
12551    }
12552}
12553
12554fn first_column_values(
12555    result: crate::storage::query::unified::UnifiedResult,
12556) -> RedDBResult<Vec<Value>> {
12557    if result.columns.len() > 1 {
12558        return Err(RedDBError::Query(
12559            "expression subquery must return exactly one column".to_string(),
12560        ));
12561    }
12562    let fallback_column = result
12563        .records
12564        .first()
12565        .and_then(|record| record.column_names().into_iter().next())
12566        .map(|name| name.to_string());
12567    let column = result.columns.first().cloned().or(fallback_column);
12568    let Some(column) = column else {
12569        return Ok(Vec::new());
12570    };
12571    Ok(result
12572        .records
12573        .iter()
12574        .map(|record| record.get(column.as_str()).cloned().unwrap_or(Value::Null))
12575        .collect())
12576}
12577
12578fn parse_timestamp_to_ms(s: &str) -> Option<u128> {
12579    // Bare integer ms.
12580    if let Ok(n) = s.parse::<u128>() {
12581        return Some(n);
12582    }
12583    // Fallback: ISO-8601 like 2030-01-02 03:04:05 — accept the date
12584    // portion only (midnight UTC). Full RFC3339 parsing is a stretch
12585    // goal; the common case is `'2030-01-01'`.
12586    if let Some(date) = s.split_whitespace().next() {
12587        let parts: Vec<&str> = date.split('-').collect();
12588        if parts.len() == 3 {
12589            let (y, m, d) = (parts[0], parts[1], parts[2]);
12590            if let (Ok(y), Ok(m), Ok(d)) = (y.parse::<i64>(), m.parse::<u32>(), d.parse::<u32>()) {
12591                // Days since 1970-01-01 — simple Julian arithmetic
12592                // suitable for years 1970-2100. Good enough for test
12593                // fixtures; precise parsing lands when we wire chrono.
12594                let days_in = days_from_civil(y, m, d);
12595                return Some((days_in as u128) * 86_400_000u128);
12596            }
12597        }
12598    }
12599    None
12600}
12601
12602/// Days from Unix epoch using H. Hinnant's civil-from-days algorithm.
12603/// Robust for the entire Gregorian range; used by `parse_timestamp_to_ms`.
12604fn days_from_civil(y: i64, m: u32, d: u32) -> i64 {
12605    let y = if m <= 2 { y - 1 } else { y };
12606    let era = if y >= 0 { y } else { y - 399 } / 400;
12607    let yoe = (y - era * 400) as u64; // [0, 399]
12608    let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) as u64 + 2) / 5 + d as u64 - 1;
12609    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
12610    era * 146097 + doe as i64 - 719468
12611}
12612
12613fn walk_plan_node(
12614    node: &crate::storage::query::planner::CanonicalLogicalNode,
12615    depth: usize,
12616    out: &mut Vec<crate::storage::query::unified::UnifiedRecord>,
12617) {
12618    use std::sync::Arc;
12619    let mut rec = crate::storage::query::unified::UnifiedRecord::default();
12620    rec.set_arc(Arc::from("op"), Value::text(node.operator.clone()));
12621    rec.set_arc(
12622        Arc::from("source"),
12623        node.source.clone().map(Value::text).unwrap_or(Value::Null),
12624    );
12625    rec.set_arc(Arc::from("est_rows"), Value::Float(node.estimated_rows));
12626    rec.set_arc(Arc::from("est_cost"), Value::Float(node.operator_cost));
12627    rec.set_arc(Arc::from("depth"), Value::Integer(depth as i64));
12628    out.push(rec);
12629    for child in &node.children {
12630        walk_plan_node(child, depth + 1, out);
12631    }
12632}
12633
12634#[cfg(test)]
12635mod inline_graph_tvf_tests {
12636    use super::*;
12637
12638    fn scopes_for(sql: &str) -> HashSet<String> {
12639        let expr = crate::storage::query::parser::parse(sql)
12640            .expect("parse")
12641            .query;
12642        query_expr_result_cache_scopes(&expr)
12643    }
12644
12645    #[test]
12646    fn inline_tvf_cache_scopes_include_source_collections() {
12647        // The result-cache key for the inline form must derive from the
12648        // `nodes`/`edges` source collections so a write to either invalidates
12649        // the cached result (issue #799).
12650        let scopes = scopes_for(
12651            "SELECT * FROM components(nodes => (SELECT id FROM hosts), edges => (SELECT src, dst FROM links))",
12652        );
12653        assert!(scopes.contains("hosts"), "nodes source scoped: {scopes:?}");
12654        assert!(scopes.contains("links"), "edges source scoped: {scopes:?}");
12655    }
12656
12657    #[test]
12658    fn graph_collection_tvf_cache_scope_is_graph_argument() {
12659        // The graph-collection form still materializes the active graph, but
12660        // result-cache invalidation is scoped to the named graph argument so
12661        // INSERT INTO g NODE/EDGE invalidates cached TVF rows.
12662        let scopes = scopes_for("SELECT * FROM components(g)");
12663        assert!(scopes.contains("g"), "collection form scoped: {scopes:?}");
12664    }
12665
12666    #[test]
12667    fn abstract_degree_centrality_counts_undirected_endpoints() {
12668        let nodes = vec!["a".to_string(), "b".to_string(), "c".to_string()];
12669        let edges = vec![
12670            ("a".to_string(), "b".to_string(), 1.0_f32),
12671            ("b".to_string(), "c".to_string(), 1.0_f32),
12672        ];
12673        let degrees = abstract_degree_centrality(&nodes, &edges);
12674        assert_eq!(
12675            degrees,
12676            vec![
12677                ("a".to_string(), 1),
12678                ("b".to_string(), 2),
12679                ("c".to_string(), 1),
12680            ]
12681        );
12682    }
12683}