Skip to main content

reddb_server/runtime/
authorized_search.rs

1//! Authorized search entry point — issue #119.
2//!
3//! Wraps every SEARCH SIMILAR / SEARCH TEXT / SEARCH CONTEXT runtime
4//! call so that the candidate set is *pre-filtered* by the calling
5//! identity's `EffectiveScope.visible_collections` BEFORE any
6//! similarity score is computed. Without this gate AI commands could
7//! return rows outside the calling user's RLS scope and leak them via
8//! the LLM context window.
9//!
10//! The contract is intentionally narrow: every public function takes
11//! `scope: &dyn ReadFrame`, refuses with a structured `RedDBError` if
12//! the frame carries `None` for `visible_collections()`, and trims the
13//! candidate set to the intersection of the user-supplied collection
14//! list (if any) and the scope's allow-list. The legacy direct entry
15//! points (`RedDBRuntime::search_similar`, `search_text`,
16//! `search_context`) remain in place for tests and for callers that
17//! have already opened a frame; this module is the *canonical* entry
18//! that the `SEARCH SIMILAR / SEARCH CONTEXT` SQL commands and ASK go
19//! through.
20
21use std::collections::HashSet;
22
23use tracing::{debug, info_span, warn};
24
25use super::statement_frame::ReadFrame;
26use super::RedDBRuntime;
27use crate::api::{RedDBError, RedDBResult};
28use crate::application::SearchContextInput;
29use crate::storage::unified::devx::SimilarResult;
30
31/// Surface area used by the AI runtime entry points. Holds a reference
32/// to the scope and the canonical functions; kept as an empty enum so
33/// callers spell `AuthorizedSearch::execute_*(...)` (matching the
34/// shape required by issue #119).
35pub enum AuthorizedSearch {}
36
37impl AuthorizedSearch {
38    /// Authorized SEARCH SIMILAR. Refuses with a structured error when
39    /// the caller's scope has no `visible_collections` set, or when the
40    /// requested collection is outside that set. Otherwise dispatches
41    /// to the underlying `RedDBRuntime::search_similar`.
42    pub(crate) fn execute_similar(
43        runtime: &RedDBRuntime,
44        scope: &dyn ReadFrame,
45        collection: &str,
46        vector: &[f32],
47        k: usize,
48        min_score: f32,
49    ) -> RedDBResult<Vec<SimilarResult>> {
50        let span = info_span!(
51            "authorized_search.similar",
52            collection = collection,
53            tenant = ?scope.effective_scope(),
54        );
55        let _enter = span.enter();
56
57        let visible = require_visible(scope, "SEARCH SIMILAR")?;
58        if !visible.contains(collection) {
59            warn!(
60                target: "authorized_search",
61                collection = collection,
62                "denied: collection outside visible scope"
63            );
64            return Err(RedDBError::Query(format!(
65                "permission denied: collection `{collection}` is not in the caller's visible scope"
66            )));
67        }
68        debug!(target: "authorized_search", "scope-checked, dispatching");
69        runtime.search_similar(collection, vector, k, min_score)
70    }
71
72    /// Authorized SEARCH TEXT. The underlying executor accepts an
73    /// optional collection list; we intersect it with the visible set
74    /// before forwarding so collections outside scope never enter the
75    /// candidate pool.
76    #[allow(clippy::too_many_arguments)]
77    pub(crate) fn execute_text(
78        runtime: &RedDBRuntime,
79        scope: &dyn ReadFrame,
80        query: String,
81        collections: Option<Vec<String>>,
82        entity_types: Option<Vec<String>>,
83        capabilities: Option<Vec<String>>,
84        fields: Option<Vec<String>>,
85        limit: Option<usize>,
86        fuzzy: bool,
87    ) -> RedDBResult<crate::storage::unified::dsl::QueryResult> {
88        let span = info_span!(
89            "authorized_search.text",
90            tenant = ?scope.effective_scope(),
91        );
92        let _enter = span.enter();
93
94        let visible = require_visible(scope, "SEARCH TEXT")?;
95        let constrained = constrain_collections(collections, visible);
96        if let Some(ref c) = constrained {
97            if c.is_empty() {
98                // Caller asked for collections outside their scope —
99                // refuse loudly rather than fall through to the global
100                // path (which would scan every collection ignoring the
101                // intent).
102                return Err(RedDBError::Query(
103                    "permission denied: no requested collection is in the caller's visible scope"
104                        .to_string(),
105                ));
106            }
107        }
108        runtime.search_text(
109            query,
110            constrained,
111            entity_types,
112            capabilities,
113            fields,
114            limit,
115            fuzzy,
116        )
117    }
118
119    /// Authorized SEARCH CONTEXT. Pre-filters the input collection list
120    /// against the visible scope and post-filters every result bucket
121    /// so cross-ref / graph / vector expansion can't leak rows from
122    /// outside the scope. Refuses with a structured error when the
123    /// scope has no visible-collections set.
124    pub(crate) fn execute_context(
125        runtime: &RedDBRuntime,
126        scope: &dyn ReadFrame,
127        mut input: SearchContextInput,
128    ) -> RedDBResult<crate::runtime::ContextSearchResult> {
129        let span = info_span!(
130            "authorized_search.context",
131            tenant = ?scope.effective_scope(),
132        );
133        let _enter = span.enter();
134
135        let visible = require_visible(scope, "SEARCH CONTEXT")?;
136
137        // Pre-filter the request: drop any caller-supplied collection
138        // that's outside the visible set, and force the global-scan
139        // path to also stay inside the visible set by passing the
140        // intersection through.
141        input.collections = constrain_collections(input.collections, visible);
142        if let Some(ref c) = input.collections {
143            if c.is_empty() {
144                return Err(RedDBError::Query(
145                    "permission denied: no requested collection is in the caller's visible scope"
146                        .to_string(),
147                ));
148            }
149        } else {
150            // No caller list — substitute the visible set so the
151            // global-scan tier stays bounded by it.
152            let mut bounded: Vec<String> = visible.iter().cloned().collect();
153            bounded.sort();
154            input.collections = Some(bounded);
155        }
156
157        let mut result = runtime.search_context(input)?;
158        post_filter_context_result(&mut result, visible);
159        Ok(result)
160    }
161}
162
163/// Defence-in-depth pass run after `search_context` returns. The
164/// `input.collections` pre-filter already bounds the corpus, but the
165/// cross-ref / graph / vector expansion paths resolve
166/// `xref.target_collection` and `entity.kind.collection()`
167/// independently. Re-filtering each bucket here ensures a regression
168/// in one of those paths can't leak rows from outside `allowed`.
169///
170/// Factored out as a free function so the property test (256 cases)
171/// can drive the invariant without booting a runtime.
172fn post_filter_context_result(
173    result: &mut crate::runtime::ContextSearchResult,
174    allowed: &HashSet<String>,
175) {
176    let retain = |bucket: &mut Vec<crate::runtime::ContextEntity>| {
177        bucket.retain(|e| allowed.contains(&e.collection));
178    };
179    retain(&mut result.tables);
180    retain(&mut result.graph.nodes);
181    retain(&mut result.graph.edges);
182    retain(&mut result.vectors);
183    retain(&mut result.documents);
184    retain(&mut result.key_values);
185
186    // Connections reference entity ids; recompute visible ids so a
187    // dangling edge into a filtered-out row is dropped.
188    let visible_ids: HashSet<u64> = std::iter::empty()
189        .chain(result.tables.iter().map(|e| e.entity.id.raw()))
190        .chain(result.graph.nodes.iter().map(|e| e.entity.id.raw()))
191        .chain(result.graph.edges.iter().map(|e| e.entity.id.raw()))
192        .chain(result.vectors.iter().map(|e| e.entity.id.raw()))
193        .chain(result.documents.iter().map(|e| e.entity.id.raw()))
194        .chain(result.key_values.iter().map(|e| e.entity.id.raw()))
195        .collect();
196    result
197        .connections
198        .retain(|c| visible_ids.contains(&c.from_id) && visible_ids.contains(&c.to_id));
199
200    result.summary.total_entities = result.tables.len()
201        + result.graph.nodes.len()
202        + result.graph.edges.len()
203        + result.vectors.len()
204        + result.documents.len()
205        + result.key_values.len();
206}
207
208/// Resolve the visible-collections set on a frame, refusing with a
209/// structured error when none is wired. Centralised so every entry
210/// point produces the same error string and tracing event.
211fn require_visible<'a>(
212    scope: &'a dyn ReadFrame,
213    op: &'static str,
214) -> RedDBResult<&'a HashSet<String>> {
215    match scope.visible_collections() {
216        Some(set) => Ok(set),
217        None => {
218            warn!(
219                target: "authorized_search",
220                op = op,
221                "refused: no visible-collections scope on frame"
222            );
223            Err(RedDBError::Query(format!(
224                "{op} requires an authenticated scope with visible_collections; \
225                 none was attached to the runtime frame"
226            )))
227        }
228    }
229}
230
231/// Intersect a caller-supplied collection list with the visible set.
232/// `None` (no caller list) means "every collection" — we pass `None`
233/// through unchanged so the caller's existing default of "scan the
234/// whole DB" reads as "scan everything visible to the scope". The
235/// caller is expected to substitute the visible set explicitly when it
236/// needs a bounded global-scan corpus.
237fn constrain_collections(
238    requested: Option<Vec<String>>,
239    visible: &HashSet<String>,
240) -> Option<Vec<String>> {
241    match requested {
242        None => None,
243        Some(list) => {
244            let filtered: Vec<String> = list.into_iter().filter(|c| visible.contains(c)).collect();
245            Some(filtered)
246        }
247    }
248}
249
250#[cfg(test)]
251mod tests {
252    use super::*;
253    use crate::api::RedDBOptions;
254    use crate::auth::Role;
255    use crate::runtime::statement_frame::test_support::FakeReadFrame;
256    use crate::runtime::RedDBRuntime;
257
258    fn rt() -> RedDBRuntime {
259        RedDBRuntime::with_options(RedDBOptions::in_memory()).expect("rt")
260    }
261
262    fn set(items: &[&str]) -> HashSet<String> {
263        items.iter().map(|s| s.to_string()).collect()
264    }
265
266    #[test]
267    fn execute_similar_refuses_without_scope() {
268        let rt = rt();
269        let frame = FakeReadFrame::without_scope();
270        let err = AuthorizedSearch::execute_similar(&rt, &frame, "orders", &[0.1], 1, 0.0)
271            .expect_err("refuses without scope");
272        assert!(format!("{err}").contains("requires an authenticated scope"));
273    }
274
275    #[test]
276    fn execute_similar_refuses_collection_outside_scope() {
277        let rt = rt();
278        let frame = FakeReadFrame::with_visible(set(&["orders"]));
279        let err = AuthorizedSearch::execute_similar(&rt, &frame, "secrets", &[0.1], 1, 0.0)
280            .expect_err("refuses out-of-scope collection");
281        assert!(format!("{err}").contains("not in the caller's visible scope"));
282    }
283
284    #[test]
285    fn execute_context_refuses_without_scope() {
286        let rt = rt();
287        let frame = FakeReadFrame::without_scope();
288        let err = AuthorizedSearch::execute_context(
289            &rt,
290            &frame,
291            SearchContextInput {
292                query: "x".into(),
293                field: None,
294                vector: None,
295                collections: None,
296                graph_depth: None,
297                graph_max_edges: None,
298                max_cross_refs: None,
299                follow_cross_refs: None,
300                expand_graph: None,
301                global_scan: None,
302                reindex: None,
303                limit: None,
304                min_score: None,
305            },
306        )
307        .expect_err("refuses without scope");
308        assert!(format!("{err}").contains("requires an authenticated scope"));
309    }
310
311    #[test]
312    fn constrain_collections_drops_out_of_scope_items() {
313        let visible = set(&["a", "b"]);
314        let got = constrain_collections(Some(vec!["a".into(), "c".into()]), &visible);
315        assert_eq!(got, Some(vec!["a".into()]));
316        // None -> None passthrough.
317        assert!(constrain_collections(None, &visible).is_none());
318        // Touch `Role` so the import isn't dropped if test fixtures grow.
319        let _ = Role::Read;
320    }
321
322    // -----------------------------------------------------------------
323    // Property test (issue #119): every result row's collection ∈
324    // scope.visible_collections after `post_filter_context_result`.
325    // 256 cases as the issue requires.
326    // -----------------------------------------------------------------
327
328    use crate::runtime::{
329        ContextConnection, ContextConnectionType, ContextEntity, ContextGraphResult,
330        ContextSearchResult, ContextSummary, DiscoveryMethod,
331    };
332    use crate::storage::unified::entity::{EntityData, EntityKind, RowData, UnifiedEntity};
333    use crate::storage::unified::EntityId;
334    use proptest::prelude::*;
335
336    fn fake_entity(id: u64, collection: &str) -> UnifiedEntity {
337        UnifiedEntity::new(
338            EntityId::new(id),
339            EntityKind::TableRow {
340                table: std::sync::Arc::from(collection),
341                row_id: id,
342            },
343            EntityData::Row(RowData::new(Vec::new())),
344        )
345    }
346
347    fn fake_ctx_entity(id: u64, collection: &str) -> ContextEntity {
348        ContextEntity {
349            entity: fake_entity(id, collection),
350            score: 0.5,
351            discovery: DiscoveryMethod::GlobalScan,
352            collection: collection.to_string(),
353        }
354    }
355
356    fn empty_summary() -> ContextSummary {
357        ContextSummary {
358            total_entities: 0,
359            direct_matches: 0,
360            expanded_via_graph: 0,
361            expanded_via_cross_refs: 0,
362            expanded_via_vector_query: 0,
363            collections_searched: 0,
364            execution_time_us: 0,
365            tiers_used: Vec::new(),
366            entities_reindexed: 0,
367        }
368    }
369
370    fn build_result(rows: &[(u64, &str)]) -> ContextSearchResult {
371        let entities: Vec<ContextEntity> =
372            rows.iter().map(|(id, c)| fake_ctx_entity(*id, c)).collect();
373        ContextSearchResult {
374            query: "x".into(),
375            tables: entities.clone(),
376            graph: ContextGraphResult {
377                nodes: entities.clone(),
378                edges: Vec::new(),
379            },
380            vectors: entities.clone(),
381            documents: Vec::new(),
382            key_values: Vec::new(),
383            connections: vec![ContextConnection {
384                from_id: rows.first().map(|(id, _)| *id).unwrap_or(0),
385                to_id: rows.last().map(|(id, _)| *id).unwrap_or(0),
386                connection_type: ContextConnectionType::CrossRef("x".into()),
387                weight: 1.0,
388            }],
389            summary: empty_summary(),
390        }
391    }
392
393    proptest! {
394        // Issue #119: every result row's collection MUST be in
395        // `visible_collections` after AuthorizedSearch's defence-in-
396        // depth post-filter. Run 256 cases with arbitrary mixes of
397        // collection names and visible sets.
398        #![proptest_config(ProptestConfig::with_cases(256))]
399        #[test]
400        fn every_result_row_is_in_visible_set(
401            row_collections in proptest::collection::vec("[a-z]{1,4}", 0..10),
402            visible in proptest::collection::hash_set("[a-z]{1,4}", 0..6),
403        ) {
404            let rows: Vec<(u64, &str)> = row_collections
405                .iter()
406                .enumerate()
407                .map(|(i, c)| (i as u64 + 1, c.as_str()))
408                .collect();
409            let mut result = build_result(&rows);
410            post_filter_context_result(&mut result, &visible);
411
412            // The invariant: nothing escapes the visible scope.
413            for e in result.tables.iter()
414                .chain(result.graph.nodes.iter())
415                .chain(result.graph.edges.iter())
416                .chain(result.vectors.iter())
417                .chain(result.documents.iter())
418                .chain(result.key_values.iter())
419            {
420                prop_assert!(visible.contains(&e.collection),
421                    "leaked row collection={} not in visible={:?}",
422                    e.collection, visible);
423            }
424            // Connections only reference visible-id pairs.
425            let visible_ids: HashSet<u64> = std::iter::empty()
426                .chain(result.tables.iter().map(|e| e.entity.id.raw()))
427                .chain(result.graph.nodes.iter().map(|e| e.entity.id.raw()))
428                .chain(result.graph.edges.iter().map(|e| e.entity.id.raw()))
429                .chain(result.vectors.iter().map(|e| e.entity.id.raw()))
430                .chain(result.documents.iter().map(|e| e.entity.id.raw()))
431                .chain(result.key_values.iter().map(|e| e.entity.id.raw()))
432                .collect();
433            for c in &result.connections {
434                prop_assert!(visible_ids.contains(&c.from_id) && visible_ids.contains(&c.to_id),
435                    "dangling connection {} -> {} survived filter",
436                    c.from_id, c.to_id);
437            }
438        }
439    }
440
441    // -----------------------------------------------------------------
442    // Regression test (issue #119): tenant A user runs SEARCH SIMILAR;
443    // tenant B rows never enter the result.
444    //
445    // Drives the boundary check directly through `AuthorizedSearch::
446    // execute_similar` with an `EffectiveScope` whose visible set
447    // mirrors what `AuthStore::visible_collections_for_scope` returns
448    // for tenant A. Asking for a tenant-B collection must refuse.
449    // -----------------------------------------------------------------
450
451    #[test]
452    fn tenant_a_cannot_see_tenant_b_collection() {
453        let rt = rt();
454        // Tenant A's caller — visible set restricted to A's collection.
455        let frame_a = FakeReadFrame::with_visible(set(&["a_orders"]));
456        // SEARCH SIMILAR against `b_orders` (tenant B's collection)
457        // must refuse with the structured permission-denied error,
458        // BEFORE any similarity score is computed (the underlying
459        // `search_similar` call is never reached because
460        // `visible.contains` short-circuits first).
461        let err = AuthorizedSearch::execute_similar(&rt, &frame_a, "b_orders", &[0.1], 1, 0.0)
462            .expect_err("tenant-A scope must refuse tenant-B collection");
463        assert!(format!("{err}").contains("not in the caller's visible scope"));
464    }
465
466    /// Companion regression: SEARCH CONTEXT also rejects when every
467    /// requested collection is outside the caller's visible scope.
468    /// Without the pre-filter, the global-scan tier would scan every
469    /// collection in the DB (including tenant B's) and only the
470    /// per-row RLS gate would catch leaks — which is exactly the
471    /// failure mode #119 closes.
472    #[test]
473    fn search_context_refuses_all_out_of_scope_collections() {
474        let rt = rt();
475        let frame = FakeReadFrame::with_visible(set(&["a_orders"]));
476        let err = AuthorizedSearch::execute_context(
477            &rt,
478            &frame,
479            SearchContextInput {
480                query: "x".into(),
481                field: None,
482                vector: None,
483                collections: Some(vec!["b_orders".into(), "b_customers".into()]),
484                graph_depth: None,
485                graph_max_edges: None,
486                max_cross_refs: None,
487                follow_cross_refs: None,
488                expand_graph: None,
489                global_scan: None,
490                reindex: None,
491                limit: None,
492                min_score: None,
493            },
494        )
495        .expect_err("all-out-of-scope SEARCH CONTEXT must refuse");
496        let msg = format!("{err}");
497        assert!(
498            msg.contains("no requested collection is in the caller's visible scope"),
499            "expected scope-refusal, got: {msg}"
500        );
501    }
502
503    // -----------------------------------------------------------------
504    // Cache hit-rate metric is exposed (issue #119 acceptance).
505    // -----------------------------------------------------------------
506
507    #[test]
508    fn auth_cache_stats_are_exposed_via_authstore() {
509        use crate::auth::store::AuthStore;
510        use crate::auth::AuthConfig;
511        let store = AuthStore::new(AuthConfig::default());
512        let stats0 = store.auth_cache_stats();
513        assert_eq!(stats0.hits + stats0.misses, 0);
514        // Drive a miss + insert via the public API.
515        let _ = store.visible_collections_for_scope(
516            None,
517            Role::Read,
518            "alice",
519            &vec!["orders".to_string()],
520        );
521        let stats1 = store.auth_cache_stats();
522        assert!(
523            stats1.misses >= 1,
524            "first lookup must record a miss, got {stats1:?}"
525        );
526        // Second call hits the freshly-populated entry.
527        let _ = store.visible_collections_for_scope(
528            None,
529            Role::Read,
530            "alice",
531            &vec!["orders".to_string()],
532        );
533        let stats2 = store.auth_cache_stats();
534        assert!(
535            stats2.hits >= 1,
536            "second lookup must record a hit, got {stats2:?}"
537        );
538        // Hit rate is computable.
539        let _ = stats2.hit_rate();
540    }
541
542    #[test]
543    fn visible_collections_cache_keeps_principals_separate() {
544        use crate::auth::privileges::{Action, GrantPrincipal, Resource};
545        use crate::auth::store::AuthStore;
546        use crate::auth::{AuthConfig, UserId};
547
548        let store = AuthStore::new(AuthConfig::default());
549        store.create_user("admin", "p", Role::Admin).unwrap();
550        store.create_user("alice", "p", Role::Read).unwrap();
551        store.create_user("bob", "p", Role::Read).unwrap();
552        store
553            .grant(
554                &UserId::platform("admin"),
555                Role::Admin,
556                GrantPrincipal::User(UserId::platform("alice")),
557                Resource::table_from_name("orders"),
558                vec![Action::Select],
559                false,
560                None,
561            )
562            .expect("grant alice orders");
563
564        let collections = vec!["orders".to_string()];
565        let alice = store.visible_collections_for_scope(None, Role::Read, "alice", &collections);
566        let bob = store.visible_collections_for_scope(None, Role::Read, "bob", &collections);
567
568        assert!(alice.contains("orders"));
569        assert!(
570            !bob.contains("orders"),
571            "bob must not reuse alice's visible-collections cache entry"
572        );
573    }
574}