Skip to main content

smooth_operator/
access_control.rs

1//! Document-level access control (feature gap G3).
2//!
3//! Org isolation already exists (every conversation / knowledge row carries an
4//! `organizationId`, and the Postgres knowledge base filters on it). This module
5//! adds the **within-org user/group entitlement** layer the industry calls
6//! document-level permissions: even inside one organization, a document may be
7//! restricted to specific users or groups, and a retrieval must only ever return
8//! documents the requester is entitled to read.
9//!
10//! ## Why enforcement lives in our layer
11//!
12//! smooth-operator's [`KnowledgeBase`](smooth_operator_core::KnowledgeBase) trait is
13//! upstream and **read-only to us**: its `query` returns a
14//! [`KnowledgeResult`](smooth_operator_core::KnowledgeResult) that carries only
15//! `document_id` / `chunk` / `score` / `source` — *not* the stored metadata —
16//! and the in-memory backend drops document metadata on ingest entirely. So we
17//! cannot read an ACL back out of a query result. Instead this module:
18//!
19//! 1. Records the document → [`DocAcl`] mapping **at ingest** (parsed from the
20//!    [`DocAcl::ACL_METADATA_KEY`] metadata the document carries) into a side
21//!    table the [`AclKnowledgeStore`] owns, while forwarding the document
22//!    unchanged to the inner backend.
23//! 2. **Filters at read**: an [`AclReader`] bound to the requester's
24//!    [`AccessContext`] over-fetches from the inner backend, looks each result's
25//!    ACL up in the side table, and drops any the requester cannot access before
26//!    truncating to the requested `K`.
27//!
28//! This is backend-agnostic: the same [`AclKnowledgeStore`] wraps the in-memory,
29//! Postgres, or DynamoDB knowledge base identically (the post-filter happens in
30//! our layer, after the backend's own org-scoped query).
31//!
32//! ## No-ACL default semantics — **no-acl ⇒ org-public**
33//!
34//! A document ingested **without** an ACL (the legacy / existing-seed path) has
35//! no entry in the side table and is treated as **org-public**: visible to
36//! anyone whose query reaches it (org isolation already happened upstream). This
37//! keeps existing seeded knowledge retrievable and makes ACLs strictly additive
38//! — you opt a document *into* restriction by attaching a [`DocAcl`]. An
39//! explicit `DocAcl::default()` (all fields empty, `public: false`) is the
40//! opposite: a fully-locked document only its listed users/groups can read.
41
42use std::collections::HashMap;
43use std::sync::{Arc, RwLock};
44
45use serde::{Deserialize, Serialize};
46use smooth_operator_core::{Document, KnowledgeBase, KnowledgeResult};
47
48/// Over-fetch multiplier: the inner backend is queried for `limit * this` (with
49/// a floor) candidates so that, after dropping results the requester can't
50/// access, the post-filter top-K is still full whenever enough accessible
51/// documents exist. Mirrors the over-fetch the Postgres backend already does for
52/// RRF fusion.
53const OVERFETCH_FACTOR: usize = 5;
54
55/// Lower bound on the candidate pool, so a small `limit` still over-fetches
56/// enough to survive filtering.
57const OVERFETCH_FLOOR: usize = 20;
58
59/// A document's allow-list — who may read it within the organization.
60///
61/// A requester may read the document when **any** of these hold:
62/// - the document is [`public`](DocAcl::public),
63/// - the requester's `user_id` is in [`users`](DocAcl::users),
64/// - any of the requester's groups is in [`groups`](DocAcl::groups).
65///
66/// The default (`public: false`, empty `users`/`groups`) is a fully-locked
67/// document. Note that "no `DocAcl` recorded at all" is *different* — that is
68/// org-public (see the module-level no-ACL default semantics).
69#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
70pub struct DocAcl {
71    /// When true, any requester reaching this document may read it.
72    #[serde(default)]
73    pub public: bool,
74    /// User ids explicitly allowed to read this document.
75    #[serde(default)]
76    pub users: Vec<String>,
77    /// Group ids explicitly allowed to read this document.
78    #[serde(default)]
79    pub groups: Vec<String>,
80}
81
82impl DocAcl {
83    /// The document-metadata key under which a [`DocAcl`] is serialized (as
84    /// JSON) so it survives the trip through the ingestion pipeline and into the
85    /// [`AclKnowledgeStore`]'s side table.
86    pub const ACL_METADATA_KEY: &'static str = "acl_v2";
87
88    /// A document readable by anyone reaching it.
89    #[must_use]
90    pub fn public() -> Self {
91        Self {
92            public: true,
93            ..Self::default()
94        }
95    }
96
97    /// A document readable only by the listed users.
98    #[must_use]
99    pub fn for_users<I, S>(users: I) -> Self
100    where
101        I: IntoIterator<Item = S>,
102        S: Into<String>,
103    {
104        Self {
105            public: false,
106            users: users.into_iter().map(Into::into).collect(),
107            groups: Vec::new(),
108        }
109    }
110
111    /// A document readable only by members of the listed groups.
112    #[must_use]
113    pub fn for_groups<I, S>(groups: I) -> Self
114    where
115        I: IntoIterator<Item = S>,
116        S: Into<String>,
117    {
118        Self {
119            public: false,
120            users: Vec::new(),
121            groups: groups.into_iter().map(Into::into).collect(),
122        }
123    }
124
125    /// Allow these additional users (builder).
126    #[must_use]
127    pub fn with_users<I, S>(mut self, users: I) -> Self
128    where
129        I: IntoIterator<Item = S>,
130        S: Into<String>,
131    {
132        self.users.extend(users.into_iter().map(Into::into));
133        self
134    }
135
136    /// Allow these additional groups (builder).
137    #[must_use]
138    pub fn with_groups<I, S>(mut self, groups: I) -> Self
139    where
140        I: IntoIterator<Item = S>,
141        S: Into<String>,
142    {
143        self.groups.extend(groups.into_iter().map(Into::into));
144        self
145    }
146
147    /// Serialize this ACL into a document's metadata under
148    /// [`ACL_METADATA_KEY`](DocAcl::ACL_METADATA_KEY) (builder over a
149    /// [`Document`]). This is how a connector / ingest path stamps an ACL onto a
150    /// document so the [`AclKnowledgeStore`] records it.
151    ///
152    /// # Panics
153    /// Never — [`DocAcl`] always serializes to JSON.
154    #[must_use]
155    pub fn attach_to(&self, doc: Document) -> Document {
156        let json = serde_json::to_string(self).expect("DocAcl always serializes");
157        doc.with_metadata(Self::ACL_METADATA_KEY, json)
158    }
159
160    /// Parse a [`DocAcl`] out of a document's metadata, if one is present and
161    /// well-formed. Returns `None` when the key is absent (no-ACL ⇒ org-public)
162    /// or the value fails to parse (treated as absent so a malformed stamp can't
163    /// silently lock or unlock a document — it falls back to the default).
164    #[must_use]
165    pub fn from_metadata(metadata: &HashMap<String, String>) -> Option<Self> {
166        let raw = metadata.get(Self::ACL_METADATA_KEY)?;
167        serde_json::from_str(raw).ok()
168    }
169}
170
171/// The identity a retrieval is performed *as* — the requester's entitlements.
172///
173/// Built from the authenticated user and the groups they belong to (resolved
174/// upstream from the auth context). Passed into the knowledge-retrieval path so
175/// results can be filtered by [`AccessContext::can_access`].
176#[derive(Debug, Clone, Default, PartialEq, Eq)]
177pub struct AccessContext {
178    /// The requester's user id, if authenticated as a user. `None` for an
179    /// anonymous / system requester (which then only sees public + no-ACL docs).
180    pub user_id: Option<String>,
181    /// The groups the requester belongs to.
182    pub groups: Vec<String>,
183}
184
185impl AccessContext {
186    /// Build a context from an optional user id and a set of groups.
187    #[must_use]
188    pub fn new(user_id: Option<String>, groups: Vec<String>) -> Self {
189        Self { user_id, groups }
190    }
191
192    /// A context for a specific user with no group memberships.
193    #[must_use]
194    pub fn for_user(user_id: impl Into<String>) -> Self {
195        Self {
196            user_id: Some(user_id.into()),
197            groups: Vec::new(),
198        }
199    }
200
201    /// An anonymous requester: no user id, no groups. Sees only public and
202    /// no-ACL (org-public) documents.
203    #[must_use]
204    pub fn anonymous() -> Self {
205        Self::default()
206    }
207
208    /// Add a group membership (builder).
209    #[must_use]
210    pub fn with_group(mut self, group: impl Into<String>) -> Self {
211        self.groups.push(group.into());
212        self
213    }
214
215    /// Whether this requester may read a document with the given [`DocAcl`].
216    ///
217    /// `true` when the doc is public, or the requester's user id is in the
218    /// allow-list, or any of the requester's groups is in the allow-list.
219    #[must_use]
220    pub fn can_access(&self, acl: &DocAcl) -> bool {
221        if acl.public {
222            return true;
223        }
224        if let Some(uid) = &self.user_id {
225            if acl.users.iter().any(|u| u == uid) {
226                return true;
227            }
228        }
229        self.groups.iter().any(|g| acl.groups.contains(g))
230    }
231}
232
233/// Side table mapping a stored `document_id` to its [`DocAcl`]. Shared (`Arc`)
234/// between the ingest handle that populates it and every per-request reader that
235/// consults it. Documents absent from the table are org-public (no-ACL default).
236type AclTable = Arc<RwLock<HashMap<String, DocAcl>>>;
237
238/// An ACL-aware knowledge store: wraps any inner
239/// [`KnowledgeBase`](smooth_operator_core::KnowledgeBase) and records document ACLs
240/// at ingest so retrieval can be filtered per requester.
241///
242/// Construction does **not** itself implement `KnowledgeBase` for reading,
243/// because reads must be bound to a requester. Instead:
244/// - [`ingest_handle`](AclKnowledgeStore::ingest_handle) returns an
245///   `Arc<dyn KnowledgeBase>` that records ACLs as it ingests (used by the
246///   ingestion pipeline);
247/// - [`reader`](AclKnowledgeStore::reader) mints an ACL-filtering
248///   `Arc<dyn KnowledgeBase>` bound to a specific [`AccessContext`] (used by the
249///   runtime + `knowledge_search` tool for a turn).
250#[derive(Clone)]
251pub struct AclKnowledgeStore {
252    inner: Arc<dyn KnowledgeBase>,
253    acls: AclTable,
254}
255
256impl AclKnowledgeStore {
257    /// Wrap an inner knowledge base. The store starts with an empty ACL table;
258    /// every document ingested through [`ingest_handle`](Self::ingest_handle)
259    /// that carries a [`DocAcl`] in its metadata is recorded.
260    #[must_use]
261    pub fn new(inner: Arc<dyn KnowledgeBase>) -> Self {
262        Self {
263            inner,
264            acls: Arc::new(RwLock::new(HashMap::new())),
265        }
266    }
267
268    /// An ingest-side handle: a [`KnowledgeBase`] whose `ingest` records the
269    /// document's ACL (if any) in the shared side table, then forwards to the
270    /// inner backend. Its `query` is the **unfiltered** inner query (callers
271    /// that read for a specific requester use [`reader`](Self::reader) instead).
272    #[must_use]
273    pub fn ingest_handle(&self) -> Arc<dyn KnowledgeBase> {
274        Arc::new(AclIngestHandle {
275            inner: Arc::clone(&self.inner),
276            acls: Arc::clone(&self.acls),
277        })
278    }
279
280    /// A read-side handle bound to `ctx`: a [`KnowledgeBase`] whose `query`
281    /// over-fetches from the inner backend and drops every result the requester
282    /// is not entitled to before truncating to the requested limit.
283    #[must_use]
284    pub fn reader(&self, ctx: AccessContext) -> Arc<dyn KnowledgeBase> {
285        Arc::new(AclReader {
286            inner: Arc::clone(&self.inner),
287            acls: Arc::clone(&self.acls),
288            ctx,
289        })
290    }
291
292    /// Record `document_id → acl` directly (without ingesting a document) — for
293    /// callers that store documents through some other path but still want the
294    /// ACL enforced at read.
295    ///
296    /// # Errors
297    /// Returns an error if the ACL table lock is poisoned.
298    pub fn record_acl(&self, document_id: impl Into<String>, acl: DocAcl) -> anyhow::Result<()> {
299        let mut table = self
300            .acls
301            .write()
302            .map_err(|e| anyhow::anyhow!("acl table lock poisoned: {e}"))?;
303        table.insert(document_id.into(), acl);
304        Ok(())
305    }
306}
307
308/// Records ACLs at ingest, forwarding documents to the inner backend.
309struct AclIngestHandle {
310    inner: Arc<dyn KnowledgeBase>,
311    acls: AclTable,
312}
313
314impl KnowledgeBase for AclIngestHandle {
315    fn ingest(&self, doc: Document) -> anyhow::Result<()> {
316        // Record the ACL (if the document carries one) keyed by document id, so
317        // a later query result with that document_id can be access-checked.
318        if let Some(acl) = DocAcl::from_metadata(&doc.metadata) {
319            let mut table = self
320                .acls
321                .write()
322                .map_err(|e| anyhow::anyhow!("acl table lock poisoned: {e}"))?;
323            table.insert(doc.id.clone(), acl);
324        }
325        self.inner.ingest(doc)
326    }
327
328    fn query(&self, query: &str, limit: usize) -> anyhow::Result<Vec<KnowledgeResult>> {
329        // Ingest handle reads are unfiltered (no requester bound). Production
330        // reads go through `reader(ctx)`.
331        self.inner.query(query, limit)
332    }
333}
334
335/// Filters query results by a bound [`AccessContext`].
336struct AclReader {
337    inner: Arc<dyn KnowledgeBase>,
338    acls: AclTable,
339    ctx: AccessContext,
340}
341
342impl KnowledgeBase for AclReader {
343    fn ingest(&self, doc: Document) -> anyhow::Result<()> {
344        // A reader can still ingest (recording ACLs), so the same handle is
345        // usable end to end in tests — but production ingest uses ingest_handle.
346        if let Some(acl) = DocAcl::from_metadata(&doc.metadata) {
347            let mut table = self
348                .acls
349                .write()
350                .map_err(|e| anyhow::anyhow!("acl table lock poisoned: {e}"))?;
351            table.insert(doc.id.clone(), acl);
352        }
353        self.inner.ingest(doc)
354    }
355
356    fn query(&self, query: &str, limit: usize) -> anyhow::Result<Vec<KnowledgeResult>> {
357        // Over-fetch so the post-filter top-K is full whenever enough accessible
358        // documents exist.
359        let candidate_n = limit.saturating_mul(OVERFETCH_FACTOR).max(OVERFETCH_FLOOR);
360        let candidates = self.inner.query(query, candidate_n)?;
361
362        let table = self
363            .acls
364            .read()
365            .map_err(|e| anyhow::anyhow!("acl table lock poisoned: {e}"))?;
366
367        let mut out = Vec::with_capacity(limit.min(candidates.len()));
368        for result in candidates {
369            // No recorded ACL ⇒ org-public (backward-compatible default).
370            let allowed = match table.get(&result.document_id) {
371                Some(acl) => self.ctx.can_access(acl),
372                None => true,
373            };
374            if allowed {
375                out.push(result);
376                if out.len() == limit {
377                    break;
378                }
379            }
380        }
381        Ok(out)
382    }
383}
384
385#[cfg(test)]
386mod tests {
387    use super::*;
388
389    // ---- can_access matrix ----------------------------------------------
390
391    #[test]
392    fn can_access_public_allows_anyone() {
393        let acl = DocAcl::public();
394        assert!(AccessContext::anonymous().can_access(&acl));
395        assert!(AccessContext::for_user("anyone").can_access(&acl));
396    }
397
398    #[test]
399    fn can_access_user_match() {
400        let acl = DocAcl::for_users(["alice"]);
401        assert!(AccessContext::for_user("alice").can_access(&acl));
402    }
403
404    #[test]
405    fn can_access_user_no_match_is_denied() {
406        let acl = DocAcl::for_users(["alice"]);
407        assert!(!AccessContext::for_user("bob").can_access(&acl));
408        // Anonymous (no user id) is denied a user-restricted doc.
409        assert!(!AccessContext::anonymous().can_access(&acl));
410    }
411
412    #[test]
413    fn can_access_group_match() {
414        let acl = DocAcl::for_groups(["support"]);
415        let ctx = AccessContext::new(Some("carol".into()), vec!["support".into()]);
416        assert!(ctx.can_access(&acl));
417    }
418
419    #[test]
420    fn can_access_group_no_match_is_denied() {
421        let acl = DocAcl::for_groups(["support"]);
422        let ctx = AccessContext::new(Some("dave".into()), vec!["billing".into()]);
423        assert!(!ctx.can_access(&acl));
424    }
425
426    #[test]
427    fn can_access_empty_acl_is_fully_locked() {
428        // An explicit, empty DocAcl (public:false, no users/groups) denies all —
429        // this is distinct from "no DocAcl recorded" (which is org-public).
430        let acl = DocAcl::default();
431        assert!(!AccessContext::for_user("alice").can_access(&acl));
432        assert!(!AccessContext::anonymous().can_access(&acl));
433        let grouped = AccessContext::new(Some("x".into()), vec!["g".into()]);
434        assert!(!grouped.can_access(&acl));
435    }
436
437    #[test]
438    fn can_access_mixed_user_or_group() {
439        // public:false, but allows user alice OR group support — either grants.
440        let acl = DocAcl::for_users(["alice"]).with_groups(["support"]);
441        assert!(AccessContext::for_user("alice").can_access(&acl));
442        let grp = AccessContext::new(Some("zed".into()), vec!["support".into()]);
443        assert!(grp.can_access(&acl));
444        let neither = AccessContext::new(Some("zed".into()), vec!["billing".into()]);
445        assert!(!neither.can_access(&acl));
446    }
447
448    // ---- DocAcl (de)serialization round-trip ----------------------------
449
450    #[test]
451    fn docacl_round_trips_through_metadata() {
452        let acl = DocAcl::for_users(["alice", "bob"]).with_groups(["support"]);
453        let doc = acl.attach_to(Document::new(
454            "c",
455            "s",
456            smooth_operator_core::DocumentType::Documentation,
457        ));
458        let parsed = DocAcl::from_metadata(&doc.metadata).expect("acl present");
459        assert_eq!(parsed, acl);
460    }
461
462    #[test]
463    fn from_metadata_absent_is_none() {
464        let doc = Document::new("c", "s", smooth_operator_core::DocumentType::Documentation);
465        assert!(DocAcl::from_metadata(&doc.metadata).is_none());
466    }
467
468    #[test]
469    fn from_metadata_malformed_is_none() {
470        let mut metadata = HashMap::new();
471        metadata.insert(
472            DocAcl::ACL_METADATA_KEY.to_string(),
473            "{not json".to_string(),
474        );
475        assert!(DocAcl::from_metadata(&metadata).is_none());
476    }
477}