smooth_operator/access_control.rs
1//! Document-level access control (feature gap G3).
2//!
3//! Org isolation already exists (every conversation / knowledge row carries an
4//! `organizationId`, and the Postgres knowledge base filters on it). This module
5//! adds the **within-org user/group entitlement** layer the industry calls
6//! document-level permissions: even inside one organization, a document may be
7//! restricted to specific users or groups, and a retrieval must only ever return
8//! documents the requester is entitled to read.
9//!
10//! ## Why enforcement lives in our layer
11//!
12//! smooth-operator's [`KnowledgeBase`](smooth_operator_core::KnowledgeBase) trait is
13//! upstream and **read-only to us**: its `query` returns a
14//! [`KnowledgeResult`](smooth_operator_core::KnowledgeResult) that carries only
15//! `document_id` / `chunk` / `score` / `source` — *not* the stored metadata —
16//! and the in-memory backend drops document metadata on ingest entirely. So we
17//! cannot read an ACL back out of a query result. Instead this module:
18//!
19//! 1. Records the document → [`DocAcl`] mapping **at ingest** (parsed from the
20//! [`DocAcl::ACL_METADATA_KEY`] metadata the document carries) into a side
21//! table the [`AclKnowledgeStore`] owns, while forwarding the document
22//! unchanged to the inner backend.
23//! 2. **Filters at read**: an [`AclReader`] bound to the requester's
24//! [`AccessContext`] over-fetches from the inner backend, looks each result's
25//! ACL up in the side table, and drops any the requester cannot access before
26//! truncating to the requested `K`.
27//!
28//! This is backend-agnostic: the same [`AclKnowledgeStore`] wraps the in-memory,
29//! Postgres, or DynamoDB knowledge base identically (the post-filter happens in
30//! our layer, after the backend's own org-scoped query).
31//!
32//! ## No-ACL default semantics — **no-acl ⇒ org-public**
33//!
34//! A document ingested **without** an ACL (the legacy / existing-seed path) has
35//! no entry in the side table and is treated as **org-public**: visible to
36//! anyone whose query reaches it (org isolation already happened upstream). This
37//! keeps existing seeded knowledge retrievable and makes ACLs strictly additive
38//! — you opt a document *into* restriction by attaching a [`DocAcl`]. An
39//! explicit `DocAcl::default()` (all fields empty, `public: false`) is the
40//! opposite: a fully-locked document only its listed users/groups can read.
41
42use std::collections::HashMap;
43use std::sync::{Arc, RwLock};
44
45use serde::{Deserialize, Serialize};
46use smooth_operator_core::{Document, KnowledgeBase, KnowledgeResult};
47
48/// Over-fetch multiplier: the inner backend is queried for `limit * this` (with
49/// a floor) candidates so that, after dropping results the requester can't
50/// access, the post-filter top-K is still full whenever enough accessible
51/// documents exist. Mirrors the over-fetch the Postgres backend already does for
52/// RRF fusion.
53const OVERFETCH_FACTOR: usize = 5;
54
55/// Lower bound on the candidate pool, so a small `limit` still over-fetches
56/// enough to survive filtering.
57const OVERFETCH_FLOOR: usize = 20;
58
59/// A document's allow-list — who may read it within the organization.
60///
61/// A requester may read the document when **any** of these hold:
62/// - the document is [`public`](DocAcl::public),
63/// - the requester's `user_id` is in [`users`](DocAcl::users),
64/// - any of the requester's groups is in [`groups`](DocAcl::groups).
65///
66/// The default (`public: false`, empty `users`/`groups`) is a fully-locked
67/// document. Note that "no `DocAcl` recorded at all" is *different* — that is
68/// org-public (see the module-level no-ACL default semantics).
69#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
70pub struct DocAcl {
71 /// When true, any requester reaching this document may read it.
72 #[serde(default)]
73 pub public: bool,
74 /// User ids explicitly allowed to read this document.
75 #[serde(default)]
76 pub users: Vec<String>,
77 /// Group ids explicitly allowed to read this document.
78 #[serde(default)]
79 pub groups: Vec<String>,
80}
81
82impl DocAcl {
83 /// The document-metadata key under which a [`DocAcl`] is serialized (as
84 /// JSON) so it survives the trip through the ingestion pipeline and into the
85 /// [`AclKnowledgeStore`]'s side table.
86 pub const ACL_METADATA_KEY: &'static str = "acl_v2";
87
88 /// A document readable by anyone reaching it.
89 #[must_use]
90 pub fn public() -> Self {
91 Self {
92 public: true,
93 ..Self::default()
94 }
95 }
96
97 /// A document readable only by the listed users.
98 #[must_use]
99 pub fn for_users<I, S>(users: I) -> Self
100 where
101 I: IntoIterator<Item = S>,
102 S: Into<String>,
103 {
104 Self {
105 public: false,
106 users: users.into_iter().map(Into::into).collect(),
107 groups: Vec::new(),
108 }
109 }
110
111 /// A document readable only by members of the listed groups.
112 #[must_use]
113 pub fn for_groups<I, S>(groups: I) -> Self
114 where
115 I: IntoIterator<Item = S>,
116 S: Into<String>,
117 {
118 Self {
119 public: false,
120 users: Vec::new(),
121 groups: groups.into_iter().map(Into::into).collect(),
122 }
123 }
124
125 /// Allow these additional users (builder).
126 #[must_use]
127 pub fn with_users<I, S>(mut self, users: I) -> Self
128 where
129 I: IntoIterator<Item = S>,
130 S: Into<String>,
131 {
132 self.users.extend(users.into_iter().map(Into::into));
133 self
134 }
135
136 /// Allow these additional groups (builder).
137 #[must_use]
138 pub fn with_groups<I, S>(mut self, groups: I) -> Self
139 where
140 I: IntoIterator<Item = S>,
141 S: Into<String>,
142 {
143 self.groups.extend(groups.into_iter().map(Into::into));
144 self
145 }
146
147 /// Serialize this ACL into a document's metadata under
148 /// [`ACL_METADATA_KEY`](DocAcl::ACL_METADATA_KEY) (builder over a
149 /// [`Document`]). This is how a connector / ingest path stamps an ACL onto a
150 /// document so the [`AclKnowledgeStore`] records it.
151 ///
152 /// # Panics
153 /// Never — [`DocAcl`] always serializes to JSON.
154 #[must_use]
155 pub fn attach_to(&self, doc: Document) -> Document {
156 let json = serde_json::to_string(self).expect("DocAcl always serializes");
157 doc.with_metadata(Self::ACL_METADATA_KEY, json)
158 }
159
160 /// Parse a [`DocAcl`] out of a document's metadata, if one is present and
161 /// well-formed. Returns `None` when the key is absent (no-ACL ⇒ org-public)
162 /// or the value fails to parse (treated as absent so a malformed stamp can't
163 /// silently lock or unlock a document — it falls back to the default).
164 #[must_use]
165 pub fn from_metadata(metadata: &HashMap<String, String>) -> Option<Self> {
166 let raw = metadata.get(Self::ACL_METADATA_KEY)?;
167 serde_json::from_str(raw).ok()
168 }
169}
170
171/// The identity a retrieval is performed *as* — the requester's entitlements.
172///
173/// Built from the authenticated user and the groups they belong to (resolved
174/// upstream from the auth context). Passed into the knowledge-retrieval path so
175/// results can be filtered by [`AccessContext::can_access`].
176#[derive(Debug, Clone, Default, PartialEq, Eq)]
177pub struct AccessContext {
178 /// The requester's user id, if authenticated as a user. `None` for an
179 /// anonymous / system requester (which then only sees public + no-ACL docs).
180 pub user_id: Option<String>,
181 /// The groups the requester belongs to.
182 pub groups: Vec<String>,
183}
184
185impl AccessContext {
186 /// Build a context from an optional user id and a set of groups.
187 #[must_use]
188 pub fn new(user_id: Option<String>, groups: Vec<String>) -> Self {
189 Self { user_id, groups }
190 }
191
192 /// A context for a specific user with no group memberships.
193 #[must_use]
194 pub fn for_user(user_id: impl Into<String>) -> Self {
195 Self {
196 user_id: Some(user_id.into()),
197 groups: Vec::new(),
198 }
199 }
200
201 /// An anonymous requester: no user id, no groups. Sees only public and
202 /// no-ACL (org-public) documents.
203 #[must_use]
204 pub fn anonymous() -> Self {
205 Self::default()
206 }
207
208 /// Add a group membership (builder).
209 #[must_use]
210 pub fn with_group(mut self, group: impl Into<String>) -> Self {
211 self.groups.push(group.into());
212 self
213 }
214
215 /// Whether this requester may read a document with the given [`DocAcl`].
216 ///
217 /// `true` when the doc is public, or the requester's user id is in the
218 /// allow-list, or any of the requester's groups is in the allow-list.
219 #[must_use]
220 pub fn can_access(&self, acl: &DocAcl) -> bool {
221 if acl.public {
222 return true;
223 }
224 if let Some(uid) = &self.user_id {
225 if acl.users.iter().any(|u| u == uid) {
226 return true;
227 }
228 }
229 self.groups.iter().any(|g| acl.groups.contains(g))
230 }
231}
232
233/// Side table mapping a stored `document_id` to its [`DocAcl`]. Shared (`Arc`)
234/// between the ingest handle that populates it and every per-request reader that
235/// consults it. Documents absent from the table are org-public (no-ACL default).
236type AclTable = Arc<RwLock<HashMap<String, DocAcl>>>;
237
238/// An ACL-aware knowledge store: wraps any inner
239/// [`KnowledgeBase`](smooth_operator_core::KnowledgeBase) and records document ACLs
240/// at ingest so retrieval can be filtered per requester.
241///
242/// Construction does **not** itself implement `KnowledgeBase` for reading,
243/// because reads must be bound to a requester. Instead:
244/// - [`ingest_handle`](AclKnowledgeStore::ingest_handle) returns an
245/// `Arc<dyn KnowledgeBase>` that records ACLs as it ingests (used by the
246/// ingestion pipeline);
247/// - [`reader`](AclKnowledgeStore::reader) mints an ACL-filtering
248/// `Arc<dyn KnowledgeBase>` bound to a specific [`AccessContext`] (used by the
249/// runtime + `knowledge_search` tool for a turn).
250#[derive(Clone)]
251pub struct AclKnowledgeStore {
252 inner: Arc<dyn KnowledgeBase>,
253 acls: AclTable,
254}
255
256impl AclKnowledgeStore {
257 /// Wrap an inner knowledge base. The store starts with an empty ACL table;
258 /// every document ingested through [`ingest_handle`](Self::ingest_handle)
259 /// that carries a [`DocAcl`] in its metadata is recorded.
260 #[must_use]
261 pub fn new(inner: Arc<dyn KnowledgeBase>) -> Self {
262 Self {
263 inner,
264 acls: Arc::new(RwLock::new(HashMap::new())),
265 }
266 }
267
268 /// An ingest-side handle: a [`KnowledgeBase`] whose `ingest` records the
269 /// document's ACL (if any) in the shared side table, then forwards to the
270 /// inner backend. Its `query` is the **unfiltered** inner query (callers
271 /// that read for a specific requester use [`reader`](Self::reader) instead).
272 #[must_use]
273 pub fn ingest_handle(&self) -> Arc<dyn KnowledgeBase> {
274 Arc::new(AclIngestHandle {
275 inner: Arc::clone(&self.inner),
276 acls: Arc::clone(&self.acls),
277 })
278 }
279
280 /// A read-side handle bound to `ctx`: a [`KnowledgeBase`] whose `query`
281 /// over-fetches from the inner backend and drops every result the requester
282 /// is not entitled to before truncating to the requested limit.
283 #[must_use]
284 pub fn reader(&self, ctx: AccessContext) -> Arc<dyn KnowledgeBase> {
285 Arc::new(AclReader {
286 inner: Arc::clone(&self.inner),
287 acls: Arc::clone(&self.acls),
288 ctx,
289 })
290 }
291
292 /// Record `document_id → acl` directly (without ingesting a document) — for
293 /// callers that store documents through some other path but still want the
294 /// ACL enforced at read.
295 ///
296 /// # Errors
297 /// Returns an error if the ACL table lock is poisoned.
298 pub fn record_acl(&self, document_id: impl Into<String>, acl: DocAcl) -> anyhow::Result<()> {
299 let mut table = self
300 .acls
301 .write()
302 .map_err(|e| anyhow::anyhow!("acl table lock poisoned: {e}"))?;
303 table.insert(document_id.into(), acl);
304 Ok(())
305 }
306}
307
308/// Records ACLs at ingest, forwarding documents to the inner backend.
309struct AclIngestHandle {
310 inner: Arc<dyn KnowledgeBase>,
311 acls: AclTable,
312}
313
314impl KnowledgeBase for AclIngestHandle {
315 fn ingest(&self, doc: Document) -> anyhow::Result<()> {
316 // Record the ACL (if the document carries one) keyed by document id, so
317 // a later query result with that document_id can be access-checked.
318 if let Some(acl) = DocAcl::from_metadata(&doc.metadata) {
319 let mut table = self
320 .acls
321 .write()
322 .map_err(|e| anyhow::anyhow!("acl table lock poisoned: {e}"))?;
323 table.insert(doc.id.clone(), acl);
324 }
325 self.inner.ingest(doc)
326 }
327
328 fn query(&self, query: &str, limit: usize) -> anyhow::Result<Vec<KnowledgeResult>> {
329 // Ingest handle reads are unfiltered (no requester bound). Production
330 // reads go through `reader(ctx)`.
331 self.inner.query(query, limit)
332 }
333}
334
335/// Filters query results by a bound [`AccessContext`].
336struct AclReader {
337 inner: Arc<dyn KnowledgeBase>,
338 acls: AclTable,
339 ctx: AccessContext,
340}
341
342impl KnowledgeBase for AclReader {
343 fn ingest(&self, doc: Document) -> anyhow::Result<()> {
344 // A reader can still ingest (recording ACLs), so the same handle is
345 // usable end to end in tests — but production ingest uses ingest_handle.
346 if let Some(acl) = DocAcl::from_metadata(&doc.metadata) {
347 let mut table = self
348 .acls
349 .write()
350 .map_err(|e| anyhow::anyhow!("acl table lock poisoned: {e}"))?;
351 table.insert(doc.id.clone(), acl);
352 }
353 self.inner.ingest(doc)
354 }
355
356 fn query(&self, query: &str, limit: usize) -> anyhow::Result<Vec<KnowledgeResult>> {
357 // Over-fetch so the post-filter top-K is full whenever enough accessible
358 // documents exist.
359 let candidate_n = limit.saturating_mul(OVERFETCH_FACTOR).max(OVERFETCH_FLOOR);
360 let candidates = self.inner.query(query, candidate_n)?;
361
362 let table = self
363 .acls
364 .read()
365 .map_err(|e| anyhow::anyhow!("acl table lock poisoned: {e}"))?;
366
367 let mut out = Vec::with_capacity(limit.min(candidates.len()));
368 for result in candidates {
369 // No recorded ACL ⇒ org-public (backward-compatible default).
370 let allowed = match table.get(&result.document_id) {
371 Some(acl) => self.ctx.can_access(acl),
372 None => true,
373 };
374 if allowed {
375 out.push(result);
376 if out.len() == limit {
377 break;
378 }
379 }
380 }
381 Ok(out)
382 }
383}
384
385#[cfg(test)]
386mod tests {
387 use super::*;
388
389 // ---- can_access matrix ----------------------------------------------
390
391 #[test]
392 fn can_access_public_allows_anyone() {
393 let acl = DocAcl::public();
394 assert!(AccessContext::anonymous().can_access(&acl));
395 assert!(AccessContext::for_user("anyone").can_access(&acl));
396 }
397
398 #[test]
399 fn can_access_user_match() {
400 let acl = DocAcl::for_users(["alice"]);
401 assert!(AccessContext::for_user("alice").can_access(&acl));
402 }
403
404 #[test]
405 fn can_access_user_no_match_is_denied() {
406 let acl = DocAcl::for_users(["alice"]);
407 assert!(!AccessContext::for_user("bob").can_access(&acl));
408 // Anonymous (no user id) is denied a user-restricted doc.
409 assert!(!AccessContext::anonymous().can_access(&acl));
410 }
411
412 #[test]
413 fn can_access_group_match() {
414 let acl = DocAcl::for_groups(["support"]);
415 let ctx = AccessContext::new(Some("carol".into()), vec!["support".into()]);
416 assert!(ctx.can_access(&acl));
417 }
418
419 #[test]
420 fn can_access_group_no_match_is_denied() {
421 let acl = DocAcl::for_groups(["support"]);
422 let ctx = AccessContext::new(Some("dave".into()), vec!["billing".into()]);
423 assert!(!ctx.can_access(&acl));
424 }
425
426 #[test]
427 fn can_access_empty_acl_is_fully_locked() {
428 // An explicit, empty DocAcl (public:false, no users/groups) denies all —
429 // this is distinct from "no DocAcl recorded" (which is org-public).
430 let acl = DocAcl::default();
431 assert!(!AccessContext::for_user("alice").can_access(&acl));
432 assert!(!AccessContext::anonymous().can_access(&acl));
433 let grouped = AccessContext::new(Some("x".into()), vec!["g".into()]);
434 assert!(!grouped.can_access(&acl));
435 }
436
437 #[test]
438 fn can_access_mixed_user_or_group() {
439 // public:false, but allows user alice OR group support — either grants.
440 let acl = DocAcl::for_users(["alice"]).with_groups(["support"]);
441 assert!(AccessContext::for_user("alice").can_access(&acl));
442 let grp = AccessContext::new(Some("zed".into()), vec!["support".into()]);
443 assert!(grp.can_access(&acl));
444 let neither = AccessContext::new(Some("zed".into()), vec!["billing".into()]);
445 assert!(!neither.can_access(&acl));
446 }
447
448 // ---- DocAcl (de)serialization round-trip ----------------------------
449
450 #[test]
451 fn docacl_round_trips_through_metadata() {
452 let acl = DocAcl::for_users(["alice", "bob"]).with_groups(["support"]);
453 let doc = acl.attach_to(Document::new(
454 "c",
455 "s",
456 smooth_operator_core::DocumentType::Documentation,
457 ));
458 let parsed = DocAcl::from_metadata(&doc.metadata).expect("acl present");
459 assert_eq!(parsed, acl);
460 }
461
462 #[test]
463 fn from_metadata_absent_is_none() {
464 let doc = Document::new("c", "s", smooth_operator_core::DocumentType::Documentation);
465 assert!(DocAcl::from_metadata(&doc.metadata).is_none());
466 }
467
468 #[test]
469 fn from_metadata_malformed_is_none() {
470 let mut metadata = HashMap::new();
471 metadata.insert(
472 DocAcl::ACL_METADATA_KEY.to_string(),
473 "{not json".to_string(),
474 );
475 assert!(DocAcl::from_metadata(&metadata).is_none());
476 }
477}