smooth_operator/curation.rs
1//! Document sets, curation boosting, and query-time metadata filters
2//! (feature gap, Phase 11 — "Document sets / curation / boosting").
3//!
4//! Curation lets a user (a) group documents into named **document sets** so a
5//! query can be scoped to "only the dev-support repo" or "only the HR handbook",
6//! (b) **boost** canonical documents (the README, the policy of record) so they
7//! outrank merely-similar matches, and (c) filter retrieval by arbitrary
8//! **metadata equality** ("only prose, not code"). This module adds all three to
9//! smooth-operator, as a retrieval-time filter applied in *our* layer — exactly
10//! like [`AclKnowledgeStore`](crate::access_control::AclKnowledgeStore).
11//!
12//! ## Why enforcement lives in our layer (same reason as ACL)
13//!
14//! The engine's [`KnowledgeBase`](smooth_operator_core::KnowledgeBase) trait is
15//! upstream and read-only to us; its `query` returns a
16//! [`KnowledgeResult`](smooth_operator_core::KnowledgeResult) carrying only
17//! `document_id` / `chunk` / `score` / `source` — **not** the stored metadata —
18//! and the in-memory backend drops document metadata on ingest entirely. So we
19//! cannot read a document's set membership / boost / metadata back out of a
20//! query result. Instead this module, mirroring [`access_control`](crate::access_control):
21//!
22//! 1. Records each document's [`DocMeta`] (parsed from the metadata stamped at
23//! ingest — see the [metadata convention](#metadata-convention)) into a side
24//! table the [`CuratedKnowledgeStore`] owns, while forwarding the document
25//! unchanged to the inner backend.
26//! 2. **Filters + boosts at read**: a reader bound to a [`RetrievalFilter`]
27//! over-fetches from the inner backend, drops documents not in the requested
28//! sets / not matching the metadata equalities, multiplies each surviving
29//! result's score by its [`boost`](DocMeta::boost), **re-sorts** by the boosted
30//! score, and truncates to the requested `K`.
31//!
32//! ## Metadata convention
33//!
34//! Stamped onto [`Document::metadata`](smooth_operator_core::Document) at ingest
35//! (the ingestion pipeline writes these; a connector/ingest config supplies the
36//! values):
37//!
38//! - **`document_set`** — set membership. **Multi-valued via a comma list**:
39//! `"alpha"` is one set, `"alpha,beta"` is both. Names are trimmed of
40//! surrounding whitespace; empty names are dropped. A document with no
41//! `document_set` belongs to no named set (so a set-scoped query never
42//! surfaces it, but an unscoped query — `document_sets: None` — still does).
43//! - **`boost`** — a parsed `f32` multiplier on the similarity score, default
44//! **1.0**. Absent or malformed (`"abc"`, `""`, `NaN`, non-finite) ⇒ `1.0`, so
45//! a bad stamp can never silently zero out or explode a document's ranking.
46//! Negative boosts are clamped to `0.0` (a curator can bury a doc, never invert
47//! ordering). `boost > 1.0` promotes; `0.0 ≤ boost < 1.0` demotes.
48//!
49//! ## Composition with ACL
50//!
51//! A [`CuratedKnowledgeStore`] also records [`DocAcl`](crate::access_control::DocAcl)s
52//! at ingest (same `acl_v2` key) and its reader takes an
53//! [`AccessContext`](crate::access_control::AccessContext) alongside the
54//! [`RetrievalFilter`]. **Both filters apply (logical AND)**: a result is
55//! returned only if the requester is entitled to it (ACL) *and* it is in the
56//! requested sets *and* it matches the metadata equalities. ACL is checked first
57//! (a curation filter must never widen what a requester can see).
58
59use std::collections::HashMap;
60use std::sync::{Arc, RwLock};
61
62use serde::{Deserialize, Serialize};
63use smooth_operator_core::{Document, KnowledgeBase, KnowledgeResult};
64
65use crate::access_control::{AccessContext, DocAcl};
66
67/// Over-fetch multiplier: the inner backend is queried for `limit * this` (with
68/// a floor) candidates so that, after dropping non-matching documents and
69/// re-ranking by boost, the post-filter top-K is still full whenever enough
70/// matching documents exist. Mirrors the ACL reader's over-fetch.
71const OVERFETCH_FACTOR: usize = 5;
72
73/// Lower bound on the candidate pool, so a small `limit` still over-fetches
74/// enough to survive filtering + boost re-ranking.
75const OVERFETCH_FLOOR: usize = 20;
76
77/// The default boost applied to a document with no (or a malformed) `boost`
78/// metadata value: a no-op multiplier that preserves the raw similarity score.
79pub const DEFAULT_BOOST: f32 = 1.0;
80
81/// Curation metadata recorded for a stored document: its document-set
82/// membership and its retrieval boost, plus the raw metadata map so the
83/// [`RetrievalFilter`]'s `metadata_eq` equalities can be evaluated at read.
84///
85/// Parsed from a [`Document`]'s metadata at ingest by [`DocMeta::from_document`].
86#[derive(Debug, Clone, PartialEq)]
87pub struct DocMeta {
88 /// The named sets this document belongs to (parsed from the comma-separated
89 /// `document_set` metadata value; empty when the document is in no set).
90 pub document_sets: Vec<String>,
91 /// The retrieval boost multiplier (parsed from the `boost` metadata value;
92 /// [`DEFAULT_BOOST`] when absent/malformed). Clamped to `≥ 0.0`.
93 pub boost: f32,
94 /// The full stamped metadata map, retained so `metadata_eq` filters can test
95 /// arbitrary key/value equalities against it.
96 pub metadata: HashMap<String, String>,
97}
98
99impl DocMeta {
100 /// The document-metadata key under which set membership is stamped. The
101 /// value is a **comma-separated** list (e.g. `"alpha"` or `"alpha,beta"`).
102 pub const DOCUMENT_SET_KEY: &'static str = "document_set";
103
104 /// The document-metadata key under which the numeric boost is stamped (a
105 /// stringified `f32`, e.g. `"3.0"`).
106 pub const BOOST_KEY: &'static str = "boost";
107
108 /// Parse the document-set list from a `document_set` metadata value:
109 /// comma-split, trim each name, drop empties. Returns an empty vec when the
110 /// key is absent or holds only whitespace/commas.
111 #[must_use]
112 pub fn parse_sets(metadata: &HashMap<String, String>) -> Vec<String> {
113 metadata
114 .get(Self::DOCUMENT_SET_KEY)
115 .map(|raw| {
116 raw.split(',')
117 .map(str::trim)
118 .filter(|s| !s.is_empty())
119 .map(ToString::to_string)
120 .collect()
121 })
122 .unwrap_or_default()
123 }
124
125 /// Parse the boost from a `boost` metadata value. Absent / unparseable /
126 /// non-finite ⇒ [`DEFAULT_BOOST`]; a parsed value is clamped to `≥ 0.0` so a
127 /// negative boost can only bury a document, never invert ordering.
128 #[must_use]
129 pub fn parse_boost(metadata: &HashMap<String, String>) -> f32 {
130 metadata
131 .get(Self::BOOST_KEY)
132 .and_then(|raw| raw.trim().parse::<f32>().ok())
133 .filter(|b| b.is_finite())
134 .map(|b| b.max(0.0))
135 .unwrap_or(DEFAULT_BOOST)
136 }
137
138 /// Build a [`DocMeta`] from a stored document's metadata.
139 #[must_use]
140 pub fn from_metadata(metadata: &HashMap<String, String>) -> Self {
141 Self {
142 document_sets: Self::parse_sets(metadata),
143 boost: Self::parse_boost(metadata),
144 metadata: metadata.clone(),
145 }
146 }
147
148 /// Build a [`DocMeta`] from a [`Document`] (convenience over
149 /// [`from_metadata`](Self::from_metadata)).
150 #[must_use]
151 pub fn from_document(doc: &Document) -> Self {
152 Self::from_metadata(&doc.metadata)
153 }
154
155 /// Whether this document belongs to the named set.
156 #[must_use]
157 pub fn in_set(&self, set: &str) -> bool {
158 self.document_sets.iter().any(|s| s == set)
159 }
160}
161
162/// Stamp a document-set membership onto a [`Document`]'s metadata (builder).
163///
164/// Multi-valued: pass several names to tag the document into all of them (stored
165/// as the comma-separated `document_set` value the [`CuratedKnowledgeStore`]
166/// parses). This is how a connector / ingest config tags a repo's docs into a
167/// named set (e.g. dev-support tags every doc from `acme/app` into set
168/// `"acme/app"`).
169#[must_use]
170pub fn with_document_set<I, S>(doc: Document, sets: I) -> Document
171where
172 I: IntoIterator<Item = S>,
173 S: Into<String>,
174{
175 let joined = sets
176 .into_iter()
177 .map(Into::into)
178 .filter(|s| !s.trim().is_empty())
179 .collect::<Vec<_>>()
180 .join(",");
181 if joined.is_empty() {
182 doc
183 } else {
184 doc.with_metadata(DocMeta::DOCUMENT_SET_KEY, joined)
185 }
186}
187
188/// Stamp a numeric boost onto a [`Document`]'s metadata (builder). A non-finite
189/// or negative value is normalized so the stamp is always a sane, parseable
190/// multiplier.
191#[must_use]
192pub fn with_boost(doc: Document, boost: f32) -> Document {
193 let boost = if boost.is_finite() {
194 boost.max(0.0)
195 } else {
196 DEFAULT_BOOST
197 };
198 doc.with_metadata(DocMeta::BOOST_KEY, format!("{boost}"))
199}
200
201/// A query-time retrieval filter: scope results to named document sets and/or
202/// require metadata equalities.
203///
204/// - `document_sets: None` ⇒ **no set scoping** (every document is eligible —
205/// the current/default behavior). `Some([])` ⇒ scope to *no* set (matches
206/// nothing); `Some(["alpha"])` ⇒ only documents in set `"alpha"`; a doc in
207/// **any** of the listed sets matches (union).
208/// - `metadata_eq` ⇒ every `(key, value)` must be present and equal in the
209/// document's stamped metadata (logical AND across the map). Empty ⇒ no
210/// metadata constraint.
211///
212/// An all-default `RetrievalFilter` ([`RetrievalFilter::none`]) matches every
213/// document — the no-op that preserves current retrieval behavior.
214#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
215pub struct RetrievalFilter {
216 /// Scope to documents in any of these sets. `None` ⇒ unscoped (all docs).
217 #[serde(default)]
218 pub document_sets: Option<Vec<String>>,
219 /// Require these `key == value` metadata equalities (all must hold).
220 #[serde(default)]
221 pub metadata_eq: HashMap<String, String>,
222}
223
224impl RetrievalFilter {
225 /// The no-op filter: no set scoping, no metadata constraint — matches every
226 /// document (preserves default retrieval behavior).
227 #[must_use]
228 pub fn none() -> Self {
229 Self::default()
230 }
231
232 /// Scope retrieval to the given document sets (a doc in any of them matches).
233 #[must_use]
234 pub fn in_sets<I, S>(sets: I) -> Self
235 where
236 I: IntoIterator<Item = S>,
237 S: Into<String>,
238 {
239 Self {
240 document_sets: Some(sets.into_iter().map(Into::into).collect()),
241 metadata_eq: HashMap::new(),
242 }
243 }
244
245 /// Add a required metadata equality (builder).
246 #[must_use]
247 pub fn with_metadata_eq(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
248 self.metadata_eq.insert(key.into(), value.into());
249 self
250 }
251
252 /// Whether this filter imposes no constraint at all (so retrieval is
253 /// unchanged). Used to short-circuit the over-fetch when there's nothing to
254 /// filter on.
255 #[must_use]
256 pub fn is_unconstrained(&self) -> bool {
257 self.document_sets.is_none() && self.metadata_eq.is_empty()
258 }
259
260 /// Whether a document with the given [`DocMeta`] passes this filter.
261 ///
262 /// True when (a) `document_sets` is `None` *or* the doc is in at least one of
263 /// the listed sets, **and** (b) every `metadata_eq` entry is present and
264 /// equal in the doc's metadata.
265 #[must_use]
266 pub fn matches(&self, meta: &DocMeta) -> bool {
267 if let Some(sets) = &self.document_sets {
268 if !sets.iter().any(|s| meta.in_set(s)) {
269 return false;
270 }
271 }
272 self.metadata_eq
273 .iter()
274 .all(|(k, v)| meta.metadata.get(k).is_some_and(|mv| mv == v))
275 }
276}
277
278/// Side table mapping a stored `document_id` to its [`DocMeta`]. Shared (`Arc`)
279/// between the ingest handle that populates it and every per-request reader.
280type MetaTable = Arc<RwLock<HashMap<String, DocMeta>>>;
281
282/// Side table mapping a stored `document_id` to its [`DocAcl`] (same role as in
283/// [`access_control`](crate::access_control)) so a [`CuratedKnowledgeStore`]
284/// enforces ACL ∧ curation in one read pass.
285type AclTable = Arc<RwLock<HashMap<String, DocAcl>>>;
286
287/// A curation-aware knowledge store: wraps any inner
288/// [`KnowledgeBase`](smooth_operator_core::KnowledgeBase), records each
289/// document's [`DocMeta`] (set membership / boost / metadata) **and** its
290/// [`DocAcl`] at ingest, and mints readers that apply a [`RetrievalFilter`] and
291/// [`AccessContext`] together at read time.
292///
293/// Like [`AclKnowledgeStore`](crate::access_control::AclKnowledgeStore), it does
294/// not itself implement `KnowledgeBase` for reading (reads must be bound to a
295/// filter + requester). Instead:
296/// - [`ingest_handle`](Self::ingest_handle) returns an `Arc<dyn KnowledgeBase>`
297/// that records curation metadata + ACL as it ingests;
298/// - [`reader`](Self::reader) mints a filtering/boosting `Arc<dyn KnowledgeBase>`
299/// bound to a [`RetrievalFilter`] + [`AccessContext`].
300#[derive(Clone)]
301pub struct CuratedKnowledgeStore {
302 inner: Arc<dyn KnowledgeBase>,
303 meta: MetaTable,
304 acls: AclTable,
305}
306
307impl CuratedKnowledgeStore {
308 /// Wrap an inner knowledge base. The store starts with empty side tables;
309 /// every document ingested through [`ingest_handle`](Self::ingest_handle) has
310 /// its [`DocMeta`] (and [`DocAcl`], if stamped) recorded.
311 #[must_use]
312 pub fn new(inner: Arc<dyn KnowledgeBase>) -> Self {
313 Self {
314 inner,
315 meta: Arc::new(RwLock::new(HashMap::new())),
316 acls: Arc::new(RwLock::new(HashMap::new())),
317 }
318 }
319
320 /// An ingest-side handle: a [`KnowledgeBase`] whose `ingest` records the
321 /// document's [`DocMeta`] (always) and [`DocAcl`] (if stamped) in the shared
322 /// side tables, then forwards to the inner backend. Its `query` is the
323 /// **unfiltered** inner query — production reads use [`reader`](Self::reader).
324 #[must_use]
325 pub fn ingest_handle(&self) -> Arc<dyn KnowledgeBase> {
326 Arc::new(CuratedIngestHandle {
327 inner: Arc::clone(&self.inner),
328 meta: Arc::clone(&self.meta),
329 acls: Arc::clone(&self.acls),
330 })
331 }
332
333 /// A read-side handle bound to a [`RetrievalFilter`] + [`AccessContext`]: a
334 /// [`KnowledgeBase`] whose `query` over-fetches from the inner backend, drops
335 /// every result the requester is not entitled to (ACL) or that does not match
336 /// the filter (sets/metadata), multiplies each survivor's score by its boost,
337 /// re-sorts, and truncates to the requested limit.
338 ///
339 /// Pass [`RetrievalFilter::none`] + [`AccessContext::anonymous`] for the
340 /// unscoped, ACL-default path (boost still applies — a boosted doc still
341 /// re-ranks).
342 #[must_use]
343 pub fn reader(&self, filter: RetrievalFilter, access: AccessContext) -> Arc<dyn KnowledgeBase> {
344 Arc::new(CuratedReader {
345 inner: Arc::clone(&self.inner),
346 meta: Arc::clone(&self.meta),
347 acls: Arc::clone(&self.acls),
348 filter,
349 access,
350 })
351 }
352
353 /// Record `document_id → meta` directly (without ingesting a document) — for
354 /// callers that store documents through some other path but still want the
355 /// curation filter/boost applied at read.
356 ///
357 /// # Errors
358 /// Returns an error if the metadata table lock is poisoned.
359 pub fn record_meta(&self, document_id: impl Into<String>, meta: DocMeta) -> anyhow::Result<()> {
360 let mut table = self
361 .meta
362 .write()
363 .map_err(|e| anyhow::anyhow!("curation meta table lock poisoned: {e}"))?;
364 table.insert(document_id.into(), meta);
365 Ok(())
366 }
367}
368
369/// Records curation metadata + ACL at ingest, forwarding documents to the inner
370/// backend.
371struct CuratedIngestHandle {
372 inner: Arc<dyn KnowledgeBase>,
373 meta: MetaTable,
374 acls: AclTable,
375}
376
377/// Shared ingest bookkeeping: record a document's [`DocMeta`] (always) and
378/// [`DocAcl`] (when stamped) into the side tables before forwarding it.
379fn record_ingest_metadata(meta: &MetaTable, acls: &AclTable, doc: &Document) -> anyhow::Result<()> {
380 {
381 let mut table = meta
382 .write()
383 .map_err(|e| anyhow::anyhow!("curation meta table lock poisoned: {e}"))?;
384 table.insert(doc.id.clone(), DocMeta::from_document(doc));
385 }
386 if let Some(acl) = DocAcl::from_metadata(&doc.metadata) {
387 let mut table = acls
388 .write()
389 .map_err(|e| anyhow::anyhow!("acl table lock poisoned: {e}"))?;
390 table.insert(doc.id.clone(), acl);
391 }
392 Ok(())
393}
394
395impl KnowledgeBase for CuratedIngestHandle {
396 fn ingest(&self, doc: Document) -> anyhow::Result<()> {
397 record_ingest_metadata(&self.meta, &self.acls, &doc)?;
398 self.inner.ingest(doc)
399 }
400
401 fn query(&self, query: &str, limit: usize) -> anyhow::Result<Vec<KnowledgeResult>> {
402 self.inner.query(query, limit)
403 }
404}
405
406/// Filters + boosts query results by a bound [`RetrievalFilter`] +
407/// [`AccessContext`].
408struct CuratedReader {
409 inner: Arc<dyn KnowledgeBase>,
410 meta: MetaTable,
411 acls: AclTable,
412 filter: RetrievalFilter,
413 access: AccessContext,
414}
415
416impl KnowledgeBase for CuratedReader {
417 fn ingest(&self, doc: Document) -> anyhow::Result<()> {
418 // A reader can still ingest (recording metadata + ACL), so the same
419 // handle is usable end to end in tests — production ingest uses
420 // ingest_handle.
421 record_ingest_metadata(&self.meta, &self.acls, &doc)?;
422 self.inner.ingest(doc)
423 }
424
425 fn query(&self, query: &str, limit: usize) -> anyhow::Result<Vec<KnowledgeResult>> {
426 if limit == 0 {
427 return Ok(Vec::new());
428 }
429
430 // Over-fetch so the post-filter, post-boost top-K is full whenever
431 // enough matching documents exist.
432 let candidate_n = limit.saturating_mul(OVERFETCH_FACTOR).max(OVERFETCH_FLOOR);
433 let candidates = self.inner.query(query, candidate_n)?;
434
435 let meta_table = self
436 .meta
437 .read()
438 .map_err(|e| anyhow::anyhow!("curation meta table lock poisoned: {e}"))?;
439 let acl_table = self
440 .acls
441 .read()
442 .map_err(|e| anyhow::anyhow!("acl table lock poisoned: {e}"))?;
443
444 let mut kept: Vec<KnowledgeResult> = Vec::with_capacity(candidates.len());
445 for mut result in candidates {
446 // ACL first: a curation filter must never widen what a requester can
447 // see. No recorded ACL ⇒ org-public (backward-compatible default).
448 let acl_ok = match acl_table.get(&result.document_id) {
449 Some(acl) => self.access.can_access(acl),
450 None => true,
451 };
452 if !acl_ok {
453 continue;
454 }
455
456 // Then the curation filter (sets + metadata). A document with no
457 // recorded DocMeta is treated as an empty DocMeta: it belongs to no
458 // set (so a set-scoped query skips it) and has the default boost.
459 let doc_meta = meta_table.get(&result.document_id).cloned();
460 let meta_for_match = doc_meta.clone().unwrap_or_else(|| DocMeta {
461 document_sets: Vec::new(),
462 boost: DEFAULT_BOOST,
463 metadata: HashMap::new(),
464 });
465 if !self.filter.matches(&meta_for_match) {
466 continue;
467 }
468
469 // Apply the boost to the score before re-ranking.
470 result.score *= meta_for_match.boost;
471 kept.push(result);
472 }
473
474 // Re-sort by the boosted score (descending), then truncate to K. A
475 // stable, total order: NaN-safe via `total_cmp`.
476 kept.sort_by(|a, b| b.score.total_cmp(&a.score));
477 kept.truncate(limit);
478 Ok(kept)
479 }
480}
481
482#[cfg(test)]
483mod tests {
484 use super::*;
485 use smooth_operator_core::DocumentType;
486
487 fn doc(id: &str, content: &str) -> Document {
488 let mut d = Document::new(content, "s", DocumentType::Documentation);
489 d.id = id.to_string();
490 d
491 }
492
493 // ---- DocMeta::parse_sets --------------------------------------------
494
495 #[test]
496 fn parse_sets_single_and_multi() {
497 let d = with_document_set(doc("a", "x"), ["alpha"]);
498 assert_eq!(DocMeta::parse_sets(&d.metadata), vec!["alpha".to_string()]);
499
500 let d = with_document_set(doc("b", "x"), ["alpha", "beta"]);
501 assert_eq!(
502 DocMeta::parse_sets(&d.metadata),
503 vec!["alpha".to_string(), "beta".to_string()]
504 );
505 }
506
507 #[test]
508 fn parse_sets_trims_and_drops_empties() {
509 let d = doc("c", "x").with_metadata(DocMeta::DOCUMENT_SET_KEY, " alpha , , beta ,");
510 assert_eq!(
511 DocMeta::parse_sets(&d.metadata),
512 vec!["alpha".to_string(), "beta".to_string()]
513 );
514 }
515
516 #[test]
517 fn parse_sets_absent_is_empty() {
518 let d = doc("d", "x");
519 assert!(DocMeta::parse_sets(&d.metadata).is_empty());
520 }
521
522 // ---- DocMeta::parse_boost (boost math: default + malformed → 1.0) ----
523
524 #[test]
525 fn parse_boost_default_when_absent() {
526 let d = doc("e", "x");
527 assert_eq!(DocMeta::parse_boost(&d.metadata), DEFAULT_BOOST);
528 }
529
530 #[test]
531 fn parse_boost_parses_valid() {
532 let d = with_boost(doc("f", "x"), 3.0);
533 assert!((DocMeta::parse_boost(&d.metadata) - 3.0).abs() < f32::EPSILON);
534 }
535
536 #[test]
537 fn parse_boost_malformed_falls_back_to_default() {
538 for bad in ["abc", "", " ", "NaN", "inf", "1.2.3"] {
539 let d = doc("g", "x").with_metadata(DocMeta::BOOST_KEY, bad);
540 assert_eq!(
541 DocMeta::parse_boost(&d.metadata),
542 DEFAULT_BOOST,
543 "malformed boost {bad:?} must fall back to default"
544 );
545 }
546 }
547
548 #[test]
549 fn parse_boost_negative_is_clamped_to_zero() {
550 let d = doc("h", "x").with_metadata(DocMeta::BOOST_KEY, "-2.0");
551 assert_eq!(DocMeta::parse_boost(&d.metadata), 0.0);
552 }
553
554 #[test]
555 fn with_boost_normalizes_non_finite() {
556 // A non-finite boost passed to the builder is normalized to the default.
557 let d = with_boost(doc("i", "x"), f32::NAN);
558 assert_eq!(DocMeta::parse_boost(&d.metadata), DEFAULT_BOOST);
559 let d = with_boost(doc("j", "x"), f32::INFINITY);
560 assert_eq!(DocMeta::parse_boost(&d.metadata), DEFAULT_BOOST);
561 }
562
563 // ---- RetrievalFilter::matches (the filter predicate) ----------------
564
565 fn meta(sets: &[&str], boost: f32, kv: &[(&str, &str)]) -> DocMeta {
566 let mut metadata = HashMap::new();
567 for (k, v) in kv {
568 metadata.insert((*k).to_string(), (*v).to_string());
569 }
570 DocMeta {
571 document_sets: sets.iter().map(ToString::to_string).collect(),
572 boost,
573 metadata,
574 }
575 }
576
577 #[test]
578 fn unconstrained_filter_matches_everything() {
579 let f = RetrievalFilter::none();
580 assert!(f.is_unconstrained());
581 assert!(f.matches(&meta(&[], 1.0, &[])));
582 assert!(f.matches(&meta(&["alpha"], 1.0, &[("kind", "code")])));
583 }
584
585 #[test]
586 fn set_scope_matches_only_members() {
587 let f = RetrievalFilter::in_sets(["alpha"]);
588 assert!(!f.is_unconstrained());
589 assert!(f.matches(&meta(&["alpha"], 1.0, &[])));
590 assert!(f.matches(&meta(&["alpha", "beta"], 1.0, &[]))); // multi-set member
591 assert!(!f.matches(&meta(&["beta"], 1.0, &[])));
592 assert!(!f.matches(&meta(&[], 1.0, &[]))); // in no set
593 }
594
595 #[test]
596 fn set_scope_union_across_listed_sets() {
597 let f = RetrievalFilter::in_sets(["alpha", "gamma"]);
598 assert!(f.matches(&meta(&["gamma"], 1.0, &[])));
599 assert!(f.matches(&meta(&["alpha"], 1.0, &[])));
600 assert!(!f.matches(&meta(&["beta"], 1.0, &[])));
601 }
602
603 #[test]
604 fn empty_set_list_matches_nothing() {
605 let f = RetrievalFilter {
606 document_sets: Some(vec![]),
607 metadata_eq: HashMap::new(),
608 };
609 assert!(!f.matches(&meta(&["alpha"], 1.0, &[])));
610 assert!(!f.matches(&meta(&[], 1.0, &[])));
611 }
612
613 #[test]
614 fn metadata_eq_requires_all_equalities() {
615 let f = RetrievalFilter::none()
616 .with_metadata_eq("kind", "prose")
617 .with_metadata_eq("lang", "en");
618 assert!(f.matches(&meta(&[], 1.0, &[("kind", "prose"), ("lang", "en")])));
619 // Missing one key.
620 assert!(!f.matches(&meta(&[], 1.0, &[("kind", "prose")])));
621 // Wrong value.
622 assert!(!f.matches(&meta(&[], 1.0, &[("kind", "code"), ("lang", "en")])));
623 }
624
625 #[test]
626 fn set_and_metadata_compose_with_and() {
627 let f = RetrievalFilter::in_sets(["alpha"]).with_metadata_eq("kind", "prose");
628 assert!(f.matches(&meta(&["alpha"], 1.0, &[("kind", "prose")])));
629 assert!(!f.matches(&meta(&["alpha"], 1.0, &[("kind", "code")]))); // set ok, meta no
630 assert!(!f.matches(&meta(&["beta"], 1.0, &[("kind", "prose")]))); // meta ok, set no
631 }
632
633 // ---- RetrievalFilter round-trips through serde ----------------------
634
635 #[test]
636 fn filter_round_trips_through_json() {
637 let f = RetrievalFilter::in_sets(["alpha", "beta"]).with_metadata_eq("kind", "prose");
638 let json = serde_json::to_string(&f).expect("serialize");
639 let parsed: RetrievalFilter = serde_json::from_str(&json).expect("deserialize");
640 assert_eq!(parsed, f);
641 }
642
643 // ---- store-level: ingest records DocMeta, reader applies it ---------
644
645 fn curated_store() -> CuratedKnowledgeStore {
646 CuratedKnowledgeStore::new(Arc::new(smooth_operator_core::InMemoryKnowledge::new()))
647 }
648
649 #[test]
650 fn reader_with_no_filter_returns_all_with_boost_applied() {
651 let store = curated_store();
652 let h = store.ingest_handle();
653 h.ingest(with_document_set(
654 doc("a", "clearance alpha fact"),
655 ["alpha"],
656 ))
657 .unwrap();
658 h.ingest(doc("plain", "clearance plain fact")).unwrap();
659
660 // Unconstrained filter: both come back.
661 let r = store.reader(RetrievalFilter::none(), AccessContext::anonymous());
662 let ids: Vec<String> = r
663 .query("clearance", 10)
664 .unwrap()
665 .into_iter()
666 .map(|x| x.document_id)
667 .collect();
668 assert!(ids.contains(&"a".to_string()));
669 assert!(ids.contains(&"plain".to_string()));
670 }
671
672 #[test]
673 fn malformed_boost_metadata_yields_default_boost_at_read() {
674 let store = curated_store();
675 let h = store.ingest_handle();
676 // A doc whose boost stamp is garbage must not vanish or explode — it
677 // ranks with the default 1.0 boost.
678 h.ingest(doc("bad", "clearance fact").with_metadata(DocMeta::BOOST_KEY, "not-a-number"))
679 .unwrap();
680 let r = store.reader(RetrievalFilter::none(), AccessContext::anonymous());
681 let hits = r.query("clearance", 10).unwrap();
682 assert_eq!(hits.len(), 1);
683 assert_eq!(hits[0].document_id, "bad");
684 }
685}