Skip to main content

smooth_operator/
curation.rs

1//! Document sets, curation boosting, and query-time metadata filters
2//! (feature gap, Phase 11 — "Document sets / curation / boosting").
3//!
4//! Curation lets a user (a) group documents into named **document sets** so a
5//! query can be scoped to "only the dev-support repo" or "only the HR handbook",
6//! (b) **boost** canonical documents (the README, the policy of record) so they
7//! outrank merely-similar matches, and (c) filter retrieval by arbitrary
8//! **metadata equality** ("only prose, not code"). This module adds all three to
9//! smooth-operator, as a retrieval-time filter applied in *our* layer — exactly
10//! like [`AclKnowledgeStore`](crate::access_control::AclKnowledgeStore).
11//!
12//! ## Why enforcement lives in our layer (same reason as ACL)
13//!
14//! The engine's [`KnowledgeBase`](smooth_operator_core::KnowledgeBase) trait is
15//! upstream and read-only to us; its `query` returns a
16//! [`KnowledgeResult`](smooth_operator_core::KnowledgeResult) carrying only
17//! `document_id` / `chunk` / `score` / `source` — **not** the stored metadata —
18//! and the in-memory backend drops document metadata on ingest entirely. So we
19//! cannot read a document's set membership / boost / metadata back out of a
20//! query result. Instead this module, mirroring [`access_control`](crate::access_control):
21//!
22//! 1. Records each document's [`DocMeta`] (parsed from the metadata stamped at
23//!    ingest — see the [metadata convention](#metadata-convention)) into a side
24//!    table the [`CuratedKnowledgeStore`] owns, while forwarding the document
25//!    unchanged to the inner backend.
26//! 2. **Filters + boosts at read**: a reader bound to a [`RetrievalFilter`]
27//!    over-fetches from the inner backend, drops documents not in the requested
28//!    sets / not matching the metadata equalities, multiplies each surviving
29//!    result's score by its [`boost`](DocMeta::boost), **re-sorts** by the boosted
30//!    score, and truncates to the requested `K`.
31//!
32//! ## Metadata convention
33//!
34//! Stamped onto [`Document::metadata`](smooth_operator_core::Document) at ingest
35//! (the ingestion pipeline writes these; a connector/ingest config supplies the
36//! values):
37//!
38//! - **`document_set`** — set membership. **Multi-valued via a comma list**:
39//!   `"alpha"` is one set, `"alpha,beta"` is both. Names are trimmed of
40//!   surrounding whitespace; empty names are dropped. A document with no
41//!   `document_set` belongs to no named set (so a set-scoped query never
42//!   surfaces it, but an unscoped query — `document_sets: None` — still does).
43//! - **`boost`** — a parsed `f32` multiplier on the similarity score, default
44//!   **1.0**. Absent or malformed (`"abc"`, `""`, `NaN`, non-finite) ⇒ `1.0`, so
45//!   a bad stamp can never silently zero out or explode a document's ranking.
46//!   Negative boosts are clamped to `0.0` (a curator can bury a doc, never invert
47//!   ordering). `boost > 1.0` promotes; `0.0 ≤ boost < 1.0` demotes.
48//!
49//! ## Composition with ACL
50//!
51//! A [`CuratedKnowledgeStore`] also records [`DocAcl`](crate::access_control::DocAcl)s
52//! at ingest (same `acl_v2` key) and its reader takes an
53//! [`AccessContext`](crate::access_control::AccessContext) alongside the
54//! [`RetrievalFilter`]. **Both filters apply (logical AND)**: a result is
55//! returned only if the requester is entitled to it (ACL) *and* it is in the
56//! requested sets *and* it matches the metadata equalities. ACL is checked first
57//! (a curation filter must never widen what a requester can see).
58
59use std::collections::HashMap;
60use std::sync::{Arc, RwLock};
61
62use serde::{Deserialize, Serialize};
63use smooth_operator_core::{Document, KnowledgeBase, KnowledgeResult};
64
65use crate::access_control::{AccessContext, DocAcl};
66
67/// Over-fetch multiplier: the inner backend is queried for `limit * this` (with
68/// a floor) candidates so that, after dropping non-matching documents and
69/// re-ranking by boost, the post-filter top-K is still full whenever enough
70/// matching documents exist. Mirrors the ACL reader's over-fetch.
71const OVERFETCH_FACTOR: usize = 5;
72
73/// Lower bound on the candidate pool, so a small `limit` still over-fetches
74/// enough to survive filtering + boost re-ranking.
75const OVERFETCH_FLOOR: usize = 20;
76
77/// The default boost applied to a document with no (or a malformed) `boost`
78/// metadata value: a no-op multiplier that preserves the raw similarity score.
79pub const DEFAULT_BOOST: f32 = 1.0;
80
81/// Curation metadata recorded for a stored document: its document-set
82/// membership and its retrieval boost, plus the raw metadata map so the
83/// [`RetrievalFilter`]'s `metadata_eq` equalities can be evaluated at read.
84///
85/// Parsed from a [`Document`]'s metadata at ingest by [`DocMeta::from_document`].
86#[derive(Debug, Clone, PartialEq)]
87pub struct DocMeta {
88    /// The named sets this document belongs to (parsed from the comma-separated
89    /// `document_set` metadata value; empty when the document is in no set).
90    pub document_sets: Vec<String>,
91    /// The retrieval boost multiplier (parsed from the `boost` metadata value;
92    /// [`DEFAULT_BOOST`] when absent/malformed). Clamped to `≥ 0.0`.
93    pub boost: f32,
94    /// The full stamped metadata map, retained so `metadata_eq` filters can test
95    /// arbitrary key/value equalities against it.
96    pub metadata: HashMap<String, String>,
97}
98
99impl DocMeta {
100    /// The document-metadata key under which set membership is stamped. The
101    /// value is a **comma-separated** list (e.g. `"alpha"` or `"alpha,beta"`).
102    pub const DOCUMENT_SET_KEY: &'static str = "document_set";
103
104    /// The document-metadata key under which the numeric boost is stamped (a
105    /// stringified `f32`, e.g. `"3.0"`).
106    pub const BOOST_KEY: &'static str = "boost";
107
108    /// Parse the document-set list from a `document_set` metadata value:
109    /// comma-split, trim each name, drop empties. Returns an empty vec when the
110    /// key is absent or holds only whitespace/commas.
111    #[must_use]
112    pub fn parse_sets(metadata: &HashMap<String, String>) -> Vec<String> {
113        metadata
114            .get(Self::DOCUMENT_SET_KEY)
115            .map(|raw| {
116                raw.split(',')
117                    .map(str::trim)
118                    .filter(|s| !s.is_empty())
119                    .map(ToString::to_string)
120                    .collect()
121            })
122            .unwrap_or_default()
123    }
124
125    /// Parse the boost from a `boost` metadata value. Absent / unparseable /
126    /// non-finite ⇒ [`DEFAULT_BOOST`]; a parsed value is clamped to `≥ 0.0` so a
127    /// negative boost can only bury a document, never invert ordering.
128    #[must_use]
129    pub fn parse_boost(metadata: &HashMap<String, String>) -> f32 {
130        metadata
131            .get(Self::BOOST_KEY)
132            .and_then(|raw| raw.trim().parse::<f32>().ok())
133            .filter(|b| b.is_finite())
134            .map(|b| b.max(0.0))
135            .unwrap_or(DEFAULT_BOOST)
136    }
137
138    /// Build a [`DocMeta`] from a stored document's metadata.
139    #[must_use]
140    pub fn from_metadata(metadata: &HashMap<String, String>) -> Self {
141        Self {
142            document_sets: Self::parse_sets(metadata),
143            boost: Self::parse_boost(metadata),
144            metadata: metadata.clone(),
145        }
146    }
147
148    /// Build a [`DocMeta`] from a [`Document`] (convenience over
149    /// [`from_metadata`](Self::from_metadata)).
150    #[must_use]
151    pub fn from_document(doc: &Document) -> Self {
152        Self::from_metadata(&doc.metadata)
153    }
154
155    /// Whether this document belongs to the named set.
156    #[must_use]
157    pub fn in_set(&self, set: &str) -> bool {
158        self.document_sets.iter().any(|s| s == set)
159    }
160}
161
162/// Stamp a document-set membership onto a [`Document`]'s metadata (builder).
163///
164/// Multi-valued: pass several names to tag the document into all of them (stored
165/// as the comma-separated `document_set` value the [`CuratedKnowledgeStore`]
166/// parses). This is how a connector / ingest config tags a repo's docs into a
167/// named set (e.g. dev-support tags every doc from `acme/app` into set
168/// `"acme/app"`).
169#[must_use]
170pub fn with_document_set<I, S>(doc: Document, sets: I) -> Document
171where
172    I: IntoIterator<Item = S>,
173    S: Into<String>,
174{
175    let joined = sets
176        .into_iter()
177        .map(Into::into)
178        .filter(|s| !s.trim().is_empty())
179        .collect::<Vec<_>>()
180        .join(",");
181    if joined.is_empty() {
182        doc
183    } else {
184        doc.with_metadata(DocMeta::DOCUMENT_SET_KEY, joined)
185    }
186}
187
188/// Stamp a numeric boost onto a [`Document`]'s metadata (builder). A non-finite
189/// or negative value is normalized so the stamp is always a sane, parseable
190/// multiplier.
191#[must_use]
192pub fn with_boost(doc: Document, boost: f32) -> Document {
193    let boost = if boost.is_finite() {
194        boost.max(0.0)
195    } else {
196        DEFAULT_BOOST
197    };
198    doc.with_metadata(DocMeta::BOOST_KEY, format!("{boost}"))
199}
200
201/// A query-time retrieval filter: scope results to named document sets and/or
202/// require metadata equalities.
203///
204/// - `document_sets: None` ⇒ **no set scoping** (every document is eligible —
205///   the current/default behavior). `Some([])` ⇒ scope to *no* set (matches
206///   nothing); `Some(["alpha"])` ⇒ only documents in set `"alpha"`; a doc in
207///   **any** of the listed sets matches (union).
208/// - `metadata_eq` ⇒ every `(key, value)` must be present and equal in the
209///   document's stamped metadata (logical AND across the map). Empty ⇒ no
210///   metadata constraint.
211///
212/// An all-default `RetrievalFilter` ([`RetrievalFilter::none`]) matches every
213/// document — the no-op that preserves current retrieval behavior.
214#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
215pub struct RetrievalFilter {
216    /// Scope to documents in any of these sets. `None` ⇒ unscoped (all docs).
217    #[serde(default)]
218    pub document_sets: Option<Vec<String>>,
219    /// Require these `key == value` metadata equalities (all must hold).
220    #[serde(default)]
221    pub metadata_eq: HashMap<String, String>,
222}
223
224impl RetrievalFilter {
225    /// The no-op filter: no set scoping, no metadata constraint — matches every
226    /// document (preserves default retrieval behavior).
227    #[must_use]
228    pub fn none() -> Self {
229        Self::default()
230    }
231
232    /// Scope retrieval to the given document sets (a doc in any of them matches).
233    #[must_use]
234    pub fn in_sets<I, S>(sets: I) -> Self
235    where
236        I: IntoIterator<Item = S>,
237        S: Into<String>,
238    {
239        Self {
240            document_sets: Some(sets.into_iter().map(Into::into).collect()),
241            metadata_eq: HashMap::new(),
242        }
243    }
244
245    /// Add a required metadata equality (builder).
246    #[must_use]
247    pub fn with_metadata_eq(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
248        self.metadata_eq.insert(key.into(), value.into());
249        self
250    }
251
252    /// Whether this filter imposes no constraint at all (so retrieval is
253    /// unchanged). Used to short-circuit the over-fetch when there's nothing to
254    /// filter on.
255    #[must_use]
256    pub fn is_unconstrained(&self) -> bool {
257        self.document_sets.is_none() && self.metadata_eq.is_empty()
258    }
259
260    /// Whether a document with the given [`DocMeta`] passes this filter.
261    ///
262    /// True when (a) `document_sets` is `None` *or* the doc is in at least one of
263    /// the listed sets, **and** (b) every `metadata_eq` entry is present and
264    /// equal in the doc's metadata.
265    #[must_use]
266    pub fn matches(&self, meta: &DocMeta) -> bool {
267        if let Some(sets) = &self.document_sets {
268            if !sets.iter().any(|s| meta.in_set(s)) {
269                return false;
270            }
271        }
272        self.metadata_eq
273            .iter()
274            .all(|(k, v)| meta.metadata.get(k).is_some_and(|mv| mv == v))
275    }
276}
277
278/// Side table mapping a stored `document_id` to its [`DocMeta`]. Shared (`Arc`)
279/// between the ingest handle that populates it and every per-request reader.
280type MetaTable = Arc<RwLock<HashMap<String, DocMeta>>>;
281
282/// Side table mapping a stored `document_id` to its [`DocAcl`] (same role as in
283/// [`access_control`](crate::access_control)) so a [`CuratedKnowledgeStore`]
284/// enforces ACL ∧ curation in one read pass.
285type AclTable = Arc<RwLock<HashMap<String, DocAcl>>>;
286
287/// A curation-aware knowledge store: wraps any inner
288/// [`KnowledgeBase`](smooth_operator_core::KnowledgeBase), records each
289/// document's [`DocMeta`] (set membership / boost / metadata) **and** its
290/// [`DocAcl`] at ingest, and mints readers that apply a [`RetrievalFilter`] and
291/// [`AccessContext`] together at read time.
292///
293/// Like [`AclKnowledgeStore`](crate::access_control::AclKnowledgeStore), it does
294/// not itself implement `KnowledgeBase` for reading (reads must be bound to a
295/// filter + requester). Instead:
296/// - [`ingest_handle`](Self::ingest_handle) returns an `Arc<dyn KnowledgeBase>`
297///   that records curation metadata + ACL as it ingests;
298/// - [`reader`](Self::reader) mints a filtering/boosting `Arc<dyn KnowledgeBase>`
299///   bound to a [`RetrievalFilter`] + [`AccessContext`].
300#[derive(Clone)]
301pub struct CuratedKnowledgeStore {
302    inner: Arc<dyn KnowledgeBase>,
303    meta: MetaTable,
304    acls: AclTable,
305}
306
307impl CuratedKnowledgeStore {
308    /// Wrap an inner knowledge base. The store starts with empty side tables;
309    /// every document ingested through [`ingest_handle`](Self::ingest_handle) has
310    /// its [`DocMeta`] (and [`DocAcl`], if stamped) recorded.
311    #[must_use]
312    pub fn new(inner: Arc<dyn KnowledgeBase>) -> Self {
313        Self {
314            inner,
315            meta: Arc::new(RwLock::new(HashMap::new())),
316            acls: Arc::new(RwLock::new(HashMap::new())),
317        }
318    }
319
320    /// An ingest-side handle: a [`KnowledgeBase`] whose `ingest` records the
321    /// document's [`DocMeta`] (always) and [`DocAcl`] (if stamped) in the shared
322    /// side tables, then forwards to the inner backend. Its `query` is the
323    /// **unfiltered** inner query — production reads use [`reader`](Self::reader).
324    #[must_use]
325    pub fn ingest_handle(&self) -> Arc<dyn KnowledgeBase> {
326        Arc::new(CuratedIngestHandle {
327            inner: Arc::clone(&self.inner),
328            meta: Arc::clone(&self.meta),
329            acls: Arc::clone(&self.acls),
330        })
331    }
332
333    /// A read-side handle bound to a [`RetrievalFilter`] + [`AccessContext`]: a
334    /// [`KnowledgeBase`] whose `query` over-fetches from the inner backend, drops
335    /// every result the requester is not entitled to (ACL) or that does not match
336    /// the filter (sets/metadata), multiplies each survivor's score by its boost,
337    /// re-sorts, and truncates to the requested limit.
338    ///
339    /// Pass [`RetrievalFilter::none`] + [`AccessContext::anonymous`] for the
340    /// unscoped, ACL-default path (boost still applies — a boosted doc still
341    /// re-ranks).
342    #[must_use]
343    pub fn reader(&self, filter: RetrievalFilter, access: AccessContext) -> Arc<dyn KnowledgeBase> {
344        Arc::new(CuratedReader {
345            inner: Arc::clone(&self.inner),
346            meta: Arc::clone(&self.meta),
347            acls: Arc::clone(&self.acls),
348            filter,
349            access,
350        })
351    }
352
353    /// Record `document_id → meta` directly (without ingesting a document) — for
354    /// callers that store documents through some other path but still want the
355    /// curation filter/boost applied at read.
356    ///
357    /// # Errors
358    /// Returns an error if the metadata table lock is poisoned.
359    pub fn record_meta(&self, document_id: impl Into<String>, meta: DocMeta) -> anyhow::Result<()> {
360        let mut table = self
361            .meta
362            .write()
363            .map_err(|e| anyhow::anyhow!("curation meta table lock poisoned: {e}"))?;
364        table.insert(document_id.into(), meta);
365        Ok(())
366    }
367}
368
369/// Records curation metadata + ACL at ingest, forwarding documents to the inner
370/// backend.
371struct CuratedIngestHandle {
372    inner: Arc<dyn KnowledgeBase>,
373    meta: MetaTable,
374    acls: AclTable,
375}
376
377/// Shared ingest bookkeeping: record a document's [`DocMeta`] (always) and
378/// [`DocAcl`] (when stamped) into the side tables before forwarding it.
379fn record_ingest_metadata(meta: &MetaTable, acls: &AclTable, doc: &Document) -> anyhow::Result<()> {
380    {
381        let mut table = meta
382            .write()
383            .map_err(|e| anyhow::anyhow!("curation meta table lock poisoned: {e}"))?;
384        table.insert(doc.id.clone(), DocMeta::from_document(doc));
385    }
386    if let Some(acl) = DocAcl::from_metadata(&doc.metadata) {
387        let mut table = acls
388            .write()
389            .map_err(|e| anyhow::anyhow!("acl table lock poisoned: {e}"))?;
390        table.insert(doc.id.clone(), acl);
391    }
392    Ok(())
393}
394
395impl KnowledgeBase for CuratedIngestHandle {
396    fn ingest(&self, doc: Document) -> anyhow::Result<()> {
397        record_ingest_metadata(&self.meta, &self.acls, &doc)?;
398        self.inner.ingest(doc)
399    }
400
401    fn query(&self, query: &str, limit: usize) -> anyhow::Result<Vec<KnowledgeResult>> {
402        self.inner.query(query, limit)
403    }
404}
405
406/// Filters + boosts query results by a bound [`RetrievalFilter`] +
407/// [`AccessContext`].
408struct CuratedReader {
409    inner: Arc<dyn KnowledgeBase>,
410    meta: MetaTable,
411    acls: AclTable,
412    filter: RetrievalFilter,
413    access: AccessContext,
414}
415
416impl KnowledgeBase for CuratedReader {
417    fn ingest(&self, doc: Document) -> anyhow::Result<()> {
418        // A reader can still ingest (recording metadata + ACL), so the same
419        // handle is usable end to end in tests — production ingest uses
420        // ingest_handle.
421        record_ingest_metadata(&self.meta, &self.acls, &doc)?;
422        self.inner.ingest(doc)
423    }
424
425    fn query(&self, query: &str, limit: usize) -> anyhow::Result<Vec<KnowledgeResult>> {
426        if limit == 0 {
427            return Ok(Vec::new());
428        }
429
430        // Over-fetch so the post-filter, post-boost top-K is full whenever
431        // enough matching documents exist.
432        let candidate_n = limit.saturating_mul(OVERFETCH_FACTOR).max(OVERFETCH_FLOOR);
433        let candidates = self.inner.query(query, candidate_n)?;
434
435        let meta_table = self
436            .meta
437            .read()
438            .map_err(|e| anyhow::anyhow!("curation meta table lock poisoned: {e}"))?;
439        let acl_table = self
440            .acls
441            .read()
442            .map_err(|e| anyhow::anyhow!("acl table lock poisoned: {e}"))?;
443
444        let mut kept: Vec<KnowledgeResult> = Vec::with_capacity(candidates.len());
445        for mut result in candidates {
446            // ACL first: a curation filter must never widen what a requester can
447            // see. No recorded ACL ⇒ org-public (backward-compatible default).
448            let acl_ok = match acl_table.get(&result.document_id) {
449                Some(acl) => self.access.can_access(acl),
450                None => true,
451            };
452            if !acl_ok {
453                continue;
454            }
455
456            // Then the curation filter (sets + metadata). A document with no
457            // recorded DocMeta is treated as an empty DocMeta: it belongs to no
458            // set (so a set-scoped query skips it) and has the default boost.
459            let doc_meta = meta_table.get(&result.document_id).cloned();
460            let meta_for_match = doc_meta.clone().unwrap_or_else(|| DocMeta {
461                document_sets: Vec::new(),
462                boost: DEFAULT_BOOST,
463                metadata: HashMap::new(),
464            });
465            if !self.filter.matches(&meta_for_match) {
466                continue;
467            }
468
469            // Apply the boost to the score before re-ranking.
470            result.score *= meta_for_match.boost;
471            kept.push(result);
472        }
473
474        // Re-sort by the boosted score (descending), then truncate to K. A
475        // stable, total order: NaN-safe via `total_cmp`.
476        kept.sort_by(|a, b| b.score.total_cmp(&a.score));
477        kept.truncate(limit);
478        Ok(kept)
479    }
480}
481
482#[cfg(test)]
483mod tests {
484    use super::*;
485    use smooth_operator_core::DocumentType;
486
487    fn doc(id: &str, content: &str) -> Document {
488        let mut d = Document::new(content, "s", DocumentType::Documentation);
489        d.id = id.to_string();
490        d
491    }
492
493    // ---- DocMeta::parse_sets --------------------------------------------
494
495    #[test]
496    fn parse_sets_single_and_multi() {
497        let d = with_document_set(doc("a", "x"), ["alpha"]);
498        assert_eq!(DocMeta::parse_sets(&d.metadata), vec!["alpha".to_string()]);
499
500        let d = with_document_set(doc("b", "x"), ["alpha", "beta"]);
501        assert_eq!(
502            DocMeta::parse_sets(&d.metadata),
503            vec!["alpha".to_string(), "beta".to_string()]
504        );
505    }
506
507    #[test]
508    fn parse_sets_trims_and_drops_empties() {
509        let d = doc("c", "x").with_metadata(DocMeta::DOCUMENT_SET_KEY, " alpha , , beta ,");
510        assert_eq!(
511            DocMeta::parse_sets(&d.metadata),
512            vec!["alpha".to_string(), "beta".to_string()]
513        );
514    }
515
516    #[test]
517    fn parse_sets_absent_is_empty() {
518        let d = doc("d", "x");
519        assert!(DocMeta::parse_sets(&d.metadata).is_empty());
520    }
521
522    // ---- DocMeta::parse_boost (boost math: default + malformed → 1.0) ----
523
524    #[test]
525    fn parse_boost_default_when_absent() {
526        let d = doc("e", "x");
527        assert_eq!(DocMeta::parse_boost(&d.metadata), DEFAULT_BOOST);
528    }
529
530    #[test]
531    fn parse_boost_parses_valid() {
532        let d = with_boost(doc("f", "x"), 3.0);
533        assert!((DocMeta::parse_boost(&d.metadata) - 3.0).abs() < f32::EPSILON);
534    }
535
536    #[test]
537    fn parse_boost_malformed_falls_back_to_default() {
538        for bad in ["abc", "", "  ", "NaN", "inf", "1.2.3"] {
539            let d = doc("g", "x").with_metadata(DocMeta::BOOST_KEY, bad);
540            assert_eq!(
541                DocMeta::parse_boost(&d.metadata),
542                DEFAULT_BOOST,
543                "malformed boost {bad:?} must fall back to default"
544            );
545        }
546    }
547
548    #[test]
549    fn parse_boost_negative_is_clamped_to_zero() {
550        let d = doc("h", "x").with_metadata(DocMeta::BOOST_KEY, "-2.0");
551        assert_eq!(DocMeta::parse_boost(&d.metadata), 0.0);
552    }
553
554    #[test]
555    fn with_boost_normalizes_non_finite() {
556        // A non-finite boost passed to the builder is normalized to the default.
557        let d = with_boost(doc("i", "x"), f32::NAN);
558        assert_eq!(DocMeta::parse_boost(&d.metadata), DEFAULT_BOOST);
559        let d = with_boost(doc("j", "x"), f32::INFINITY);
560        assert_eq!(DocMeta::parse_boost(&d.metadata), DEFAULT_BOOST);
561    }
562
563    // ---- RetrievalFilter::matches (the filter predicate) ----------------
564
565    fn meta(sets: &[&str], boost: f32, kv: &[(&str, &str)]) -> DocMeta {
566        let mut metadata = HashMap::new();
567        for (k, v) in kv {
568            metadata.insert((*k).to_string(), (*v).to_string());
569        }
570        DocMeta {
571            document_sets: sets.iter().map(ToString::to_string).collect(),
572            boost,
573            metadata,
574        }
575    }
576
577    #[test]
578    fn unconstrained_filter_matches_everything() {
579        let f = RetrievalFilter::none();
580        assert!(f.is_unconstrained());
581        assert!(f.matches(&meta(&[], 1.0, &[])));
582        assert!(f.matches(&meta(&["alpha"], 1.0, &[("kind", "code")])));
583    }
584
585    #[test]
586    fn set_scope_matches_only_members() {
587        let f = RetrievalFilter::in_sets(["alpha"]);
588        assert!(!f.is_unconstrained());
589        assert!(f.matches(&meta(&["alpha"], 1.0, &[])));
590        assert!(f.matches(&meta(&["alpha", "beta"], 1.0, &[]))); // multi-set member
591        assert!(!f.matches(&meta(&["beta"], 1.0, &[])));
592        assert!(!f.matches(&meta(&[], 1.0, &[]))); // in no set
593    }
594
595    #[test]
596    fn set_scope_union_across_listed_sets() {
597        let f = RetrievalFilter::in_sets(["alpha", "gamma"]);
598        assert!(f.matches(&meta(&["gamma"], 1.0, &[])));
599        assert!(f.matches(&meta(&["alpha"], 1.0, &[])));
600        assert!(!f.matches(&meta(&["beta"], 1.0, &[])));
601    }
602
603    #[test]
604    fn empty_set_list_matches_nothing() {
605        let f = RetrievalFilter {
606            document_sets: Some(vec![]),
607            metadata_eq: HashMap::new(),
608        };
609        assert!(!f.matches(&meta(&["alpha"], 1.0, &[])));
610        assert!(!f.matches(&meta(&[], 1.0, &[])));
611    }
612
613    #[test]
614    fn metadata_eq_requires_all_equalities() {
615        let f = RetrievalFilter::none()
616            .with_metadata_eq("kind", "prose")
617            .with_metadata_eq("lang", "en");
618        assert!(f.matches(&meta(&[], 1.0, &[("kind", "prose"), ("lang", "en")])));
619        // Missing one key.
620        assert!(!f.matches(&meta(&[], 1.0, &[("kind", "prose")])));
621        // Wrong value.
622        assert!(!f.matches(&meta(&[], 1.0, &[("kind", "code"), ("lang", "en")])));
623    }
624
625    #[test]
626    fn set_and_metadata_compose_with_and() {
627        let f = RetrievalFilter::in_sets(["alpha"]).with_metadata_eq("kind", "prose");
628        assert!(f.matches(&meta(&["alpha"], 1.0, &[("kind", "prose")])));
629        assert!(!f.matches(&meta(&["alpha"], 1.0, &[("kind", "code")]))); // set ok, meta no
630        assert!(!f.matches(&meta(&["beta"], 1.0, &[("kind", "prose")]))); // meta ok, set no
631    }
632
633    // ---- RetrievalFilter round-trips through serde ----------------------
634
635    #[test]
636    fn filter_round_trips_through_json() {
637        let f = RetrievalFilter::in_sets(["alpha", "beta"]).with_metadata_eq("kind", "prose");
638        let json = serde_json::to_string(&f).expect("serialize");
639        let parsed: RetrievalFilter = serde_json::from_str(&json).expect("deserialize");
640        assert_eq!(parsed, f);
641    }
642
643    // ---- store-level: ingest records DocMeta, reader applies it ---------
644
645    fn curated_store() -> CuratedKnowledgeStore {
646        CuratedKnowledgeStore::new(Arc::new(smooth_operator_core::InMemoryKnowledge::new()))
647    }
648
649    #[test]
650    fn reader_with_no_filter_returns_all_with_boost_applied() {
651        let store = curated_store();
652        let h = store.ingest_handle();
653        h.ingest(with_document_set(
654            doc("a", "clearance alpha fact"),
655            ["alpha"],
656        ))
657        .unwrap();
658        h.ingest(doc("plain", "clearance plain fact")).unwrap();
659
660        // Unconstrained filter: both come back.
661        let r = store.reader(RetrievalFilter::none(), AccessContext::anonymous());
662        let ids: Vec<String> = r
663            .query("clearance", 10)
664            .unwrap()
665            .into_iter()
666            .map(|x| x.document_id)
667            .collect();
668        assert!(ids.contains(&"a".to_string()));
669        assert!(ids.contains(&"plain".to_string()));
670    }
671
672    #[test]
673    fn malformed_boost_metadata_yields_default_boost_at_read() {
674        let store = curated_store();
675        let h = store.ingest_handle();
676        // A doc whose boost stamp is garbage must not vanish or explode — it
677        // ranks with the default 1.0 boost.
678        h.ingest(doc("bad", "clearance fact").with_metadata(DocMeta::BOOST_KEY, "not-a-number"))
679            .unwrap();
680        let r = store.reader(RetrievalFilter::none(), AccessContext::anonymous());
681        let hits = r.query("clearance", 10).unwrap();
682        assert_eq!(hits.len(), 1);
683        assert_eq!(hits[0].document_id, "bad");
684    }
685}