Skip to main content

cortex_retrieval/
lexical.rs

1//! Minimal in-memory lexical retrieval over accepted memory documents.
2
3use std::collections::HashSet;
4
5use cortex_core::{CortexError, CortexResult, MemoryId};
6
7/// Read-only document shape consumed by lexical retrieval.
8///
9/// The store integration can populate this from active/accepted memory rows once
10/// a read API exists. The retrieval layer itself does not mutate memory.
11#[derive(Debug, Clone, PartialEq, Eq)]
12pub struct LexicalDocument {
13    /// Memory identifier.
14    pub id: MemoryId,
15    /// Durable memory claim text.
16    pub claim: String,
17    /// Domain tags attached to the memory.
18    pub domains: Vec<String>,
19}
20
21impl LexicalDocument {
22    /// Creates a lexical document from accepted memory fields.
23    #[must_use]
24    pub fn accepted_memory(id: MemoryId, claim: impl Into<String>, domains: Vec<String>) -> Self {
25        Self {
26            id,
27            claim: claim.into(),
28            domains,
29        }
30    }
31}
32
33/// In-memory lexical index.
34#[derive(Debug, Clone, Default, PartialEq, Eq)]
35pub struct LexicalIndex {
36    documents: Vec<LexicalDocument>,
37}
38
39impl LexicalIndex {
40    /// Builds an index from already accepted memory documents.
41    #[must_use]
42    pub fn new(documents: Vec<LexicalDocument>) -> Self {
43        Self { documents }
44    }
45
46    /// Returns matching documents ranked by lexical overlap with `query`.
47    pub fn search(&self, query: &str) -> CortexResult<Vec<LexicalHit>> {
48        self.search_with_tag_filter(query, &[])
49    }
50
51    /// Returns matching documents ranked by lexical overlap with `query`,
52    /// restricted to documents whose `domains` tags carry every entry in
53    /// `required_tags` (AND semantics).
54    ///
55    /// The tag filter runs as a pre-pass: documents that fail the AND
56    /// closure are skipped before any lexical scoring work. An empty
57    /// `required_tags` slice disables filtering and yields the same hits
58    /// as [`Self::search`]. Duplicate tag inputs are coalesced.
59    pub fn search_with_tag_filter(
60        &self,
61        query: &str,
62        required_tags: &[String],
63    ) -> CortexResult<Vec<LexicalHit>> {
64        if query.trim().is_empty() {
65            return Err(CortexError::Validation(
66                "search query must not be empty".into(),
67            ));
68        }
69        let query_terms = tokenize_unique(query);
70        if query_terms.is_empty() {
71            // Query has content but no ASCII-alphanumeric tokens (e.g. a
72            // Unicode-only string like "μνήμη"). The ASCII-tokenized index
73            // cannot match anything, so return no results rather than an error.
74            return Ok(Vec::new());
75        }
76
77        let mut unique_tags: Vec<String> = Vec::with_capacity(required_tags.len());
78        for tag in required_tags {
79            if !unique_tags.iter().any(|existing| existing == tag) {
80                unique_tags.push(tag.clone());
81            }
82        }
83
84        let mut hits: Vec<_> = self
85            .documents
86            .iter()
87            .filter(|document| document_carries_all_tags(document, &unique_tags))
88            .filter_map(|document| lexical_hit(document, &query_terms))
89            .collect();
90        hits.sort_by(|left, right| {
91            right
92                .explanation
93                .lexical_match
94                .total_cmp(&left.explanation.lexical_match)
95                .then_with(|| {
96                    left.document
97                        .id
98                        .to_string()
99                        .cmp(&right.document.id.to_string())
100                })
101        });
102        Ok(hits)
103    }
104}
105
106fn document_carries_all_tags(document: &LexicalDocument, required: &[String]) -> bool {
107    if required.is_empty() {
108        return true;
109    }
110    required
111        .iter()
112        .all(|tag| document.domains.iter().any(|domain| domain == tag))
113}
114
115/// A lexical search hit and its explanation.
116#[derive(Debug, Clone, PartialEq)]
117pub struct LexicalHit {
118    /// Matching document.
119    pub document: LexicalDocument,
120    /// Lexical match explanation.
121    pub explanation: LexicalExplanation,
122}
123
124/// Explanation for the lexical component of retrieval.
125#[derive(Debug, Clone, PartialEq)]
126pub struct LexicalExplanation {
127    /// Normalized lexical match in `[0, 1]`.
128    pub lexical_match: f32,
129    /// Distinct normalized query terms used for matching.
130    pub query_terms: Vec<String>,
131    /// Query terms found in the memory claim.
132    pub matched_claim_terms: Vec<String>,
133    /// Query terms found in the memory domain tags.
134    pub matched_domain_terms: Vec<String>,
135    /// Distinct query terms found in either indexed field.
136    pub matched_terms: Vec<String>,
137}
138
139fn lexical_hit(document: &LexicalDocument, query_terms: &[String]) -> Option<LexicalHit> {
140    let claim_terms: HashSet<_> = tokenize_unique(&document.claim).into_iter().collect();
141    let domain_terms: HashSet<_> = document
142        .domains
143        .iter()
144        .flat_map(|domain| tokenize_unique(domain))
145        .collect();
146
147    let mut matched_claim_terms = Vec::new();
148    let mut matched_domain_terms = Vec::new();
149    let mut matched_terms = Vec::new();
150
151    for term in query_terms {
152        let claim_match = claim_terms.contains(term);
153        let domain_match = domain_terms.contains(term);
154        if claim_match {
155            matched_claim_terms.push(term.clone());
156        }
157        if domain_match {
158            matched_domain_terms.push(term.clone());
159        }
160        if claim_match || domain_match {
161            matched_terms.push(term.clone());
162        }
163    }
164
165    if matched_terms.is_empty() {
166        return None;
167    }
168
169    let lexical_match = matched_terms.len() as f32 / query_terms.len() as f32;
170    Some(LexicalHit {
171        document: document.clone(),
172        explanation: LexicalExplanation {
173            lexical_match,
174            query_terms: query_terms.to_vec(),
175            matched_claim_terms,
176            matched_domain_terms,
177            matched_terms,
178        },
179    })
180}
181
182fn tokenize_unique(text: &str) -> Vec<String> {
183    let mut seen = HashSet::new();
184    let mut terms = Vec::new();
185    for token in text
186        .split(|character: char| !character.is_ascii_alphanumeric())
187        .map(str::trim)
188        .filter(|token| !token.is_empty())
189        .map(str::to_ascii_lowercase)
190    {
191        if seen.insert(token.clone()) {
192            terms.push(token);
193        }
194    }
195    terms
196}
197
198#[cfg(test)]
199mod tests {
200    use super::*;
201
202    #[test]
203    fn insert_and_query_returns_expected_hit() {
204        let expected = LexicalDocument::accepted_memory(
205            MemoryId::new(),
206            "SQLite repositories preserve Cortex rows",
207            vec!["store".into(), "retrieval".into()],
208        );
209        let unrelated = LexicalDocument::accepted_memory(
210            MemoryId::new(),
211            "Agent prompts require redaction before export",
212            vec!["privacy".into()],
213        );
214        let index = LexicalIndex::new(vec![unrelated, expected.clone()]);
215
216        let hits = index.search("sqlite retrieval").expect("search succeeds");
217
218        assert_eq!(hits.len(), 1);
219        assert_eq!(hits[0].document, expected);
220        assert_eq!(hits[0].explanation.lexical_match, 1.0);
221        assert_eq!(hits[0].explanation.query_terms, ["sqlite", "retrieval"]);
222        assert_eq!(hits[0].explanation.matched_claim_terms, ["sqlite"]);
223        assert_eq!(hits[0].explanation.matched_domain_terms, ["retrieval"]);
224    }
225
226    #[test]
227    fn search_orders_by_lexical_match() {
228        let strongest = LexicalDocument::accepted_memory(
229            MemoryId::new(),
230            "memory search explains lexical salience scoring",
231            vec!["retrieval".into()],
232        );
233        let weaker = LexicalDocument::accepted_memory(
234            MemoryId::new(),
235            "memory lifecycle accepts durable claims",
236            vec!["memory".into()],
237        );
238        let index = LexicalIndex::new(vec![weaker.clone(), strongest.clone()]);
239
240        let hits = index
241            .search("memory lexical salience")
242            .expect("search succeeds");
243
244        assert_eq!(hits[0].document, strongest);
245        assert_eq!(hits[0].explanation.lexical_match, 1.0);
246        assert_eq!(hits[1].document, weaker);
247        assert_eq!(hits[1].explanation.lexical_match, 1.0 / 3.0);
248    }
249
250    #[test]
251    fn empty_query_is_validation_error() {
252        let index = LexicalIndex::default();
253
254        let err = index.search(" \n\t ").unwrap_err();
255        assert!(
256            err.to_string().contains("must not be empty"),
257            "expected 'must not be empty' in error, got: {err}"
258        );
259    }
260
261    #[test]
262    fn unicode_only_query_returns_no_matches_not_error() {
263        let doc = LexicalDocument::accepted_memory(
264            MemoryId::new(),
265            "memory recall in Greek",
266            vec!["retrieval".into()],
267        );
268        let index = LexicalIndex::new(vec![doc]);
269
270        // A query composed entirely of non-ASCII Unicode should succeed with
271        // zero hits rather than returning a validation error.
272        let hits = index.search("μνήμη").expect("unicode query should not error");
273        assert!(hits.is_empty(), "expected no hits for Unicode-only query");
274    }
275
276    #[test]
277    fn tag_filter_excludes_documents_missing_required_tag() {
278        let with_rust = LexicalDocument::accepted_memory(
279            MemoryId::new(),
280            "rust memory matters",
281            vec!["rust".into(), "store".into()],
282        );
283        let without_rust = LexicalDocument::accepted_memory(
284            MemoryId::new(),
285            "memory matters in python",
286            vec!["python".into(), "store".into()],
287        );
288        let index = LexicalIndex::new(vec![with_rust.clone(), without_rust]);
289
290        let hits = index
291            .search_with_tag_filter("memory matters", &["rust".into()])
292            .expect("filtered search succeeds");
293
294        assert_eq!(hits.len(), 1);
295        assert_eq!(hits[0].document, with_rust);
296    }
297
298    #[test]
299    fn tag_filter_applies_and_semantics() {
300        let only_a = LexicalDocument::accepted_memory(
301            MemoryId::new(),
302            "claim mentions retrieval",
303            vec!["a".into()],
304        );
305        let only_b = LexicalDocument::accepted_memory(
306            MemoryId::new(),
307            "claim mentions retrieval too",
308            vec!["b".into()],
309        );
310        let both = LexicalDocument::accepted_memory(
311            MemoryId::new(),
312            "claim mentions retrieval thrice",
313            vec!["a".into(), "b".into()],
314        );
315        let index = LexicalIndex::new(vec![only_a, only_b, both.clone()]);
316
317        let hits = index
318            .search_with_tag_filter("retrieval", &["a".into(), "b".into()])
319            .expect("filtered search succeeds");
320
321        assert_eq!(hits.len(), 1);
322        assert_eq!(hits[0].document, both);
323    }
324
325    #[test]
326    fn tag_filter_empty_required_tags_matches_unfiltered_search() {
327        let document = LexicalDocument::accepted_memory(
328            MemoryId::new(),
329            "memory matters",
330            vec!["rust".into()],
331        );
332        let index = LexicalIndex::new(vec![document]);
333
334        let baseline = index.search("memory matters").expect("baseline search");
335        let filtered = index
336            .search_with_tag_filter("memory matters", &[])
337            .expect("filtered search");
338
339        assert_eq!(baseline, filtered);
340    }
341}