1use std::collections::HashSet;
4
5use cortex_core::{CortexError, CortexResult, MemoryId};
6
7#[derive(Debug, Clone, PartialEq, Eq)]
12pub struct LexicalDocument {
13 pub id: MemoryId,
15 pub claim: String,
17 pub domains: Vec<String>,
19}
20
21impl LexicalDocument {
22 #[must_use]
24 pub fn accepted_memory(id: MemoryId, claim: impl Into<String>, domains: Vec<String>) -> Self {
25 Self {
26 id,
27 claim: claim.into(),
28 domains,
29 }
30 }
31}
32
33#[derive(Debug, Clone, Default, PartialEq, Eq)]
35pub struct LexicalIndex {
36 documents: Vec<LexicalDocument>,
37}
38
39impl LexicalIndex {
40 #[must_use]
42 pub fn new(documents: Vec<LexicalDocument>) -> Self {
43 Self { documents }
44 }
45
46 pub fn search(&self, query: &str) -> CortexResult<Vec<LexicalHit>> {
48 self.search_with_tag_filter(query, &[])
49 }
50
51 pub fn search_with_tag_filter(
60 &self,
61 query: &str,
62 required_tags: &[String],
63 ) -> CortexResult<Vec<LexicalHit>> {
64 if query.trim().is_empty() {
65 return Err(CortexError::Validation(
66 "search query must not be empty".into(),
67 ));
68 }
69 let query_terms = tokenize_unique(query);
70 if query_terms.is_empty() {
71 return Ok(Vec::new());
75 }
76
77 let mut unique_tags: Vec<String> = Vec::with_capacity(required_tags.len());
78 for tag in required_tags {
79 if !unique_tags.iter().any(|existing| existing == tag) {
80 unique_tags.push(tag.clone());
81 }
82 }
83
84 let mut hits: Vec<_> = self
85 .documents
86 .iter()
87 .filter(|document| document_carries_all_tags(document, &unique_tags))
88 .filter_map(|document| lexical_hit(document, &query_terms))
89 .collect();
90 hits.sort_by(|left, right| {
91 right
92 .explanation
93 .lexical_match
94 .total_cmp(&left.explanation.lexical_match)
95 .then_with(|| {
96 left.document
97 .id
98 .to_string()
99 .cmp(&right.document.id.to_string())
100 })
101 });
102 Ok(hits)
103 }
104}
105
106fn document_carries_all_tags(document: &LexicalDocument, required: &[String]) -> bool {
107 if required.is_empty() {
108 return true;
109 }
110 required
111 .iter()
112 .all(|tag| document.domains.iter().any(|domain| domain == tag))
113}
114
115#[derive(Debug, Clone, PartialEq)]
117pub struct LexicalHit {
118 pub document: LexicalDocument,
120 pub explanation: LexicalExplanation,
122}
123
124#[derive(Debug, Clone, PartialEq)]
126pub struct LexicalExplanation {
127 pub lexical_match: f32,
129 pub query_terms: Vec<String>,
131 pub matched_claim_terms: Vec<String>,
133 pub matched_domain_terms: Vec<String>,
135 pub matched_terms: Vec<String>,
137}
138
139fn lexical_hit(document: &LexicalDocument, query_terms: &[String]) -> Option<LexicalHit> {
140 let claim_terms: HashSet<_> = tokenize_unique(&document.claim).into_iter().collect();
141 let domain_terms: HashSet<_> = document
142 .domains
143 .iter()
144 .flat_map(|domain| tokenize_unique(domain))
145 .collect();
146
147 let mut matched_claim_terms = Vec::new();
148 let mut matched_domain_terms = Vec::new();
149 let mut matched_terms = Vec::new();
150
151 for term in query_terms {
152 let claim_match = claim_terms.contains(term);
153 let domain_match = domain_terms.contains(term);
154 if claim_match {
155 matched_claim_terms.push(term.clone());
156 }
157 if domain_match {
158 matched_domain_terms.push(term.clone());
159 }
160 if claim_match || domain_match {
161 matched_terms.push(term.clone());
162 }
163 }
164
165 if matched_terms.is_empty() {
166 return None;
167 }
168
169 let lexical_match = matched_terms.len() as f32 / query_terms.len() as f32;
170 Some(LexicalHit {
171 document: document.clone(),
172 explanation: LexicalExplanation {
173 lexical_match,
174 query_terms: query_terms.to_vec(),
175 matched_claim_terms,
176 matched_domain_terms,
177 matched_terms,
178 },
179 })
180}
181
182fn tokenize_unique(text: &str) -> Vec<String> {
183 let mut seen = HashSet::new();
184 let mut terms = Vec::new();
185 for token in text
186 .split(|character: char| !character.is_ascii_alphanumeric())
187 .map(str::trim)
188 .filter(|token| !token.is_empty())
189 .map(str::to_ascii_lowercase)
190 {
191 if seen.insert(token.clone()) {
192 terms.push(token);
193 }
194 }
195 terms
196}
197
198#[cfg(test)]
199mod tests {
200 use super::*;
201
202 #[test]
203 fn insert_and_query_returns_expected_hit() {
204 let expected = LexicalDocument::accepted_memory(
205 MemoryId::new(),
206 "SQLite repositories preserve Cortex rows",
207 vec!["store".into(), "retrieval".into()],
208 );
209 let unrelated = LexicalDocument::accepted_memory(
210 MemoryId::new(),
211 "Agent prompts require redaction before export",
212 vec!["privacy".into()],
213 );
214 let index = LexicalIndex::new(vec![unrelated, expected.clone()]);
215
216 let hits = index.search("sqlite retrieval").expect("search succeeds");
217
218 assert_eq!(hits.len(), 1);
219 assert_eq!(hits[0].document, expected);
220 assert_eq!(hits[0].explanation.lexical_match, 1.0);
221 assert_eq!(hits[0].explanation.query_terms, ["sqlite", "retrieval"]);
222 assert_eq!(hits[0].explanation.matched_claim_terms, ["sqlite"]);
223 assert_eq!(hits[0].explanation.matched_domain_terms, ["retrieval"]);
224 }
225
226 #[test]
227 fn search_orders_by_lexical_match() {
228 let strongest = LexicalDocument::accepted_memory(
229 MemoryId::new(),
230 "memory search explains lexical salience scoring",
231 vec!["retrieval".into()],
232 );
233 let weaker = LexicalDocument::accepted_memory(
234 MemoryId::new(),
235 "memory lifecycle accepts durable claims",
236 vec!["memory".into()],
237 );
238 let index = LexicalIndex::new(vec![weaker.clone(), strongest.clone()]);
239
240 let hits = index
241 .search("memory lexical salience")
242 .expect("search succeeds");
243
244 assert_eq!(hits[0].document, strongest);
245 assert_eq!(hits[0].explanation.lexical_match, 1.0);
246 assert_eq!(hits[1].document, weaker);
247 assert_eq!(hits[1].explanation.lexical_match, 1.0 / 3.0);
248 }
249
250 #[test]
251 fn empty_query_is_validation_error() {
252 let index = LexicalIndex::default();
253
254 let err = index.search(" \n\t ").unwrap_err();
255 assert!(
256 err.to_string().contains("must not be empty"),
257 "expected 'must not be empty' in error, got: {err}"
258 );
259 }
260
261 #[test]
262 fn unicode_only_query_returns_no_matches_not_error() {
263 let doc = LexicalDocument::accepted_memory(
264 MemoryId::new(),
265 "memory recall in Greek",
266 vec!["retrieval".into()],
267 );
268 let index = LexicalIndex::new(vec![doc]);
269
270 let hits = index.search("μνήμη").expect("unicode query should not error");
273 assert!(hits.is_empty(), "expected no hits for Unicode-only query");
274 }
275
276 #[test]
277 fn tag_filter_excludes_documents_missing_required_tag() {
278 let with_rust = LexicalDocument::accepted_memory(
279 MemoryId::new(),
280 "rust memory matters",
281 vec!["rust".into(), "store".into()],
282 );
283 let without_rust = LexicalDocument::accepted_memory(
284 MemoryId::new(),
285 "memory matters in python",
286 vec!["python".into(), "store".into()],
287 );
288 let index = LexicalIndex::new(vec![with_rust.clone(), without_rust]);
289
290 let hits = index
291 .search_with_tag_filter("memory matters", &["rust".into()])
292 .expect("filtered search succeeds");
293
294 assert_eq!(hits.len(), 1);
295 assert_eq!(hits[0].document, with_rust);
296 }
297
298 #[test]
299 fn tag_filter_applies_and_semantics() {
300 let only_a = LexicalDocument::accepted_memory(
301 MemoryId::new(),
302 "claim mentions retrieval",
303 vec!["a".into()],
304 );
305 let only_b = LexicalDocument::accepted_memory(
306 MemoryId::new(),
307 "claim mentions retrieval too",
308 vec!["b".into()],
309 );
310 let both = LexicalDocument::accepted_memory(
311 MemoryId::new(),
312 "claim mentions retrieval thrice",
313 vec!["a".into(), "b".into()],
314 );
315 let index = LexicalIndex::new(vec![only_a, only_b, both.clone()]);
316
317 let hits = index
318 .search_with_tag_filter("retrieval", &["a".into(), "b".into()])
319 .expect("filtered search succeeds");
320
321 assert_eq!(hits.len(), 1);
322 assert_eq!(hits[0].document, both);
323 }
324
325 #[test]
326 fn tag_filter_empty_required_tags_matches_unfiltered_search() {
327 let document = LexicalDocument::accepted_memory(
328 MemoryId::new(),
329 "memory matters",
330 vec!["rust".into()],
331 );
332 let index = LexicalIndex::new(vec![document]);
333
334 let baseline = index.search("memory matters").expect("baseline search");
335 let filtered = index
336 .search_with_tag_filter("memory matters", &[])
337 .expect("filtered search");
338
339 assert_eq!(baseline, filtered);
340 }
341}