use std::collections::HashSet;
use cortex_core::{CortexError, CortexResult, MemoryId};
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LexicalDocument {
pub id: MemoryId,
pub claim: String,
pub domains: Vec<String>,
}
impl LexicalDocument {
#[must_use]
pub fn accepted_memory(id: MemoryId, claim: impl Into<String>, domains: Vec<String>) -> Self {
Self {
id,
claim: claim.into(),
domains,
}
}
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct LexicalIndex {
documents: Vec<LexicalDocument>,
}
impl LexicalIndex {
#[must_use]
pub fn new(documents: Vec<LexicalDocument>) -> Self {
Self { documents }
}
pub fn search(&self, query: &str) -> CortexResult<Vec<LexicalHit>> {
self.search_with_tag_filter(query, &[])
}
pub fn search_with_tag_filter(
&self,
query: &str,
required_tags: &[String],
) -> CortexResult<Vec<LexicalHit>> {
if query.trim().is_empty() {
return Err(CortexError::Validation(
"search query must not be empty".into(),
));
}
let query_terms = tokenize_unique(query);
if query_terms.is_empty() {
return Ok(Vec::new());
}
let mut unique_tags: Vec<String> = Vec::with_capacity(required_tags.len());
for tag in required_tags {
if !unique_tags.iter().any(|existing| existing == tag) {
unique_tags.push(tag.clone());
}
}
let mut hits: Vec<_> = self
.documents
.iter()
.filter(|document| document_carries_all_tags(document, &unique_tags))
.filter_map(|document| lexical_hit(document, &query_terms))
.collect();
hits.sort_by(|left, right| {
right
.explanation
.lexical_match
.total_cmp(&left.explanation.lexical_match)
.then_with(|| {
left.document
.id
.to_string()
.cmp(&right.document.id.to_string())
})
});
Ok(hits)
}
}
fn document_carries_all_tags(document: &LexicalDocument, required: &[String]) -> bool {
if required.is_empty() {
return true;
}
required
.iter()
.all(|tag| document.domains.iter().any(|domain| domain == tag))
}
#[derive(Debug, Clone, PartialEq)]
pub struct LexicalHit {
pub document: LexicalDocument,
pub explanation: LexicalExplanation,
}
#[derive(Debug, Clone, PartialEq)]
pub struct LexicalExplanation {
pub lexical_match: f32,
pub query_terms: Vec<String>,
pub matched_claim_terms: Vec<String>,
pub matched_domain_terms: Vec<String>,
pub matched_terms: Vec<String>,
}
fn lexical_hit(document: &LexicalDocument, query_terms: &[String]) -> Option<LexicalHit> {
let claim_terms: HashSet<_> = tokenize_unique(&document.claim).into_iter().collect();
let domain_terms: HashSet<_> = document
.domains
.iter()
.flat_map(|domain| tokenize_unique(domain))
.collect();
let mut matched_claim_terms = Vec::new();
let mut matched_domain_terms = Vec::new();
let mut matched_terms = Vec::new();
for term in query_terms {
let claim_match = claim_terms.contains(term);
let domain_match = domain_terms.contains(term);
if claim_match {
matched_claim_terms.push(term.clone());
}
if domain_match {
matched_domain_terms.push(term.clone());
}
if claim_match || domain_match {
matched_terms.push(term.clone());
}
}
if matched_terms.is_empty() {
return None;
}
let lexical_match = matched_terms.len() as f32 / query_terms.len() as f32;
Some(LexicalHit {
document: document.clone(),
explanation: LexicalExplanation {
lexical_match,
query_terms: query_terms.to_vec(),
matched_claim_terms,
matched_domain_terms,
matched_terms,
},
})
}
fn tokenize_unique(text: &str) -> Vec<String> {
let mut seen = HashSet::new();
let mut terms = Vec::new();
for token in text
.split(|character: char| !character.is_ascii_alphanumeric())
.map(str::trim)
.filter(|token| !token.is_empty())
.map(str::to_ascii_lowercase)
{
if seen.insert(token.clone()) {
terms.push(token);
}
}
terms
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn insert_and_query_returns_expected_hit() {
let expected = LexicalDocument::accepted_memory(
MemoryId::new(),
"SQLite repositories preserve Cortex rows",
vec!["store".into(), "retrieval".into()],
);
let unrelated = LexicalDocument::accepted_memory(
MemoryId::new(),
"Agent prompts require redaction before export",
vec!["privacy".into()],
);
let index = LexicalIndex::new(vec![unrelated, expected.clone()]);
let hits = index.search("sqlite retrieval").expect("search succeeds");
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].document, expected);
assert_eq!(hits[0].explanation.lexical_match, 1.0);
assert_eq!(hits[0].explanation.query_terms, ["sqlite", "retrieval"]);
assert_eq!(hits[0].explanation.matched_claim_terms, ["sqlite"]);
assert_eq!(hits[0].explanation.matched_domain_terms, ["retrieval"]);
}
#[test]
fn search_orders_by_lexical_match() {
let strongest = LexicalDocument::accepted_memory(
MemoryId::new(),
"memory search explains lexical salience scoring",
vec!["retrieval".into()],
);
let weaker = LexicalDocument::accepted_memory(
MemoryId::new(),
"memory lifecycle accepts durable claims",
vec!["memory".into()],
);
let index = LexicalIndex::new(vec![weaker.clone(), strongest.clone()]);
let hits = index
.search("memory lexical salience")
.expect("search succeeds");
assert_eq!(hits[0].document, strongest);
assert_eq!(hits[0].explanation.lexical_match, 1.0);
assert_eq!(hits[1].document, weaker);
assert_eq!(hits[1].explanation.lexical_match, 1.0 / 3.0);
}
#[test]
fn empty_query_is_validation_error() {
let index = LexicalIndex::default();
let err = index.search(" \n\t ").unwrap_err();
assert!(
err.to_string().contains("must not be empty"),
"expected 'must not be empty' in error, got: {err}"
);
}
#[test]
fn unicode_only_query_returns_no_matches_not_error() {
let doc = LexicalDocument::accepted_memory(
MemoryId::new(),
"memory recall in Greek",
vec!["retrieval".into()],
);
let index = LexicalIndex::new(vec![doc]);
let hits = index.search("μνήμη").expect("unicode query should not error");
assert!(hits.is_empty(), "expected no hits for Unicode-only query");
}
#[test]
fn tag_filter_excludes_documents_missing_required_tag() {
let with_rust = LexicalDocument::accepted_memory(
MemoryId::new(),
"rust memory matters",
vec!["rust".into(), "store".into()],
);
let without_rust = LexicalDocument::accepted_memory(
MemoryId::new(),
"memory matters in python",
vec!["python".into(), "store".into()],
);
let index = LexicalIndex::new(vec![with_rust.clone(), without_rust]);
let hits = index
.search_with_tag_filter("memory matters", &["rust".into()])
.expect("filtered search succeeds");
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].document, with_rust);
}
#[test]
fn tag_filter_applies_and_semantics() {
let only_a = LexicalDocument::accepted_memory(
MemoryId::new(),
"claim mentions retrieval",
vec!["a".into()],
);
let only_b = LexicalDocument::accepted_memory(
MemoryId::new(),
"claim mentions retrieval too",
vec!["b".into()],
);
let both = LexicalDocument::accepted_memory(
MemoryId::new(),
"claim mentions retrieval thrice",
vec!["a".into(), "b".into()],
);
let index = LexicalIndex::new(vec![only_a, only_b, both.clone()]);
let hits = index
.search_with_tag_filter("retrieval", &["a".into(), "b".into()])
.expect("filtered search succeeds");
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].document, both);
}
#[test]
fn tag_filter_empty_required_tags_matches_unfiltered_search() {
let document = LexicalDocument::accepted_memory(
MemoryId::new(),
"memory matters",
vec!["rust".into()],
);
let index = LexicalIndex::new(vec![document]);
let baseline = index.search("memory matters").expect("baseline search");
let filtered = index
.search_with_tag_filter("memory matters", &[])
.expect("filtered search");
assert_eq!(baseline, filtered);
}
}