crtx-retrieval 0.1.1

Hybrid retrieval over memory views (lexical + salience; vectors later).
Documentation
//! Minimal in-memory lexical retrieval over accepted memory documents.

use std::collections::HashSet;

use cortex_core::{CortexError, CortexResult, MemoryId};

/// Read-only document shape consumed by lexical retrieval.
///
/// The store integration can populate this from active/accepted memory rows once
/// a read API exists. The retrieval layer itself does not mutate memory.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LexicalDocument {
    /// Memory identifier.
    pub id: MemoryId,
    /// Durable memory claim text.
    pub claim: String,
    /// Domain tags attached to the memory.
    pub domains: Vec<String>,
}

impl LexicalDocument {
    /// Creates a lexical document from accepted memory fields.
    #[must_use]
    pub fn accepted_memory(id: MemoryId, claim: impl Into<String>, domains: Vec<String>) -> Self {
        Self {
            id,
            claim: claim.into(),
            domains,
        }
    }
}

/// In-memory lexical index.
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct LexicalIndex {
    documents: Vec<LexicalDocument>,
}

impl LexicalIndex {
    /// Builds an index from already accepted memory documents.
    #[must_use]
    pub fn new(documents: Vec<LexicalDocument>) -> Self {
        Self { documents }
    }

    /// Returns matching documents ranked by lexical overlap with `query`.
    pub fn search(&self, query: &str) -> CortexResult<Vec<LexicalHit>> {
        self.search_with_tag_filter(query, &[])
    }

    /// Returns matching documents ranked by lexical overlap with `query`,
    /// restricted to documents whose `domains` tags carry every entry in
    /// `required_tags` (AND semantics).
    ///
    /// The tag filter runs as a pre-pass: documents that fail the AND
    /// closure are skipped before any lexical scoring work. An empty
    /// `required_tags` slice disables filtering and yields the same hits
    /// as [`Self::search`]. Duplicate tag inputs are coalesced.
    pub fn search_with_tag_filter(
        &self,
        query: &str,
        required_tags: &[String],
    ) -> CortexResult<Vec<LexicalHit>> {
        if query.trim().is_empty() {
            return Err(CortexError::Validation(
                "search query must not be empty".into(),
            ));
        }
        let query_terms = tokenize_unique(query);
        if query_terms.is_empty() {
            // Query has content but no ASCII-alphanumeric tokens (e.g. a
            // Unicode-only string like "μνήμη"). The ASCII-tokenized index
            // cannot match anything, so return no results rather than an error.
            return Ok(Vec::new());
        }

        let mut unique_tags: Vec<String> = Vec::with_capacity(required_tags.len());
        for tag in required_tags {
            if !unique_tags.iter().any(|existing| existing == tag) {
                unique_tags.push(tag.clone());
            }
        }

        let mut hits: Vec<_> = self
            .documents
            .iter()
            .filter(|document| document_carries_all_tags(document, &unique_tags))
            .filter_map(|document| lexical_hit(document, &query_terms))
            .collect();
        hits.sort_by(|left, right| {
            right
                .explanation
                .lexical_match
                .total_cmp(&left.explanation.lexical_match)
                .then_with(|| {
                    left.document
                        .id
                        .to_string()
                        .cmp(&right.document.id.to_string())
                })
        });
        Ok(hits)
    }
}

fn document_carries_all_tags(document: &LexicalDocument, required: &[String]) -> bool {
    if required.is_empty() {
        return true;
    }
    required
        .iter()
        .all(|tag| document.domains.iter().any(|domain| domain == tag))
}

/// A lexical search hit and its explanation.
#[derive(Debug, Clone, PartialEq)]
pub struct LexicalHit {
    /// Matching document.
    pub document: LexicalDocument,
    /// Lexical match explanation.
    pub explanation: LexicalExplanation,
}

/// Explanation for the lexical component of retrieval.
#[derive(Debug, Clone, PartialEq)]
pub struct LexicalExplanation {
    /// Normalized lexical match in `[0, 1]`.
    pub lexical_match: f32,
    /// Distinct normalized query terms used for matching.
    pub query_terms: Vec<String>,
    /// Query terms found in the memory claim.
    pub matched_claim_terms: Vec<String>,
    /// Query terms found in the memory domain tags.
    pub matched_domain_terms: Vec<String>,
    /// Distinct query terms found in either indexed field.
    pub matched_terms: Vec<String>,
}

fn lexical_hit(document: &LexicalDocument, query_terms: &[String]) -> Option<LexicalHit> {
    let claim_terms: HashSet<_> = tokenize_unique(&document.claim).into_iter().collect();
    let domain_terms: HashSet<_> = document
        .domains
        .iter()
        .flat_map(|domain| tokenize_unique(domain))
        .collect();

    let mut matched_claim_terms = Vec::new();
    let mut matched_domain_terms = Vec::new();
    let mut matched_terms = Vec::new();

    for term in query_terms {
        let claim_match = claim_terms.contains(term);
        let domain_match = domain_terms.contains(term);
        if claim_match {
            matched_claim_terms.push(term.clone());
        }
        if domain_match {
            matched_domain_terms.push(term.clone());
        }
        if claim_match || domain_match {
            matched_terms.push(term.clone());
        }
    }

    if matched_terms.is_empty() {
        return None;
    }

    let lexical_match = matched_terms.len() as f32 / query_terms.len() as f32;
    Some(LexicalHit {
        document: document.clone(),
        explanation: LexicalExplanation {
            lexical_match,
            query_terms: query_terms.to_vec(),
            matched_claim_terms,
            matched_domain_terms,
            matched_terms,
        },
    })
}

fn tokenize_unique(text: &str) -> Vec<String> {
    let mut seen = HashSet::new();
    let mut terms = Vec::new();
    for token in text
        .split(|character: char| !character.is_ascii_alphanumeric())
        .map(str::trim)
        .filter(|token| !token.is_empty())
        .map(str::to_ascii_lowercase)
    {
        if seen.insert(token.clone()) {
            terms.push(token);
        }
    }
    terms
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn insert_and_query_returns_expected_hit() {
        let expected = LexicalDocument::accepted_memory(
            MemoryId::new(),
            "SQLite repositories preserve Cortex rows",
            vec!["store".into(), "retrieval".into()],
        );
        let unrelated = LexicalDocument::accepted_memory(
            MemoryId::new(),
            "Agent prompts require redaction before export",
            vec!["privacy".into()],
        );
        let index = LexicalIndex::new(vec![unrelated, expected.clone()]);

        let hits = index.search("sqlite retrieval").expect("search succeeds");

        assert_eq!(hits.len(), 1);
        assert_eq!(hits[0].document, expected);
        assert_eq!(hits[0].explanation.lexical_match, 1.0);
        assert_eq!(hits[0].explanation.query_terms, ["sqlite", "retrieval"]);
        assert_eq!(hits[0].explanation.matched_claim_terms, ["sqlite"]);
        assert_eq!(hits[0].explanation.matched_domain_terms, ["retrieval"]);
    }

    #[test]
    fn search_orders_by_lexical_match() {
        let strongest = LexicalDocument::accepted_memory(
            MemoryId::new(),
            "memory search explains lexical salience scoring",
            vec!["retrieval".into()],
        );
        let weaker = LexicalDocument::accepted_memory(
            MemoryId::new(),
            "memory lifecycle accepts durable claims",
            vec!["memory".into()],
        );
        let index = LexicalIndex::new(vec![weaker.clone(), strongest.clone()]);

        let hits = index
            .search("memory lexical salience")
            .expect("search succeeds");

        assert_eq!(hits[0].document, strongest);
        assert_eq!(hits[0].explanation.lexical_match, 1.0);
        assert_eq!(hits[1].document, weaker);
        assert_eq!(hits[1].explanation.lexical_match, 1.0 / 3.0);
    }

    #[test]
    fn empty_query_is_validation_error() {
        let index = LexicalIndex::default();

        let err = index.search(" \n\t ").unwrap_err();
        assert!(
            err.to_string().contains("must not be empty"),
            "expected 'must not be empty' in error, got: {err}"
        );
    }

    #[test]
    fn unicode_only_query_returns_no_matches_not_error() {
        let doc = LexicalDocument::accepted_memory(
            MemoryId::new(),
            "memory recall in Greek",
            vec!["retrieval".into()],
        );
        let index = LexicalIndex::new(vec![doc]);

        // A query composed entirely of non-ASCII Unicode should succeed with
        // zero hits rather than returning a validation error.
        let hits = index.search("μνήμη").expect("unicode query should not error");
        assert!(hits.is_empty(), "expected no hits for Unicode-only query");
    }

    #[test]
    fn tag_filter_excludes_documents_missing_required_tag() {
        let with_rust = LexicalDocument::accepted_memory(
            MemoryId::new(),
            "rust memory matters",
            vec!["rust".into(), "store".into()],
        );
        let without_rust = LexicalDocument::accepted_memory(
            MemoryId::new(),
            "memory matters in python",
            vec!["python".into(), "store".into()],
        );
        let index = LexicalIndex::new(vec![with_rust.clone(), without_rust]);

        let hits = index
            .search_with_tag_filter("memory matters", &["rust".into()])
            .expect("filtered search succeeds");

        assert_eq!(hits.len(), 1);
        assert_eq!(hits[0].document, with_rust);
    }

    #[test]
    fn tag_filter_applies_and_semantics() {
        let only_a = LexicalDocument::accepted_memory(
            MemoryId::new(),
            "claim mentions retrieval",
            vec!["a".into()],
        );
        let only_b = LexicalDocument::accepted_memory(
            MemoryId::new(),
            "claim mentions retrieval too",
            vec!["b".into()],
        );
        let both = LexicalDocument::accepted_memory(
            MemoryId::new(),
            "claim mentions retrieval thrice",
            vec!["a".into(), "b".into()],
        );
        let index = LexicalIndex::new(vec![only_a, only_b, both.clone()]);

        let hits = index
            .search_with_tag_filter("retrieval", &["a".into(), "b".into()])
            .expect("filtered search succeeds");

        assert_eq!(hits.len(), 1);
        assert_eq!(hits[0].document, both);
    }

    #[test]
    fn tag_filter_empty_required_tags_matches_unfiltered_search() {
        let document = LexicalDocument::accepted_memory(
            MemoryId::new(),
            "memory matters",
            vec!["rust".into()],
        );
        let index = LexicalIndex::new(vec![document]);

        let baseline = index.search("memory matters").expect("baseline search");
        let filtered = index
            .search_with_tag_filter("memory matters", &[])
            .expect("filtered search");

        assert_eq!(baseline, filtered);
    }
}