crtx-retrieval 0.1.1

Hybrid retrieval over memory views (lexical + salience; vectors later).
Documentation
//! Snippet extraction: highlighted text windows from memory claims for search
//! result display.
//!
//! The public surface is intentionally pure — no I/O, no global state.

/// A highlighted text fragment from a memory claim suitable for search
/// result display. Contains the surrounding context window and the
/// byte offsets of the matched terms within it.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Snippet {
    /// The extracted window text (UTF-8, at most `MAX_SNIPPET_CHARS` chars).
    pub text: String,
    /// Byte ranges within `text` that correspond to matched query terms.
    /// Callers render highlights using these ranges (e.g. ANSI bold,
    /// HTML `<mark>`, JSON `highlight_ranges`).
    pub highlight_ranges: Vec<std::ops::Range<usize>>,
    /// True if `text` was truncated from the source claim (i.e. the claim
    /// is longer than the window).
    pub truncated: bool,
}

/// Maximum character length of an extracted snippet window.
pub const MAX_SNIPPET_CHARS: usize = 240;

/// Minimum context characters before the first match in the window.
pub const SNIPPET_LEAD_CHARS: usize = 40;

/// Extract a snippet window from `claim` for the given `query_terms`.
///
/// Algorithm:
/// 1. Case-fold both claim and query terms to lowercase for matching.
/// 2. Find the first occurrence of any query term in the claim.
/// 3. Start the window `SNIPPET_LEAD_CHARS` before that match (clamped
///    to a valid UTF-8 character boundary and the start of the string).
/// 4. Extend the window up to `MAX_SNIPPET_CHARS` characters.
/// 5. Within the window, collect all occurrences of all query terms
///    as `highlight_ranges` (byte offsets relative to the window start).
/// 6. If no query term appears in the claim, return the first
///    `MAX_SNIPPET_CHARS` characters as a plain window with no
///    highlights (the claim may still be relevant via semantic score).
///
/// The function is pure (no I/O, no allocation beyond the returned
/// `Snippet`) and UTF-8 correct: it never splits inside a multi-byte
/// character.
pub fn extract(claim: &str, query_terms: &[&str]) -> Snippet {
    let claim_lower = claim.to_lowercase();

    // Step 1–2: find the earliest byte offset of any query term match.
    let first_match_byte = query_terms
        .iter()
        .filter(|term| !term.is_empty())
        .filter_map(|term| {
            let term_lower = term.to_lowercase();
            claim_lower.find(term_lower.as_str())
        })
        .min();

    // Step 3: determine window start in chars.
    let window_start_byte = match first_match_byte {
        None => 0,
        Some(match_byte) => {
            // Convert byte offset to char index.
            let match_char_idx = claim[..match_byte].chars().count();
            // Apply lead, clamped to 0.
            let lead_start_char = match_char_idx.saturating_sub(SNIPPET_LEAD_CHARS);
            // Convert back to byte offset at a character boundary.
            claim
                .char_indices()
                .nth(lead_start_char)
                .map(|(b, _)| b)
                .unwrap_or(0)
        }
    };

    // Step 4: extract up to MAX_SNIPPET_CHARS characters from window_start_byte.
    let window_slice = &claim[window_start_byte..];
    let window_char_count = window_slice.chars().count();
    let truncated_from_end = window_char_count > MAX_SNIPPET_CHARS;

    let window_text: String = if truncated_from_end {
        window_slice.chars().take(MAX_SNIPPET_CHARS).collect()
    } else {
        window_slice.to_string()
    };

    // The window also truncates if we didn't start at byte 0.
    let truncated = window_start_byte > 0 || truncated_from_end;

    // Step 5: collect all occurrences of all terms within the window.
    let window_lower = window_text.to_lowercase();
    let mut highlight_ranges: Vec<std::ops::Range<usize>> = Vec::new();

    for term in query_terms {
        if term.is_empty() {
            continue;
        }
        let term_lower = term.to_lowercase();
        let term_bytes = term_lower.len();
        if term_bytes == 0 {
            continue;
        }
        let mut search_start = 0usize;
        while search_start <= window_lower.len().saturating_sub(term_bytes) {
            match window_lower[search_start..].find(term_lower.as_str()) {
                None => break,
                Some(rel_offset) => {
                    let abs_start = search_start + rel_offset;
                    let abs_end = abs_start + term_bytes;
                    highlight_ranges.push(abs_start..abs_end);
                    // Advance by at least one byte to avoid infinite loops on
                    // zero-length matches (guarded above, but belt-and-suspenders).
                    search_start = abs_start + term_bytes.max(1);
                }
            }
        }
    }

    // Sort and deduplicate overlapping ranges.
    highlight_ranges.sort_by_key(|r| r.start);
    highlight_ranges.dedup_by(|later, earlier| {
        // Merge if `later` starts inside `earlier`.
        if later.start < earlier.end {
            earlier.end = earlier.end.max(later.end);
            true
        } else {
            false
        }
    });

    Snippet {
        text: window_text,
        highlight_ranges,
        truncated,
    }
}

/// Split a search query string into individual terms, lowercased, filtered
/// to terms ≥ 2 chars, and deduplicated.
pub fn tokenize_query(query: &str) -> Vec<String> {
    let mut seen: Vec<String> = Vec::new();
    for token in query
        .split(|c: char| !c.is_alphanumeric())
        .map(str::trim)
        .filter(|t| t.len() >= 2)
        .map(|t| t.to_lowercase())
    {
        if !seen.contains(&token) {
            seen.push(token);
        }
    }
    seen
}

/// Returns `snippet.text` with `[…]` prepended if `truncated`.
///
/// Useful for non-ANSI terminals or plain-text contexts.
pub fn snippet_plain_text(snippet: &Snippet) -> String {
    if snippet.truncated {
        format!("[…]{}", snippet.text)
    } else {
        snippet.text.clone()
    }
}

/// ANSI escape sequences used for highlighting.
const ANSI_BOLD_ON: &str = "\x1b[1m";
const ANSI_UNDERLINE_ON: &str = "\x1b[4m";
const ANSI_RESET: &str = "\x1b[0m";

/// Returns `snippet.text` with ANSI bold+underline applied to each
/// `highlight_range`. Callers that don't want ANSI use
/// [`snippet_plain_text`] instead.
pub fn snippet_ansi_highlighted(snippet: &Snippet) -> String {
    if snippet.highlight_ranges.is_empty() {
        return snippet.text.clone();
    }

    let text = &snippet.text;
    let bytes = text.as_bytes();
    let mut result = String::with_capacity(text.len() + snippet.highlight_ranges.len() * 20);
    let mut cursor = 0usize;

    for range in &snippet.highlight_ranges {
        // Safety: highlight_ranges are byte-aligned to UTF-8 boundaries
        // (they were produced from `find` on a UTF-8 string).
        if range.start > cursor {
            result.push_str(&text[cursor..range.start]);
        }
        result.push_str(ANSI_BOLD_ON);
        result.push_str(ANSI_UNDERLINE_ON);
        // Clamp end to valid byte boundary within text length.
        let end = range.end.min(bytes.len());
        result.push_str(&text[range.start..end]);
        result.push_str(ANSI_RESET);
        cursor = end;
    }

    if cursor < text.len() {
        result.push_str(&text[cursor..]);
    }

    result
}

#[cfg(test)]
mod tests {
    use super::*;

    // -----------------------------------------------------------------------
    // extract
    // -----------------------------------------------------------------------

    #[test]
    fn extract_finds_first_match_and_centers_window() {
        // Build a claim longer than MAX_SNIPPET_CHARS so the window is visible.
        let padding = "word ".repeat(20); // 100 chars of padding before the term
        let claim = format!("{padding}the quick brown fox jumps over the lazy dog");
        let snippet = extract(&claim, &["fox"]);
        assert!(
            snippet.text.contains("fox"),
            "window must contain the matched term"
        );
        assert!(
            !snippet.highlight_ranges.is_empty(),
            "should have at least one highlight"
        );
    }

    #[test]
    fn extract_clamps_window_to_string_start_when_match_is_early() {
        let claim = "fox is the very first word in this claim";
        let snippet = extract(claim, &["fox"]);
        // Window starts at byte 0 — not truncated from start.
        assert!(snippet.text.starts_with("fox"));
        assert!(!snippet.highlight_ranges.is_empty());
    }

    #[test]
    fn extract_returns_truncated_true_when_claim_is_longer_than_max() {
        // A claim of 300 chars (> MAX_SNIPPET_CHARS=240) with term near start.
        let suffix = "x".repeat(300);
        let claim = format!("needle {suffix}");
        let snippet = extract(&claim, &["needle"]);
        assert!(snippet.truncated);
        assert!(snippet.text.len() <= claim.len());
        // Window char count must not exceed MAX_SNIPPET_CHARS.
        assert!(snippet.text.chars().count() <= MAX_SNIPPET_CHARS);
    }

    #[test]
    fn extract_marks_all_occurrences_of_all_terms_in_window() {
        let claim = "rust and memory and rust again and memory everywhere";
        let snippet = extract(claim, &["rust", "memory"]);
        // "rust" appears twice, "memory" appears twice — expect 4 ranges.
        assert_eq!(snippet.highlight_ranges.len(), 4);
        // All ranges must point to "rust" or "memory" in the text.
        for range in &snippet.highlight_ranges {
            let frag = &snippet.text[range.clone()];
            assert!(
                frag == "rust" || frag == "memory",
                "unexpected fragment: {frag:?}"
            );
        }
    }

    #[test]
    fn extract_case_insensitive_match() {
        let claim = "The QUICK Brown Fox Jumps OVER the lazy dog";
        let snippet = extract(claim, &["quick", "FOX"]);
        assert!(!snippet.highlight_ranges.is_empty());
        // Byte ranges are in the original-case window text.
        for range in &snippet.highlight_ranges {
            let frag = snippet.text[range.clone()].to_lowercase();
            assert!(frag == "quick" || frag == "fox");
        }
    }

    #[test]
    fn extract_handles_multi_byte_utf8_without_splitting() {
        // Emoji are 4 bytes each; if we split mid-char the string ops would panic.
        let claim = "🦀 Rust is memory-safe 🦀 and fast 🦀 by design";
        let snippet = extract(claim, &["rust"]);
        // Must not panic and result must be valid UTF-8.
        assert!(std::str::from_utf8(snippet.text.as_bytes()).is_ok());
        assert!(!snippet.highlight_ranges.is_empty());
    }

    #[test]
    fn extract_no_match_returns_first_window_with_empty_highlights() {
        let claim = "no matching term anywhere in this sentence at all";
        let snippet = extract(claim, &["zzz_absent"]);
        assert!(snippet.highlight_ranges.is_empty());
        // Window is the first MAX_SNIPPET_CHARS chars (claim is short here).
        assert_eq!(snippet.text, claim);
        // Short claim not truncated.
        assert!(!snippet.truncated);
    }

    // -----------------------------------------------------------------------
    // tokenize_query
    // -----------------------------------------------------------------------

    #[test]
    fn tokenize_query_lowercases_and_deduplicates() {
        let terms = tokenize_query("Rust RUST rust Memory memory");
        assert_eq!(terms, vec!["rust", "memory"]);
    }

    #[test]
    fn tokenize_query_filters_short_terms() {
        let terms = tokenize_query("a to the memory");
        // "a" (1 char) and "to" (2 chars) — "to" passes (>=2), "a" fails.
        assert!(terms.contains(&"memory".to_string()));
        assert!(terms.contains(&"to".to_string()));
        assert!(!terms.contains(&"a".to_string()));
    }

    // -----------------------------------------------------------------------
    // snippet_plain_text
    // -----------------------------------------------------------------------

    #[test]
    fn snippet_plain_text_prepends_ellipsis_when_truncated() {
        let snippet = Snippet {
            text: "some window text".to_string(),
            highlight_ranges: vec![],
            truncated: true,
        };
        let plain = snippet_plain_text(&snippet);
        assert!(plain.starts_with("[…]"));
        assert!(plain.ends_with("some window text"));
    }

    #[test]
    fn snippet_plain_text_no_prefix_when_not_truncated() {
        let snippet = Snippet {
            text: "full text".to_string(),
            highlight_ranges: vec![],
            truncated: false,
        };
        let plain = snippet_plain_text(&snippet);
        assert_eq!(plain, "full text");
    }

    // -----------------------------------------------------------------------
    // snippet_ansi_highlighted
    // -----------------------------------------------------------------------

    #[test]
    fn snippet_ansi_highlighted_wraps_ranges() {
        let claim = "the quick brown fox jumps over the lazy dog";
        let snippet = extract(claim, &["fox"]);
        let ansi = snippet_ansi_highlighted(&snippet);

        // Must contain the ANSI bold escape before "fox".
        assert!(
            ansi.contains(ANSI_BOLD_ON),
            "ANSI bold escape must be present"
        );
        assert!(
            ansi.contains(ANSI_UNDERLINE_ON),
            "ANSI underline escape must be present"
        );
        assert!(
            ansi.contains(ANSI_RESET),
            "ANSI reset escape must be present"
        );

        // The highlighted region must still contain the matched text.
        assert!(ansi.contains("fox"));

        // Bold escape must appear before "fox" in the output.
        let bold_pos = ansi.find(ANSI_BOLD_ON).unwrap();
        let fox_pos = ansi.find("fox").unwrap();
        assert!(
            bold_pos < fox_pos,
            "bold escape should precede the match text"
        );
    }
}