collet 0.1.1 - Docs.rs

//! Code-aware tokenizer shared across BM25 consumers.
//!
//! Handles camelCase, snake_case, dotted paths, and filters noise words.
//! Produces lowercased tokens suitable for BM25 term-frequency maps.

use std::collections::HashMap;

// ── Public API ───────────────────────────────────────────────────────

/// Build a term-frequency map from arbitrary text (facts, summaries, etc.).
///
/// Splits on non-alphanumeric boundaries, then handles camelCase / snake_case.
pub fn tokenize_text(text: &str) -> HashMap<String, u32> {
    let mut tf: HashMap<String, u32> = HashMap::new();
    for token in split_and_normalize(text) {
        *tf.entry(token).or_default() += 1;
    }
    tf
}

/// Build a term-frequency map from source code.
///
/// Same as `tokenize_text` but additionally strips string literal contents.
pub fn tokenize_code(content: &str) -> HashMap<String, u32> {
    let mut tf: HashMap<String, u32> = HashMap::new();
    for word in split_code_words(content) {
        if word.len() < 2 || is_noise_word(&word) {
            continue;
        }
        for token in expand_compound(&word) {
            *tf.entry(token).or_default() += 1;
        }
    }
    tf
}

/// Tokenize a file path (splits on `/` and `.`, also indexes full path).
pub fn tokenize_path(path: &str) -> Vec<String> {
    let mut tokens = Vec::new();

    for segment in path.split('/') {
        if let Some((name, ext)) = segment.rsplit_once('.') {
            let name_lower = name.to_lowercase();
            if name_lower.len() >= 2 {
                tokens.push(name_lower);
            }
            let ext_lower = ext.to_lowercase();
            if ext_lower.len() >= 2 {
                tokens.push(ext_lower);
            }
        } else {
            let seg_lower = segment.to_lowercase();
            if seg_lower.len() >= 2 {
                tokens.push(seg_lower);
            }
        }
    }

    // Also index the full path
    tokens.push(path.to_lowercase());
    tokens
}

/// Tokenize a user/agent query — deduplicates and filters query stop words.
pub fn tokenize_query(query: &str) -> Vec<String> {
    let mut seen = std::collections::HashSet::new();
    let mut tokens = Vec::new();
    // Reusable buffer — declared outside the loop to avoid per-word allocation
    let mut clean_buf = String::new();

    for word in query.split_whitespace() {
        // Filter in-place: keep only alphanumeric, underscore, hyphen, dot
        clean_buf.clear();
        clean_buf.reserve(word.len());
        clean_buf.extend(
            word.chars()
                .filter(|c| c.is_alphanumeric() || *c == '_' || *c == '-' || *c == '.'),
        );

        if clean_buf.len() < 2 {
            continue;
        }

        let lower = clean_buf.to_lowercase();
        if is_query_stop_word(&lower) {
            continue;
        }

        // Pass original-case input so expand_compound can split camelCase
        for token in expand_compound(&clean_buf) {
            if seen.insert(token.clone()) {
                tokens.push(token);
            }
        }
    }

    tokens
}

// ── camelCase / snake_case Splitting ─────────────────────────────────

/// Split camelCase / PascalCase into parts.
///
/// ```text
/// "camelCase"  → ["camel", "Case"]
/// "PascalCase" → ["Pascal", "Case"]
/// "HTMLParser"  → ["HTML", "Parser"]
/// "simple"     → ["simple"]
/// ```
pub fn split_camel_case(s: &str) -> Vec<String> {
    let mut parts = Vec::new();
    let mut current = String::new();

    // Iterate using char_indices for safe multi-byte handling
    let chars: Vec<char> = s.chars().collect();
    let len = chars.len();

    for i in 0..len {
        let ch = chars[i];

        if ch.is_uppercase() && !current.is_empty() {
            let prev_lower = i > 0 && chars[i - 1].is_lowercase();
            let next_lower = i + 1 < len && chars[i + 1].is_lowercase();

            if prev_lower || (next_lower && current.len() > 1) {
                parts.push(std::mem::take(&mut current));
            }
        }

        current.push(ch);
    }

    if !current.is_empty() {
        parts.push(current);
    }

    parts
}

/// Expand a compound token (camelCase or snake_case) into sub-tokens + full token.
///
/// Returns all lowercased.
/// Accepts pre-lowered input (e.g., from `tokenize_query`) and skips re-lowercasing
/// when the input is already lowercase.
fn expand_compound(word: &str) -> Vec<String> {
    let mut tokens = Vec::new();
    let is_already_lower = word.bytes().all(|b| !b.is_ascii_uppercase());

    // Try camelCase split first (only if there are uppercase chars)
    if !is_already_lower {
        let camel_parts = split_camel_case(word);
        if camel_parts.len() > 1 {
            for part in &camel_parts {
                let p = part.to_lowercase();
                if p.len() >= 2 && !is_noise_word(&p) {
                    tokens.push(p);
                }
            }
            tokens.push(word.to_lowercase());
            return tokens;
        }
    }

    let lower = if is_already_lower {
        // Avoid re-allocation — word is already lowercase
        None
    } else {
        Some(word.to_lowercase())
    };
    let lower_ref = lower.as_deref().unwrap_or(word);

    // Try snake_case split
    let snake_parts: Vec<&str> = lower_ref.split('_').filter(|p| p.len() >= 2).collect();
    if snake_parts.len() > 1 {
        for part in &snake_parts {
            if !is_noise_word(part) {
                tokens.push(part.to_string());
            }
        }
        tokens.push(lower_ref.to_string());
    } else {
        tokens.push(lower_ref.to_string());
    }

    tokens
}

// ── Word Splitting ───────────────────────────────────────────────────

/// Split text on non-alphanumeric boundaries (generic text, not source code).
fn split_and_normalize(text: &str) -> Vec<String> {
    let mut tokens = Vec::new();

    for raw_word in text.split(|c: char| !c.is_alphanumeric() && c != '_' && c != '-') {
        let word = raw_word.trim();
        if word.len() < 2 {
            continue;
        }
        if is_stop_word(word) {
            continue;
        }

        tokens.extend(expand_compound(word));
    }

    tokens
}

/// Split raw source code content into word-like tokens, skipping string literals.
fn split_code_words(content: &str) -> Vec<String> {
    let mut words = Vec::new();
    let mut current = String::new();
    let mut in_string = false;
    let mut string_char = '"';

    for ch in content.chars() {
        if in_string {
            if ch == string_char {
                in_string = false;
            }
            continue;
        }

        match ch {
            '"' | '\'' => {
                if !current.is_empty() {
                    words.push(std::mem::take(&mut current));
                }
                in_string = true;
                string_char = ch;
            }
            c if c.is_alphanumeric() || c == '_' => {
                current.push(c);
            }
            _ => {
                if !current.is_empty() {
                    words.push(std::mem::take(&mut current));
                }
            }
        }
    }

    if !current.is_empty() {
        words.push(current);
    }

    words
}

// ── Stop Word Lists ──────────────────────────────────────────────────

/// Unified noise words filtered during indexing.
///
/// Superset of code keywords + common English words.
/// Expects already-lowercased input to avoid repeated allocation.
pub fn is_noise_word(word: &str) -> bool {
    // Fast ASCII-only lowercase check — avoids allocating a new String.
    // For mixed-case input, fall back to allocation.
    if word
        .bytes()
        .all(|b| b.is_ascii_lowercase() || !b.is_ascii_alphabetic())
    {
        is_stop_word(word)
            || matches!(
                word,
                "ok" | "err"
                    | "const"
                    | "static"
                    | "async"
                    | "await"
                    | "match"
                    | "while"
                    | "loop"
                    | "break"
                    | "continue"
                    | "crate"
                    | "super"
                    | "as"
                    | "in"
                    | "ref"
                    | "dyn"
                    | "if"
                    | "else"
            )
    } else {
        let w = word.to_lowercase();
        is_stop_word(&w)
            || matches!(
                w.as_str(),
                "ok" | "err"
                    | "const"
                    | "static"
                    | "async"
                    | "await"
                    | "match"
                    | "while"
                    | "loop"
                    | "break"
                    | "continue"
                    | "crate"
                    | "super"
                    | "as"
                    | "in"
                    | "ref"
                    | "dyn"
                    | "if"
                    | "else"
            )
    }
}

/// Core stop words (shared by both indexing and knowledge retrieval).
/// Expects already-lowercased input when possible; falls back to allocation
/// for mixed-case input.
pub fn is_stop_word(word: &str) -> bool {
    // Fast path: all-ASCII-lowercase — skip allocation
    if word
        .bytes()
        .all(|b| b.is_ascii_lowercase() || !b.is_ascii_alphabetic())
    {
        matches!(
            word,
            "the"
                | "and"
                | "for"
                | "with"
                | "from"
                | "this"
                | "that"
                | "self"
                | "mut"
                | "let"
                | "pub"
                | "use"
                | "mod"
                | "fn"
                | "impl"
                | "struct"
                | "enum"
                | "type"
                | "trait"
                | "where"
                | "return"
                | "true"
                | "false"
                | "none"
                | "some"
                | "is"
                | "are"
                | "was"
                | "has"
                | "had"
                | "not"
                | "but"
                | "all"
                | "can"
                | "will"
                | "into"
                | "then"
                | "than"
        )
    } else {
        matches!(
            word.to_lowercase().as_str(),
            "the"
                | "and"
                | "for"
                | "with"
                | "from"
                | "this"
                | "that"
                | "self"
                | "mut"
                | "let"
                | "pub"
                | "use"
                | "mod"
                | "fn"
                | "impl"
                | "struct"
                | "enum"
                | "type"
                | "trait"
                | "where"
                | "return"
                | "true"
                | "false"
                | "none"
                | "some"
                | "is"
                | "are"
                | "was"
                | "has"
                | "had"
                | "not"
                | "but"
                | "all"
                | "can"
                | "will"
                | "into"
                | "then"
                | "than"
        )
    }
}

/// Query-time stop words (natural language — more aggressive than indexing).
pub fn is_query_stop_word(word: &str) -> bool {
    matches!(
        word,
        "the"
            | "is"
            | "at"
            | "which"
            | "on"
            | "a"
            | "an"
            | "be"
            | "to"
            | "of"
            | "it"
            | "in"
            | "do"
            | "does"
            | "was"
            | "were"
            | "been"
            | "being"
            | "have"
            | "has"
            | "had"
            | "having"
            | "can"
            | "could"
            | "would"
            | "should"
            | "will"
            | "shall"
            | "may"
            | "might"
            | "are"
            | "am"
    )
}

// ── Tests ────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_split_camel_case() {
        assert_eq!(split_camel_case("camelCase"), vec!["camel", "Case"]);
        assert_eq!(split_camel_case("PascalCase"), vec!["Pascal", "Case"]);
        assert_eq!(split_camel_case("HTMLParser"), vec!["HTML", "Parser"]);
        assert_eq!(split_camel_case("simple"), vec!["simple"]);
    }

    #[test]
    fn test_tokenize_text_snake_case() {
        let tf = tokenize_text("shared_knowledge base");
        assert!(tf.contains_key("shared"));
        assert!(tf.contains_key("knowledge"));
        assert!(tf.contains_key("shared_knowledge"));
        assert!(tf.contains_key("base"));
    }

    #[test]
    fn test_tokenize_text_camel_case() {
        let tf = tokenize_text("buildContextSummary");
        assert!(tf.contains_key("build"));
        assert!(tf.contains_key("context"));
        assert!(tf.contains_key("summary"));
    }

    #[test]
    fn test_tokenize_code_skips_strings() {
        let tf = tokenize_code(r#"let x = "hello world";"#);
        assert!(!tf.contains_key("hello"));
        assert!(!tf.contains_key("world"));
    }

    #[test]
    fn test_tokenize_code_snake_case() {
        let tf = tokenize_code("let my_variable = 42;");
        assert!(tf.contains_key("my"));
        assert!(tf.contains_key("variable"));
        assert!(tf.contains_key("my_variable"));
    }

    #[test]
    fn test_tokenize_code_camel_case() {
        let tf = tokenize_code("myFunctionName()");
        assert!(tf.contains_key("my"));
        assert!(tf.contains_key("function"));
        assert!(tf.contains_key("name"));
        assert!(tf.contains_key("myfunctionname"));
    }

    #[test]
    fn test_tokenize_path_segments() {
        let tokens = tokenize_path("src/agent/context.rs");
        assert!(tokens.contains(&"src".to_string()));
        assert!(tokens.contains(&"agent".to_string()));
        assert!(tokens.contains(&"context".to_string()));
        assert!(tokens.contains(&"rs".to_string()));
    }

    #[test]
    fn test_tokenize_query_deduplicates() {
        let tokens = tokenize_query("error error handling");
        let error_count = tokens.iter().filter(|t| *t == "error").count();
        assert_eq!(error_count, 1, "query tokens should be deduplicated");
        assert!(tokens.contains(&"handling".to_string()));
    }

    #[test]
    fn test_tokenize_query_camel_case() {
        // Regression: expand_compound must receive original-case input so camelCase splits work
        let tokens = tokenize_query("parseURL");
        assert!(
            tokens.contains(&"parse".to_string()),
            "camelCase query should split: got {tokens:?}"
        );
        assert!(tokens.contains(&"url".to_string()));
    }

    #[test]
    fn test_tokenize_query_filters_stop_words() {
        let tokens = tokenize_query("where is the error?");
        assert!(!tokens.contains(&"the".to_string()));
        assert!(!tokens.contains(&"is".to_string()));
        assert!(tokens.contains(&"error".to_string()));
    }

    #[test]
    fn test_noise_word_superset() {
        // noise_word includes code-specific keywords beyond stop_word
        assert!(is_noise_word("async"));
        assert!(is_noise_word("await"));
        assert!(is_noise_word("crate"));
        // stop_word subset
        assert!(is_noise_word("the"));
        assert!(is_noise_word("pub"));
    }
}