ripvec-core 3.1.2

Semantic code + document search engine. Cacheless static-embedding + cross-encoder rerank by default; optional ModernBERT/BGE transformer engines with GPU backends. Tree-sitter chunking, hybrid BM25 + PageRank, composable ranking layers.
Documentation
//! Identifier tokenizer for BM25 indexing.
//!
//! Port of `~/src/semble/src/semble/tokens.py`. Splits identifiers on
//! camelCase / PascalCase / snake_case boundaries and emits sub-tokens
//! alongside the lowercased compound, so partial matches work in BM25.
//!
//! ## Behavioural parity with Python
//!
//! Python's `_CAMEL_RE` uses a lookahead (`(?=[A-Z][a-z])`) that the
//! default `regex` crate's RE2 engine does not support. Rather than
//! pulling in `fancy-regex`, this module hand-rolls
//! [`split_camel_segments`] to match the four alternatives Python's
//! regex tries in order:
//!
//! - `[A-Z]+(?=[A-Z][a-z])` — uppercase acronym before a CamelWord
//!   (`HTTP` in `HTTPResponse`).
//! - `[A-Z]?[a-z]+` — optional upper + lower-case run (`Response`,
//!   `get`, `andler`).
//! - `[A-Z]+` — uppercase-only run (acronym at end of identifier).
//! - `[0-9]+` — digit run.
//!
//! Parity is enforced by [`tests::matches_python_docstring_examples`]
//! and a corpus property test.

use std::sync::OnceLock;

use regex::Regex;

/// Match an identifier-like token: `[a-zA-Z_][a-zA-Z0-9_]*`.
///
/// Pre-filter for [`tokenize`]; identifies the spans inside arbitrary
/// text that look like programming-language identifiers (snake_case,
/// camelCase, dotted names, etc.).
fn identifier_re() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| Regex::new(r"[a-zA-Z_][a-zA-Z0-9_]*").expect("identifier regex compiles"))
}

/// Split a single identifier into camelCase/PascalCase segments.
///
/// Implements the four-alternative match from `tokens.py`'s `_CAMEL_RE`
/// without lookarounds. Walks the input byte-wise (ASCII) and consumes:
///
/// - **Digits**: any run of `[0-9]+` becomes one segment.
/// - **Lowercase-led runs**: a leading lowercase letter consumes the run
///   of following lowercase letters (alternative B with the optional
///   leading upper skipped).
/// - **Uppercase-led runs**:
///   - Followed immediately by a lowercase letter → CamelWord
///     (alternative B with the leading upper taken).
///   - Followed by more uppercase letters → acronym; if the run ends
///     and the next char is lowercase, the last upper "belongs" to the
///     next CamelWord (alternative A's lookahead) and is yielded as its
///     own segment on the next loop.
///   - All-uppercase to end of string → alternative C.
///
/// Non-alphanumeric bytes are skipped (the caller normally pre-filters
/// via [`identifier_re`]).
fn split_camel_segments(token: &str) -> Vec<String> {
    // ASCII only — Python's regex applies to ASCII identifiers and this
    // module pre-filters via identifier_re. Multibyte UTF-8 cannot
    // appear in a tree-sitter identifier match.
    let bytes = token.as_bytes();
    let n = bytes.len();
    let mut segments: Vec<String> = Vec::new();
    let mut i = 0;
    while i < n {
        let c = bytes[i];
        if c.is_ascii_digit() {
            let start = i;
            while i < n && bytes[i].is_ascii_digit() {
                i += 1;
            }
            segments.push(token[start..i].to_string());
        } else if c.is_ascii_alphabetic() {
            let start = i;
            if c.is_ascii_uppercase() {
                i += 1;
                if i < n && bytes[i].is_ascii_lowercase() {
                    // CamelWord: upper followed by lowers.
                    while i < n && bytes[i].is_ascii_lowercase() {
                        i += 1;
                    }
                    segments.push(token[start..i].to_string());
                } else if i < n && bytes[i].is_ascii_uppercase() {
                    // Acronym branch: consume uppercase run.
                    let mut j = i;
                    while j < n && bytes[j].is_ascii_uppercase() {
                        j += 1;
                    }
                    if j < n && bytes[j].is_ascii_lowercase() && j > start + 1 {
                        // Last upper (j-1) starts the next CamelWord; the
                        // acronym is start..j-1.
                        segments.push(token[start..j - 1].to_string());
                        i = j - 1;
                    } else {
                        // All-uppercase run (alternative C) — push fully.
                        segments.push(token[start..j].to_string());
                        i = j;
                    }
                } else {
                    // Single upper followed by digit/EOS/non-alpha.
                    segments.push(token[start..i].to_string());
                }
            } else {
                // Lowercase-led run (no leading upper to skip).
                while i < n && bytes[i].is_ascii_lowercase() {
                    i += 1;
                }
                segments.push(token[start..i].to_string());
            }
        } else {
            // Non-alphanumeric — skip (defensive; caller pre-filters).
            i += 1;
        }
    }
    segments
}

/// Split a single identifier into sub-tokens via camelCase / snake_case.
///
/// Returns the original token (lowered) plus any sub-tokens.
///
/// # Examples
///
/// - `HandlerStack` → `["handlerstack", "handler", "stack"]`
/// - `getHTTPResponse` → `["gethttpresponse", "get", "http", "response"]`
/// - `my_func` → `["my_func", "my", "func"]`
/// - `XMLParser` → `["xmlparser", "xml", "parser"]`
/// - `simple` → `["simple"]`
///
/// Mirrors the Python `split_identifier` function exactly.
#[must_use]
pub fn split_identifier(token: &str) -> Vec<String> {
    let lower = token.to_ascii_lowercase();
    let parts: Vec<String> = if token.contains('_') {
        // snake_case branch.
        lower
            .split('_')
            .filter(|p| !p.is_empty())
            .map(str::to_string)
            .collect()
    } else {
        // camelCase / PascalCase branch.
        split_camel_segments(token)
            .into_iter()
            .map(|s| s.to_ascii_lowercase())
            .collect()
    };
    if parts.len() >= 2 {
        let mut out = Vec::with_capacity(parts.len() + 1);
        out.push(lower);
        out.extend(parts);
        out
    } else {
        vec![lower]
    }
}

/// Split text into lowercase identifier-like tokens for BM25 indexing.
///
/// Compound identifiers (camelCase, PascalCase, snake_case) are expanded
/// into sub-tokens so that partial matches work. The original compound
/// token is preserved for exact-match boosting.
///
/// # Examples
///
/// `"getHTTPResponse from MyClass"` → tokens include `gethttpresponse`,
/// `get`, `http`, `response`, `from`, `myclass`, `my`, `class`.
#[must_use]
pub fn tokenize(text: &str) -> Vec<String> {
    let mut result = Vec::new();
    for raw in identifier_re().find_iter(text) {
        result.extend(split_identifier(raw.as_str()));
    }
    result
}

#[cfg(test)]
mod tests {
    use super::*;

    /// The four cases in `tokens.py`'s docstring must round-trip exactly.
    /// Corresponds to acceptance `test:tokens-{handlerstack,gethttpresponse,my-func,xmlparser}`.
    #[test]
    fn tokens_handlerstack() {
        assert_eq!(
            split_identifier("HandlerStack"),
            vec!["handlerstack", "handler", "stack"]
        );
    }

    #[test]
    fn tokens_gethttpresponse() {
        assert_eq!(
            split_identifier("getHTTPResponse"),
            vec!["gethttpresponse", "get", "http", "response"]
        );
    }

    #[test]
    fn tokens_my_func() {
        assert_eq!(split_identifier("my_func"), vec!["my_func", "my", "func"]);
    }

    #[test]
    fn tokens_xmlparser() {
        assert_eq!(
            split_identifier("XMLParser"),
            vec!["xmlparser", "xml", "parser"]
        );
    }

    /// Property: the input lowered is always the first emitted element
    /// for multi-segment tokens; for single-segment tokens it is the sole
    /// element. Cross-checks split_identifier's contract against a fixed
    /// corpus drawn from Python semble's behaviour.
    ///
    /// Corresponds to acceptance `property:tokens-parity-python-corpus`.
    #[test]
    fn matches_python_docstring_examples() {
        // Pairs lifted from src/semble/tokens.py docstring + comment examples.
        let cases: &[(&str, &[&str])] = &[
            ("HandlerStack", &["handlerstack", "handler", "stack"]),
            (
                "getHTTPResponse",
                &["gethttpresponse", "get", "http", "response"],
            ),
            ("XMLParser", &["xmlparser", "xml", "parser"]),
            ("my_func", &["my_func", "my", "func"]),
            ("simple", &["simple"]),
            ("CamelCase", &["camelcase", "camel", "case"]),
            (
                "snake_case_word",
                &["snake_case_word", "snake", "case", "word"],
            ),
            ("HTML", &["html"]),
            ("parseHTML", &["parsehtml", "parse", "html"]),
            ("PI_VALUE_2", &["pi_value_2", "pi", "value", "2"]),
        ];
        for (input, expected) in cases {
            let got = split_identifier(input);
            let want: Vec<String> = expected.iter().map(|s| (*s).to_string()).collect();
            assert_eq!(got, want, "split_identifier({input:?})");
        }
    }

    /// `tokenize` runs the identifier regex over the input then expands
    /// each match via split_identifier. Verify with a phrase containing
    /// punctuation, multiple identifiers, and a digit-bearing word.
    #[test]
    fn tokenize_phrase_expands_each_identifier() {
        let got = tokenize("call getHTTPResponse, then MyClass.do_thing(3)");
        // Identifier matches in order:
        //   call -> [call]
        //   getHTTPResponse -> [gethttpresponse, get, http, response]
        //   then -> [then]
        //   MyClass -> [myclass, my, class]
        //   do_thing -> [do_thing, do, thing]
        // The "3" is matched by the digit-only literal, NOT by the identifier
        // regex (which requires a leading letter/underscore), so it does not
        // appear. This matches Python's `_TOKEN_RE = r"[a-zA-Z_][a-zA-Z0-9_]*"`.
        assert_eq!(
            got,
            vec![
                "call",
                "gethttpresponse",
                "get",
                "http",
                "response",
                "then",
                "myclass",
                "my",
                "class",
                "do_thing",
                "do",
                "thing",
            ]
        );
    }

    /// Single character identifiers don't fan out (no sub-tokens beyond
    /// the lowered compound).
    #[test]
    fn single_char_no_fanout() {
        assert_eq!(split_identifier("x"), vec!["x"]);
        assert_eq!(split_identifier("X"), vec!["x"]);
        assert_eq!(split_identifier("_"), vec!["_"]);
    }
}