Skip to main content

ripvec_core/encoder/ripvec/
tokens.rs

1//! Identifier tokenizer for BM25 indexing.
2//!
3//! Port of `~/src/semble/src/semble/tokens.py`. Splits identifiers on
4//! camelCase / PascalCase / snake_case boundaries and emits sub-tokens
5//! alongside the lowercased compound, so partial matches work in BM25.
6//!
7//! ## Behavioural parity with Python
8//!
9//! Python's `_CAMEL_RE` uses a lookahead (`(?=[A-Z][a-z])`) that the
10//! default `regex` crate's RE2 engine does not support. Rather than
11//! pulling in `fancy-regex`, this module hand-rolls
12//! [`split_camel_segments`] to match the four alternatives Python's
13//! regex tries in order:
14//!
15//! - `[A-Z]+(?=[A-Z][a-z])` — uppercase acronym before a CamelWord
16//!   (`HTTP` in `HTTPResponse`).
17//! - `[A-Z]?[a-z]+` — optional upper + lower-case run (`Response`,
18//!   `get`, `andler`).
19//! - `[A-Z]+` — uppercase-only run (acronym at end of identifier).
20//! - `[0-9]+` — digit run.
21//!
22//! Parity is enforced by [`tests::matches_python_docstring_examples`]
23//! and a corpus property test.
24
25use std::sync::OnceLock;
26
27use regex::Regex;
28
29/// Match an identifier-like token: `[a-zA-Z_][a-zA-Z0-9_]*`.
30///
31/// Pre-filter for [`tokenize`]; identifies the spans inside arbitrary
32/// text that look like programming-language identifiers (snake_case,
33/// camelCase, dotted names, etc.).
34fn identifier_re() -> &'static Regex {
35    static RE: OnceLock<Regex> = OnceLock::new();
36    RE.get_or_init(|| Regex::new(r"[a-zA-Z_][a-zA-Z0-9_]*").expect("identifier regex compiles"))
37}
38
39/// Split a single identifier into camelCase/PascalCase segments.
40///
41/// Implements the four-alternative match from `tokens.py`'s `_CAMEL_RE`
42/// without lookarounds. Walks the input byte-wise (ASCII) and consumes:
43///
44/// - **Digits**: any run of `[0-9]+` becomes one segment.
45/// - **Lowercase-led runs**: a leading lowercase letter consumes the run
46///   of following lowercase letters (alternative B with the optional
47///   leading upper skipped).
48/// - **Uppercase-led runs**:
49///   - Followed immediately by a lowercase letter → CamelWord
50///     (alternative B with the leading upper taken).
51///   - Followed by more uppercase letters → acronym; if the run ends
52///     and the next char is lowercase, the last upper "belongs" to the
53///     next CamelWord (alternative A's lookahead) and is yielded as its
54///     own segment on the next loop.
55///   - All-uppercase to end of string → alternative C.
56///
57/// Non-alphanumeric bytes are skipped (the caller normally pre-filters
58/// via [`identifier_re`]).
59fn split_camel_segments(token: &str) -> Vec<String> {
60    // ASCII only — Python's regex applies to ASCII identifiers and this
61    // module pre-filters via identifier_re. Multibyte UTF-8 cannot
62    // appear in a tree-sitter identifier match.
63    let bytes = token.as_bytes();
64    let n = bytes.len();
65    let mut segments: Vec<String> = Vec::new();
66    let mut i = 0;
67    while i < n {
68        let c = bytes[i];
69        if c.is_ascii_digit() {
70            let start = i;
71            while i < n && bytes[i].is_ascii_digit() {
72                i += 1;
73            }
74            segments.push(token[start..i].to_string());
75        } else if c.is_ascii_alphabetic() {
76            let start = i;
77            if c.is_ascii_uppercase() {
78                i += 1;
79                if i < n && bytes[i].is_ascii_lowercase() {
80                    // CamelWord: upper followed by lowers.
81                    while i < n && bytes[i].is_ascii_lowercase() {
82                        i += 1;
83                    }
84                    segments.push(token[start..i].to_string());
85                } else if i < n && bytes[i].is_ascii_uppercase() {
86                    // Acronym branch: consume uppercase run.
87                    let mut j = i;
88                    while j < n && bytes[j].is_ascii_uppercase() {
89                        j += 1;
90                    }
91                    if j < n && bytes[j].is_ascii_lowercase() && j > start + 1 {
92                        // Last upper (j-1) starts the next CamelWord; the
93                        // acronym is start..j-1.
94                        segments.push(token[start..j - 1].to_string());
95                        i = j - 1;
96                    } else {
97                        // All-uppercase run (alternative C) — push fully.
98                        segments.push(token[start..j].to_string());
99                        i = j;
100                    }
101                } else {
102                    // Single upper followed by digit/EOS/non-alpha.
103                    segments.push(token[start..i].to_string());
104                }
105            } else {
106                // Lowercase-led run (no leading upper to skip).
107                while i < n && bytes[i].is_ascii_lowercase() {
108                    i += 1;
109                }
110                segments.push(token[start..i].to_string());
111            }
112        } else {
113            // Non-alphanumeric — skip (defensive; caller pre-filters).
114            i += 1;
115        }
116    }
117    segments
118}
119
120/// Split a single identifier into sub-tokens via camelCase / snake_case.
121///
122/// Returns the original token (lowered) plus any sub-tokens.
123///
124/// # Examples
125///
126/// - `HandlerStack` → `["handlerstack", "handler", "stack"]`
127/// - `getHTTPResponse` → `["gethttpresponse", "get", "http", "response"]`
128/// - `my_func` → `["my_func", "my", "func"]`
129/// - `XMLParser` → `["xmlparser", "xml", "parser"]`
130/// - `simple` → `["simple"]`
131///
132/// Mirrors the Python `split_identifier` function exactly.
133#[must_use]
134pub fn split_identifier(token: &str) -> Vec<String> {
135    let lower = token.to_ascii_lowercase();
136    let parts: Vec<String> = if token.contains('_') {
137        // snake_case branch.
138        lower
139            .split('_')
140            .filter(|p| !p.is_empty())
141            .map(str::to_string)
142            .collect()
143    } else {
144        // camelCase / PascalCase branch.
145        split_camel_segments(token)
146            .into_iter()
147            .map(|s| s.to_ascii_lowercase())
148            .collect()
149    };
150    if parts.len() >= 2 {
151        let mut out = Vec::with_capacity(parts.len() + 1);
152        out.push(lower);
153        out.extend(parts);
154        out
155    } else {
156        vec![lower]
157    }
158}
159
160/// Split text into lowercase identifier-like tokens for BM25 indexing.
161///
162/// Compound identifiers (camelCase, PascalCase, snake_case) are expanded
163/// into sub-tokens so that partial matches work. The original compound
164/// token is preserved for exact-match boosting.
165///
166/// # Examples
167///
168/// `"getHTTPResponse from MyClass"` → tokens include `gethttpresponse`,
169/// `get`, `http`, `response`, `from`, `myclass`, `my`, `class`.
170#[must_use]
171pub fn tokenize(text: &str) -> Vec<String> {
172    let mut result = Vec::new();
173    for raw in identifier_re().find_iter(text) {
174        result.extend(split_identifier(raw.as_str()));
175    }
176    result
177}
178
179#[cfg(test)]
180mod tests {
181    use super::*;
182
183    /// The four cases in `tokens.py`'s docstring must round-trip exactly.
184    /// Corresponds to acceptance `test:tokens-{handlerstack,gethttpresponse,my-func,xmlparser}`.
185    #[test]
186    fn tokens_handlerstack() {
187        assert_eq!(
188            split_identifier("HandlerStack"),
189            vec!["handlerstack", "handler", "stack"]
190        );
191    }
192
193    #[test]
194    fn tokens_gethttpresponse() {
195        assert_eq!(
196            split_identifier("getHTTPResponse"),
197            vec!["gethttpresponse", "get", "http", "response"]
198        );
199    }
200
201    #[test]
202    fn tokens_my_func() {
203        assert_eq!(split_identifier("my_func"), vec!["my_func", "my", "func"]);
204    }
205
206    #[test]
207    fn tokens_xmlparser() {
208        assert_eq!(
209            split_identifier("XMLParser"),
210            vec!["xmlparser", "xml", "parser"]
211        );
212    }
213
214    /// Property: the input lowered is always the first emitted element
215    /// for multi-segment tokens; for single-segment tokens it is the sole
216    /// element. Cross-checks split_identifier's contract against a fixed
217    /// corpus drawn from Python semble's behaviour.
218    ///
219    /// Corresponds to acceptance `property:tokens-parity-python-corpus`.
220    #[test]
221    fn matches_python_docstring_examples() {
222        // Pairs lifted from src/semble/tokens.py docstring + comment examples.
223        let cases: &[(&str, &[&str])] = &[
224            ("HandlerStack", &["handlerstack", "handler", "stack"]),
225            (
226                "getHTTPResponse",
227                &["gethttpresponse", "get", "http", "response"],
228            ),
229            ("XMLParser", &["xmlparser", "xml", "parser"]),
230            ("my_func", &["my_func", "my", "func"]),
231            ("simple", &["simple"]),
232            ("CamelCase", &["camelcase", "camel", "case"]),
233            (
234                "snake_case_word",
235                &["snake_case_word", "snake", "case", "word"],
236            ),
237            ("HTML", &["html"]),
238            ("parseHTML", &["parsehtml", "parse", "html"]),
239            ("PI_VALUE_2", &["pi_value_2", "pi", "value", "2"]),
240        ];
241        for (input, expected) in cases {
242            let got = split_identifier(input);
243            let want: Vec<String> = expected.iter().map(|s| (*s).to_string()).collect();
244            assert_eq!(got, want, "split_identifier({input:?})");
245        }
246    }
247
248    /// `tokenize` runs the identifier regex over the input then expands
249    /// each match via split_identifier. Verify with a phrase containing
250    /// punctuation, multiple identifiers, and a digit-bearing word.
251    #[test]
252    fn tokenize_phrase_expands_each_identifier() {
253        let got = tokenize("call getHTTPResponse, then MyClass.do_thing(3)");
254        // Identifier matches in order:
255        //   call -> [call]
256        //   getHTTPResponse -> [gethttpresponse, get, http, response]
257        //   then -> [then]
258        //   MyClass -> [myclass, my, class]
259        //   do_thing -> [do_thing, do, thing]
260        // The "3" is matched by the digit-only literal, NOT by the identifier
261        // regex (which requires a leading letter/underscore), so it does not
262        // appear. This matches Python's `_TOKEN_RE = r"[a-zA-Z_][a-zA-Z0-9_]*"`.
263        assert_eq!(
264            got,
265            vec![
266                "call",
267                "gethttpresponse",
268                "get",
269                "http",
270                "response",
271                "then",
272                "myclass",
273                "my",
274                "class",
275                "do_thing",
276                "do",
277                "thing",
278            ]
279        );
280    }
281
282    /// Single character identifiers don't fan out (no sub-tokens beyond
283    /// the lowered compound).
284    #[test]
285    fn single_char_no_fanout() {
286        assert_eq!(split_identifier("x"), vec!["x"]);
287        assert_eq!(split_identifier("X"), vec!["x"]);
288        assert_eq!(split_identifier("_"), vec!["_"]);
289    }
290}