ripvec_core/encoder/ripvec/tokens.rs
1//! Identifier tokenizer for BM25 indexing.
2//!
3//! Port of `~/src/semble/src/semble/tokens.py`. Splits identifiers on
4//! camelCase / PascalCase / snake_case boundaries and emits sub-tokens
5//! alongside the lowercased compound, so partial matches work in BM25.
6//!
7//! ## Behavioural parity with Python
8//!
9//! Python's `_CAMEL_RE` uses a lookahead (`(?=[A-Z][a-z])`) that the
10//! default `regex` crate's RE2 engine does not support. Rather than
11//! pulling in `fancy-regex`, this module hand-rolls
12//! [`split_camel_segments`] to match the four alternatives Python's
13//! regex tries in order:
14//!
15//! - `[A-Z]+(?=[A-Z][a-z])` — uppercase acronym before a CamelWord
16//! (`HTTP` in `HTTPResponse`).
17//! - `[A-Z]?[a-z]+` — optional upper + lower-case run (`Response`,
18//! `get`, `andler`).
19//! - `[A-Z]+` — uppercase-only run (acronym at end of identifier).
20//! - `[0-9]+` — digit run.
21//!
22//! Parity is enforced by [`tests::matches_python_docstring_examples`]
23//! and a corpus property test.
24
25use std::sync::OnceLock;
26
27use regex::Regex;
28
29/// Match an identifier-like token: `[a-zA-Z_][a-zA-Z0-9_]*`.
30///
31/// Pre-filter for [`tokenize`]; identifies the spans inside arbitrary
32/// text that look like programming-language identifiers (snake_case,
33/// camelCase, dotted names, etc.).
34fn identifier_re() -> &'static Regex {
35 static RE: OnceLock<Regex> = OnceLock::new();
36 RE.get_or_init(|| Regex::new(r"[a-zA-Z_][a-zA-Z0-9_]*").expect("identifier regex compiles"))
37}
38
39/// Split a single identifier into camelCase/PascalCase segments.
40///
41/// Implements the four-alternative match from `tokens.py`'s `_CAMEL_RE`
42/// without lookarounds. Walks the input byte-wise (ASCII) and consumes:
43///
44/// - **Digits**: any run of `[0-9]+` becomes one segment.
45/// - **Lowercase-led runs**: a leading lowercase letter consumes the run
46/// of following lowercase letters (alternative B with the optional
47/// leading upper skipped).
48/// - **Uppercase-led runs**:
49/// - Followed immediately by a lowercase letter → CamelWord
50/// (alternative B with the leading upper taken).
51/// - Followed by more uppercase letters → acronym; if the run ends
52/// and the next char is lowercase, the last upper "belongs" to the
53/// next CamelWord (alternative A's lookahead) and is yielded as its
54/// own segment on the next loop.
55/// - All-uppercase to end of string → alternative C.
56///
57/// Non-alphanumeric bytes are skipped (the caller normally pre-filters
58/// via [`identifier_re`]).
59fn split_camel_segments(token: &str) -> Vec<String> {
60 // ASCII only — Python's regex applies to ASCII identifiers and this
61 // module pre-filters via identifier_re. Multibyte UTF-8 cannot
62 // appear in a tree-sitter identifier match.
63 let bytes = token.as_bytes();
64 let n = bytes.len();
65 let mut segments: Vec<String> = Vec::new();
66 let mut i = 0;
67 while i < n {
68 let c = bytes[i];
69 if c.is_ascii_digit() {
70 let start = i;
71 while i < n && bytes[i].is_ascii_digit() {
72 i += 1;
73 }
74 segments.push(token[start..i].to_string());
75 } else if c.is_ascii_alphabetic() {
76 let start = i;
77 if c.is_ascii_uppercase() {
78 i += 1;
79 if i < n && bytes[i].is_ascii_lowercase() {
80 // CamelWord: upper followed by lowers.
81 while i < n && bytes[i].is_ascii_lowercase() {
82 i += 1;
83 }
84 segments.push(token[start..i].to_string());
85 } else if i < n && bytes[i].is_ascii_uppercase() {
86 // Acronym branch: consume uppercase run.
87 let mut j = i;
88 while j < n && bytes[j].is_ascii_uppercase() {
89 j += 1;
90 }
91 if j < n && bytes[j].is_ascii_lowercase() && j > start + 1 {
92 // Last upper (j-1) starts the next CamelWord; the
93 // acronym is start..j-1.
94 segments.push(token[start..j - 1].to_string());
95 i = j - 1;
96 } else {
97 // All-uppercase run (alternative C) — push fully.
98 segments.push(token[start..j].to_string());
99 i = j;
100 }
101 } else {
102 // Single upper followed by digit/EOS/non-alpha.
103 segments.push(token[start..i].to_string());
104 }
105 } else {
106 // Lowercase-led run (no leading upper to skip).
107 while i < n && bytes[i].is_ascii_lowercase() {
108 i += 1;
109 }
110 segments.push(token[start..i].to_string());
111 }
112 } else {
113 // Non-alphanumeric — skip (defensive; caller pre-filters).
114 i += 1;
115 }
116 }
117 segments
118}
119
120/// Split a single identifier into sub-tokens via camelCase / snake_case.
121///
122/// Returns the original token (lowered) plus any sub-tokens.
123///
124/// # Examples
125///
126/// - `HandlerStack` → `["handlerstack", "handler", "stack"]`
127/// - `getHTTPResponse` → `["gethttpresponse", "get", "http", "response"]`
128/// - `my_func` → `["my_func", "my", "func"]`
129/// - `XMLParser` → `["xmlparser", "xml", "parser"]`
130/// - `simple` → `["simple"]`
131///
132/// Mirrors the Python `split_identifier` function exactly.
133#[must_use]
134pub fn split_identifier(token: &str) -> Vec<String> {
135 let lower = token.to_ascii_lowercase();
136 let parts: Vec<String> = if token.contains('_') {
137 // snake_case branch.
138 lower
139 .split('_')
140 .filter(|p| !p.is_empty())
141 .map(str::to_string)
142 .collect()
143 } else {
144 // camelCase / PascalCase branch.
145 split_camel_segments(token)
146 .into_iter()
147 .map(|s| s.to_ascii_lowercase())
148 .collect()
149 };
150 if parts.len() >= 2 {
151 let mut out = Vec::with_capacity(parts.len() + 1);
152 out.push(lower);
153 out.extend(parts);
154 out
155 } else {
156 vec![lower]
157 }
158}
159
160/// Split text into lowercase identifier-like tokens for BM25 indexing.
161///
162/// Compound identifiers (camelCase, PascalCase, snake_case) are expanded
163/// into sub-tokens so that partial matches work. The original compound
164/// token is preserved for exact-match boosting.
165///
166/// # Examples
167///
168/// `"getHTTPResponse from MyClass"` → tokens include `gethttpresponse`,
169/// `get`, `http`, `response`, `from`, `myclass`, `my`, `class`.
170#[must_use]
171pub fn tokenize(text: &str) -> Vec<String> {
172 let mut result = Vec::new();
173 for raw in identifier_re().find_iter(text) {
174 result.extend(split_identifier(raw.as_str()));
175 }
176 result
177}
178
179#[cfg(test)]
180mod tests {
181 use super::*;
182
183 /// The four cases in `tokens.py`'s docstring must round-trip exactly.
184 /// Corresponds to acceptance `test:tokens-{handlerstack,gethttpresponse,my-func,xmlparser}`.
185 #[test]
186 fn tokens_handlerstack() {
187 assert_eq!(
188 split_identifier("HandlerStack"),
189 vec!["handlerstack", "handler", "stack"]
190 );
191 }
192
193 #[test]
194 fn tokens_gethttpresponse() {
195 assert_eq!(
196 split_identifier("getHTTPResponse"),
197 vec!["gethttpresponse", "get", "http", "response"]
198 );
199 }
200
201 #[test]
202 fn tokens_my_func() {
203 assert_eq!(split_identifier("my_func"), vec!["my_func", "my", "func"]);
204 }
205
206 #[test]
207 fn tokens_xmlparser() {
208 assert_eq!(
209 split_identifier("XMLParser"),
210 vec!["xmlparser", "xml", "parser"]
211 );
212 }
213
214 /// Property: the input lowered is always the first emitted element
215 /// for multi-segment tokens; for single-segment tokens it is the sole
216 /// element. Cross-checks split_identifier's contract against a fixed
217 /// corpus drawn from Python semble's behaviour.
218 ///
219 /// Corresponds to acceptance `property:tokens-parity-python-corpus`.
220 #[test]
221 fn matches_python_docstring_examples() {
222 // Pairs lifted from src/semble/tokens.py docstring + comment examples.
223 let cases: &[(&str, &[&str])] = &[
224 ("HandlerStack", &["handlerstack", "handler", "stack"]),
225 (
226 "getHTTPResponse",
227 &["gethttpresponse", "get", "http", "response"],
228 ),
229 ("XMLParser", &["xmlparser", "xml", "parser"]),
230 ("my_func", &["my_func", "my", "func"]),
231 ("simple", &["simple"]),
232 ("CamelCase", &["camelcase", "camel", "case"]),
233 (
234 "snake_case_word",
235 &["snake_case_word", "snake", "case", "word"],
236 ),
237 ("HTML", &["html"]),
238 ("parseHTML", &["parsehtml", "parse", "html"]),
239 ("PI_VALUE_2", &["pi_value_2", "pi", "value", "2"]),
240 ];
241 for (input, expected) in cases {
242 let got = split_identifier(input);
243 let want: Vec<String> = expected.iter().map(|s| (*s).to_string()).collect();
244 assert_eq!(got, want, "split_identifier({input:?})");
245 }
246 }
247
248 /// `tokenize` runs the identifier regex over the input then expands
249 /// each match via split_identifier. Verify with a phrase containing
250 /// punctuation, multiple identifiers, and a digit-bearing word.
251 #[test]
252 fn tokenize_phrase_expands_each_identifier() {
253 let got = tokenize("call getHTTPResponse, then MyClass.do_thing(3)");
254 // Identifier matches in order:
255 // call -> [call]
256 // getHTTPResponse -> [gethttpresponse, get, http, response]
257 // then -> [then]
258 // MyClass -> [myclass, my, class]
259 // do_thing -> [do_thing, do, thing]
260 // The "3" is matched by the digit-only literal, NOT by the identifier
261 // regex (which requires a leading letter/underscore), so it does not
262 // appear. This matches Python's `_TOKEN_RE = r"[a-zA-Z_][a-zA-Z0-9_]*"`.
263 assert_eq!(
264 got,
265 vec![
266 "call",
267 "gethttpresponse",
268 "get",
269 "http",
270 "response",
271 "then",
272 "myclass",
273 "my",
274 "class",
275 "do_thing",
276 "do",
277 "thing",
278 ]
279 );
280 }
281
282 /// Single character identifiers don't fan out (no sub-tokens beyond
283 /// the lowered compound).
284 #[test]
285 fn single_char_no_fanout() {
286 assert_eq!(split_identifier("x"), vec!["x"]);
287 assert_eq!(split_identifier("X"), vec!["x"]);
288 assert_eq!(split_identifier("_"), vec!["_"]);
289 }
290}