veles_core/tokenizer.rs
1//! Tokenizer for BM25 indexing — splits identifiers into sub-tokens.
2//!
3//! Hot path: indexing tokenises every chunk (~50 lines of code each), so a few
4//! micro-optimisations matter here:
5//! * `tokenize_into` lets callers reuse an output buffer across docs.
6//! * `split_identifier_into` only emits the lowercased original when there
7//! are no sub-tokens (no Vec allocation in the simple-identifier case).
8//! * ASCII fast-path lowercase avoids the Unicode case-folding allocation
9//! for source code, which is overwhelmingly ASCII.
10
11use regex::Regex;
12use std::sync::LazyLock;
13
14// Match identifier-like tokens in source text.
15//
16// Unicode-aware: any letter (including Cyrillic, Greek, CJK, Arabic, …) starts
17// a token; it then continues with letters, digits, or underscores. ASCII is a
18// strict subset of `\p{L}`/`\p{N}`, so this stays correct for English source.
19//
20// Note: camelCase splitting in `split_camel_into` operates on ASCII bytes and
21// simply emits the whole-token form for non-ASCII identifiers (we don't try to
22// split scripts whose case-folding semantics differ from ASCII).
23static TOKEN_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[\p{L}_][\p{L}\p{N}_]*").unwrap());
24
25/// Split an ASCII identifier into camelCase/PascalCase sub-tokens.
26///
27/// Mirrors the Python tokeniser's intent (the equivalent regex used a
28/// look-ahead, which the `regex` crate doesn't support):
29///
30/// `HandlerStack` → `["Handler", "Stack"]`
31/// `getHTTPResponse` → `["get", "HTTP", "Response"]`
32/// `XMLParser` → `["XML", "Parser"]`
33/// `parse2Things` → `["parse", "2", "Things"]`
34///
35/// Handwritten in plain Rust to avoid both the regex look-around and the
36/// per-call allocation of regex iteration.
37fn split_camel_into(token: &str, out: &mut Vec<String>) {
38 let bytes = token.as_bytes();
39 let n = bytes.len();
40 if n == 0 {
41 return;
42 }
43
44 let mut i = 0usize;
45 while i < n {
46 let c = bytes[i];
47
48 if c.is_ascii_uppercase() {
49 // Consume the run of consecutive uppercase letters.
50 let run_start = i;
51 while i < n && bytes[i].is_ascii_uppercase() {
52 i += 1;
53 }
54 let upper_end = i;
55 let run_len = upper_end - run_start;
56
57 if i < n && bytes[i].is_ascii_lowercase() {
58 // Uppercase run followed by lowercase letters.
59 if run_len >= 2 {
60 // Acronym boundary: the *last* uppercase belongs to the next word.
61 // "HTTPResponse" → "HTTP" + "Response"
62 let acronym_end = upper_end - 1;
63 push_slice(token, run_start, acronym_end, out);
64 let word_start = acronym_end;
65 while i < n && bytes[i].is_ascii_lowercase() {
66 i += 1;
67 }
68 push_slice(token, word_start, i, out);
69 } else {
70 // Single uppercase + lowercase: "Handler", "Config".
71 while i < n && bytes[i].is_ascii_lowercase() {
72 i += 1;
73 }
74 push_slice(token, run_start, i, out);
75 }
76 } else {
77 // Pure uppercase block: "HTTP", "XML", or trailing acronym.
78 push_slice(token, run_start, upper_end, out);
79 }
80 } else if c.is_ascii_lowercase() {
81 let s = i;
82 while i < n && bytes[i].is_ascii_lowercase() {
83 i += 1;
84 }
85 push_slice(token, s, i, out);
86 } else if c.is_ascii_digit() {
87 let s = i;
88 while i < n && bytes[i].is_ascii_digit() {
89 i += 1;
90 }
91 push_slice(token, s, i, out);
92 } else {
93 // Anything else (e.g. underscore not handled here) — skip.
94 i += 1;
95 }
96 }
97}
98
99#[inline]
100fn push_slice(token: &str, start: usize, end: usize, out: &mut Vec<String>) {
101 // Token is ASCII (TOKEN_RE only matches `[a-zA-Z_][a-zA-Z0-9_]*`), so byte
102 // slicing is safe and we can lowercase per-byte.
103 let slice = &token[start..end];
104 out.push(lowercase(slice));
105}
106
107/// Lowercase an ASCII-mostly string. Falls back to `to_lowercase` if any
108/// non-ASCII byte is seen, keeping correctness for source containing Unicode.
109#[inline]
110fn lowercase(s: &str) -> String {
111 if s.bytes().all(|b| b < 0x80) {
112 // ASCII-only: simple per-byte lowercase, exactly one allocation.
113 let mut out = String::with_capacity(s.len());
114 for b in s.bytes() {
115 out.push((if b.is_ascii_uppercase() { b + 32 } else { b }) as char);
116 }
117 out
118 } else {
119 s.to_lowercase()
120 }
121}
122
123/// Append sub-tokens of a single identifier (lowercased) to `out`.
124///
125/// Always emits the lowercased whole token. If the identifier splits into
126/// multiple parts (snake_case or camelCase), each sub-token is appended too.
127#[inline]
128fn split_identifier_into(token: &str, out: &mut Vec<String>) {
129 let lower = lowercase(token);
130
131 if token.contains('_') {
132 let parts: Vec<&str> = lower.split('_').filter(|p| !p.is_empty()).collect();
133 if parts.len() >= 2 {
134 let part_strings: Vec<String> = parts.iter().map(|p| (*p).to_string()).collect();
135 out.push(lower);
136 out.extend(part_strings);
137 return;
138 }
139 out.push(lower);
140 return;
141 }
142
143 // camelCase / PascalCase / digit boundaries — handwritten splitter.
144 // We split into a scratch buffer first so we can decide whether to also
145 // emit the whole-token form.
146 let len_before = out.len();
147 split_camel_into(token, out);
148 let n_parts = out.len() - len_before;
149
150 if n_parts >= 2 {
151 // Insert the whole token *before* the sub-tokens to match the prior order:
152 // `["handlerstack", "handler", "stack"]`.
153 out.insert(len_before, lower);
154 } else {
155 // 0 or 1 sub-tokens — keep just the whole-token form.
156 out.truncate(len_before);
157 out.push(lower);
158 }
159}
160
161/// Split a single identifier into sub-tokens.
162///
163/// Convenience wrapper retained for API compatibility with ranking code.
164pub fn split_identifier(token: &str) -> Vec<String> {
165 let mut out = Vec::new();
166 split_identifier_into(token, &mut out);
167 out
168}
169
170/// Tokenize text into the provided output buffer.
171///
172/// The buffer is appended to (not cleared); callers can reuse a single Vec
173/// across many documents to amortise allocation.
174pub fn tokenize_into(text: &str, out: &mut Vec<String>) {
175 for tok in TOKEN_RE.find_iter(text) {
176 split_identifier_into(tok.as_str(), out);
177 }
178}
179
180/// Split text into lowercase identifier-like tokens for BM25 indexing.
181pub fn tokenize(text: &str) -> Vec<String> {
182 let mut out = Vec::new();
183 tokenize_into(text, &mut out);
184 out
185}
186
187#[cfg(test)]
188mod tests {
189 use super::*;
190
191 #[test]
192 fn test_split_identifier_snake_case() {
193 let parts = split_identifier("my_func");
194 assert_eq!(parts, vec!["my_func", "my", "func"]);
195 }
196
197 #[test]
198 fn test_split_identifier_camel_case() {
199 let parts = split_identifier("HandlerStack");
200 assert_eq!(parts, vec!["handlerstack", "handler", "stack"]);
201 }
202
203 #[test]
204 fn test_split_identifier_simple() {
205 let parts = split_identifier("simple");
206 assert_eq!(parts, vec!["simple"]);
207 }
208
209 #[test]
210 fn test_tokenize_mixed() {
211 let tokens = tokenize("parseConfig handler");
212 assert!(tokens.contains(&"parseconfig".to_string()));
213 assert!(tokens.contains(&"parse".to_string()));
214 assert!(tokens.contains(&"config".to_string()));
215 assert!(tokens.contains(&"handler".to_string()));
216 }
217
218 #[test]
219 fn test_tokenize_cyrillic() {
220 // Cyrillic words should be captured as whole tokens (no camelCase
221 // splitting), and lowercased correctly.
222 let tokens = tokenize("Как работи токенизаторът");
223 assert!(tokens.contains(&"как".to_string()));
224 assert!(tokens.contains(&"работи".to_string()));
225 assert!(tokens.contains(&"токенизаторът".to_string()));
226 }
227
228 #[test]
229 fn test_tokenize_mixed_scripts() {
230 // Mixed ASCII and Cyrillic should each be captured as their own token.
231 let tokens = tokenize("parseConfig функция handler");
232 assert!(tokens.contains(&"parseconfig".to_string()));
233 assert!(tokens.contains(&"parse".to_string()));
234 assert!(tokens.contains(&"config".to_string()));
235 assert!(tokens.contains(&"функция".to_string()));
236 assert!(tokens.contains(&"handler".to_string()));
237 }
238
239 #[test]
240 fn test_tokenize_cjk() {
241 // CJK characters should also be captured (no spaces between chars in
242 // Chinese — the regex matches each contiguous run of letters).
243 let tokens = tokenize("函数 search 関数");
244 assert!(tokens.contains(&"函数".to_string()));
245 assert!(tokens.contains(&"search".to_string()));
246 assert!(tokens.contains(&"関数".to_string()));
247 }
248
249 #[test]
250 fn ascii_fast_path_matches_unicode() {
251 // Make sure both code paths agree on a mixed input.
252 let s_ascii = "FooBar123";
253 let s_unicode = "FooBär123";
254 assert_eq!(lowercase(s_ascii), s_ascii.to_lowercase());
255 assert_eq!(lowercase(s_unicode), s_unicode.to_lowercase());
256 }
257}