veles_core/tokenizer.rs
1//! Tokenizer for BM25 indexing — splits identifiers into sub-tokens.
2//!
3//! Hot path: indexing tokenises every chunk (~50 lines of code each), so a few
4//! micro-optimisations matter here:
5//! * `tokenize_into` lets callers reuse an output buffer across docs.
6//! * `split_identifier_into` only emits the lowercased original when there
7//! are no sub-tokens (no Vec allocation in the simple-identifier case).
8//! * ASCII fast-path lowercase avoids the Unicode case-folding allocation
9//! for source code, which is overwhelmingly ASCII.
10
11use regex::Regex;
12use std::sync::LazyLock;
13
14// Match identifier-like tokens in source text.
15//
16// Unicode-aware: any letter (including Cyrillic, Greek, CJK, Arabic, …) starts
17// a token; it then continues with letters, digits, or underscores. ASCII is a
18// strict subset of `\p{L}`/`\p{N}`, so this stays correct for English source.
19//
20// Note: camelCase splitting in `split_camel_into` operates on ASCII bytes and
21// simply emits the whole-token form for non-ASCII identifiers (we don't try to
22// split scripts whose case-folding semantics differ from ASCII).
23static TOKEN_RE: LazyLock<Regex> =
24 LazyLock::new(|| Regex::new(r"[\p{L}_][\p{L}\p{N}_]*").unwrap());
25
26/// Split an ASCII identifier into camelCase/PascalCase sub-tokens.
27///
28/// Mirrors the Python tokeniser's intent (the equivalent regex used a
29/// look-ahead, which the `regex` crate doesn't support):
30///
31/// `HandlerStack` → `["Handler", "Stack"]`
32/// `getHTTPResponse` → `["get", "HTTP", "Response"]`
33/// `XMLParser` → `["XML", "Parser"]`
34/// `parse2Things` → `["parse", "2", "Things"]`
35///
36/// Handwritten in plain Rust to avoid both the regex look-around and the
37/// per-call allocation of regex iteration.
38fn split_camel_into(token: &str, out: &mut Vec<String>) {
39 let bytes = token.as_bytes();
40 let n = bytes.len();
41 if n == 0 {
42 return;
43 }
44
45 let mut i = 0usize;
46 while i < n {
47 let c = bytes[i];
48
49 if c.is_ascii_uppercase() {
50 // Consume the run of consecutive uppercase letters.
51 let run_start = i;
52 while i < n && bytes[i].is_ascii_uppercase() {
53 i += 1;
54 }
55 let upper_end = i;
56 let run_len = upper_end - run_start;
57
58 if i < n && bytes[i].is_ascii_lowercase() {
59 // Uppercase run followed by lowercase letters.
60 if run_len >= 2 {
61 // Acronym boundary: the *last* uppercase belongs to the next word.
62 // "HTTPResponse" → "HTTP" + "Response"
63 let acronym_end = upper_end - 1;
64 push_slice(token, run_start, acronym_end, out);
65 let word_start = acronym_end;
66 while i < n && bytes[i].is_ascii_lowercase() {
67 i += 1;
68 }
69 push_slice(token, word_start, i, out);
70 } else {
71 // Single uppercase + lowercase: "Handler", "Config".
72 while i < n && bytes[i].is_ascii_lowercase() {
73 i += 1;
74 }
75 push_slice(token, run_start, i, out);
76 }
77 } else {
78 // Pure uppercase block: "HTTP", "XML", or trailing acronym.
79 push_slice(token, run_start, upper_end, out);
80 }
81 } else if c.is_ascii_lowercase() {
82 let s = i;
83 while i < n && bytes[i].is_ascii_lowercase() {
84 i += 1;
85 }
86 push_slice(token, s, i, out);
87 } else if c.is_ascii_digit() {
88 let s = i;
89 while i < n && bytes[i].is_ascii_digit() {
90 i += 1;
91 }
92 push_slice(token, s, i, out);
93 } else {
94 // Anything else (e.g. underscore not handled here) — skip.
95 i += 1;
96 }
97 }
98}
99
100#[inline]
101fn push_slice(token: &str, start: usize, end: usize, out: &mut Vec<String>) {
102 // Token is ASCII (TOKEN_RE only matches `[a-zA-Z_][a-zA-Z0-9_]*`), so byte
103 // slicing is safe and we can lowercase per-byte.
104 let slice = &token[start..end];
105 out.push(lowercase(slice));
106}
107
108/// Lowercase an ASCII-mostly string. Falls back to `to_lowercase` if any
109/// non-ASCII byte is seen, keeping correctness for source containing Unicode.
110#[inline]
111fn lowercase(s: &str) -> String {
112 if s.bytes().all(|b| b < 0x80) {
113 // ASCII-only: simple per-byte lowercase, exactly one allocation.
114 let mut out = String::with_capacity(s.len());
115 for b in s.bytes() {
116 out.push((if b.is_ascii_uppercase() { b + 32 } else { b }) as char);
117 }
118 out
119 } else {
120 s.to_lowercase()
121 }
122}
123
124/// Append sub-tokens of a single identifier (lowercased) to `out`.
125///
126/// Always emits the lowercased whole token. If the identifier splits into
127/// multiple parts (snake_case or camelCase), each sub-token is appended too.
128#[inline]
129fn split_identifier_into(token: &str, out: &mut Vec<String>) {
130 let lower = lowercase(token);
131
132 if token.contains('_') {
133 let parts: Vec<&str> = lower.split('_').filter(|p| !p.is_empty()).collect();
134 if parts.len() >= 2 {
135 let part_strings: Vec<String> = parts.iter().map(|p| (*p).to_string()).collect();
136 out.push(lower);
137 out.extend(part_strings);
138 return;
139 }
140 out.push(lower);
141 return;
142 }
143
144 // camelCase / PascalCase / digit boundaries — handwritten splitter.
145 // We split into a scratch buffer first so we can decide whether to also
146 // emit the whole-token form.
147 let len_before = out.len();
148 split_camel_into(token, out);
149 let n_parts = out.len() - len_before;
150
151 if n_parts >= 2 {
152 // Insert the whole token *before* the sub-tokens to match the prior order:
153 // `["handlerstack", "handler", "stack"]`.
154 out.insert(len_before, lower);
155 } else {
156 // 0 or 1 sub-tokens — keep just the whole-token form.
157 out.truncate(len_before);
158 out.push(lower);
159 }
160}
161
162/// Split a single identifier into sub-tokens.
163///
164/// Convenience wrapper retained for API compatibility with ranking code.
165pub fn split_identifier(token: &str) -> Vec<String> {
166 let mut out = Vec::new();
167 split_identifier_into(token, &mut out);
168 out
169}
170
171/// Tokenize text into the provided output buffer.
172///
173/// The buffer is appended to (not cleared); callers can reuse a single Vec
174/// across many documents to amortise allocation.
175pub fn tokenize_into(text: &str, out: &mut Vec<String>) {
176 for tok in TOKEN_RE.find_iter(text) {
177 split_identifier_into(tok.as_str(), out);
178 }
179}
180
181/// Split text into lowercase identifier-like tokens for BM25 indexing.
182pub fn tokenize(text: &str) -> Vec<String> {
183 let mut out = Vec::new();
184 tokenize_into(text, &mut out);
185 out
186}
187
188#[cfg(test)]
189mod tests {
190 use super::*;
191
192 #[test]
193 fn test_split_identifier_snake_case() {
194 let parts = split_identifier("my_func");
195 assert_eq!(parts, vec!["my_func", "my", "func"]);
196 }
197
198 #[test]
199 fn test_split_identifier_camel_case() {
200 let parts = split_identifier("HandlerStack");
201 assert_eq!(parts, vec!["handlerstack", "handler", "stack"]);
202 }
203
204 #[test]
205 fn test_split_identifier_simple() {
206 let parts = split_identifier("simple");
207 assert_eq!(parts, vec!["simple"]);
208 }
209
210 #[test]
211 fn test_tokenize_mixed() {
212 let tokens = tokenize("parseConfig handler");
213 assert!(tokens.contains(&"parseconfig".to_string()));
214 assert!(tokens.contains(&"parse".to_string()));
215 assert!(tokens.contains(&"config".to_string()));
216 assert!(tokens.contains(&"handler".to_string()));
217 }
218
219 #[test]
220 fn test_tokenize_cyrillic() {
221 // Cyrillic words should be captured as whole tokens (no camelCase
222 // splitting), and lowercased correctly.
223 let tokens = tokenize("Как работи токенизаторът");
224 assert!(tokens.contains(&"как".to_string()));
225 assert!(tokens.contains(&"работи".to_string()));
226 assert!(tokens.contains(&"токенизаторът".to_string()));
227 }
228
229 #[test]
230 fn test_tokenize_mixed_scripts() {
231 // Mixed ASCII and Cyrillic should each be captured as their own token.
232 let tokens = tokenize("parseConfig функция handler");
233 assert!(tokens.contains(&"parseconfig".to_string()));
234 assert!(tokens.contains(&"parse".to_string()));
235 assert!(tokens.contains(&"config".to_string()));
236 assert!(tokens.contains(&"функция".to_string()));
237 assert!(tokens.contains(&"handler".to_string()));
238 }
239
240 #[test]
241 fn test_tokenize_cjk() {
242 // CJK characters should also be captured (no spaces between chars in
243 // Chinese — the regex matches each contiguous run of letters).
244 let tokens = tokenize("函数 search 関数");
245 assert!(tokens.contains(&"函数".to_string()));
246 assert!(tokens.contains(&"search".to_string()));
247 assert!(tokens.contains(&"関数".to_string()));
248 }
249
250 #[test]
251 fn ascii_fast_path_matches_unicode() {
252 // Make sure both code paths agree on a mixed input.
253 let s_ascii = "FooBar123";
254 let s_unicode = "FooBär123";
255 assert_eq!(lowercase(s_ascii), s_ascii.to_lowercase());
256 assert_eq!(lowercase(s_unicode), s_unicode.to_lowercase());
257 }
258}