Skip to main content

veles_core/
tokenizer.rs

1//! Tokenizer for BM25 indexing — splits identifiers into sub-tokens.
2//!
3//! Hot path: indexing tokenises every chunk (~50 lines of code each), so a few
4//! micro-optimisations matter here:
5//!   * `tokenize_into` lets callers reuse an output buffer across docs.
6//!   * `split_identifier_into` only emits the lowercased original when there
7//!     are no sub-tokens (no Vec allocation in the simple-identifier case).
8//!   * ASCII fast-path lowercase avoids the Unicode case-folding allocation
9//!     for source code, which is overwhelmingly ASCII.
10
11use regex::Regex;
12use std::sync::LazyLock;
13
14// Match identifier-like tokens in source text.
15//
16// Unicode-aware: any letter (including Cyrillic, Greek, CJK, Arabic, …) starts
17// a token; it then continues with letters, digits, or underscores. ASCII is a
18// strict subset of `\p{L}`/`\p{N}`, so this stays correct for English source.
19//
20// Note: camelCase splitting in `split_camel_into` operates on ASCII bytes and
21// simply emits the whole-token form for non-ASCII identifiers (we don't try to
22// split scripts whose case-folding semantics differ from ASCII).
23static TOKEN_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[\p{L}_][\p{L}\p{N}_]*").unwrap());
24
25/// Split an ASCII identifier into camelCase/PascalCase sub-tokens.
26///
27/// Mirrors the Python tokeniser's intent (the equivalent regex used a
28/// look-ahead, which the `regex` crate doesn't support):
29///
30///   `HandlerStack`     → `["Handler", "Stack"]`
31///   `getHTTPResponse`  → `["get", "HTTP", "Response"]`
32///   `XMLParser`        → `["XML", "Parser"]`
33///   `parse2Things`     → `["parse", "2", "Things"]`
34///
35/// Handwritten in plain Rust to avoid both the regex look-around and the
36/// per-call allocation of regex iteration.
37fn split_camel_into(token: &str, out: &mut Vec<String>) {
38    let bytes = token.as_bytes();
39    let n = bytes.len();
40    if n == 0 {
41        return;
42    }
43
44    let mut i = 0usize;
45    while i < n {
46        let c = bytes[i];
47
48        if c.is_ascii_uppercase() {
49            // Consume the run of consecutive uppercase letters.
50            let run_start = i;
51            while i < n && bytes[i].is_ascii_uppercase() {
52                i += 1;
53            }
54            let upper_end = i;
55            let run_len = upper_end - run_start;
56
57            if i < n && bytes[i].is_ascii_lowercase() {
58                // Uppercase run followed by lowercase letters.
59                if run_len >= 2 {
60                    // Acronym boundary: the *last* uppercase belongs to the next word.
61                    //   "HTTPResponse" → "HTTP" + "Response"
62                    let acronym_end = upper_end - 1;
63                    push_slice(token, run_start, acronym_end, out);
64                    let word_start = acronym_end;
65                    while i < n && bytes[i].is_ascii_lowercase() {
66                        i += 1;
67                    }
68                    push_slice(token, word_start, i, out);
69                } else {
70                    // Single uppercase + lowercase: "Handler", "Config".
71                    while i < n && bytes[i].is_ascii_lowercase() {
72                        i += 1;
73                    }
74                    push_slice(token, run_start, i, out);
75                }
76            } else {
77                // Pure uppercase block: "HTTP", "XML", or trailing acronym.
78                push_slice(token, run_start, upper_end, out);
79            }
80        } else if c.is_ascii_lowercase() {
81            let s = i;
82            while i < n && bytes[i].is_ascii_lowercase() {
83                i += 1;
84            }
85            push_slice(token, s, i, out);
86        } else if c.is_ascii_digit() {
87            let s = i;
88            while i < n && bytes[i].is_ascii_digit() {
89                i += 1;
90            }
91            push_slice(token, s, i, out);
92        } else {
93            // Anything else (e.g. underscore not handled here) — skip.
94            i += 1;
95        }
96    }
97}
98
99#[inline]
100fn push_slice(token: &str, start: usize, end: usize, out: &mut Vec<String>) {
101    // Token is ASCII (TOKEN_RE only matches `[a-zA-Z_][a-zA-Z0-9_]*`), so byte
102    // slicing is safe and we can lowercase per-byte.
103    let slice = &token[start..end];
104    out.push(lowercase(slice));
105}
106
107/// Lowercase an ASCII-mostly string. Falls back to `to_lowercase` if any
108/// non-ASCII byte is seen, keeping correctness for source containing Unicode.
109#[inline]
110fn lowercase(s: &str) -> String {
111    if s.bytes().all(|b| b < 0x80) {
112        // ASCII-only: simple per-byte lowercase, exactly one allocation.
113        let mut out = String::with_capacity(s.len());
114        for b in s.bytes() {
115            out.push((if b.is_ascii_uppercase() { b + 32 } else { b }) as char);
116        }
117        out
118    } else {
119        s.to_lowercase()
120    }
121}
122
123/// Append sub-tokens of a single identifier (lowercased) to `out`.
124///
125/// Always emits the lowercased whole token. If the identifier splits into
126/// multiple parts (snake_case or camelCase), each sub-token is appended too.
127#[inline]
128fn split_identifier_into(token: &str, out: &mut Vec<String>) {
129    let lower = lowercase(token);
130
131    if token.contains('_') {
132        let parts: Vec<&str> = lower.split('_').filter(|p| !p.is_empty()).collect();
133        if parts.len() >= 2 {
134            let part_strings: Vec<String> = parts.iter().map(|p| (*p).to_string()).collect();
135            out.push(lower);
136            out.extend(part_strings);
137            return;
138        }
139        out.push(lower);
140        return;
141    }
142
143    // camelCase / PascalCase / digit boundaries — handwritten splitter.
144    // We split into a scratch buffer first so we can decide whether to also
145    // emit the whole-token form.
146    let len_before = out.len();
147    split_camel_into(token, out);
148    let n_parts = out.len() - len_before;
149
150    if n_parts >= 2 {
151        // Insert the whole token *before* the sub-tokens to match the prior order:
152        //   `["handlerstack", "handler", "stack"]`.
153        out.insert(len_before, lower);
154    } else {
155        // 0 or 1 sub-tokens — keep just the whole-token form.
156        out.truncate(len_before);
157        out.push(lower);
158    }
159}
160
161/// Split a single identifier into sub-tokens.
162///
163/// Convenience wrapper retained for API compatibility with ranking code.
164pub fn split_identifier(token: &str) -> Vec<String> {
165    let mut out = Vec::new();
166    split_identifier_into(token, &mut out);
167    out
168}
169
170/// Tokenize text into the provided output buffer.
171///
172/// The buffer is appended to (not cleared); callers can reuse a single Vec
173/// across many documents to amortise allocation.
174pub fn tokenize_into(text: &str, out: &mut Vec<String>) {
175    for tok in TOKEN_RE.find_iter(text) {
176        split_identifier_into(tok.as_str(), out);
177    }
178}
179
180/// Split text into lowercase identifier-like tokens for BM25 indexing.
181pub fn tokenize(text: &str) -> Vec<String> {
182    let mut out = Vec::new();
183    tokenize_into(text, &mut out);
184    out
185}
186
187#[cfg(test)]
188mod tests {
189    use super::*;
190
191    #[test]
192    fn test_split_identifier_snake_case() {
193        let parts = split_identifier("my_func");
194        assert_eq!(parts, vec!["my_func", "my", "func"]);
195    }
196
197    #[test]
198    fn test_split_identifier_camel_case() {
199        let parts = split_identifier("HandlerStack");
200        assert_eq!(parts, vec!["handlerstack", "handler", "stack"]);
201    }
202
203    #[test]
204    fn test_split_identifier_simple() {
205        let parts = split_identifier("simple");
206        assert_eq!(parts, vec!["simple"]);
207    }
208
209    #[test]
210    fn test_tokenize_mixed() {
211        let tokens = tokenize("parseConfig handler");
212        assert!(tokens.contains(&"parseconfig".to_string()));
213        assert!(tokens.contains(&"parse".to_string()));
214        assert!(tokens.contains(&"config".to_string()));
215        assert!(tokens.contains(&"handler".to_string()));
216    }
217
218    #[test]
219    fn test_tokenize_cyrillic() {
220        // Cyrillic words should be captured as whole tokens (no camelCase
221        // splitting), and lowercased correctly.
222        let tokens = tokenize("Как работи токенизаторът");
223        assert!(tokens.contains(&"как".to_string()));
224        assert!(tokens.contains(&"работи".to_string()));
225        assert!(tokens.contains(&"токенизаторът".to_string()));
226    }
227
228    #[test]
229    fn test_tokenize_mixed_scripts() {
230        // Mixed ASCII and Cyrillic should each be captured as their own token.
231        let tokens = tokenize("parseConfig функция handler");
232        assert!(tokens.contains(&"parseconfig".to_string()));
233        assert!(tokens.contains(&"parse".to_string()));
234        assert!(tokens.contains(&"config".to_string()));
235        assert!(tokens.contains(&"функция".to_string()));
236        assert!(tokens.contains(&"handler".to_string()));
237    }
238
239    #[test]
240    fn test_tokenize_cjk() {
241        // CJK characters should also be captured (no spaces between chars in
242        // Chinese — the regex matches each contiguous run of letters).
243        let tokens = tokenize("函数 search 関数");
244        assert!(tokens.contains(&"函数".to_string()));
245        assert!(tokens.contains(&"search".to_string()));
246        assert!(tokens.contains(&"関数".to_string()));
247    }
248
249    #[test]
250    fn ascii_fast_path_matches_unicode() {
251        // Make sure both code paths agree on a mixed input.
252        let s_ascii = "FooBar123";
253        let s_unicode = "FooBär123";
254        assert_eq!(lowercase(s_ascii), s_ascii.to_lowercase());
255        assert_eq!(lowercase(s_unicode), s_unicode.to_lowercase());
256    }
257}