Skip to main content

veles_core/
tokenizer.rs

1//! Tokenizer for BM25 indexing — splits identifiers into sub-tokens.
2//!
3//! Hot path: indexing tokenises every chunk (~50 lines of code each), so a few
4//! micro-optimisations matter here:
5//!   * `tokenize_into` lets callers reuse an output buffer across docs.
6//!   * `split_identifier_into` only emits the lowercased original when there
7//!     are no sub-tokens (no Vec allocation in the simple-identifier case).
8//!   * ASCII fast-path lowercase avoids the Unicode case-folding allocation
9//!     for source code, which is overwhelmingly ASCII.
10
11use regex::Regex;
12use std::sync::LazyLock;
13
14// Match identifier-like tokens in source text.
15//
16// Unicode-aware: any letter (including Cyrillic, Greek, CJK, Arabic, …) starts
17// a token; it then continues with letters, digits, or underscores. ASCII is a
18// strict subset of `\p{L}`/`\p{N}`, so this stays correct for English source.
19//
20// Note: camelCase splitting in `split_camel_into` operates on ASCII bytes and
21// simply emits the whole-token form for non-ASCII identifiers (we don't try to
22// split scripts whose case-folding semantics differ from ASCII).
23static TOKEN_RE: LazyLock<Regex> =
24    LazyLock::new(|| Regex::new(r"[\p{L}_][\p{L}\p{N}_]*").unwrap());
25
26/// Split an ASCII identifier into camelCase/PascalCase sub-tokens.
27///
28/// Mirrors the Python tokeniser's intent (the equivalent regex used a
29/// look-ahead, which the `regex` crate doesn't support):
30///
31///   `HandlerStack`     → `["Handler", "Stack"]`
32///   `getHTTPResponse`  → `["get", "HTTP", "Response"]`
33///   `XMLParser`        → `["XML", "Parser"]`
34///   `parse2Things`     → `["parse", "2", "Things"]`
35///
36/// Handwritten in plain Rust to avoid both the regex look-around and the
37/// per-call allocation of regex iteration.
38fn split_camel_into(token: &str, out: &mut Vec<String>) {
39    let bytes = token.as_bytes();
40    let n = bytes.len();
41    if n == 0 {
42        return;
43    }
44
45    let mut i = 0usize;
46    while i < n {
47        let c = bytes[i];
48
49        if c.is_ascii_uppercase() {
50            // Consume the run of consecutive uppercase letters.
51            let run_start = i;
52            while i < n && bytes[i].is_ascii_uppercase() {
53                i += 1;
54            }
55            let upper_end = i;
56            let run_len = upper_end - run_start;
57
58            if i < n && bytes[i].is_ascii_lowercase() {
59                // Uppercase run followed by lowercase letters.
60                if run_len >= 2 {
61                    // Acronym boundary: the *last* uppercase belongs to the next word.
62                    //   "HTTPResponse" → "HTTP" + "Response"
63                    let acronym_end = upper_end - 1;
64                    push_slice(token, run_start, acronym_end, out);
65                    let word_start = acronym_end;
66                    while i < n && bytes[i].is_ascii_lowercase() {
67                        i += 1;
68                    }
69                    push_slice(token, word_start, i, out);
70                } else {
71                    // Single uppercase + lowercase: "Handler", "Config".
72                    while i < n && bytes[i].is_ascii_lowercase() {
73                        i += 1;
74                    }
75                    push_slice(token, run_start, i, out);
76                }
77            } else {
78                // Pure uppercase block: "HTTP", "XML", or trailing acronym.
79                push_slice(token, run_start, upper_end, out);
80            }
81        } else if c.is_ascii_lowercase() {
82            let s = i;
83            while i < n && bytes[i].is_ascii_lowercase() {
84                i += 1;
85            }
86            push_slice(token, s, i, out);
87        } else if c.is_ascii_digit() {
88            let s = i;
89            while i < n && bytes[i].is_ascii_digit() {
90                i += 1;
91            }
92            push_slice(token, s, i, out);
93        } else {
94            // Anything else (e.g. underscore not handled here) — skip.
95            i += 1;
96        }
97    }
98}
99
100#[inline]
101fn push_slice(token: &str, start: usize, end: usize, out: &mut Vec<String>) {
102    // Token is ASCII (TOKEN_RE only matches `[a-zA-Z_][a-zA-Z0-9_]*`), so byte
103    // slicing is safe and we can lowercase per-byte.
104    let slice = &token[start..end];
105    out.push(lowercase(slice));
106}
107
108/// Lowercase an ASCII-mostly string. Falls back to `to_lowercase` if any
109/// non-ASCII byte is seen, keeping correctness for source containing Unicode.
110#[inline]
111fn lowercase(s: &str) -> String {
112    if s.bytes().all(|b| b < 0x80) {
113        // ASCII-only: simple per-byte lowercase, exactly one allocation.
114        let mut out = String::with_capacity(s.len());
115        for b in s.bytes() {
116            out.push((if b.is_ascii_uppercase() { b + 32 } else { b }) as char);
117        }
118        out
119    } else {
120        s.to_lowercase()
121    }
122}
123
124/// Append sub-tokens of a single identifier (lowercased) to `out`.
125///
126/// Always emits the lowercased whole token. If the identifier splits into
127/// multiple parts (snake_case or camelCase), each sub-token is appended too.
128#[inline]
129fn split_identifier_into(token: &str, out: &mut Vec<String>) {
130    let lower = lowercase(token);
131
132    if token.contains('_') {
133        let parts: Vec<&str> = lower.split('_').filter(|p| !p.is_empty()).collect();
134        if parts.len() >= 2 {
135            let part_strings: Vec<String> = parts.iter().map(|p| (*p).to_string()).collect();
136            out.push(lower);
137            out.extend(part_strings);
138            return;
139        }
140        out.push(lower);
141        return;
142    }
143
144    // camelCase / PascalCase / digit boundaries — handwritten splitter.
145    // We split into a scratch buffer first so we can decide whether to also
146    // emit the whole-token form.
147    let len_before = out.len();
148    split_camel_into(token, out);
149    let n_parts = out.len() - len_before;
150
151    if n_parts >= 2 {
152        // Insert the whole token *before* the sub-tokens to match the prior order:
153        //   `["handlerstack", "handler", "stack"]`.
154        out.insert(len_before, lower);
155    } else {
156        // 0 or 1 sub-tokens — keep just the whole-token form.
157        out.truncate(len_before);
158        out.push(lower);
159    }
160}
161
162/// Split a single identifier into sub-tokens.
163///
164/// Convenience wrapper retained for API compatibility with ranking code.
165pub fn split_identifier(token: &str) -> Vec<String> {
166    let mut out = Vec::new();
167    split_identifier_into(token, &mut out);
168    out
169}
170
171/// Tokenize text into the provided output buffer.
172///
173/// The buffer is appended to (not cleared); callers can reuse a single Vec
174/// across many documents to amortise allocation.
175pub fn tokenize_into(text: &str, out: &mut Vec<String>) {
176    for tok in TOKEN_RE.find_iter(text) {
177        split_identifier_into(tok.as_str(), out);
178    }
179}
180
181/// Split text into lowercase identifier-like tokens for BM25 indexing.
182pub fn tokenize(text: &str) -> Vec<String> {
183    let mut out = Vec::new();
184    tokenize_into(text, &mut out);
185    out
186}
187
188#[cfg(test)]
189mod tests {
190    use super::*;
191
192    #[test]
193    fn test_split_identifier_snake_case() {
194        let parts = split_identifier("my_func");
195        assert_eq!(parts, vec!["my_func", "my", "func"]);
196    }
197
198    #[test]
199    fn test_split_identifier_camel_case() {
200        let parts = split_identifier("HandlerStack");
201        assert_eq!(parts, vec!["handlerstack", "handler", "stack"]);
202    }
203
204    #[test]
205    fn test_split_identifier_simple() {
206        let parts = split_identifier("simple");
207        assert_eq!(parts, vec!["simple"]);
208    }
209
210    #[test]
211    fn test_tokenize_mixed() {
212        let tokens = tokenize("parseConfig handler");
213        assert!(tokens.contains(&"parseconfig".to_string()));
214        assert!(tokens.contains(&"parse".to_string()));
215        assert!(tokens.contains(&"config".to_string()));
216        assert!(tokens.contains(&"handler".to_string()));
217    }
218
219    #[test]
220    fn test_tokenize_cyrillic() {
221        // Cyrillic words should be captured as whole tokens (no camelCase
222        // splitting), and lowercased correctly.
223        let tokens = tokenize("Как работи токенизаторът");
224        assert!(tokens.contains(&"как".to_string()));
225        assert!(tokens.contains(&"работи".to_string()));
226        assert!(tokens.contains(&"токенизаторът".to_string()));
227    }
228
229    #[test]
230    fn test_tokenize_mixed_scripts() {
231        // Mixed ASCII and Cyrillic should each be captured as their own token.
232        let tokens = tokenize("parseConfig функция handler");
233        assert!(tokens.contains(&"parseconfig".to_string()));
234        assert!(tokens.contains(&"parse".to_string()));
235        assert!(tokens.contains(&"config".to_string()));
236        assert!(tokens.contains(&"функция".to_string()));
237        assert!(tokens.contains(&"handler".to_string()));
238    }
239
240    #[test]
241    fn test_tokenize_cjk() {
242        // CJK characters should also be captured (no spaces between chars in
243        // Chinese — the regex matches each contiguous run of letters).
244        let tokens = tokenize("函数 search 関数");
245        assert!(tokens.contains(&"函数".to_string()));
246        assert!(tokens.contains(&"search".to_string()));
247        assert!(tokens.contains(&"関数".to_string()));
248    }
249
250    #[test]
251    fn ascii_fast_path_matches_unicode() {
252        // Make sure both code paths agree on a mixed input.
253        let s_ascii = "FooBar123";
254        let s_unicode = "FooBär123";
255        assert_eq!(lowercase(s_ascii), s_ascii.to_lowercase());
256        assert_eq!(lowercase(s_unicode), s_unicode.to_lowercase());
257    }
258}