use regex::Regex;
use std::sync::LazyLock;
static TOKEN_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"[\p{L}_][\p{L}\p{N}_]*").unwrap());
fn split_camel_into(token: &str, out: &mut Vec<String>) {
let bytes = token.as_bytes();
let n = bytes.len();
if n == 0 {
return;
}
let mut i = 0usize;
while i < n {
let c = bytes[i];
if c.is_ascii_uppercase() {
let run_start = i;
while i < n && bytes[i].is_ascii_uppercase() {
i += 1;
}
let upper_end = i;
let run_len = upper_end - run_start;
if i < n && bytes[i].is_ascii_lowercase() {
if run_len >= 2 {
let acronym_end = upper_end - 1;
push_slice(token, run_start, acronym_end, out);
let word_start = acronym_end;
while i < n && bytes[i].is_ascii_lowercase() {
i += 1;
}
push_slice(token, word_start, i, out);
} else {
while i < n && bytes[i].is_ascii_lowercase() {
i += 1;
}
push_slice(token, run_start, i, out);
}
} else {
push_slice(token, run_start, upper_end, out);
}
} else if c.is_ascii_lowercase() {
let s = i;
while i < n && bytes[i].is_ascii_lowercase() {
i += 1;
}
push_slice(token, s, i, out);
} else if c.is_ascii_digit() {
let s = i;
while i < n && bytes[i].is_ascii_digit() {
i += 1;
}
push_slice(token, s, i, out);
} else {
i += 1;
}
}
}
#[inline]
fn push_slice(token: &str, start: usize, end: usize, out: &mut Vec<String>) {
let slice = &token[start..end];
out.push(lowercase(slice));
}
#[inline]
fn lowercase(s: &str) -> String {
if s.bytes().all(|b| b < 0x80) {
let mut out = String::with_capacity(s.len());
for b in s.bytes() {
out.push((if b.is_ascii_uppercase() { b + 32 } else { b }) as char);
}
out
} else {
s.to_lowercase()
}
}
#[inline]
fn split_identifier_into(token: &str, out: &mut Vec<String>) {
let lower = lowercase(token);
if token.contains('_') {
let parts: Vec<&str> = lower.split('_').filter(|p| !p.is_empty()).collect();
if parts.len() >= 2 {
let part_strings: Vec<String> = parts.iter().map(|p| (*p).to_string()).collect();
out.push(lower);
out.extend(part_strings);
return;
}
out.push(lower);
return;
}
let len_before = out.len();
split_camel_into(token, out);
let n_parts = out.len() - len_before;
if n_parts >= 2 {
out.insert(len_before, lower);
} else {
out.truncate(len_before);
out.push(lower);
}
}
pub fn split_identifier(token: &str) -> Vec<String> {
let mut out = Vec::new();
split_identifier_into(token, &mut out);
out
}
pub fn tokenize_into(text: &str, out: &mut Vec<String>) {
for tok in TOKEN_RE.find_iter(text) {
split_identifier_into(tok.as_str(), out);
}
}
pub fn tokenize(text: &str) -> Vec<String> {
let mut out = Vec::new();
tokenize_into(text, &mut out);
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_split_identifier_snake_case() {
let parts = split_identifier("my_func");
assert_eq!(parts, vec!["my_func", "my", "func"]);
}
#[test]
fn test_split_identifier_camel_case() {
let parts = split_identifier("HandlerStack");
assert_eq!(parts, vec!["handlerstack", "handler", "stack"]);
}
#[test]
fn test_split_identifier_simple() {
let parts = split_identifier("simple");
assert_eq!(parts, vec!["simple"]);
}
#[test]
fn test_tokenize_mixed() {
let tokens = tokenize("parseConfig handler");
assert!(tokens.contains(&"parseconfig".to_string()));
assert!(tokens.contains(&"parse".to_string()));
assert!(tokens.contains(&"config".to_string()));
assert!(tokens.contains(&"handler".to_string()));
}
#[test]
fn test_tokenize_cyrillic() {
let tokens = tokenize("Как работи токенизаторът");
assert!(tokens.contains(&"как".to_string()));
assert!(tokens.contains(&"работи".to_string()));
assert!(tokens.contains(&"токенизаторът".to_string()));
}
#[test]
fn test_tokenize_mixed_scripts() {
let tokens = tokenize("parseConfig функция handler");
assert!(tokens.contains(&"parseconfig".to_string()));
assert!(tokens.contains(&"parse".to_string()));
assert!(tokens.contains(&"config".to_string()));
assert!(tokens.contains(&"функция".to_string()));
assert!(tokens.contains(&"handler".to_string()));
}
#[test]
fn test_tokenize_cjk() {
let tokens = tokenize("函数 search 関数");
assert!(tokens.contains(&"函数".to_string()));
assert!(tokens.contains(&"search".to_string()));
assert!(tokens.contains(&"関数".to_string()));
}
#[test]
fn ascii_fast_path_matches_unicode() {
let s_ascii = "FooBar123";
let s_unicode = "FooBär123";
assert_eq!(lowercase(s_ascii), s_ascii.to_lowercase());
assert_eq!(lowercase(s_unicode), s_unicode.to_lowercase());
}
}