use regex::Regex;
use std::sync::OnceLock;
fn ident_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| Regex::new(r"[a-zA-Z_][a-zA-Z0-9_]*").unwrap())
}
pub fn scan_identifiers(text: &str) -> Vec<&str> {
ident_re().find_iter(text).map(|m| m.as_str()).collect()
}
pub fn split_identifier(token: &str) -> Vec<String> {
let parts: Vec<String> = if token.contains('_') {
token
.split('_')
.filter(|p| !p.is_empty())
.map(str::to_ascii_lowercase)
.collect()
} else {
camel_split(token)
.into_iter()
.map(|p| p.to_ascii_lowercase())
.collect()
};
if parts.len() >= 2 {
parts
} else {
Vec::new()
}
}
pub fn tokenize(text: &str) -> Vec<String> {
let mut result = Vec::new();
for ident in scan_identifiers(text) {
result.push(ident.to_ascii_lowercase());
result.extend(split_identifier(ident));
}
result
}
fn camel_split(s: &str) -> Vec<String> {
let chars: Vec<char> = s.chars().collect();
let n = chars.len();
let mut out = Vec::new();
let mut i = 0;
while i < n {
let c = chars[i];
if c.is_ascii_digit() {
let start = i;
while i < n && chars[i].is_ascii_digit() {
i += 1;
}
out.push(chars[start..i].iter().collect());
} else if c.is_ascii_uppercase() {
let start = i;
let mut end = i;
while end < n && chars[end].is_ascii_uppercase() {
end += 1;
}
if end - start >= 2 && end < n && chars[end].is_ascii_lowercase() {
let acronym_end = end - 1;
out.push(chars[start..acronym_end].iter().collect());
i = acronym_end;
} else if end - start >= 2 {
out.push(chars[start..end].iter().collect());
i = end;
} else {
let mut j = end;
while j < n && chars[j].is_ascii_lowercase() {
j += 1;
}
out.push(chars[start..j].iter().collect());
i = j;
}
} else if c.is_ascii_lowercase() {
let start = i;
while i < n && chars[i].is_ascii_lowercase() {
i += 1;
}
out.push(chars[start..i].iter().collect());
} else {
i += 1;
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn split_handler_stack() {
assert_eq!(split_identifier("HandlerStack"), vec!["handler", "stack"]);
}
#[test]
fn split_snake_case() {
assert_eq!(split_identifier("my_func"), vec!["my", "func"]);
}
#[test]
fn split_simple_returns_empty() {
assert!(split_identifier("simple").is_empty());
}
#[test]
fn split_acronym_in_camel() {
assert_eq!(
split_identifier("getHTTPResponse"),
vec!["get", "http", "response"]
);
}
#[test]
fn split_pure_acronym_compound() {
assert_eq!(split_identifier("XMLParser"), vec!["xml", "parser"]);
}
#[test]
fn split_with_digits() {
assert_eq!(split_identifier("h2o"), vec!["h", "2", "o"]);
}
#[test]
fn split_leading_trailing_underscore() {
assert!(split_identifier("_foo_").is_empty());
assert_eq!(split_identifier("_foo_bar_"), vec!["foo", "bar"]);
}
#[test]
fn tokenize_full_text() {
let toks = tokenize("def getUserById(user_id):");
assert_eq!(
toks,
vec![
"def",
"getuserbyid", "get", "user", "by", "id",
"user_id", "user", "id"
]
);
}
#[test]
fn tokenize_skips_punctuation_and_numbers() {
assert_eq!(tokenize("foo + 42 - bar"), vec!["foo", "bar"]);
}
#[test]
fn tokenize_simple_word() {
assert_eq!(tokenize("simple"), vec!["simple"]);
}
#[test]
fn scan_identifiers_basic() {
assert_eq!(
scan_identifiers("a + b1c (d_e) 2f"),
vec!["a", "b1c", "d_e", "f"]
);
}
#[test]
fn scan_identifiers_underscore_start() {
assert_eq!(scan_identifiers("_x __y_z"), vec!["_x", "__y_z"]);
}
}