use std::sync::OnceLock;
use regex::Regex;
fn identifier_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| Regex::new(r"[a-zA-Z_][a-zA-Z0-9_]*").expect("identifier regex compiles"))
}
fn split_camel_segments(token: &str) -> Vec<String> {
let bytes = token.as_bytes();
let n = bytes.len();
let mut segments: Vec<String> = Vec::new();
let mut i = 0;
while i < n {
let c = bytes[i];
if c.is_ascii_digit() {
let start = i;
while i < n && bytes[i].is_ascii_digit() {
i += 1;
}
segments.push(token[start..i].to_string());
} else if c.is_ascii_alphabetic() {
let start = i;
if c.is_ascii_uppercase() {
i += 1;
if i < n && bytes[i].is_ascii_lowercase() {
while i < n && bytes[i].is_ascii_lowercase() {
i += 1;
}
segments.push(token[start..i].to_string());
} else if i < n && bytes[i].is_ascii_uppercase() {
let mut j = i;
while j < n && bytes[j].is_ascii_uppercase() {
j += 1;
}
if j < n && bytes[j].is_ascii_lowercase() && j > start + 1 {
segments.push(token[start..j - 1].to_string());
i = j - 1;
} else {
segments.push(token[start..j].to_string());
i = j;
}
} else {
segments.push(token[start..i].to_string());
}
} else {
while i < n && bytes[i].is_ascii_lowercase() {
i += 1;
}
segments.push(token[start..i].to_string());
}
} else {
i += 1;
}
}
segments
}
#[must_use]
pub fn split_identifier(token: &str) -> Vec<String> {
let lower = token.to_ascii_lowercase();
let parts: Vec<String> = if token.contains('_') {
lower
.split('_')
.filter(|p| !p.is_empty())
.map(str::to_string)
.collect()
} else {
split_camel_segments(token)
.into_iter()
.map(|s| s.to_ascii_lowercase())
.collect()
};
if parts.len() >= 2 {
let mut out = Vec::with_capacity(parts.len() + 1);
out.push(lower);
out.extend(parts);
out
} else {
vec![lower]
}
}
#[must_use]
pub fn tokenize(text: &str) -> Vec<String> {
let mut result = Vec::new();
for raw in identifier_re().find_iter(text) {
result.extend(split_identifier(raw.as_str()));
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tokens_handlerstack() {
assert_eq!(
split_identifier("HandlerStack"),
vec!["handlerstack", "handler", "stack"]
);
}
#[test]
fn tokens_gethttpresponse() {
assert_eq!(
split_identifier("getHTTPResponse"),
vec!["gethttpresponse", "get", "http", "response"]
);
}
#[test]
fn tokens_my_func() {
assert_eq!(split_identifier("my_func"), vec!["my_func", "my", "func"]);
}
#[test]
fn tokens_xmlparser() {
assert_eq!(
split_identifier("XMLParser"),
vec!["xmlparser", "xml", "parser"]
);
}
#[test]
fn matches_python_docstring_examples() {
let cases: &[(&str, &[&str])] = &[
("HandlerStack", &["handlerstack", "handler", "stack"]),
(
"getHTTPResponse",
&["gethttpresponse", "get", "http", "response"],
),
("XMLParser", &["xmlparser", "xml", "parser"]),
("my_func", &["my_func", "my", "func"]),
("simple", &["simple"]),
("CamelCase", &["camelcase", "camel", "case"]),
(
"snake_case_word",
&["snake_case_word", "snake", "case", "word"],
),
("HTML", &["html"]),
("parseHTML", &["parsehtml", "parse", "html"]),
("PI_VALUE_2", &["pi_value_2", "pi", "value", "2"]),
];
for (input, expected) in cases {
let got = split_identifier(input);
let want: Vec<String> = expected.iter().map(|s| (*s).to_string()).collect();
assert_eq!(got, want, "split_identifier({input:?})");
}
}
#[test]
fn tokenize_phrase_expands_each_identifier() {
let got = tokenize("call getHTTPResponse, then MyClass.do_thing(3)");
assert_eq!(
got,
vec![
"call",
"gethttpresponse",
"get",
"http",
"response",
"then",
"myclass",
"my",
"class",
"do_thing",
"do",
"thing",
]
);
}
#[test]
fn single_char_no_fanout() {
assert_eq!(split_identifier("x"), vec!["x"]);
assert_eq!(split_identifier("X"), vec!["x"]);
assert_eq!(split_identifier("_"), vec!["_"]);
}
}