pub fn split_identifier(token: &str) -> Vec<String> {
let lower = token.to_ascii_lowercase();
let has_underscore = token.contains('_');
let has_upper_or_digit = token
.bytes()
.any(|b| b.is_ascii_uppercase() || b.is_ascii_digit());
if !has_underscore && !has_upper_or_digit {
return vec![lower];
}
let parts: Vec<String> = if has_underscore {
lower
.split('_')
.filter(|p| !p.is_empty())
.map(str::to_string)
.collect()
} else {
camel_split(token)
.into_iter()
.map(str::to_ascii_lowercase)
.collect()
};
if parts.len() >= 2 {
let mut out = Vec::with_capacity(parts.len() + 1);
out.push(lower);
out.extend(parts);
out
} else {
vec![lower]
}
}
fn camel_split(token: &str) -> Vec<&str> {
let b = token.as_bytes();
let n = b.len();
let mut out = Vec::new();
let mut p = 0;
while p < n {
let c = b[p];
if c.is_ascii_uppercase() {
let mut q = p;
while q < n && b[q].is_ascii_uppercase() {
q += 1;
}
let run = q - p;
let next_is_lower = q < n && b[q].is_ascii_lowercase();
if run >= 2 && next_is_lower {
out.push(&token[p..q - 1]);
p = q - 1;
} else if run == 1 && next_is_lower {
let mut r = q;
while r < n && b[r].is_ascii_lowercase() {
r += 1;
}
out.push(&token[p..r]);
p = r;
} else {
out.push(&token[p..q]);
p = q;
}
} else if c.is_ascii_lowercase() {
let mut r = p;
while r < n && b[r].is_ascii_lowercase() {
r += 1;
}
out.push(&token[p..r]);
p = r;
} else if c.is_ascii_digit() {
let mut r = p;
while r < n && b[r].is_ascii_digit() {
r += 1;
}
out.push(&token[p..r]);
p = r;
} else {
p += 1;
}
}
out
}
pub fn tokenize(text: &str) -> Vec<String> {
let mut result = Vec::new();
for token in token_matches(text) {
result.extend(split_identifier(token));
}
result
}
fn token_matches(text: &str) -> Vec<&str> {
let b = text.as_bytes();
let n = b.len();
let mut out = Vec::new();
let mut p = 0;
while p < n {
if b[p].is_ascii_alphabetic() || b[p] == b'_' {
let mut q = p + 1;
while q < n && (b[q].is_ascii_alphanumeric() || b[q] == b'_') {
q += 1;
}
out.push(&text[p..q]);
p = q;
} else {
p += 1;
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn splits_pascal_case() {
assert_eq!(
split_identifier("HandlerStack"),
["handlerstack", "handler", "stack"]
);
}
#[test]
fn preserves_runs_of_capitals_as_a_single_sub_token() {
assert_eq!(
split_identifier("getHTTPResponse"),
["gethttpresponse", "get", "http", "response"]
);
}
#[test]
fn handles_leading_run_of_capitals() {
assert_eq!(
split_identifier("XMLParser"),
["xmlparser", "xml", "parser"]
);
}
#[test]
fn splits_snake_case() {
assert_eq!(split_identifier("my_func"), ["my_func", "my", "func"]);
}
#[test]
fn returns_only_lowered_token_when_no_boundary() {
assert_eq!(split_identifier("simple"), ["simple"]);
}
#[test]
fn lowercases_an_already_lowercase_token() {
assert_eq!(split_identifier("Already"), ["already"]);
}
#[test]
fn keeps_consecutive_underscores_from_collapsing() {
assert_eq!(split_identifier("foo__bar"), ["foo__bar", "foo", "bar"]);
}
#[test]
fn treats_leading_underscore_as_one_effective_part() {
assert_eq!(split_identifier("_foo"), ["_foo"]);
}
#[test]
fn splits_digit_runs_as_their_own_camel_sub_token() {
assert_eq!(
split_identifier("abc123Def"),
["abc123def", "abc", "123", "def"]
);
}
#[test]
fn tokenize_splits_plain_space_separated_words() {
assert_eq!(tokenize("foo bar baz"), ["foo", "bar", "baz"]);
}
#[test]
fn tokenize_expands_compounds_and_drops_non_identifier_digits() {
assert_eq!(
tokenize("camelCase_snake_case 123"),
["camelcase_snake_case", "camelcase", "snake", "case"]
);
}
#[test]
fn tokenize_returns_empty_for_no_identifiers() {
assert_eq!(tokenize(" !!! 123 ???"), Vec::<String>::new());
}
#[test]
fn tokenize_preserves_multiple_identifiers_and_expands_each() {
assert_eq!(
tokenize("HandlerStack my_func"),
["handlerstack", "handler", "stack", "my_func", "my", "func"]
);
}
}