pub fn tokenize(text: &str) -> Vec<String> {
let mut tokens = Vec::new();
let mut current = String::new();
for b in text.bytes() {
if b.is_ascii_alphanumeric() {
current.push(b.to_ascii_lowercase() as char);
} else if !current.is_empty() {
tokens.push(std::mem::take(&mut current));
}
}
if !current.is_empty() {
tokens.push(current);
}
tokens
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_input_returns_empty_vec() {
assert!(tokenize("").is_empty());
assert!(tokenize(" ").is_empty());
assert!(tokenize("!!!---???").is_empty());
}
#[test]
fn splits_on_punctuation_and_whitespace() {
assert_eq!(
tokenize("hello, world!"),
vec!["hello".to_string(), "world".to_string()]
);
assert_eq!(
tokenize("a\tb\nc d"),
vec![
"a".to_string(),
"b".to_string(),
"c".to_string(),
"d".to_string()
]
);
}
#[test]
fn lowercases_ascii_letters() {
assert_eq!(
tokenize("FooBar BAZ"),
vec!["foobar".to_string(), "baz".to_string()]
);
}
#[test]
fn alphanumeric_runs_stay_together() {
assert_eq!(tokenize("rust2026"), vec!["rust2026".to_string()]);
assert_eq!(tokenize("co-op"), vec!["co".to_string(), "op".to_string()]);
}
#[test]
fn non_ascii_bytes_act_as_separators_without_panicking() {
let toks = tokenize("café");
assert_eq!(toks, vec!["caf".to_string()]);
assert!(tokenize("日本語").is_empty());
}
#[test]
fn smoke_module_path_reaches_through_lib() {
assert_eq!(
crate::sql::fts::tokenize("Hello, world!"),
vec!["hello".to_string(), "world".to_string()]
);
}
}