pub fn tokenize(input: &str) -> Vec<&str> {
let mut first_word_part_byte_index = 0;
let mut last_was_word_part = false;
let mut result: Vec<&str> = Vec::with_capacity(input.len());
let mut byte_index = 0;
for character in input.chars() {
let current_is_word_part = (character == '_') || character.is_alphanumeric();
if current_is_word_part {
if last_was_word_part {
} else {
first_word_part_byte_index = byte_index;
}
} else {
if last_was_word_part {
result.push(&input[first_word_part_byte_index..byte_index]);
}
result.push(&input[byte_index..(byte_index + character.len_utf8())]);
}
last_was_word_part = current_is_word_part;
byte_index += character.len_utf8();
}
if last_was_word_part {
result.push(&input[first_word_part_byte_index..]);
}
return result;
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(test)]
use pretty_assertions::assert_eq;
#[test]
fn test_empty() {
let no_strings: Vec<String> = Vec::new();
assert_eq!(tokenize(""), no_strings);
}
#[test]
fn test_words() {
assert_eq!(tokenize("word"), ["word"]);
assert_eq!(tokenize("Adam Bea"), ["Adam", " ", "Bea"]);
}
#[test]
fn test_numbers() {
assert_eq!(tokenize("123"), ["123"]);
assert_eq!(tokenize("123 456"), ["123", " ", "456"]);
}
#[test]
fn test_alphanumeric() {
assert_eq!(tokenize("0xC0deCafe"), ["0xC0deCafe"]);
}
#[test]
fn test_others() {
assert_eq!(tokenize("+!,"), ["+", "!", ","]);
}
#[test]
fn test_non_breaking_space() {
assert_eq!(tokenize("\u{00a0}"), ["\u{00a0}"]);
assert_eq!(tokenize("\u{00a0}s"), ["\u{00a0}", "s"]);
}
}