pub struct Tokenizer {
stop_words: Vec<String>,
}
impl Tokenizer {
pub fn new() -> Self {
Tokenizer {
stop_words: vec![
"the".to_string(),
"is".to_string(),
"in".to_string(),
"and".to_string(),
"to".to_string(),
"a".to_string(),
"of".to_string(),
"that".to_string(),
"it".to_string(),
"with".to_string(),
],
}
}
pub fn tokenize(&self, text: &str) -> Vec<String> {
text.split(|c: char| c.is_whitespace() || c.is_ascii_punctuation())
.filter(|token| !token.is_empty())
.map(|token| token.to_lowercase())
.filter(|token| !self.stop_words.contains(token))
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn splits_on_whitespace() {
let t = Tokenizer::new();
assert_eq!(t.tokenize("hello world"), vec!["hello", "world"]);
}
#[test]
fn splits_on_punctuation() {
let t = Tokenizer::new();
let tokens = t.tokenize("hello, world!");
assert!(tokens.contains(&"hello".to_string()));
assert!(tokens.contains(&"world".to_string()));
}
#[test]
fn lowercases_tokens() {
let t = Tokenizer::new();
assert_eq!(t.tokenize("Hello WORLD"), vec!["hello", "world"]);
}
#[test]
fn filters_stop_words() {
let t = Tokenizer::new();
let tokens = t.tokenize("the quick brown fox");
assert!(!tokens.contains(&"the".to_string()));
assert!(tokens.contains(&"quick".to_string()));
assert!(tokens.contains(&"brown".to_string()));
assert!(tokens.contains(&"fox".to_string()));
}
#[test]
fn stop_words_are_case_insensitive() {
let t = Tokenizer::new();
assert!(t.tokenize("The dog").iter().all(|tok| tok != "the"));
}
#[test]
fn empty_string_returns_empty() {
let t = Tokenizer::new();
assert!(t.tokenize("").is_empty());
}
#[test]
fn only_stop_words_returns_empty() {
let t = Tokenizer::new();
assert!(t.tokenize("the and to a").is_empty());
}
#[test]
fn consecutive_delimiters_produce_no_empty_tokens() {
let t = Tokenizer::new();
let tokens = t.tokenize("hello world");
assert_eq!(tokens, vec!["hello", "world"]);
}
}