pub struct Tokenizer {
stop_words: Vec<String>,
}
impl Tokenizer {
pub fn new() -> Self {
Tokenizer {
stop_words: vec![
"the".to_string(),
"is".to_string(),
"in".to_string(),
"and".to_string(),
"to".to_string(),
"a".to_string(),
"of".to_string(),
"that".to_string(),
"it".to_string(),
"with".to_string(),
],
}
}
pub fn tokenize(&self, text: &str) -> Vec<String> {
text.split(|c: char| c.is_whitespace() || c.is_ascii_punctuation())
.filter(|token| !token.is_empty())
.map(|token| token.to_lowercase())
.filter(|token| !self.stop_words.contains(token))
.collect()
}
}