1pub struct Tokenizer {
2 stop_words: Vec<String>,
3}
4
5impl Tokenizer {
6 pub fn new() -> Self {
7 Tokenizer {
8 stop_words: vec![
10 "the".to_string(),
11 "is".to_string(),
12 "in".to_string(),
13 "and".to_string(),
14 "to".to_string(),
15 "a".to_string(),
16 "of".to_string(),
17 "that".to_string(),
18 "it".to_string(),
19 "with".to_string(),
20 ],
21 }
22 }
23
24 pub fn tokenize(&self, text: &str) -> Vec<String> {
26 text.split(|c: char| c.is_whitespace() || c.is_ascii_punctuation())
27 .filter(|token| !token.is_empty())
28 .map(|token| token.to_lowercase())
29 .filter(|token| !self.stop_words.contains(token))
30 .collect()
31 }
32}
33
34#[cfg(test)]
35mod tests {
36 use super::*;
37
38 #[test]
39 fn splits_on_whitespace() {
40 let t = Tokenizer::new();
41 assert_eq!(t.tokenize("hello world"), vec!["hello", "world"]);
42 }
43
44 #[test]
45 fn splits_on_punctuation() {
46 let t = Tokenizer::new();
47 let tokens = t.tokenize("hello, world!");
48 assert!(tokens.contains(&"hello".to_string()));
49 assert!(tokens.contains(&"world".to_string()));
50 }
51
52 #[test]
53 fn lowercases_tokens() {
54 let t = Tokenizer::new();
55 assert_eq!(t.tokenize("Hello WORLD"), vec!["hello", "world"]);
56 }
57
58 #[test]
59 fn filters_stop_words() {
60 let t = Tokenizer::new();
61 let tokens = t.tokenize("the quick brown fox");
62 assert!(!tokens.contains(&"the".to_string()));
63 assert!(tokens.contains(&"quick".to_string()));
64 assert!(tokens.contains(&"brown".to_string()));
65 assert!(tokens.contains(&"fox".to_string()));
66 }
67
68 #[test]
69 fn stop_words_are_case_insensitive() {
70 let t = Tokenizer::new();
71 assert!(t.tokenize("The dog").iter().all(|tok| tok != "the"));
73 }
74
75 #[test]
76 fn empty_string_returns_empty() {
77 let t = Tokenizer::new();
78 assert!(t.tokenize("").is_empty());
79 }
80
81 #[test]
82 fn only_stop_words_returns_empty() {
83 let t = Tokenizer::new();
84 assert!(t.tokenize("the and to a").is_empty());
85 }
86
87 #[test]
88 fn consecutive_delimiters_produce_no_empty_tokens() {
89 let t = Tokenizer::new();
90 let tokens = t.tokenize("hello world");
91 assert_eq!(tokens, vec!["hello", "world"]);
92 }
93}