Skip to main content

ruve/bm25/
tokenizer.rs

1pub struct Tokenizer {
2    stop_words: Vec<String>,
3}
4
5impl Tokenizer {
6    pub fn new() -> Self {
7        Tokenizer {
8            // todo load from config
9            stop_words: vec![
10                "the".to_string(),
11                "is".to_string(),
12                "in".to_string(),
13                "and".to_string(),
14                "to".to_string(),
15                "a".to_string(),
16                "of".to_string(),
17                "that".to_string(),
18                "it".to_string(),
19                "with".to_string(),
20            ],
21        }
22    }
23
24    // tokenize whitespace, punctuation, and stop words, and lowercase the tokens
25    pub fn tokenize(&self, text: &str) -> Vec<String> {
26        text.split(|c: char| c.is_whitespace() || c.is_ascii_punctuation())
27            .filter(|token| !token.is_empty())
28            .map(|token| token.to_lowercase())
29            .filter(|token| !self.stop_words.contains(token))
30            .collect()
31    }
32}
33
34#[cfg(test)]
35mod tests {
36    use super::*;
37
38    #[test]
39    fn splits_on_whitespace() {
40        let t = Tokenizer::new();
41        assert_eq!(t.tokenize("hello world"), vec!["hello", "world"]);
42    }
43
44    #[test]
45    fn splits_on_punctuation() {
46        let t = Tokenizer::new();
47        let tokens = t.tokenize("hello, world!");
48        assert!(tokens.contains(&"hello".to_string()));
49        assert!(tokens.contains(&"world".to_string()));
50    }
51
52    #[test]
53    fn lowercases_tokens() {
54        let t = Tokenizer::new();
55        assert_eq!(t.tokenize("Hello WORLD"), vec!["hello", "world"]);
56    }
57
58    #[test]
59    fn filters_stop_words() {
60        let t = Tokenizer::new();
61        let tokens = t.tokenize("the quick brown fox");
62        assert!(!tokens.contains(&"the".to_string()));
63        assert!(tokens.contains(&"quick".to_string()));
64        assert!(tokens.contains(&"brown".to_string()));
65        assert!(tokens.contains(&"fox".to_string()));
66    }
67
68    #[test]
69    fn stop_words_are_case_insensitive() {
70        let t = Tokenizer::new();
71        // "The" should be lowercased to "the" then filtered
72        assert!(t.tokenize("The dog").iter().all(|tok| tok != "the"));
73    }
74
75    #[test]
76    fn empty_string_returns_empty() {
77        let t = Tokenizer::new();
78        assert!(t.tokenize("").is_empty());
79    }
80
81    #[test]
82    fn only_stop_words_returns_empty() {
83        let t = Tokenizer::new();
84        assert!(t.tokenize("the and to a").is_empty());
85    }
86
87    #[test]
88    fn consecutive_delimiters_produce_no_empty_tokens() {
89        let t = Tokenizer::new();
90        let tokens = t.tokenize("hello   world");
91        assert_eq!(tokens, vec!["hello", "world"]);
92    }
93}