bm25_vectorizer/
bm25_tokenizer.rs

1/// Trait for tokenizing text into individual terms for BM25 processing.
2///
3/// Implementors of this trait define how input text should be broken down into
4/// individual tokens. This is an important step in the BM25 algorithm as it determines
5/// how documents are analysed and indexed.
6///
7/// Common tokenization strategies include:
8/// - **Whitespace splitting**: Split on spaces and punctuation
9/// - **Stemming/Lemmatization**: Reduce words to their root forms
10/// - **N-gram generation**: Create overlapping sequences of words
11/// - **Language-specific processing**: Handle specific language features
12///
13/// # Examples
14///
15/// ```rust
16/// use bm25_vectorizer::Bm25Tokenizer;
17///
18/// struct WhitespaceTokenizer;
19///
20/// impl Bm25Tokenizer for WhitespaceTokenizer {
21///     fn tokenize(&self, input_text: &str) -> Vec<String> {
22///         input_text
23///             .split_whitespace()
24///             .map(|token| token.to_lowercase())
25///             .collect()
26///     }
27/// }
28///
29/// let tokenizer = WhitespaceTokenizer;
30/// let tokens = tokenizer.tokenize("Hello World Example");
31/// assert_eq!(tokens, vec!["hello", "world", "example"]);
32/// ```
33pub trait Bm25Tokenizer {
34    /// Tokenizes the input text into a vector of string tokens.
35    ///
36    /// This method takes a string slice and returns a vector of tokens that will
37    /// be used for BM25 scoring.
38    ///
39    /// # Arguments
40    ///
41    /// * `input_text` - The text to be tokenized
42    ///
43    /// # Returns
44    ///
45    /// A vector of string tokens extracted from the input text
46    ///
47    /// # Examples
48    ///
49    /// ```rust
50    /// use bm25_vectorizer::Bm25Tokenizer;
51    ///
52    /// struct SimpleTokenizer;
53    /// impl Bm25Tokenizer for SimpleTokenizer {
54    ///     fn tokenize(&self, input_text: &str) -> Vec<String> {
55    ///         input_text.split_whitespace()
56    ///                   .map(String::from)
57    ///                   .collect()
58    ///     }
59    /// }
60    ///
61    /// let tokenizer = SimpleTokenizer;
62    /// let tokens = tokenizer.tokenize("rust is awesome");
63    /// assert_eq!(tokens, vec!["rust", "is", "awesome"]);
64    /// ```
65    fn tokenize(&self, input_text: &str) -> Vec<String>;
66}
67
68#[cfg(test)]
69mod tests {
70    use super::*;
71    use crate::mocking::{
72        MockCasePreservingTokenizer,
73        MockPunctuationTokenizer, MockWhitespaceTokenizer,
74    };
75    use rust_stemmers::{Algorithm as StemmingAlgorithm, Stemmer};
76    use stop_words::{get, LANGUAGE as StopWordLanguage};
77    use unicode_segmentation::UnicodeSegmentation;
78
79    // Tests for Bm25Tokenizer trait
80
81    #[test]
82    fn test_whitespace_tokenizer_basic() {
83        let tokenizer = MockWhitespaceTokenizer;
84        let tokens = tokenizer.tokenize("hello world rust");
85        assert_eq!(tokens, vec!["hello", "world", "rust"]);
86    }
87
88    #[test]
89    fn test_whitespace_tokenizer_case_normalization() {
90        let tokenizer = MockWhitespaceTokenizer;
91        let tokens = tokenizer.tokenize("Hello WORLD RusT");
92        assert_eq!(tokens, vec!["hello", "world", "rust"]);
93    }
94
95    #[test]
96    fn test_whitespace_tokenizer_empty_string() {
97        let tokenizer = MockWhitespaceTokenizer;
98        let tokens = tokenizer.tokenize("");
99        assert_eq!(tokens, Vec::<String>::new());
100    }
101
102    #[test]
103    fn test_whitespace_tokenizer_single_token() {
104        let tokenizer = MockWhitespaceTokenizer;
105        let tokens = tokenizer.tokenize("hello");
106        assert_eq!(tokens, vec!["hello"]);
107    }
108
109    #[test]
110    fn test_whitespace_tokenizer_multiple_spaces() {
111        let tokenizer = MockWhitespaceTokenizer;
112        let tokens = tokenizer.tokenize("hello    world   rust");
113        assert_eq!(tokens, vec!["hello", "world", "rust"]);
114    }
115
116    #[test]
117    fn test_whitespace_tokenizer_leading_trailing_spaces() {
118        let tokenizer = MockWhitespaceTokenizer;
119        let tokens = tokenizer.tokenize("  hello world  ");
120        assert_eq!(tokens, vec!["hello", "world"]);
121    }
122
123    #[test]
124    fn test_case_preserving_tokenizer() {
125        let tokenizer = MockCasePreservingTokenizer;
126        let tokens = tokenizer.tokenize("Hello WORLD RusT");
127        assert_eq!(tokens, vec!["Hello", "WORLD", "RusT"]);
128    }
129
130    #[test]
131    fn test_punctuation_tokenizer() {
132        let tokenizer = MockPunctuationTokenizer;
133        let tokens = tokenizer.tokenize("hello, world! rust?");
134        assert_eq!(tokens, vec!["hello", "world", "rust"]);
135    }
136
137    #[test]
138    fn test_punctuation_tokenizer_numbers() {
139        let tokenizer = MockPunctuationTokenizer;
140        let tokens = tokenizer.tokenize("version 2.0 is great!");
141        assert_eq!(tokens, vec!["version", "20", "is", "great"]);
142    }
143
144    #[test]
145    fn test_tokenizer_properties() {
146        let tokenizer = MockWhitespaceTokenizer;
147
148        // Property: tokenizing empty string should return empty vector
149        assert!(tokenizer.tokenize("").is_empty());
150
151        // Property: tokenizing single word should return vector with one element
152        let result = tokenizer.tokenize("word");
153        assert_eq!(result.len(), 1);
154        assert_eq!(result[0], "word");
155
156        // Property: all tokens should be lowercase (for this specific tokenizer)
157        let result = tokenizer.tokenize("HELLO World");
158        for token in &result {
159            assert_eq!(
160                token.to_lowercase(),
161                *token,
162                "All tokens should be lowercase"
163            );
164        }
165    }
166
167    /// Example tokenizer.
168    /// Performs: Unicode normalisation → lowercase → tokenisation → stop word removal → stemming
169    struct SampleNlpTokenizer;
170
171    impl SampleNlpTokenizer {
172        fn new() -> Self {
173            Self
174        }
175    }
176
177    impl Bm25Tokenizer for SampleNlpTokenizer {
178        /// Tokenizes input text through a NLP pipeline.
179        ///
180        /// # Processing Steps
181        ///
182        /// 1. **Unicode Normalisation**: Converts non-ASCII characters to ASCII equivalents
183        /// 2. **Lowercase Conversion**: Ensures case-insensitive matching
184        /// 3. **Word Segmentation**: Splits text into tokens using Unicode word boundaries
185        /// 4. **Stop Word Removal**: Filters out common words (e.g., "the", "is", "at")
186        /// 5. **Stemming**: Reduces words to their root form (e.g., "running" → "run")
187        fn tokenize(&self, input_text: &str) -> Vec<String> {
188            // Step 1: Normalise Unicode characters to ASCII
189            // U+FFFD � REPLACEMENT CHARACTER used to replace an unknown, unrecognised, or unrepresentable character
190            let text = deunicode::deunicode_with_tofu_cow(input_text, "�");
191
192            // Step 2: Convert to lowercase for consistent processing
193            let text = text.to_lowercase();
194
195            // Step 3: Tokenise into words using Unicode segmentation
196            let tokens: Vec<&str> = text
197                .unicode_words()
198                .filter(|word| !word.is_empty())
199                .collect();
200
201            // Step 4 & 5: Remove stop words and apply stemming
202            let stop_words = get(StopWordLanguage::English);
203            let stemmer = Stemmer::create(StemmingAlgorithm::English);
204
205            tokens
206                .into_iter()
207                .filter(|token| !stop_words.contains(&*token))
208                .map(|token| stemmer.stem(token).to_string())
209                .collect()
210        }
211    }
212
213    #[test]
214    fn test_nlp_tokenizer_basic() {
215        let tokenizer = SampleNlpTokenizer::new();
216        let tokens = tokenizer.tokenize("The quick brown fox jumps over the lazy dog");
217        // Tokens:
218        // [0] = {alloc::string::String} "quick"
219        // [1] = {alloc::string::String} "brown"
220        // [2] = {alloc::string::String} "fox"
221        // [3] = {alloc::string::String} "jump"
222        // [4] = {alloc::string::String} "lazi"
223
224        // Should not contain stop words
225        assert!(!tokens.contains(&"the".to_string()));
226        assert!(!tokens.contains(&"over".to_string()));
227
228        // Should contain stemmed content words
229        assert!(tokens.iter().any(|t| t.starts_with("quick")));
230        assert!(tokens.iter().any(|t| t.starts_with("jump")));
231    }
232
233    #[test]
234    fn test_nlp_tokenizer_pipeline() {
235        let tokenizer = SampleNlpTokenizer::new();
236        let input_text = "Modern computing owes much to the theoretical foundations laid by pioneers in mathematics and logic.";
237
238        let tokens = tokenizer.tokenize(input_text);
239        // Tokens:
240        // [0] = {alloc::string::String} "modern"
241        // [1] = {alloc::string::String} "comput"
242        // [2] = {alloc::string::String} "owe"
243        // [3] = {alloc::string::String} "theoret"
244        // [4] = {alloc::string::String} "foundat"
245        // [5] = {alloc::string::String} "laid"
246        // [6] = {alloc::string::String} "pioneer"
247        // [7] = {alloc::string::String} "mathemat"
248        // [8] = {alloc::string::String} "logic"
249
250        // Verify tokens are not empty
251        assert!(!tokens.is_empty(), "Token list should not be empty");
252
253        // Verify stop words removed
254        assert!(
255            !tokens.contains(&"to".to_string()),
256            "Stop word 'to' should be removed"
257        );
258        assert!(
259            !tokens.contains(&"the".to_string()),
260            "Stop word 'the' should be removed"
261        );
262        assert!(
263            !tokens.contains(&"in".to_string()),
264            "Stop word 'in' should be removed"
265        );
266
267        // Verify stemming applied
268        assert!(
269            tokens.iter().any(|t| t.starts_with("comput")),
270            "Should contain stemmed form of 'computing'"
271        );
272        assert!(
273            tokens.iter().any(|t| t.starts_with("theoret")),
274            "Should contain stemmed form of 'theoretical'"
275        );
276    }
277
278    #[test]
279    fn test_nlp_tokenizer_empty_input() {
280        let tokenizer = SampleNlpTokenizer::new();
281        let tokens = tokenizer.tokenize("");
282        assert!(tokens.is_empty());
283    }
284
285    #[test]
286    fn test_nlp_tokenizer_unicode() {
287        let tokenizer = SampleNlpTokenizer::new();
288        let tokens = tokenizer.tokenize("café résumé naïve");
289        // Tokens:
290        // [0] = {alloc::string::String} "cafe"
291        // [1] = {alloc::string::String} "resum"
292        // [2] = {alloc::string::String} "naiv"
293
294        // Should handle Unicode normalisation
295        assert!(tokens.len() == 3);
296        assert!(tokens.contains(&"cafe".to_string()));
297        assert!(tokens.contains(&"resum".to_string()));
298        assert!(tokens.contains(&"naiv".to_string()));
299    }
300}