bm25_vectorizer/bm25_tokenizer.rs
1/// Trait for tokenizing text into individual terms for BM25 processing.
2///
3/// Implementors of this trait define how input text should be broken down into
4/// individual tokens. This is an important step in the BM25 algorithm as it determines
5/// how documents are analysed and indexed.
6///
7/// Common tokenization strategies include:
8/// - **Whitespace splitting**: Split on spaces and punctuation
9/// - **Stemming/Lemmatization**: Reduce words to their root forms
10/// - **N-gram generation**: Create overlapping sequences of words
11/// - **Language-specific processing**: Handle specific language features
12///
13/// # Examples
14///
15/// ```rust
16/// use bm25_vectorizer::Bm25Tokenizer;
17///
18/// struct WhitespaceTokenizer;
19///
20/// impl Bm25Tokenizer for WhitespaceTokenizer {
21/// fn tokenize(&self, input_text: &str) -> Vec<String> {
22/// input_text
23/// .split_whitespace()
24/// .map(|token| token.to_lowercase())
25/// .collect()
26/// }
27/// }
28///
29/// let tokenizer = WhitespaceTokenizer;
30/// let tokens = tokenizer.tokenize("Hello World Example");
31/// assert_eq!(tokens, vec!["hello", "world", "example"]);
32/// ```
33pub trait Bm25Tokenizer {
34 /// Tokenizes the input text into a vector of string tokens.
35 ///
36 /// This method takes a string slice and returns a vector of tokens that will
37 /// be used for BM25 scoring.
38 ///
39 /// # Arguments
40 ///
41 /// * `input_text` - The text to be tokenized
42 ///
43 /// # Returns
44 ///
45 /// A vector of string tokens extracted from the input text
46 ///
47 /// # Examples
48 ///
49 /// ```rust
50 /// use bm25_vectorizer::Bm25Tokenizer;
51 ///
52 /// struct SimpleTokenizer;
53 /// impl Bm25Tokenizer for SimpleTokenizer {
54 /// fn tokenize(&self, input_text: &str) -> Vec<String> {
55 /// input_text.split_whitespace()
56 /// .map(String::from)
57 /// .collect()
58 /// }
59 /// }
60 ///
61 /// let tokenizer = SimpleTokenizer;
62 /// let tokens = tokenizer.tokenize("rust is awesome");
63 /// assert_eq!(tokens, vec!["rust", "is", "awesome"]);
64 /// ```
65 fn tokenize(&self, input_text: &str) -> Vec<String>;
66}
67
68#[cfg(test)]
69mod tests {
70 use super::*;
71 use crate::mocking::{
72 MockCasePreservingTokenizer,
73 MockPunctuationTokenizer, MockWhitespaceTokenizer,
74 };
75 use rust_stemmers::{Algorithm as StemmingAlgorithm, Stemmer};
76 use stop_words::{get, LANGUAGE as StopWordLanguage};
77 use unicode_segmentation::UnicodeSegmentation;
78
79 // Tests for Bm25Tokenizer trait
80
81 #[test]
82 fn test_whitespace_tokenizer_basic() {
83 let tokenizer = MockWhitespaceTokenizer;
84 let tokens = tokenizer.tokenize("hello world rust");
85 assert_eq!(tokens, vec!["hello", "world", "rust"]);
86 }
87
88 #[test]
89 fn test_whitespace_tokenizer_case_normalization() {
90 let tokenizer = MockWhitespaceTokenizer;
91 let tokens = tokenizer.tokenize("Hello WORLD RusT");
92 assert_eq!(tokens, vec!["hello", "world", "rust"]);
93 }
94
95 #[test]
96 fn test_whitespace_tokenizer_empty_string() {
97 let tokenizer = MockWhitespaceTokenizer;
98 let tokens = tokenizer.tokenize("");
99 assert_eq!(tokens, Vec::<String>::new());
100 }
101
102 #[test]
103 fn test_whitespace_tokenizer_single_token() {
104 let tokenizer = MockWhitespaceTokenizer;
105 let tokens = tokenizer.tokenize("hello");
106 assert_eq!(tokens, vec!["hello"]);
107 }
108
109 #[test]
110 fn test_whitespace_tokenizer_multiple_spaces() {
111 let tokenizer = MockWhitespaceTokenizer;
112 let tokens = tokenizer.tokenize("hello world rust");
113 assert_eq!(tokens, vec!["hello", "world", "rust"]);
114 }
115
116 #[test]
117 fn test_whitespace_tokenizer_leading_trailing_spaces() {
118 let tokenizer = MockWhitespaceTokenizer;
119 let tokens = tokenizer.tokenize(" hello world ");
120 assert_eq!(tokens, vec!["hello", "world"]);
121 }
122
123 #[test]
124 fn test_case_preserving_tokenizer() {
125 let tokenizer = MockCasePreservingTokenizer;
126 let tokens = tokenizer.tokenize("Hello WORLD RusT");
127 assert_eq!(tokens, vec!["Hello", "WORLD", "RusT"]);
128 }
129
130 #[test]
131 fn test_punctuation_tokenizer() {
132 let tokenizer = MockPunctuationTokenizer;
133 let tokens = tokenizer.tokenize("hello, world! rust?");
134 assert_eq!(tokens, vec!["hello", "world", "rust"]);
135 }
136
137 #[test]
138 fn test_punctuation_tokenizer_numbers() {
139 let tokenizer = MockPunctuationTokenizer;
140 let tokens = tokenizer.tokenize("version 2.0 is great!");
141 assert_eq!(tokens, vec!["version", "20", "is", "great"]);
142 }
143
144 #[test]
145 fn test_tokenizer_properties() {
146 let tokenizer = MockWhitespaceTokenizer;
147
148 // Property: tokenizing empty string should return empty vector
149 assert!(tokenizer.tokenize("").is_empty());
150
151 // Property: tokenizing single word should return vector with one element
152 let result = tokenizer.tokenize("word");
153 assert_eq!(result.len(), 1);
154 assert_eq!(result[0], "word");
155
156 // Property: all tokens should be lowercase (for this specific tokenizer)
157 let result = tokenizer.tokenize("HELLO World");
158 for token in &result {
159 assert_eq!(
160 token.to_lowercase(),
161 *token,
162 "All tokens should be lowercase"
163 );
164 }
165 }
166
167 /// Example tokenizer.
168 /// Performs: Unicode normalisation → lowercase → tokenisation → stop word removal → stemming
169 struct SampleNlpTokenizer;
170
171 impl SampleNlpTokenizer {
172 fn new() -> Self {
173 Self
174 }
175 }
176
177 impl Bm25Tokenizer for SampleNlpTokenizer {
178 /// Tokenizes input text through a NLP pipeline.
179 ///
180 /// # Processing Steps
181 ///
182 /// 1. **Unicode Normalisation**: Converts non-ASCII characters to ASCII equivalents
183 /// 2. **Lowercase Conversion**: Ensures case-insensitive matching
184 /// 3. **Word Segmentation**: Splits text into tokens using Unicode word boundaries
185 /// 4. **Stop Word Removal**: Filters out common words (e.g., "the", "is", "at")
186 /// 5. **Stemming**: Reduces words to their root form (e.g., "running" → "run")
187 fn tokenize(&self, input_text: &str) -> Vec<String> {
188 // Step 1: Normalise Unicode characters to ASCII
189 // U+FFFD � REPLACEMENT CHARACTER used to replace an unknown, unrecognised, or unrepresentable character
190 let text = deunicode::deunicode_with_tofu_cow(input_text, "�");
191
192 // Step 2: Convert to lowercase for consistent processing
193 let text = text.to_lowercase();
194
195 // Step 3: Tokenise into words using Unicode segmentation
196 let tokens: Vec<&str> = text
197 .unicode_words()
198 .filter(|word| !word.is_empty())
199 .collect();
200
201 // Step 4 & 5: Remove stop words and apply stemming
202 let stop_words = get(StopWordLanguage::English);
203 let stemmer = Stemmer::create(StemmingAlgorithm::English);
204
205 tokens
206 .into_iter()
207 .filter(|token| !stop_words.contains(&*token))
208 .map(|token| stemmer.stem(token).to_string())
209 .collect()
210 }
211 }
212
213 #[test]
214 fn test_nlp_tokenizer_basic() {
215 let tokenizer = SampleNlpTokenizer::new();
216 let tokens = tokenizer.tokenize("The quick brown fox jumps over the lazy dog");
217 // Tokens:
218 // [0] = {alloc::string::String} "quick"
219 // [1] = {alloc::string::String} "brown"
220 // [2] = {alloc::string::String} "fox"
221 // [3] = {alloc::string::String} "jump"
222 // [4] = {alloc::string::String} "lazi"
223
224 // Should not contain stop words
225 assert!(!tokens.contains(&"the".to_string()));
226 assert!(!tokens.contains(&"over".to_string()));
227
228 // Should contain stemmed content words
229 assert!(tokens.iter().any(|t| t.starts_with("quick")));
230 assert!(tokens.iter().any(|t| t.starts_with("jump")));
231 }
232
233 #[test]
234 fn test_nlp_tokenizer_pipeline() {
235 let tokenizer = SampleNlpTokenizer::new();
236 let input_text = "Modern computing owes much to the theoretical foundations laid by pioneers in mathematics and logic.";
237
238 let tokens = tokenizer.tokenize(input_text);
239 // Tokens:
240 // [0] = {alloc::string::String} "modern"
241 // [1] = {alloc::string::String} "comput"
242 // [2] = {alloc::string::String} "owe"
243 // [3] = {alloc::string::String} "theoret"
244 // [4] = {alloc::string::String} "foundat"
245 // [5] = {alloc::string::String} "laid"
246 // [6] = {alloc::string::String} "pioneer"
247 // [7] = {alloc::string::String} "mathemat"
248 // [8] = {alloc::string::String} "logic"
249
250 // Verify tokens are not empty
251 assert!(!tokens.is_empty(), "Token list should not be empty");
252
253 // Verify stop words removed
254 assert!(
255 !tokens.contains(&"to".to_string()),
256 "Stop word 'to' should be removed"
257 );
258 assert!(
259 !tokens.contains(&"the".to_string()),
260 "Stop word 'the' should be removed"
261 );
262 assert!(
263 !tokens.contains(&"in".to_string()),
264 "Stop word 'in' should be removed"
265 );
266
267 // Verify stemming applied
268 assert!(
269 tokens.iter().any(|t| t.starts_with("comput")),
270 "Should contain stemmed form of 'computing'"
271 );
272 assert!(
273 tokens.iter().any(|t| t.starts_with("theoret")),
274 "Should contain stemmed form of 'theoretical'"
275 );
276 }
277
278 #[test]
279 fn test_nlp_tokenizer_empty_input() {
280 let tokenizer = SampleNlpTokenizer::new();
281 let tokens = tokenizer.tokenize("");
282 assert!(tokens.is_empty());
283 }
284
285 #[test]
286 fn test_nlp_tokenizer_unicode() {
287 let tokenizer = SampleNlpTokenizer::new();
288 let tokens = tokenizer.tokenize("café résumé naïve");
289 // Tokens:
290 // [0] = {alloc::string::String} "cafe"
291 // [1] = {alloc::string::String} "resum"
292 // [2] = {alloc::string::String} "naiv"
293
294 // Should handle Unicode normalisation
295 assert!(tokens.len() == 3);
296 assert!(tokens.contains(&"cafe".to_string()));
297 assert!(tokens.contains(&"resum".to_string()));
298 assert!(tokens.contains(&"naiv".to_string()));
299 }
300}