bm25_vectorizer/
bm25_token_indexer.rs1pub trait Bm25TokenIndexer {
65 type Bm25TokenIndex;
69
70 fn index(&self, token: &str) -> Self::Bm25TokenIndex;
107}
108
109#[cfg(test)]
111mod tests {
112 use super::*;
113 use crate::mocking::{
114 MockDictionaryTokenIndexer, MockHashTokenIndexer, MockStringTokenIndexer,
115 MockWhitespaceTokenizer,
116 };
117 use crate::Bm25Tokenizer;
118
119 #[test]
120 fn test_hash_token_indexer_deterministic() {
121 let indexer = MockHashTokenIndexer;
122 let index1 = indexer.index("hello");
123 let index2 = indexer.index("hello");
124 assert_eq!(index1, index2, "Same token should produce same index");
125 }
126
127 #[test]
128 fn test_hash_token_indexer_different_tokens() {
129 let indexer = MockHashTokenIndexer;
130 let index1 = indexer.index("hello");
131 let index2 = indexer.index("world");
132 assert_ne!(
133 index1, index2,
134 "Different tokens should produce different indices"
135 );
136 }
137
138 #[test]
139 fn test_hash_token_indexer_case_sensitivity() {
140 let indexer = MockHashTokenIndexer;
141 let index1 = indexer.index("hello");
142 let index2 = indexer.index("Hello");
143 assert_ne!(
144 index1, index2,
145 "Case-different tokens should produce different indices"
146 );
147 }
148
149 #[test]
150 fn test_dictionary_token_indexer_sequential() {
151 let indexer = MockDictionaryTokenIndexer::new();
152 let index1 = indexer.index("hello");
153 let index2 = indexer.index("world");
154 let index3 = indexer.index("rust");
155
156 assert_eq!(index1, 0);
157 assert_eq!(index2, 1);
158 assert_eq!(index3, 2);
159 }
160
161 #[test]
162 fn test_dictionary_token_indexer_deterministic() {
163 let indexer = MockDictionaryTokenIndexer::new();
164 let index1 = indexer.index("hello");
165 let index2 = indexer.index("world");
166 let index3 = indexer.index("hello"); assert_eq!(index1, index3, "Same token should produce same index");
169 assert_ne!(
170 index1, index2,
171 "Different tokens should produce different indices"
172 );
173 }
174
175 #[test]
176 fn test_dictionary_token_indexer_empty_string() {
177 let indexer = MockDictionaryTokenIndexer::new();
178 let index1 = indexer.index("");
179 let index2 = indexer.index("");
180 assert_eq!(
181 index1, index2,
182 "Empty string should be handled consistently"
183 );
184 }
185
186 #[test]
187 fn test_string_token_indexer() {
188 let indexer = MockStringTokenIndexer;
189 let index1 = indexer.index("hello");
190 let index2 = indexer.index("world");
191
192 assert_eq!(index1, "idx_hello");
193 assert_eq!(index2, "idx_world");
194 }
195
196 #[test]
197 fn test_string_token_indexer_deterministic() {
198 let indexer = MockStringTokenIndexer;
199 let index1 = indexer.index("test");
200 let index2 = indexer.index("test");
201 assert_eq!(index1, index2, "Same token should produce same index");
202 }
203
204 #[test]
207 fn test_tokenizer_indexer_integration() {
208 let tokenizer = MockWhitespaceTokenizer;
209 let indexer = MockHashTokenIndexer;
210
211 let text = "hello world hello rust";
212 let tokens = tokenizer.tokenize(text);
213 let indices: Vec<u64> = tokens.iter().map(|token| indexer.index(token)).collect();
214
215 assert_eq!(indices.len(), 4);
217
218 assert_eq!(
220 indices[0], indices[2],
221 "Repeated token 'hello' should have same index"
222 );
223
224 assert_ne!(
226 indices[0], indices[1],
227 "'hello' and 'world' should have different indices"
228 );
229 assert_ne!(
230 indices[1], indices[3],
231 "'world' and 'rust' should have different indices"
232 );
233 assert_ne!(
234 indices[0], indices[3],
235 "'hello' and 'rust' should have different indices"
236 );
237 }
238
239 #[test]
240 fn test_dictionary_indexer_with_tokenizer() {
241 let tokenizer = MockWhitespaceTokenizer;
242 let indexer = MockDictionaryTokenIndexer::new();
243
244 let text = "the quick brown fox jumps over the lazy dog";
245 let tokens = tokenizer.tokenize(text);
246 let indices: Vec<usize> = tokens.iter().map(|token| indexer.index(token)).collect();
247
248 assert_eq!(indices.len(), 9);
250
251 let the_index = indexer.index("the");
253 assert_eq!(indices[0], the_index);
254 assert_eq!(indices[6], the_index);
255 assert_eq!(
256 indices[0], indices[6],
257 "Repeated token 'the' should have same index"
258 );
259 }
260
261 #[test]
262 fn test_edge_cases() {
263 let tokenizer = MockWhitespaceTokenizer;
264 let indexer = MockHashTokenIndexer;
265
266 let tokens = tokenizer.tokenize(" \t \n ");
268 assert!(
269 tokens.is_empty(),
270 "Whitespace-only string should produce no tokens"
271 );
272
273 let tokens = tokenizer.tokenize("a");
275 assert_eq!(tokens, vec!["a"]);
276 let index = indexer.index(&tokens[0]);
277 assert!(index > 0, "Single character should produce valid index");
278
279 let long_token = "a".repeat(1000);
281 let index1 = indexer.index(&long_token);
282 let index2 = indexer.index(&long_token);
283 assert_eq!(index1, index2, "Long token should be handled consistently");
284 }
285
286 #[test]
287 fn test_indexer_properties() {
288 let indexer = MockHashTokenIndexer;
289
290 let token = "consistent";
292 let index1 = indexer.index(token);
293 let index2 = indexer.index(token);
294 assert_eq!(index1, index2, "Indexer should be deterministic");
295
296 let index_a = indexer.index("a");
299 let index_b = indexer.index("b");
300 assert_ne!(
301 index_a, index_b,
302 "Different tokens should generally have different indices"
303 );
304 }
305}