1#[cfg(any(feature = "native", feature = "wasm"))]
4mod hf_tokenizer;
5
6#[cfg(feature = "native")]
7mod idf_weights;
8
9#[cfg(any(feature = "native", feature = "wasm"))]
10pub use hf_tokenizer::{HfTokenizer, TokenizerSource};
11
12#[cfg(feature = "native")]
13pub use hf_tokenizer::{TokenizerCache, tokenizer_cache};
14
15#[cfg(feature = "native")]
16pub use idf_weights::{IdfWeights, IdfWeightsCache, idf_weights_cache};
17
18use std::collections::HashMap;
19use std::sync::Arc;
20
21use parking_lot::RwLock;
22use rust_stemmers::Algorithm;
23use serde::{Deserialize, Serialize};
24use stop_words::LANGUAGE;
25
26#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
28pub struct Token {
29 pub text: String,
31 pub position: u32,
33 pub offset_from: usize,
35 pub offset_to: usize,
37}
38
39impl Token {
40 pub fn new(text: String, position: u32, offset_from: usize, offset_to: usize) -> Self {
41 Self {
42 text,
43 position,
44 offset_from,
45 offset_to,
46 }
47 }
48}
49
50pub trait Tokenizer: Send + Sync + Clone + 'static {
52 fn tokenize(&self, text: &str) -> Vec<Token>;
54}
55
56#[derive(Debug, Clone, Default)]
58pub struct SimpleTokenizer;
59
60impl Tokenizer for SimpleTokenizer {
61 fn tokenize(&self, text: &str) -> Vec<Token> {
62 let mut tokens = Vec::new();
63 let mut position = 0u32;
64
65 for (offset, word) in split_whitespace_with_offsets(text) {
66 if !word.is_empty() {
67 tokens.push(Token::new(
68 word.to_string(),
69 position,
70 offset,
71 offset + word.len(),
72 ));
73 position += 1;
74 }
75 }
76
77 tokens
78 }
79}
80
81#[derive(Debug, Clone, Default)]
83pub struct LowercaseTokenizer;
84
85impl Tokenizer for LowercaseTokenizer {
86 fn tokenize(&self, text: &str) -> Vec<Token> {
87 tokenize_and_clean(text, |s| s.to_string())
88 }
89}
90
91fn tokenize_and_clean(text: &str, transform: impl Fn(&str) -> String) -> Vec<Token> {
94 let mut tokens = Vec::new();
95 let mut position = 0u32;
96 for (offset, word) in split_whitespace_with_offsets(text) {
97 if !word.is_empty() {
98 let cleaned: String = word
99 .chars()
100 .filter(|c| c.is_alphanumeric())
101 .flat_map(|c| c.to_lowercase())
102 .collect();
103 if !cleaned.is_empty() {
104 tokens.push(Token::new(
105 transform(&cleaned),
106 position,
107 offset,
108 offset + word.len(),
109 ));
110 position += 1;
111 }
112 }
113 }
114 tokens
115}
116
117fn split_whitespace_with_offsets(text: &str) -> impl Iterator<Item = (usize, &str)> {
119 let mut offset = 0;
120 text.split_whitespace().map(move |word| {
121 let word_start = text[offset..].find(word).unwrap() + offset;
122 offset = word_start + word.len();
123 (word_start, word)
124 })
125}
126
127#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
129#[allow(missing_docs)]
130#[derive(Default)]
131pub enum Language {
132 Arabic,
133 Danish,
134 Dutch,
135 #[default]
136 English,
137 Finnish,
138 French,
139 German,
140 Greek,
141 Hungarian,
142 Italian,
143 Norwegian,
144 Portuguese,
145 Romanian,
146 Russian,
147 Spanish,
148 Swedish,
149 Tamil,
150 Turkish,
151}
152
153impl Language {
154 fn to_algorithm(self) -> Algorithm {
155 match self {
156 Language::Arabic => Algorithm::Arabic,
157 Language::Danish => Algorithm::Danish,
158 Language::Dutch => Algorithm::Dutch,
159 Language::English => Algorithm::English,
160 Language::Finnish => Algorithm::Finnish,
161 Language::French => Algorithm::French,
162 Language::German => Algorithm::German,
163 Language::Greek => Algorithm::Greek,
164 Language::Hungarian => Algorithm::Hungarian,
165 Language::Italian => Algorithm::Italian,
166 Language::Norwegian => Algorithm::Norwegian,
167 Language::Portuguese => Algorithm::Portuguese,
168 Language::Romanian => Algorithm::Romanian,
169 Language::Russian => Algorithm::Russian,
170 Language::Spanish => Algorithm::Spanish,
171 Language::Swedish => Algorithm::Swedish,
172 Language::Tamil => Algorithm::Tamil,
173 Language::Turkish => Algorithm::Turkish,
174 }
175 }
176
177 fn to_stop_words_language(self) -> LANGUAGE {
178 match self {
179 Language::Arabic => LANGUAGE::Arabic,
180 Language::Danish => LANGUAGE::Danish,
181 Language::Dutch => LANGUAGE::Dutch,
182 Language::English => LANGUAGE::English,
183 Language::Finnish => LANGUAGE::Finnish,
184 Language::French => LANGUAGE::French,
185 Language::German => LANGUAGE::German,
186 Language::Greek => LANGUAGE::Greek,
187 Language::Hungarian => LANGUAGE::Hungarian,
188 Language::Italian => LANGUAGE::Italian,
189 Language::Norwegian => LANGUAGE::Norwegian,
190 Language::Portuguese => LANGUAGE::Portuguese,
191 Language::Romanian => LANGUAGE::Romanian,
192 Language::Russian => LANGUAGE::Russian,
193 Language::Spanish => LANGUAGE::Spanish,
194 Language::Swedish => LANGUAGE::Swedish,
195 Language::Tamil => LANGUAGE::English, Language::Turkish => LANGUAGE::Turkish,
197 }
198 }
199}
200
201#[derive(Debug, Clone)]
205pub struct StopWordTokenizer<T: Tokenizer> {
206 inner: T,
207 stop_words: HashSet<String>,
208}
209
210use std::collections::HashSet;
211
212impl<T: Tokenizer> StopWordTokenizer<T> {
213 pub fn new(inner: T, language: Language) -> Self {
215 let stop_words: HashSet<String> = stop_words::get(language.to_stop_words_language())
216 .into_iter()
217 .map(|s| s.to_string())
218 .collect();
219 Self { inner, stop_words }
220 }
221
222 pub fn english(inner: T) -> Self {
224 Self::new(inner, Language::English)
225 }
226
227 pub fn with_custom_stop_words(inner: T, stop_words: HashSet<String>) -> Self {
229 Self { inner, stop_words }
230 }
231
232 pub fn is_stop_word(&self, word: &str) -> bool {
234 self.stop_words.contains(word)
235 }
236}
237
238impl<T: Tokenizer> Tokenizer for StopWordTokenizer<T> {
239 fn tokenize(&self, text: &str) -> Vec<Token> {
240 self.inner
241 .tokenize(text)
242 .into_iter()
243 .filter(|token| !self.stop_words.contains(&token.text))
244 .collect()
245 }
246}
247
248#[derive(Debug, Clone)]
253pub struct StemmerTokenizer {
254 language: Language,
255}
256
257impl StemmerTokenizer {
258 pub fn new(language: Language) -> Self {
260 Self { language }
261 }
262
263 pub fn english() -> Self {
265 Self::new(Language::English)
266 }
267}
268
269impl Default for StemmerTokenizer {
270 fn default() -> Self {
271 Self::english()
272 }
273}
274
275impl Tokenizer for StemmerTokenizer {
276 fn tokenize(&self, text: &str) -> Vec<Token> {
277 let stemmer = rust_stemmers::Stemmer::create(self.language.to_algorithm());
278 tokenize_and_clean(text, |s| stemmer.stem(s).into_owned())
279 }
280}
281
282#[derive(Debug, Clone)]
287pub struct MultiLanguageStemmer {
288 default_language: Language,
289}
290
291impl MultiLanguageStemmer {
292 pub fn new(default_language: Language) -> Self {
294 Self { default_language }
295 }
296
297 pub fn tokenize_with_language(&self, text: &str, language: Language) -> Vec<Token> {
299 let stemmer = rust_stemmers::Stemmer::create(language.to_algorithm());
300 tokenize_and_clean(text, |s| stemmer.stem(s).into_owned())
301 }
302
303 pub fn default_language(&self) -> Language {
305 self.default_language
306 }
307}
308
309impl Default for MultiLanguageStemmer {
310 fn default() -> Self {
311 Self::new(Language::English)
312 }
313}
314
315impl Tokenizer for MultiLanguageStemmer {
316 fn tokenize(&self, text: &str) -> Vec<Token> {
317 self.tokenize_with_language(text, self.default_language)
318 }
319}
320
321#[derive(Clone)]
326pub struct LanguageAwareTokenizer<F>
327where
328 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
329{
330 language_selector: F,
331 stemmer: MultiLanguageStemmer,
332}
333
334impl<F> LanguageAwareTokenizer<F>
335where
336 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
337{
338 pub fn new(language_selector: F) -> Self {
355 Self {
356 language_selector,
357 stemmer: MultiLanguageStemmer::default(),
358 }
359 }
360
361 pub fn tokenize_with_hint(&self, text: &str, language_hint: &str) -> Vec<Token> {
365 let language = (self.language_selector)(language_hint);
366 self.stemmer.tokenize_with_language(text, language)
367 }
368}
369
370impl<F> Tokenizer for LanguageAwareTokenizer<F>
371where
372 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
373{
374 fn tokenize(&self, text: &str) -> Vec<Token> {
375 self.stemmer.tokenize_with_language(text, Language::English)
377 }
378}
379
380pub fn parse_language(s: &str) -> Language {
384 match s.to_lowercase().as_str() {
385 "ar" | "arabic" => Language::Arabic,
386 "da" | "danish" => Language::Danish,
387 "nl" | "dutch" => Language::Dutch,
388 "en" | "english" => Language::English,
389 "fi" | "finnish" => Language::Finnish,
390 "fr" | "french" => Language::French,
391 "de" | "german" => Language::German,
392 "el" | "greek" => Language::Greek,
393 "hu" | "hungarian" => Language::Hungarian,
394 "it" | "italian" => Language::Italian,
395 "no" | "norwegian" => Language::Norwegian,
396 "pt" | "portuguese" => Language::Portuguese,
397 "ro" | "romanian" => Language::Romanian,
398 "ru" | "russian" => Language::Russian,
399 "es" | "spanish" => Language::Spanish,
400 "sv" | "swedish" => Language::Swedish,
401 "ta" | "tamil" => Language::Tamil,
402 "tr" | "turkish" => Language::Turkish,
403 _ => Language::English, }
405}
406
407pub type BoxedTokenizer = Box<dyn TokenizerClone>;
409
410pub trait TokenizerClone: Send + Sync {
411 fn tokenize(&self, text: &str) -> Vec<Token>;
412 fn clone_box(&self) -> BoxedTokenizer;
413}
414
415impl<T: Tokenizer> TokenizerClone for T {
416 fn tokenize(&self, text: &str) -> Vec<Token> {
417 Tokenizer::tokenize(self, text)
418 }
419
420 fn clone_box(&self) -> BoxedTokenizer {
421 Box::new(self.clone())
422 }
423}
424
425impl Clone for BoxedTokenizer {
426 fn clone(&self) -> Self {
427 self.clone_box()
428 }
429}
430
431#[derive(Clone)]
436pub struct TokenizerRegistry {
437 tokenizers: Arc<RwLock<HashMap<String, BoxedTokenizer>>>,
438}
439
440impl TokenizerRegistry {
441 pub fn new() -> Self {
443 let registry = Self {
444 tokenizers: Arc::new(RwLock::new(HashMap::new())),
445 };
446 registry.register_defaults();
447 registry
448 }
449
450 fn register_defaults(&self) {
452 self.register("default", LowercaseTokenizer);
454 self.register("simple", SimpleTokenizer);
455 self.register("lowercase", LowercaseTokenizer);
456 self.register("raw", SimpleTokenizer);
457
458 self.register("en_stem", StemmerTokenizer::new(Language::English));
460 self.register("english", StemmerTokenizer::new(Language::English));
461
462 self.register("ar_stem", StemmerTokenizer::new(Language::Arabic));
464 self.register("arabic", StemmerTokenizer::new(Language::Arabic));
465 self.register("da_stem", StemmerTokenizer::new(Language::Danish));
466 self.register("danish", StemmerTokenizer::new(Language::Danish));
467 self.register("nl_stem", StemmerTokenizer::new(Language::Dutch));
468 self.register("dutch", StemmerTokenizer::new(Language::Dutch));
469 self.register("fi_stem", StemmerTokenizer::new(Language::Finnish));
470 self.register("finnish", StemmerTokenizer::new(Language::Finnish));
471 self.register("fr_stem", StemmerTokenizer::new(Language::French));
472 self.register("french", StemmerTokenizer::new(Language::French));
473 self.register("de_stem", StemmerTokenizer::new(Language::German));
474 self.register("german", StemmerTokenizer::new(Language::German));
475 self.register("el_stem", StemmerTokenizer::new(Language::Greek));
476 self.register("greek", StemmerTokenizer::new(Language::Greek));
477 self.register("hu_stem", StemmerTokenizer::new(Language::Hungarian));
478 self.register("hungarian", StemmerTokenizer::new(Language::Hungarian));
479 self.register("it_stem", StemmerTokenizer::new(Language::Italian));
480 self.register("italian", StemmerTokenizer::new(Language::Italian));
481 self.register("no_stem", StemmerTokenizer::new(Language::Norwegian));
482 self.register("norwegian", StemmerTokenizer::new(Language::Norwegian));
483 self.register("pt_stem", StemmerTokenizer::new(Language::Portuguese));
484 self.register("portuguese", StemmerTokenizer::new(Language::Portuguese));
485 self.register("ro_stem", StemmerTokenizer::new(Language::Romanian));
486 self.register("romanian", StemmerTokenizer::new(Language::Romanian));
487 self.register("ru_stem", StemmerTokenizer::new(Language::Russian));
488 self.register("russian", StemmerTokenizer::new(Language::Russian));
489 self.register("es_stem", StemmerTokenizer::new(Language::Spanish));
490 self.register("spanish", StemmerTokenizer::new(Language::Spanish));
491 self.register("sv_stem", StemmerTokenizer::new(Language::Swedish));
492 self.register("swedish", StemmerTokenizer::new(Language::Swedish));
493 self.register("ta_stem", StemmerTokenizer::new(Language::Tamil));
494 self.register("tamil", StemmerTokenizer::new(Language::Tamil));
495 self.register("tr_stem", StemmerTokenizer::new(Language::Turkish));
496 self.register("turkish", StemmerTokenizer::new(Language::Turkish));
497
498 self.register(
500 "en_stop",
501 StopWordTokenizer::new(LowercaseTokenizer, Language::English),
502 );
503 self.register(
504 "de_stop",
505 StopWordTokenizer::new(LowercaseTokenizer, Language::German),
506 );
507 self.register(
508 "fr_stop",
509 StopWordTokenizer::new(LowercaseTokenizer, Language::French),
510 );
511 self.register(
512 "ru_stop",
513 StopWordTokenizer::new(LowercaseTokenizer, Language::Russian),
514 );
515 self.register(
516 "es_stop",
517 StopWordTokenizer::new(LowercaseTokenizer, Language::Spanish),
518 );
519
520 self.register(
522 "en_stem_stop",
523 StopWordTokenizer::new(StemmerTokenizer::new(Language::English), Language::English),
524 );
525 self.register(
526 "de_stem_stop",
527 StopWordTokenizer::new(StemmerTokenizer::new(Language::German), Language::German),
528 );
529 self.register(
530 "fr_stem_stop",
531 StopWordTokenizer::new(StemmerTokenizer::new(Language::French), Language::French),
532 );
533 self.register(
534 "ru_stem_stop",
535 StopWordTokenizer::new(StemmerTokenizer::new(Language::Russian), Language::Russian),
536 );
537 self.register(
538 "es_stem_stop",
539 StopWordTokenizer::new(StemmerTokenizer::new(Language::Spanish), Language::Spanish),
540 );
541 }
542
543 pub fn register<T: Tokenizer>(&self, name: &str, tokenizer: T) {
545 let mut tokenizers = self.tokenizers.write();
546 tokenizers.insert(name.to_string(), Box::new(tokenizer));
547 }
548
549 pub fn get(&self, name: &str) -> Option<BoxedTokenizer> {
551 let tokenizers = self.tokenizers.read();
552 tokenizers.get(name).cloned()
553 }
554
555 pub fn contains(&self, name: &str) -> bool {
557 let tokenizers = self.tokenizers.read();
558 tokenizers.contains_key(name)
559 }
560
561 pub fn names(&self) -> Vec<String> {
563 let tokenizers = self.tokenizers.read();
564 tokenizers.keys().cloned().collect()
565 }
566}
567
568impl Default for TokenizerRegistry {
569 fn default() -> Self {
570 Self::new()
571 }
572}
573
574#[cfg(test)]
575mod tests {
576 use super::*;
577
578 #[test]
579 fn test_simple_tokenizer() {
580 let tokenizer = SimpleTokenizer;
581 let tokens = Tokenizer::tokenize(&tokenizer, "hello world");
582
583 assert_eq!(tokens.len(), 2);
584 assert_eq!(tokens[0].text, "hello");
585 assert_eq!(tokens[0].position, 0);
586 assert_eq!(tokens[1].text, "world");
587 assert_eq!(tokens[1].position, 1);
588 }
589
590 #[test]
591 fn test_lowercase_tokenizer() {
592 let tokenizer = LowercaseTokenizer;
593 let tokens = Tokenizer::tokenize(&tokenizer, "Hello, World!");
594
595 assert_eq!(tokens.len(), 2);
596 assert_eq!(tokens[0].text, "hello");
597 assert_eq!(tokens[1].text, "world");
598 }
599
600 #[test]
601 fn test_empty_text() {
602 let tokenizer = SimpleTokenizer;
603 let tokens = Tokenizer::tokenize(&tokenizer, "");
604 assert!(tokens.is_empty());
605 }
606
607 #[test]
608 fn test_stemmer_tokenizer_english() {
609 let tokenizer = StemmerTokenizer::english();
610 let tokens = Tokenizer::tokenize(&tokenizer, "Dogs are running quickly");
611
612 assert_eq!(tokens.len(), 4);
613 assert_eq!(tokens[0].text, "dog"); assert_eq!(tokens[1].text, "are"); assert_eq!(tokens[2].text, "run"); assert_eq!(tokens[3].text, "quick"); }
618
619 #[test]
620 fn test_stemmer_tokenizer_preserves_offsets() {
621 let tokenizer = StemmerTokenizer::english();
622 let tokens = Tokenizer::tokenize(&tokenizer, "Running dogs");
623
624 assert_eq!(tokens.len(), 2);
625 assert_eq!(tokens[0].text, "run");
626 assert_eq!(tokens[0].offset_from, 0);
627 assert_eq!(tokens[0].offset_to, 7); assert_eq!(tokens[1].text, "dog");
629 assert_eq!(tokens[1].offset_from, 8);
630 assert_eq!(tokens[1].offset_to, 12); }
632
633 #[test]
634 fn test_stemmer_tokenizer_german() {
635 let tokenizer = StemmerTokenizer::new(Language::German);
636 let tokens = Tokenizer::tokenize(&tokenizer, "Häuser Bücher");
637
638 assert_eq!(tokens.len(), 2);
639 assert_eq!(tokens[0].text, "haus"); assert_eq!(tokens[1].text, "buch"); }
643
644 #[test]
645 fn test_stemmer_tokenizer_russian() {
646 let tokenizer = StemmerTokenizer::new(Language::Russian);
647 let tokens = Tokenizer::tokenize(&tokenizer, "бегущие собаки");
648
649 assert_eq!(tokens.len(), 2);
650 assert_eq!(tokens[0].text, "бегущ"); assert_eq!(tokens[1].text, "собак"); }
654
655 #[test]
656 fn test_multi_language_stemmer() {
657 let stemmer = MultiLanguageStemmer::new(Language::English);
658
659 let tokens = stemmer.tokenize_with_language("running dogs", Language::English);
661 assert_eq!(tokens[0].text, "run");
662 assert_eq!(tokens[1].text, "dog");
663
664 let tokens = stemmer.tokenize_with_language("Häuser Bücher", Language::German);
666 assert_eq!(tokens[0].text, "haus");
667 assert_eq!(tokens[1].text, "buch");
668
669 let tokens = stemmer.tokenize_with_language("бегущие собаки", Language::Russian);
671 assert_eq!(tokens[0].text, "бегущ");
672 assert_eq!(tokens[1].text, "собак");
673 }
674
675 #[test]
676 fn test_language_aware_tokenizer() {
677 let tokenizer = LanguageAwareTokenizer::new(parse_language);
678
679 let tokens = tokenizer.tokenize_with_hint("running dogs", "en");
681 assert_eq!(tokens[0].text, "run");
682 assert_eq!(tokens[1].text, "dog");
683
684 let tokens = tokenizer.tokenize_with_hint("Häuser Bücher", "de");
686 assert_eq!(tokens[0].text, "haus");
687 assert_eq!(tokens[1].text, "buch");
688
689 let tokens = tokenizer.tokenize_with_hint("бегущие собаки", "russian");
691 assert_eq!(tokens[0].text, "бегущ");
692 assert_eq!(tokens[1].text, "собак");
693 }
694
695 #[test]
696 fn test_parse_language() {
697 assert_eq!(parse_language("en"), Language::English);
698 assert_eq!(parse_language("english"), Language::English);
699 assert_eq!(parse_language("English"), Language::English);
700 assert_eq!(parse_language("de"), Language::German);
701 assert_eq!(parse_language("german"), Language::German);
702 assert_eq!(parse_language("ru"), Language::Russian);
703 assert_eq!(parse_language("russian"), Language::Russian);
704 assert_eq!(parse_language("unknown"), Language::English); }
706
707 #[test]
708 fn test_tokenizer_registry_defaults() {
709 let registry = TokenizerRegistry::new();
710
711 assert!(registry.contains("default"));
713 assert!(registry.contains("simple"));
714 assert!(registry.contains("lowercase"));
715 assert!(registry.contains("en_stem"));
716 assert!(registry.contains("german"));
717 assert!(registry.contains("russian"));
718 }
719
720 #[test]
721 fn test_tokenizer_registry_get() {
722 let registry = TokenizerRegistry::new();
723
724 let tokenizer = registry.get("en_stem").unwrap();
726 let tokens = tokenizer.tokenize("running dogs");
727 assert_eq!(tokens[0].text, "run");
728 assert_eq!(tokens[1].text, "dog");
729
730 let tokenizer = registry.get("german").unwrap();
732 let tokens = tokenizer.tokenize("Häuser Bücher");
733 assert_eq!(tokens[0].text, "haus");
734 assert_eq!(tokens[1].text, "buch");
735 }
736
737 #[test]
738 fn test_tokenizer_registry_custom() {
739 let registry = TokenizerRegistry::new();
740
741 registry.register("my_tokenizer", LowercaseTokenizer);
743
744 assert!(registry.contains("my_tokenizer"));
745 let tokenizer = registry.get("my_tokenizer").unwrap();
746 let tokens = tokenizer.tokenize("Hello World");
747 assert_eq!(tokens[0].text, "hello");
748 assert_eq!(tokens[1].text, "world");
749 }
750
751 #[test]
752 fn test_tokenizer_registry_nonexistent() {
753 let registry = TokenizerRegistry::new();
754 assert!(registry.get("nonexistent").is_none());
755 }
756
757 #[test]
758 fn test_stop_word_tokenizer_english() {
759 let tokenizer = StopWordTokenizer::english(LowercaseTokenizer);
760 let tokens = Tokenizer::tokenize(&tokenizer, "The quick brown fox jumps over the lazy dog");
761
762 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
764 assert!(!texts.contains(&"the"));
765 assert!(!texts.contains(&"over"));
766 assert!(texts.contains(&"quick"));
767 assert!(texts.contains(&"brown"));
768 assert!(texts.contains(&"fox"));
769 assert!(texts.contains(&"jumps"));
770 assert!(texts.contains(&"lazy"));
771 assert!(texts.contains(&"dog"));
772 }
773
774 #[test]
775 fn test_stop_word_tokenizer_with_stemmer() {
776 let tokenizer = StopWordTokenizer::new(StemmerTokenizer::english(), Language::English);
780 let tokens = Tokenizer::tokenize(&tokenizer, "elephants galaxies quantum");
781
782 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
783 assert!(texts.contains(&"eleph")); assert!(texts.contains(&"galaxi")); assert!(texts.contains(&"quantum")); }
788
789 #[test]
790 fn test_stop_word_tokenizer_german() {
791 let tokenizer = StopWordTokenizer::new(LowercaseTokenizer, Language::German);
792 let tokens = Tokenizer::tokenize(&tokenizer, "Der Hund und die Katze");
793
794 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
796 assert!(!texts.contains(&"der"));
797 assert!(!texts.contains(&"und"));
798 assert!(!texts.contains(&"die"));
799 assert!(texts.contains(&"hund"));
800 assert!(texts.contains(&"katze"));
801 }
802
803 #[test]
804 fn test_stop_word_tokenizer_custom() {
805 let custom_stops: HashSet<String> = ["foo", "bar"].iter().map(|s| s.to_string()).collect();
806 let tokenizer = StopWordTokenizer::with_custom_stop_words(LowercaseTokenizer, custom_stops);
807 let tokens = Tokenizer::tokenize(&tokenizer, "foo baz bar qux");
808
809 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
810 assert!(!texts.contains(&"foo"));
811 assert!(!texts.contains(&"bar"));
812 assert!(texts.contains(&"baz"));
813 assert!(texts.contains(&"qux"));
814 }
815
816 #[test]
817 fn test_stop_word_tokenizer_is_stop_word() {
818 let tokenizer = StopWordTokenizer::english(LowercaseTokenizer);
819 assert!(tokenizer.is_stop_word("the"));
820 assert!(tokenizer.is_stop_word("and"));
821 assert!(tokenizer.is_stop_word("is"));
822 assert!(!tokenizer.is_stop_word("elephant"));
824 assert!(!tokenizer.is_stop_word("quantum"));
825 }
826
827 #[test]
828 fn test_tokenizer_registry_stop_word_tokenizers() {
829 let registry = TokenizerRegistry::new();
830
831 assert!(registry.contains("en_stop"));
833 assert!(registry.contains("en_stem_stop"));
834 assert!(registry.contains("de_stop"));
835 assert!(registry.contains("ru_stop"));
836
837 let tokenizer = registry.get("en_stop").unwrap();
839 let tokens = tokenizer.tokenize("The quick fox");
840 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
841 assert!(!texts.contains(&"the"));
842 assert!(texts.contains(&"quick"));
843 assert!(texts.contains(&"fox"));
844
845 let tokenizer = registry.get("en_stem_stop").unwrap();
847 let tokens = tokenizer.tokenize("elephants galaxies");
848 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
849 assert!(texts.contains(&"eleph")); assert!(texts.contains(&"galaxi")); }
852}