1use std::collections::HashMap;
4use std::sync::Arc;
5
6use parking_lot::RwLock;
7use rust_stemmers::Algorithm;
8use serde::{Deserialize, Serialize};
9use stop_words::LANGUAGE;
10
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
13pub struct Token {
14 pub text: String,
16 pub position: u32,
18 pub offset_from: usize,
20 pub offset_to: usize,
22}
23
24impl Token {
25 pub fn new(text: String, position: u32, offset_from: usize, offset_to: usize) -> Self {
26 Self {
27 text,
28 position,
29 offset_from,
30 offset_to,
31 }
32 }
33}
34
35pub trait Tokenizer: Send + Sync + Clone + 'static {
37 fn tokenize(&self, text: &str) -> Vec<Token>;
39}
40
41#[derive(Debug, Clone, Default)]
43pub struct SimpleTokenizer;
44
45impl Tokenizer for SimpleTokenizer {
46 fn tokenize(&self, text: &str) -> Vec<Token> {
47 let mut tokens = Vec::new();
48 let mut position = 0u32;
49
50 for (offset, word) in split_whitespace_with_offsets(text) {
51 if !word.is_empty() {
52 tokens.push(Token::new(
53 word.to_string(),
54 position,
55 offset,
56 offset + word.len(),
57 ));
58 position += 1;
59 }
60 }
61
62 tokens
63 }
64}
65
66#[derive(Debug, Clone, Default)]
68pub struct LowercaseTokenizer;
69
70impl Tokenizer for LowercaseTokenizer {
71 fn tokenize(&self, text: &str) -> Vec<Token> {
72 let mut tokens = Vec::new();
73 let mut position = 0u32;
74
75 for (offset, word) in split_whitespace_with_offsets(text) {
76 if !word.is_empty() {
77 let cleaned: String = word
79 .chars()
80 .filter(|c| c.is_alphanumeric())
81 .flat_map(|c| c.to_lowercase())
82 .collect();
83
84 if !cleaned.is_empty() {
85 tokens.push(Token::new(cleaned, position, offset, offset + word.len()));
86 position += 1;
87 }
88 }
89 }
90
91 tokens
92 }
93}
94
95fn split_whitespace_with_offsets(text: &str) -> impl Iterator<Item = (usize, &str)> {
97 let mut offset = 0;
98 text.split_whitespace().map(move |word| {
99 let word_start = text[offset..].find(word).unwrap() + offset;
100 offset = word_start + word.len();
101 (word_start, word)
102 })
103}
104
105#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
107#[allow(missing_docs)]
108#[derive(Default)]
109pub enum Language {
110 Arabic,
111 Danish,
112 Dutch,
113 #[default]
114 English,
115 Finnish,
116 French,
117 German,
118 Greek,
119 Hungarian,
120 Italian,
121 Norwegian,
122 Portuguese,
123 Romanian,
124 Russian,
125 Spanish,
126 Swedish,
127 Tamil,
128 Turkish,
129}
130
131impl Language {
132 fn to_algorithm(self) -> Algorithm {
133 match self {
134 Language::Arabic => Algorithm::Arabic,
135 Language::Danish => Algorithm::Danish,
136 Language::Dutch => Algorithm::Dutch,
137 Language::English => Algorithm::English,
138 Language::Finnish => Algorithm::Finnish,
139 Language::French => Algorithm::French,
140 Language::German => Algorithm::German,
141 Language::Greek => Algorithm::Greek,
142 Language::Hungarian => Algorithm::Hungarian,
143 Language::Italian => Algorithm::Italian,
144 Language::Norwegian => Algorithm::Norwegian,
145 Language::Portuguese => Algorithm::Portuguese,
146 Language::Romanian => Algorithm::Romanian,
147 Language::Russian => Algorithm::Russian,
148 Language::Spanish => Algorithm::Spanish,
149 Language::Swedish => Algorithm::Swedish,
150 Language::Tamil => Algorithm::Tamil,
151 Language::Turkish => Algorithm::Turkish,
152 }
153 }
154
155 fn to_stop_words_language(self) -> LANGUAGE {
156 match self {
157 Language::Arabic => LANGUAGE::Arabic,
158 Language::Danish => LANGUAGE::Danish,
159 Language::Dutch => LANGUAGE::Dutch,
160 Language::English => LANGUAGE::English,
161 Language::Finnish => LANGUAGE::Finnish,
162 Language::French => LANGUAGE::French,
163 Language::German => LANGUAGE::German,
164 Language::Greek => LANGUAGE::Greek,
165 Language::Hungarian => LANGUAGE::Hungarian,
166 Language::Italian => LANGUAGE::Italian,
167 Language::Norwegian => LANGUAGE::Norwegian,
168 Language::Portuguese => LANGUAGE::Portuguese,
169 Language::Romanian => LANGUAGE::Romanian,
170 Language::Russian => LANGUAGE::Russian,
171 Language::Spanish => LANGUAGE::Spanish,
172 Language::Swedish => LANGUAGE::Swedish,
173 Language::Tamil => LANGUAGE::English, Language::Turkish => LANGUAGE::Turkish,
175 }
176 }
177}
178
179#[derive(Debug, Clone)]
183pub struct StopWordTokenizer<T: Tokenizer> {
184 inner: T,
185 stop_words: HashSet<String>,
186}
187
188use std::collections::HashSet;
189
190impl<T: Tokenizer> StopWordTokenizer<T> {
191 pub fn new(inner: T, language: Language) -> Self {
193 let stop_words: HashSet<String> = stop_words::get(language.to_stop_words_language())
194 .into_iter()
195 .map(|s| s.to_string())
196 .collect();
197 Self { inner, stop_words }
198 }
199
200 pub fn english(inner: T) -> Self {
202 Self::new(inner, Language::English)
203 }
204
205 pub fn with_custom_stop_words(inner: T, stop_words: HashSet<String>) -> Self {
207 Self { inner, stop_words }
208 }
209
210 pub fn is_stop_word(&self, word: &str) -> bool {
212 self.stop_words.contains(word)
213 }
214}
215
216impl<T: Tokenizer> Tokenizer for StopWordTokenizer<T> {
217 fn tokenize(&self, text: &str) -> Vec<Token> {
218 self.inner
219 .tokenize(text)
220 .into_iter()
221 .filter(|token| !self.stop_words.contains(&token.text))
222 .collect()
223 }
224}
225
226#[derive(Debug, Clone)]
231pub struct StemmerTokenizer {
232 language: Language,
233}
234
235impl StemmerTokenizer {
236 pub fn new(language: Language) -> Self {
238 Self { language }
239 }
240
241 pub fn english() -> Self {
243 Self::new(Language::English)
244 }
245}
246
247impl Default for StemmerTokenizer {
248 fn default() -> Self {
249 Self::english()
250 }
251}
252
253impl Tokenizer for StemmerTokenizer {
254 fn tokenize(&self, text: &str) -> Vec<Token> {
255 let stemmer = rust_stemmers::Stemmer::create(self.language.to_algorithm());
256 let mut tokens = Vec::new();
257 let mut position = 0u32;
258
259 for (offset, word) in split_whitespace_with_offsets(text) {
260 if !word.is_empty() {
261 let cleaned: String = word
263 .chars()
264 .filter(|c| c.is_alphanumeric())
265 .flat_map(|c| c.to_lowercase())
266 .collect();
267
268 if !cleaned.is_empty() {
269 let stemmed = stemmer.stem(&cleaned);
271 tokens.push(Token::new(
272 stemmed.into_owned(),
273 position,
274 offset,
275 offset + word.len(),
276 ));
277 position += 1;
278 }
279 }
280 }
281
282 tokens
283 }
284}
285
286#[derive(Debug, Clone)]
291pub struct MultiLanguageStemmer {
292 default_language: Language,
293}
294
295impl MultiLanguageStemmer {
296 pub fn new(default_language: Language) -> Self {
298 Self { default_language }
299 }
300
301 pub fn tokenize_with_language(&self, text: &str, language: Language) -> Vec<Token> {
303 let stemmer = rust_stemmers::Stemmer::create(language.to_algorithm());
304 let mut tokens = Vec::new();
305 let mut position = 0u32;
306
307 for (offset, word) in split_whitespace_with_offsets(text) {
308 if !word.is_empty() {
309 let cleaned: String = word
310 .chars()
311 .filter(|c| c.is_alphanumeric())
312 .flat_map(|c| c.to_lowercase())
313 .collect();
314
315 if !cleaned.is_empty() {
316 let stemmed = stemmer.stem(&cleaned);
317 tokens.push(Token::new(
318 stemmed.into_owned(),
319 position,
320 offset,
321 offset + word.len(),
322 ));
323 position += 1;
324 }
325 }
326 }
327
328 tokens
329 }
330
331 pub fn default_language(&self) -> Language {
333 self.default_language
334 }
335}
336
337impl Default for MultiLanguageStemmer {
338 fn default() -> Self {
339 Self::new(Language::English)
340 }
341}
342
343impl Tokenizer for MultiLanguageStemmer {
344 fn tokenize(&self, text: &str) -> Vec<Token> {
345 self.tokenize_with_language(text, self.default_language)
346 }
347}
348
349#[derive(Clone)]
354pub struct LanguageAwareTokenizer<F>
355where
356 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
357{
358 language_selector: F,
359 stemmer: MultiLanguageStemmer,
360}
361
362impl<F> LanguageAwareTokenizer<F>
363where
364 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
365{
366 pub fn new(language_selector: F) -> Self {
383 Self {
384 language_selector,
385 stemmer: MultiLanguageStemmer::default(),
386 }
387 }
388
389 pub fn tokenize_with_hint(&self, text: &str, language_hint: &str) -> Vec<Token> {
393 let language = (self.language_selector)(language_hint);
394 self.stemmer.tokenize_with_language(text, language)
395 }
396}
397
398impl<F> Tokenizer for LanguageAwareTokenizer<F>
399where
400 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
401{
402 fn tokenize(&self, text: &str) -> Vec<Token> {
403 self.stemmer.tokenize_with_language(text, Language::English)
405 }
406}
407
408pub fn parse_language(s: &str) -> Language {
412 match s.to_lowercase().as_str() {
413 "ar" | "arabic" => Language::Arabic,
414 "da" | "danish" => Language::Danish,
415 "nl" | "dutch" => Language::Dutch,
416 "en" | "english" => Language::English,
417 "fi" | "finnish" => Language::Finnish,
418 "fr" | "french" => Language::French,
419 "de" | "german" => Language::German,
420 "el" | "greek" => Language::Greek,
421 "hu" | "hungarian" => Language::Hungarian,
422 "it" | "italian" => Language::Italian,
423 "no" | "norwegian" => Language::Norwegian,
424 "pt" | "portuguese" => Language::Portuguese,
425 "ro" | "romanian" => Language::Romanian,
426 "ru" | "russian" => Language::Russian,
427 "es" | "spanish" => Language::Spanish,
428 "sv" | "swedish" => Language::Swedish,
429 "ta" | "tamil" => Language::Tamil,
430 "tr" | "turkish" => Language::Turkish,
431 _ => Language::English, }
433}
434
435pub type BoxedTokenizer = Box<dyn TokenizerClone>;
437
438pub trait TokenizerClone: Send + Sync {
439 fn tokenize(&self, text: &str) -> Vec<Token>;
440 fn clone_box(&self) -> BoxedTokenizer;
441}
442
443impl<T: Tokenizer> TokenizerClone for T {
444 fn tokenize(&self, text: &str) -> Vec<Token> {
445 Tokenizer::tokenize(self, text)
446 }
447
448 fn clone_box(&self) -> BoxedTokenizer {
449 Box::new(self.clone())
450 }
451}
452
453impl Clone for BoxedTokenizer {
454 fn clone(&self) -> Self {
455 self.clone_box()
456 }
457}
458
459#[derive(Clone)]
464pub struct TokenizerRegistry {
465 tokenizers: Arc<RwLock<HashMap<String, BoxedTokenizer>>>,
466}
467
468impl TokenizerRegistry {
469 pub fn new() -> Self {
471 let registry = Self {
472 tokenizers: Arc::new(RwLock::new(HashMap::new())),
473 };
474 registry.register_defaults();
475 registry
476 }
477
478 fn register_defaults(&self) {
480 self.register("default", LowercaseTokenizer);
482 self.register("simple", SimpleTokenizer);
483 self.register("lowercase", LowercaseTokenizer);
484 self.register("raw", SimpleTokenizer);
485
486 self.register("en_stem", StemmerTokenizer::new(Language::English));
488 self.register("english", StemmerTokenizer::new(Language::English));
489
490 self.register("ar_stem", StemmerTokenizer::new(Language::Arabic));
492 self.register("arabic", StemmerTokenizer::new(Language::Arabic));
493 self.register("da_stem", StemmerTokenizer::new(Language::Danish));
494 self.register("danish", StemmerTokenizer::new(Language::Danish));
495 self.register("nl_stem", StemmerTokenizer::new(Language::Dutch));
496 self.register("dutch", StemmerTokenizer::new(Language::Dutch));
497 self.register("fi_stem", StemmerTokenizer::new(Language::Finnish));
498 self.register("finnish", StemmerTokenizer::new(Language::Finnish));
499 self.register("fr_stem", StemmerTokenizer::new(Language::French));
500 self.register("french", StemmerTokenizer::new(Language::French));
501 self.register("de_stem", StemmerTokenizer::new(Language::German));
502 self.register("german", StemmerTokenizer::new(Language::German));
503 self.register("el_stem", StemmerTokenizer::new(Language::Greek));
504 self.register("greek", StemmerTokenizer::new(Language::Greek));
505 self.register("hu_stem", StemmerTokenizer::new(Language::Hungarian));
506 self.register("hungarian", StemmerTokenizer::new(Language::Hungarian));
507 self.register("it_stem", StemmerTokenizer::new(Language::Italian));
508 self.register("italian", StemmerTokenizer::new(Language::Italian));
509 self.register("no_stem", StemmerTokenizer::new(Language::Norwegian));
510 self.register("norwegian", StemmerTokenizer::new(Language::Norwegian));
511 self.register("pt_stem", StemmerTokenizer::new(Language::Portuguese));
512 self.register("portuguese", StemmerTokenizer::new(Language::Portuguese));
513 self.register("ro_stem", StemmerTokenizer::new(Language::Romanian));
514 self.register("romanian", StemmerTokenizer::new(Language::Romanian));
515 self.register("ru_stem", StemmerTokenizer::new(Language::Russian));
516 self.register("russian", StemmerTokenizer::new(Language::Russian));
517 self.register("es_stem", StemmerTokenizer::new(Language::Spanish));
518 self.register("spanish", StemmerTokenizer::new(Language::Spanish));
519 self.register("sv_stem", StemmerTokenizer::new(Language::Swedish));
520 self.register("swedish", StemmerTokenizer::new(Language::Swedish));
521 self.register("ta_stem", StemmerTokenizer::new(Language::Tamil));
522 self.register("tamil", StemmerTokenizer::new(Language::Tamil));
523 self.register("tr_stem", StemmerTokenizer::new(Language::Turkish));
524 self.register("turkish", StemmerTokenizer::new(Language::Turkish));
525
526 self.register(
528 "en_stop",
529 StopWordTokenizer::new(LowercaseTokenizer, Language::English),
530 );
531 self.register(
532 "de_stop",
533 StopWordTokenizer::new(LowercaseTokenizer, Language::German),
534 );
535 self.register(
536 "fr_stop",
537 StopWordTokenizer::new(LowercaseTokenizer, Language::French),
538 );
539 self.register(
540 "ru_stop",
541 StopWordTokenizer::new(LowercaseTokenizer, Language::Russian),
542 );
543 self.register(
544 "es_stop",
545 StopWordTokenizer::new(LowercaseTokenizer, Language::Spanish),
546 );
547
548 self.register(
550 "en_stem_stop",
551 StopWordTokenizer::new(StemmerTokenizer::new(Language::English), Language::English),
552 );
553 self.register(
554 "de_stem_stop",
555 StopWordTokenizer::new(StemmerTokenizer::new(Language::German), Language::German),
556 );
557 self.register(
558 "fr_stem_stop",
559 StopWordTokenizer::new(StemmerTokenizer::new(Language::French), Language::French),
560 );
561 self.register(
562 "ru_stem_stop",
563 StopWordTokenizer::new(StemmerTokenizer::new(Language::Russian), Language::Russian),
564 );
565 self.register(
566 "es_stem_stop",
567 StopWordTokenizer::new(StemmerTokenizer::new(Language::Spanish), Language::Spanish),
568 );
569 }
570
571 pub fn register<T: Tokenizer>(&self, name: &str, tokenizer: T) {
573 let mut tokenizers = self.tokenizers.write();
574 tokenizers.insert(name.to_string(), Box::new(tokenizer));
575 }
576
577 pub fn get(&self, name: &str) -> Option<BoxedTokenizer> {
579 let tokenizers = self.tokenizers.read();
580 tokenizers.get(name).cloned()
581 }
582
583 pub fn contains(&self, name: &str) -> bool {
585 let tokenizers = self.tokenizers.read();
586 tokenizers.contains_key(name)
587 }
588
589 pub fn names(&self) -> Vec<String> {
591 let tokenizers = self.tokenizers.read();
592 tokenizers.keys().cloned().collect()
593 }
594}
595
596impl Default for TokenizerRegistry {
597 fn default() -> Self {
598 Self::new()
599 }
600}
601
602#[cfg(test)]
603mod tests {
604 use super::*;
605
606 #[test]
607 fn test_simple_tokenizer() {
608 let tokenizer = SimpleTokenizer;
609 let tokens = Tokenizer::tokenize(&tokenizer, "hello world");
610
611 assert_eq!(tokens.len(), 2);
612 assert_eq!(tokens[0].text, "hello");
613 assert_eq!(tokens[0].position, 0);
614 assert_eq!(tokens[1].text, "world");
615 assert_eq!(tokens[1].position, 1);
616 }
617
618 #[test]
619 fn test_lowercase_tokenizer() {
620 let tokenizer = LowercaseTokenizer;
621 let tokens = Tokenizer::tokenize(&tokenizer, "Hello, World!");
622
623 assert_eq!(tokens.len(), 2);
624 assert_eq!(tokens[0].text, "hello");
625 assert_eq!(tokens[1].text, "world");
626 }
627
628 #[test]
629 fn test_empty_text() {
630 let tokenizer = SimpleTokenizer;
631 let tokens = Tokenizer::tokenize(&tokenizer, "");
632 assert!(tokens.is_empty());
633 }
634
635 #[test]
636 fn test_stemmer_tokenizer_english() {
637 let tokenizer = StemmerTokenizer::english();
638 let tokens = Tokenizer::tokenize(&tokenizer, "Dogs are running quickly");
639
640 assert_eq!(tokens.len(), 4);
641 assert_eq!(tokens[0].text, "dog"); assert_eq!(tokens[1].text, "are"); assert_eq!(tokens[2].text, "run"); assert_eq!(tokens[3].text, "quick"); }
646
647 #[test]
648 fn test_stemmer_tokenizer_preserves_offsets() {
649 let tokenizer = StemmerTokenizer::english();
650 let tokens = Tokenizer::tokenize(&tokenizer, "Running dogs");
651
652 assert_eq!(tokens.len(), 2);
653 assert_eq!(tokens[0].text, "run");
654 assert_eq!(tokens[0].offset_from, 0);
655 assert_eq!(tokens[0].offset_to, 7); assert_eq!(tokens[1].text, "dog");
657 assert_eq!(tokens[1].offset_from, 8);
658 assert_eq!(tokens[1].offset_to, 12); }
660
661 #[test]
662 fn test_stemmer_tokenizer_german() {
663 let tokenizer = StemmerTokenizer::new(Language::German);
664 let tokens = Tokenizer::tokenize(&tokenizer, "Häuser Bücher");
665
666 assert_eq!(tokens.len(), 2);
667 assert_eq!(tokens[0].text, "haus"); assert_eq!(tokens[1].text, "buch"); }
671
672 #[test]
673 fn test_stemmer_tokenizer_russian() {
674 let tokenizer = StemmerTokenizer::new(Language::Russian);
675 let tokens = Tokenizer::tokenize(&tokenizer, "бегущие собаки");
676
677 assert_eq!(tokens.len(), 2);
678 assert_eq!(tokens[0].text, "бегущ"); assert_eq!(tokens[1].text, "собак"); }
682
683 #[test]
684 fn test_multi_language_stemmer() {
685 let stemmer = MultiLanguageStemmer::new(Language::English);
686
687 let tokens = stemmer.tokenize_with_language("running dogs", Language::English);
689 assert_eq!(tokens[0].text, "run");
690 assert_eq!(tokens[1].text, "dog");
691
692 let tokens = stemmer.tokenize_with_language("Häuser Bücher", Language::German);
694 assert_eq!(tokens[0].text, "haus");
695 assert_eq!(tokens[1].text, "buch");
696
697 let tokens = stemmer.tokenize_with_language("бегущие собаки", Language::Russian);
699 assert_eq!(tokens[0].text, "бегущ");
700 assert_eq!(tokens[1].text, "собак");
701 }
702
703 #[test]
704 fn test_language_aware_tokenizer() {
705 let tokenizer = LanguageAwareTokenizer::new(parse_language);
706
707 let tokens = tokenizer.tokenize_with_hint("running dogs", "en");
709 assert_eq!(tokens[0].text, "run");
710 assert_eq!(tokens[1].text, "dog");
711
712 let tokens = tokenizer.tokenize_with_hint("Häuser Bücher", "de");
714 assert_eq!(tokens[0].text, "haus");
715 assert_eq!(tokens[1].text, "buch");
716
717 let tokens = tokenizer.tokenize_with_hint("бегущие собаки", "russian");
719 assert_eq!(tokens[0].text, "бегущ");
720 assert_eq!(tokens[1].text, "собак");
721 }
722
723 #[test]
724 fn test_parse_language() {
725 assert_eq!(parse_language("en"), Language::English);
726 assert_eq!(parse_language("english"), Language::English);
727 assert_eq!(parse_language("English"), Language::English);
728 assert_eq!(parse_language("de"), Language::German);
729 assert_eq!(parse_language("german"), Language::German);
730 assert_eq!(parse_language("ru"), Language::Russian);
731 assert_eq!(parse_language("russian"), Language::Russian);
732 assert_eq!(parse_language("unknown"), Language::English); }
734
735 #[test]
736 fn test_tokenizer_registry_defaults() {
737 let registry = TokenizerRegistry::new();
738
739 assert!(registry.contains("default"));
741 assert!(registry.contains("simple"));
742 assert!(registry.contains("lowercase"));
743 assert!(registry.contains("en_stem"));
744 assert!(registry.contains("german"));
745 assert!(registry.contains("russian"));
746 }
747
748 #[test]
749 fn test_tokenizer_registry_get() {
750 let registry = TokenizerRegistry::new();
751
752 let tokenizer = registry.get("en_stem").unwrap();
754 let tokens = tokenizer.tokenize("running dogs");
755 assert_eq!(tokens[0].text, "run");
756 assert_eq!(tokens[1].text, "dog");
757
758 let tokenizer = registry.get("german").unwrap();
760 let tokens = tokenizer.tokenize("Häuser Bücher");
761 assert_eq!(tokens[0].text, "haus");
762 assert_eq!(tokens[1].text, "buch");
763 }
764
765 #[test]
766 fn test_tokenizer_registry_custom() {
767 let registry = TokenizerRegistry::new();
768
769 registry.register("my_tokenizer", LowercaseTokenizer);
771
772 assert!(registry.contains("my_tokenizer"));
773 let tokenizer = registry.get("my_tokenizer").unwrap();
774 let tokens = tokenizer.tokenize("Hello World");
775 assert_eq!(tokens[0].text, "hello");
776 assert_eq!(tokens[1].text, "world");
777 }
778
779 #[test]
780 fn test_tokenizer_registry_nonexistent() {
781 let registry = TokenizerRegistry::new();
782 assert!(registry.get("nonexistent").is_none());
783 }
784
785 #[test]
786 fn test_stop_word_tokenizer_english() {
787 let tokenizer = StopWordTokenizer::english(LowercaseTokenizer);
788 let tokens = Tokenizer::tokenize(&tokenizer, "The quick brown fox jumps over the lazy dog");
789
790 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
792 assert!(!texts.contains(&"the"));
793 assert!(!texts.contains(&"over"));
794 assert!(texts.contains(&"quick"));
795 assert!(texts.contains(&"brown"));
796 assert!(texts.contains(&"fox"));
797 assert!(texts.contains(&"jumps"));
798 assert!(texts.contains(&"lazy"));
799 assert!(texts.contains(&"dog"));
800 }
801
802 #[test]
803 fn test_stop_word_tokenizer_with_stemmer() {
804 let tokenizer = StopWordTokenizer::new(StemmerTokenizer::english(), Language::English);
808 let tokens = Tokenizer::tokenize(&tokenizer, "elephants galaxies quantum");
809
810 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
811 assert!(texts.contains(&"eleph")); assert!(texts.contains(&"galaxi")); assert!(texts.contains(&"quantum")); }
816
817 #[test]
818 fn test_stop_word_tokenizer_german() {
819 let tokenizer = StopWordTokenizer::new(LowercaseTokenizer, Language::German);
820 let tokens = Tokenizer::tokenize(&tokenizer, "Der Hund und die Katze");
821
822 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
824 assert!(!texts.contains(&"der"));
825 assert!(!texts.contains(&"und"));
826 assert!(!texts.contains(&"die"));
827 assert!(texts.contains(&"hund"));
828 assert!(texts.contains(&"katze"));
829 }
830
831 #[test]
832 fn test_stop_word_tokenizer_custom() {
833 let custom_stops: HashSet<String> = ["foo", "bar"].iter().map(|s| s.to_string()).collect();
834 let tokenizer = StopWordTokenizer::with_custom_stop_words(LowercaseTokenizer, custom_stops);
835 let tokens = Tokenizer::tokenize(&tokenizer, "foo baz bar qux");
836
837 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
838 assert!(!texts.contains(&"foo"));
839 assert!(!texts.contains(&"bar"));
840 assert!(texts.contains(&"baz"));
841 assert!(texts.contains(&"qux"));
842 }
843
844 #[test]
845 fn test_stop_word_tokenizer_is_stop_word() {
846 let tokenizer = StopWordTokenizer::english(LowercaseTokenizer);
847 assert!(tokenizer.is_stop_word("the"));
848 assert!(tokenizer.is_stop_word("and"));
849 assert!(tokenizer.is_stop_word("is"));
850 assert!(!tokenizer.is_stop_word("elephant"));
852 assert!(!tokenizer.is_stop_word("quantum"));
853 }
854
855 #[test]
856 fn test_tokenizer_registry_stop_word_tokenizers() {
857 let registry = TokenizerRegistry::new();
858
859 assert!(registry.contains("en_stop"));
861 assert!(registry.contains("en_stem_stop"));
862 assert!(registry.contains("de_stop"));
863 assert!(registry.contains("ru_stop"));
864
865 let tokenizer = registry.get("en_stop").unwrap();
867 let tokens = tokenizer.tokenize("The quick fox");
868 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
869 assert!(!texts.contains(&"the"));
870 assert!(texts.contains(&"quick"));
871 assert!(texts.contains(&"fox"));
872
873 let tokenizer = registry.get("en_stem_stop").unwrap();
875 let tokens = tokenizer.tokenize("elephants galaxies");
876 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
877 assert!(texts.contains(&"eleph")); assert!(texts.contains(&"galaxi")); }
880}