1#[cfg(any(feature = "native", feature = "wasm"))]
4mod hf_tokenizer;
5
6#[cfg(feature = "native")]
7mod idf_weights;
8
9#[cfg(any(feature = "native", feature = "wasm"))]
10pub use hf_tokenizer::{HfTokenizer, TokenizerSource};
11
12#[cfg(feature = "native")]
13pub use hf_tokenizer::{TokenizerCache, tokenizer_cache};
14
15#[cfg(feature = "native")]
16pub use idf_weights::{IdfWeights, IdfWeightsCache, idf_weights_cache};
17
18use std::collections::HashMap;
19use std::sync::Arc;
20
21use parking_lot::RwLock;
22use rust_stemmers::Algorithm;
23use serde::{Deserialize, Serialize};
24use stop_words::LANGUAGE;
25
26#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
28pub struct Token {
29 pub text: String,
31 pub position: u32,
33 pub offset_from: usize,
35 pub offset_to: usize,
37}
38
39impl Token {
40 pub fn new(text: String, position: u32, offset_from: usize, offset_to: usize) -> Self {
41 Self {
42 text,
43 position,
44 offset_from,
45 offset_to,
46 }
47 }
48}
49
50pub trait Tokenizer: Send + Sync + Clone + 'static {
52 fn tokenize(&self, text: &str) -> Vec<Token>;
54}
55
56#[derive(Debug, Clone, Default)]
58pub struct SimpleTokenizer;
59
60impl Tokenizer for SimpleTokenizer {
61 fn tokenize(&self, text: &str) -> Vec<Token> {
62 let mut tokens = Vec::new();
63 let mut position = 0u32;
64
65 for (offset, word) in split_whitespace_with_offsets(text) {
66 if !word.is_empty() {
67 tokens.push(Token::new(
68 word.to_string(),
69 position,
70 offset,
71 offset + word.len(),
72 ));
73 position += 1;
74 }
75 }
76
77 tokens
78 }
79}
80
81#[derive(Debug, Clone, Default)]
83pub struct LowercaseTokenizer;
84
85impl Tokenizer for LowercaseTokenizer {
86 fn tokenize(&self, text: &str) -> Vec<Token> {
87 tokenize_and_clean(text, |s| s.to_string())
88 }
89}
90
91#[inline]
96fn clean_word(word: &str) -> String {
97 if word.is_ascii() {
98 let mut result = String::with_capacity(word.len());
100 for &b in word.as_bytes() {
101 if b.is_ascii_alphanumeric() {
102 result.push(b.to_ascii_lowercase() as char);
103 }
104 }
105 result
106 } else {
107 word.chars()
109 .filter(|c| c.is_alphanumeric())
110 .flat_map(|c| c.to_lowercase())
111 .collect()
112 }
113}
114
115fn tokenize_and_clean(text: &str, transform: impl Fn(&str) -> String) -> Vec<Token> {
118 let mut tokens = Vec::new();
119 let mut position = 0u32;
120 for (offset, word) in split_whitespace_with_offsets(text) {
121 if !word.is_empty() {
122 let cleaned = clean_word(word);
123 if !cleaned.is_empty() {
124 tokens.push(Token::new(
125 transform(&cleaned),
126 position,
127 offset,
128 offset + word.len(),
129 ));
130 position += 1;
131 }
132 }
133 }
134 tokens
135}
136
137fn split_whitespace_with_offsets(text: &str) -> impl Iterator<Item = (usize, &str)> {
142 let base = text.as_ptr() as usize;
143 text.split_whitespace()
144 .map(move |word| (word.as_ptr() as usize - base, word))
145}
146
147#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
149#[allow(missing_docs)]
150#[derive(Default)]
151pub enum Language {
152 Arabic,
153 Danish,
154 Dutch,
155 #[default]
156 English,
157 Finnish,
158 French,
159 German,
160 Greek,
161 Hungarian,
162 Italian,
163 Norwegian,
164 Portuguese,
165 Romanian,
166 Russian,
167 Spanish,
168 Swedish,
169 Tamil,
170 Turkish,
171}
172
173impl Language {
174 fn to_algorithm(self) -> Algorithm {
175 match self {
176 Language::Arabic => Algorithm::Arabic,
177 Language::Danish => Algorithm::Danish,
178 Language::Dutch => Algorithm::Dutch,
179 Language::English => Algorithm::English,
180 Language::Finnish => Algorithm::Finnish,
181 Language::French => Algorithm::French,
182 Language::German => Algorithm::German,
183 Language::Greek => Algorithm::Greek,
184 Language::Hungarian => Algorithm::Hungarian,
185 Language::Italian => Algorithm::Italian,
186 Language::Norwegian => Algorithm::Norwegian,
187 Language::Portuguese => Algorithm::Portuguese,
188 Language::Romanian => Algorithm::Romanian,
189 Language::Russian => Algorithm::Russian,
190 Language::Spanish => Algorithm::Spanish,
191 Language::Swedish => Algorithm::Swedish,
192 Language::Tamil => Algorithm::Tamil,
193 Language::Turkish => Algorithm::Turkish,
194 }
195 }
196
197 fn to_stop_words_language(self) -> LANGUAGE {
198 match self {
199 Language::Arabic => LANGUAGE::Arabic,
200 Language::Danish => LANGUAGE::Danish,
201 Language::Dutch => LANGUAGE::Dutch,
202 Language::English => LANGUAGE::English,
203 Language::Finnish => LANGUAGE::Finnish,
204 Language::French => LANGUAGE::French,
205 Language::German => LANGUAGE::German,
206 Language::Greek => LANGUAGE::Greek,
207 Language::Hungarian => LANGUAGE::Hungarian,
208 Language::Italian => LANGUAGE::Italian,
209 Language::Norwegian => LANGUAGE::Norwegian,
210 Language::Portuguese => LANGUAGE::Portuguese,
211 Language::Romanian => LANGUAGE::Romanian,
212 Language::Russian => LANGUAGE::Russian,
213 Language::Spanish => LANGUAGE::Spanish,
214 Language::Swedish => LANGUAGE::Swedish,
215 Language::Tamil => LANGUAGE::English, Language::Turkish => LANGUAGE::Turkish,
217 }
218 }
219}
220
221#[derive(Debug, Clone)]
225pub struct StopWordTokenizer<T: Tokenizer> {
226 inner: T,
227 stop_words: HashSet<String>,
228}
229
230use std::collections::HashSet;
231
232impl<T: Tokenizer> StopWordTokenizer<T> {
233 pub fn new(inner: T, language: Language) -> Self {
235 let stop_words: HashSet<String> = stop_words::get(language.to_stop_words_language())
236 .into_iter()
237 .map(|s| s.to_string())
238 .collect();
239 Self { inner, stop_words }
240 }
241
242 pub fn english(inner: T) -> Self {
244 Self::new(inner, Language::English)
245 }
246
247 pub fn with_custom_stop_words(inner: T, stop_words: HashSet<String>) -> Self {
249 Self { inner, stop_words }
250 }
251
252 pub fn is_stop_word(&self, word: &str) -> bool {
254 self.stop_words.contains(word)
255 }
256}
257
258impl<T: Tokenizer> Tokenizer for StopWordTokenizer<T> {
259 fn tokenize(&self, text: &str) -> Vec<Token> {
260 self.inner
261 .tokenize(text)
262 .into_iter()
263 .filter(|token| !self.stop_words.contains(&token.text))
264 .collect()
265 }
266}
267
268#[derive(Debug, Clone)]
273pub struct StemmerTokenizer {
274 language: Language,
275}
276
277impl StemmerTokenizer {
278 pub fn new(language: Language) -> Self {
280 Self { language }
281 }
282
283 pub fn english() -> Self {
285 Self::new(Language::English)
286 }
287}
288
289impl Default for StemmerTokenizer {
290 fn default() -> Self {
291 Self::english()
292 }
293}
294
295impl Tokenizer for StemmerTokenizer {
296 fn tokenize(&self, text: &str) -> Vec<Token> {
297 let stemmer = rust_stemmers::Stemmer::create(self.language.to_algorithm());
298 tokenize_and_clean(text, |s| stemmer.stem(s).into_owned())
299 }
300}
301
302#[derive(Debug, Clone)]
307pub struct MultiLanguageStemmer {
308 default_language: Language,
309}
310
311impl MultiLanguageStemmer {
312 pub fn new(default_language: Language) -> Self {
314 Self { default_language }
315 }
316
317 pub fn tokenize_with_language(&self, text: &str, language: Language) -> Vec<Token> {
319 let stemmer = rust_stemmers::Stemmer::create(language.to_algorithm());
320 tokenize_and_clean(text, |s| stemmer.stem(s).into_owned())
321 }
322
323 pub fn default_language(&self) -> Language {
325 self.default_language
326 }
327}
328
329impl Default for MultiLanguageStemmer {
330 fn default() -> Self {
331 Self::new(Language::English)
332 }
333}
334
335impl Tokenizer for MultiLanguageStemmer {
336 fn tokenize(&self, text: &str) -> Vec<Token> {
337 self.tokenize_with_language(text, self.default_language)
338 }
339}
340
341#[derive(Clone)]
346pub struct LanguageAwareTokenizer<F>
347where
348 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
349{
350 language_selector: F,
351 stemmer: MultiLanguageStemmer,
352}
353
354impl<F> LanguageAwareTokenizer<F>
355where
356 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
357{
358 pub fn new(language_selector: F) -> Self {
375 Self {
376 language_selector,
377 stemmer: MultiLanguageStemmer::default(),
378 }
379 }
380
381 pub fn tokenize_with_hint(&self, text: &str, language_hint: &str) -> Vec<Token> {
385 let language = (self.language_selector)(language_hint);
386 self.stemmer.tokenize_with_language(text, language)
387 }
388}
389
390impl<F> Tokenizer for LanguageAwareTokenizer<F>
391where
392 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
393{
394 fn tokenize(&self, text: &str) -> Vec<Token> {
395 self.stemmer.tokenize_with_language(text, Language::English)
397 }
398}
399
400pub fn parse_language(s: &str) -> Language {
404 match s.to_lowercase().as_str() {
405 "ar" | "arabic" => Language::Arabic,
406 "da" | "danish" => Language::Danish,
407 "nl" | "dutch" => Language::Dutch,
408 "en" | "english" => Language::English,
409 "fi" | "finnish" => Language::Finnish,
410 "fr" | "french" => Language::French,
411 "de" | "german" => Language::German,
412 "el" | "greek" => Language::Greek,
413 "hu" | "hungarian" => Language::Hungarian,
414 "it" | "italian" => Language::Italian,
415 "no" | "norwegian" => Language::Norwegian,
416 "pt" | "portuguese" => Language::Portuguese,
417 "ro" | "romanian" => Language::Romanian,
418 "ru" | "russian" => Language::Russian,
419 "es" | "spanish" => Language::Spanish,
420 "sv" | "swedish" => Language::Swedish,
421 "ta" | "tamil" => Language::Tamil,
422 "tr" | "turkish" => Language::Turkish,
423 _ => Language::English, }
425}
426
427pub type BoxedTokenizer = Box<dyn TokenizerClone>;
429
430pub trait TokenizerClone: Send + Sync {
431 fn tokenize(&self, text: &str) -> Vec<Token>;
432 fn clone_box(&self) -> BoxedTokenizer;
433}
434
435impl<T: Tokenizer> TokenizerClone for T {
436 fn tokenize(&self, text: &str) -> Vec<Token> {
437 Tokenizer::tokenize(self, text)
438 }
439
440 fn clone_box(&self) -> BoxedTokenizer {
441 Box::new(self.clone())
442 }
443}
444
445impl Clone for BoxedTokenizer {
446 fn clone(&self) -> Self {
447 self.clone_box()
448 }
449}
450
451#[derive(Clone)]
456pub struct TokenizerRegistry {
457 tokenizers: Arc<RwLock<HashMap<String, BoxedTokenizer>>>,
458}
459
460impl TokenizerRegistry {
461 pub fn new() -> Self {
463 let registry = Self {
464 tokenizers: Arc::new(RwLock::new(HashMap::new())),
465 };
466 registry.register_defaults();
467 registry
468 }
469
470 fn register_defaults(&self) {
472 self.register("default", LowercaseTokenizer);
474 self.register("simple", SimpleTokenizer);
475 self.register("lowercase", LowercaseTokenizer);
476 self.register("raw", SimpleTokenizer);
477
478 self.register("en_stem", StemmerTokenizer::new(Language::English));
480 self.register("english", StemmerTokenizer::new(Language::English));
481
482 self.register("ar_stem", StemmerTokenizer::new(Language::Arabic));
484 self.register("arabic", StemmerTokenizer::new(Language::Arabic));
485 self.register("da_stem", StemmerTokenizer::new(Language::Danish));
486 self.register("danish", StemmerTokenizer::new(Language::Danish));
487 self.register("nl_stem", StemmerTokenizer::new(Language::Dutch));
488 self.register("dutch", StemmerTokenizer::new(Language::Dutch));
489 self.register("fi_stem", StemmerTokenizer::new(Language::Finnish));
490 self.register("finnish", StemmerTokenizer::new(Language::Finnish));
491 self.register("fr_stem", StemmerTokenizer::new(Language::French));
492 self.register("french", StemmerTokenizer::new(Language::French));
493 self.register("de_stem", StemmerTokenizer::new(Language::German));
494 self.register("german", StemmerTokenizer::new(Language::German));
495 self.register("el_stem", StemmerTokenizer::new(Language::Greek));
496 self.register("greek", StemmerTokenizer::new(Language::Greek));
497 self.register("hu_stem", StemmerTokenizer::new(Language::Hungarian));
498 self.register("hungarian", StemmerTokenizer::new(Language::Hungarian));
499 self.register("it_stem", StemmerTokenizer::new(Language::Italian));
500 self.register("italian", StemmerTokenizer::new(Language::Italian));
501 self.register("no_stem", StemmerTokenizer::new(Language::Norwegian));
502 self.register("norwegian", StemmerTokenizer::new(Language::Norwegian));
503 self.register("pt_stem", StemmerTokenizer::new(Language::Portuguese));
504 self.register("portuguese", StemmerTokenizer::new(Language::Portuguese));
505 self.register("ro_stem", StemmerTokenizer::new(Language::Romanian));
506 self.register("romanian", StemmerTokenizer::new(Language::Romanian));
507 self.register("ru_stem", StemmerTokenizer::new(Language::Russian));
508 self.register("russian", StemmerTokenizer::new(Language::Russian));
509 self.register("es_stem", StemmerTokenizer::new(Language::Spanish));
510 self.register("spanish", StemmerTokenizer::new(Language::Spanish));
511 self.register("sv_stem", StemmerTokenizer::new(Language::Swedish));
512 self.register("swedish", StemmerTokenizer::new(Language::Swedish));
513 self.register("ta_stem", StemmerTokenizer::new(Language::Tamil));
514 self.register("tamil", StemmerTokenizer::new(Language::Tamil));
515 self.register("tr_stem", StemmerTokenizer::new(Language::Turkish));
516 self.register("turkish", StemmerTokenizer::new(Language::Turkish));
517
518 self.register(
520 "en_stop",
521 StopWordTokenizer::new(LowercaseTokenizer, Language::English),
522 );
523 self.register(
524 "de_stop",
525 StopWordTokenizer::new(LowercaseTokenizer, Language::German),
526 );
527 self.register(
528 "fr_stop",
529 StopWordTokenizer::new(LowercaseTokenizer, Language::French),
530 );
531 self.register(
532 "ru_stop",
533 StopWordTokenizer::new(LowercaseTokenizer, Language::Russian),
534 );
535 self.register(
536 "es_stop",
537 StopWordTokenizer::new(LowercaseTokenizer, Language::Spanish),
538 );
539
540 self.register(
542 "en_stem_stop",
543 StopWordTokenizer::new(StemmerTokenizer::new(Language::English), Language::English),
544 );
545 self.register(
546 "de_stem_stop",
547 StopWordTokenizer::new(StemmerTokenizer::new(Language::German), Language::German),
548 );
549 self.register(
550 "fr_stem_stop",
551 StopWordTokenizer::new(StemmerTokenizer::new(Language::French), Language::French),
552 );
553 self.register(
554 "ru_stem_stop",
555 StopWordTokenizer::new(StemmerTokenizer::new(Language::Russian), Language::Russian),
556 );
557 self.register(
558 "es_stem_stop",
559 StopWordTokenizer::new(StemmerTokenizer::new(Language::Spanish), Language::Spanish),
560 );
561 }
562
563 pub fn register<T: Tokenizer>(&self, name: &str, tokenizer: T) {
565 let mut tokenizers = self.tokenizers.write();
566 tokenizers.insert(name.to_string(), Box::new(tokenizer));
567 }
568
569 pub fn get(&self, name: &str) -> Option<BoxedTokenizer> {
571 let tokenizers = self.tokenizers.read();
572 tokenizers.get(name).cloned()
573 }
574
575 pub fn contains(&self, name: &str) -> bool {
577 let tokenizers = self.tokenizers.read();
578 tokenizers.contains_key(name)
579 }
580
581 pub fn names(&self) -> Vec<String> {
583 let tokenizers = self.tokenizers.read();
584 tokenizers.keys().cloned().collect()
585 }
586}
587
588impl Default for TokenizerRegistry {
589 fn default() -> Self {
590 Self::new()
591 }
592}
593
594#[cfg(test)]
595mod tests {
596 use super::*;
597
598 #[test]
599 fn test_simple_tokenizer() {
600 let tokenizer = SimpleTokenizer;
601 let tokens = Tokenizer::tokenize(&tokenizer, "hello world");
602
603 assert_eq!(tokens.len(), 2);
604 assert_eq!(tokens[0].text, "hello");
605 assert_eq!(tokens[0].position, 0);
606 assert_eq!(tokens[1].text, "world");
607 assert_eq!(tokens[1].position, 1);
608 }
609
610 #[test]
611 fn test_lowercase_tokenizer() {
612 let tokenizer = LowercaseTokenizer;
613 let tokens = Tokenizer::tokenize(&tokenizer, "Hello, World!");
614
615 assert_eq!(tokens.len(), 2);
616 assert_eq!(tokens[0].text, "hello");
617 assert_eq!(tokens[1].text, "world");
618 }
619
620 #[test]
621 fn test_empty_text() {
622 let tokenizer = SimpleTokenizer;
623 let tokens = Tokenizer::tokenize(&tokenizer, "");
624 assert!(tokens.is_empty());
625 }
626
627 #[test]
628 fn test_stemmer_tokenizer_english() {
629 let tokenizer = StemmerTokenizer::english();
630 let tokens = Tokenizer::tokenize(&tokenizer, "Dogs are running quickly");
631
632 assert_eq!(tokens.len(), 4);
633 assert_eq!(tokens[0].text, "dog"); assert_eq!(tokens[1].text, "are"); assert_eq!(tokens[2].text, "run"); assert_eq!(tokens[3].text, "quick"); }
638
639 #[test]
640 fn test_stemmer_tokenizer_preserves_offsets() {
641 let tokenizer = StemmerTokenizer::english();
642 let tokens = Tokenizer::tokenize(&tokenizer, "Running dogs");
643
644 assert_eq!(tokens.len(), 2);
645 assert_eq!(tokens[0].text, "run");
646 assert_eq!(tokens[0].offset_from, 0);
647 assert_eq!(tokens[0].offset_to, 7); assert_eq!(tokens[1].text, "dog");
649 assert_eq!(tokens[1].offset_from, 8);
650 assert_eq!(tokens[1].offset_to, 12); }
652
653 #[test]
654 fn test_stemmer_tokenizer_german() {
655 let tokenizer = StemmerTokenizer::new(Language::German);
656 let tokens = Tokenizer::tokenize(&tokenizer, "Häuser Bücher");
657
658 assert_eq!(tokens.len(), 2);
659 assert_eq!(tokens[0].text, "haus"); assert_eq!(tokens[1].text, "buch"); }
663
664 #[test]
665 fn test_stemmer_tokenizer_russian() {
666 let tokenizer = StemmerTokenizer::new(Language::Russian);
667 let tokens = Tokenizer::tokenize(&tokenizer, "бегущие собаки");
668
669 assert_eq!(tokens.len(), 2);
670 assert_eq!(tokens[0].text, "бегущ"); assert_eq!(tokens[1].text, "собак"); }
674
675 #[test]
676 fn test_multi_language_stemmer() {
677 let stemmer = MultiLanguageStemmer::new(Language::English);
678
679 let tokens = stemmer.tokenize_with_language("running dogs", Language::English);
681 assert_eq!(tokens[0].text, "run");
682 assert_eq!(tokens[1].text, "dog");
683
684 let tokens = stemmer.tokenize_with_language("Häuser Bücher", Language::German);
686 assert_eq!(tokens[0].text, "haus");
687 assert_eq!(tokens[1].text, "buch");
688
689 let tokens = stemmer.tokenize_with_language("бегущие собаки", Language::Russian);
691 assert_eq!(tokens[0].text, "бегущ");
692 assert_eq!(tokens[1].text, "собак");
693 }
694
695 #[test]
696 fn test_language_aware_tokenizer() {
697 let tokenizer = LanguageAwareTokenizer::new(parse_language);
698
699 let tokens = tokenizer.tokenize_with_hint("running dogs", "en");
701 assert_eq!(tokens[0].text, "run");
702 assert_eq!(tokens[1].text, "dog");
703
704 let tokens = tokenizer.tokenize_with_hint("Häuser Bücher", "de");
706 assert_eq!(tokens[0].text, "haus");
707 assert_eq!(tokens[1].text, "buch");
708
709 let tokens = tokenizer.tokenize_with_hint("бегущие собаки", "russian");
711 assert_eq!(tokens[0].text, "бегущ");
712 assert_eq!(tokens[1].text, "собак");
713 }
714
715 #[test]
716 fn test_parse_language() {
717 assert_eq!(parse_language("en"), Language::English);
718 assert_eq!(parse_language("english"), Language::English);
719 assert_eq!(parse_language("English"), Language::English);
720 assert_eq!(parse_language("de"), Language::German);
721 assert_eq!(parse_language("german"), Language::German);
722 assert_eq!(parse_language("ru"), Language::Russian);
723 assert_eq!(parse_language("russian"), Language::Russian);
724 assert_eq!(parse_language("unknown"), Language::English); }
726
727 #[test]
728 fn test_tokenizer_registry_defaults() {
729 let registry = TokenizerRegistry::new();
730
731 assert!(registry.contains("default"));
733 assert!(registry.contains("simple"));
734 assert!(registry.contains("lowercase"));
735 assert!(registry.contains("en_stem"));
736 assert!(registry.contains("german"));
737 assert!(registry.contains("russian"));
738 }
739
740 #[test]
741 fn test_tokenizer_registry_get() {
742 let registry = TokenizerRegistry::new();
743
744 let tokenizer = registry.get("en_stem").unwrap();
746 let tokens = tokenizer.tokenize("running dogs");
747 assert_eq!(tokens[0].text, "run");
748 assert_eq!(tokens[1].text, "dog");
749
750 let tokenizer = registry.get("german").unwrap();
752 let tokens = tokenizer.tokenize("Häuser Bücher");
753 assert_eq!(tokens[0].text, "haus");
754 assert_eq!(tokens[1].text, "buch");
755 }
756
757 #[test]
758 fn test_tokenizer_registry_custom() {
759 let registry = TokenizerRegistry::new();
760
761 registry.register("my_tokenizer", LowercaseTokenizer);
763
764 assert!(registry.contains("my_tokenizer"));
765 let tokenizer = registry.get("my_tokenizer").unwrap();
766 let tokens = tokenizer.tokenize("Hello World");
767 assert_eq!(tokens[0].text, "hello");
768 assert_eq!(tokens[1].text, "world");
769 }
770
771 #[test]
772 fn test_tokenizer_registry_nonexistent() {
773 let registry = TokenizerRegistry::new();
774 assert!(registry.get("nonexistent").is_none());
775 }
776
777 #[test]
778 fn test_stop_word_tokenizer_english() {
779 let tokenizer = StopWordTokenizer::english(LowercaseTokenizer);
780 let tokens = Tokenizer::tokenize(&tokenizer, "The quick brown fox jumps over the lazy dog");
781
782 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
784 assert!(!texts.contains(&"the"));
785 assert!(!texts.contains(&"over"));
786 assert!(texts.contains(&"quick"));
787 assert!(texts.contains(&"brown"));
788 assert!(texts.contains(&"fox"));
789 assert!(texts.contains(&"jumps"));
790 assert!(texts.contains(&"lazy"));
791 assert!(texts.contains(&"dog"));
792 }
793
794 #[test]
795 fn test_stop_word_tokenizer_with_stemmer() {
796 let tokenizer = StopWordTokenizer::new(StemmerTokenizer::english(), Language::English);
800 let tokens = Tokenizer::tokenize(&tokenizer, "elephants galaxies quantum");
801
802 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
803 assert!(texts.contains(&"eleph")); assert!(texts.contains(&"galaxi")); assert!(texts.contains(&"quantum")); }
808
809 #[test]
810 fn test_stop_word_tokenizer_german() {
811 let tokenizer = StopWordTokenizer::new(LowercaseTokenizer, Language::German);
812 let tokens = Tokenizer::tokenize(&tokenizer, "Der Hund und die Katze");
813
814 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
816 assert!(!texts.contains(&"der"));
817 assert!(!texts.contains(&"und"));
818 assert!(!texts.contains(&"die"));
819 assert!(texts.contains(&"hund"));
820 assert!(texts.contains(&"katze"));
821 }
822
823 #[test]
824 fn test_stop_word_tokenizer_custom() {
825 let custom_stops: HashSet<String> = ["foo", "bar"].iter().map(|s| s.to_string()).collect();
826 let tokenizer = StopWordTokenizer::with_custom_stop_words(LowercaseTokenizer, custom_stops);
827 let tokens = Tokenizer::tokenize(&tokenizer, "foo baz bar qux");
828
829 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
830 assert!(!texts.contains(&"foo"));
831 assert!(!texts.contains(&"bar"));
832 assert!(texts.contains(&"baz"));
833 assert!(texts.contains(&"qux"));
834 }
835
836 #[test]
837 fn test_stop_word_tokenizer_is_stop_word() {
838 let tokenizer = StopWordTokenizer::english(LowercaseTokenizer);
839 assert!(tokenizer.is_stop_word("the"));
840 assert!(tokenizer.is_stop_word("and"));
841 assert!(tokenizer.is_stop_word("is"));
842 assert!(!tokenizer.is_stop_word("elephant"));
844 assert!(!tokenizer.is_stop_word("quantum"));
845 }
846
847 #[test]
848 fn test_tokenizer_registry_stop_word_tokenizers() {
849 let registry = TokenizerRegistry::new();
850
851 assert!(registry.contains("en_stop"));
853 assert!(registry.contains("en_stem_stop"));
854 assert!(registry.contains("de_stop"));
855 assert!(registry.contains("ru_stop"));
856
857 let tokenizer = registry.get("en_stop").unwrap();
859 let tokens = tokenizer.tokenize("The quick fox");
860 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
861 assert!(!texts.contains(&"the"));
862 assert!(texts.contains(&"quick"));
863 assert!(texts.contains(&"fox"));
864
865 let tokenizer = registry.get("en_stem_stop").unwrap();
867 let tokens = tokenizer.tokenize("elephants galaxies");
868 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
869 assert!(texts.contains(&"eleph")); assert!(texts.contains(&"galaxi")); }
872}