1#[cfg(any(feature = "native", feature = "wasm"))]
4mod hf_tokenizer;
5
6#[cfg(any(feature = "native", feature = "wasm"))]
7pub use hf_tokenizer::{HfTokenizer, TokenizerSource};
8
9#[cfg(feature = "native")]
10pub use hf_tokenizer::{TokenizerCache, tokenizer_cache};
11
12use std::collections::HashMap;
13use std::sync::Arc;
14
15use parking_lot::RwLock;
16use rust_stemmers::Algorithm;
17use serde::{Deserialize, Serialize};
18use stop_words::LANGUAGE;
19
20#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
22pub struct Token {
23 pub text: String,
25 pub position: u32,
27 pub offset_from: usize,
29 pub offset_to: usize,
31}
32
33impl Token {
34 pub fn new(text: String, position: u32, offset_from: usize, offset_to: usize) -> Self {
35 Self {
36 text,
37 position,
38 offset_from,
39 offset_to,
40 }
41 }
42}
43
44pub trait Tokenizer: Send + Sync + Clone + 'static {
46 fn tokenize(&self, text: &str) -> Vec<Token>;
48}
49
50#[derive(Debug, Clone, Default)]
52pub struct SimpleTokenizer;
53
54impl Tokenizer for SimpleTokenizer {
55 fn tokenize(&self, text: &str) -> Vec<Token> {
56 let mut tokens = Vec::new();
57 let mut position = 0u32;
58
59 for (offset, word) in split_whitespace_with_offsets(text) {
60 if !word.is_empty() {
61 tokens.push(Token::new(
62 word.to_string(),
63 position,
64 offset,
65 offset + word.len(),
66 ));
67 position += 1;
68 }
69 }
70
71 tokens
72 }
73}
74
75#[derive(Debug, Clone, Default)]
77pub struct LowercaseTokenizer;
78
79impl Tokenizer for LowercaseTokenizer {
80 fn tokenize(&self, text: &str) -> Vec<Token> {
81 let mut tokens = Vec::new();
82 let mut position = 0u32;
83
84 for (offset, word) in split_whitespace_with_offsets(text) {
85 if !word.is_empty() {
86 let cleaned: String = word
88 .chars()
89 .filter(|c| c.is_alphanumeric())
90 .flat_map(|c| c.to_lowercase())
91 .collect();
92
93 if !cleaned.is_empty() {
94 tokens.push(Token::new(cleaned, position, offset, offset + word.len()));
95 position += 1;
96 }
97 }
98 }
99
100 tokens
101 }
102}
103
104fn split_whitespace_with_offsets(text: &str) -> impl Iterator<Item = (usize, &str)> {
106 let mut offset = 0;
107 text.split_whitespace().map(move |word| {
108 let word_start = text[offset..].find(word).unwrap() + offset;
109 offset = word_start + word.len();
110 (word_start, word)
111 })
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
116#[allow(missing_docs)]
117#[derive(Default)]
118pub enum Language {
119 Arabic,
120 Danish,
121 Dutch,
122 #[default]
123 English,
124 Finnish,
125 French,
126 German,
127 Greek,
128 Hungarian,
129 Italian,
130 Norwegian,
131 Portuguese,
132 Romanian,
133 Russian,
134 Spanish,
135 Swedish,
136 Tamil,
137 Turkish,
138}
139
140impl Language {
141 fn to_algorithm(self) -> Algorithm {
142 match self {
143 Language::Arabic => Algorithm::Arabic,
144 Language::Danish => Algorithm::Danish,
145 Language::Dutch => Algorithm::Dutch,
146 Language::English => Algorithm::English,
147 Language::Finnish => Algorithm::Finnish,
148 Language::French => Algorithm::French,
149 Language::German => Algorithm::German,
150 Language::Greek => Algorithm::Greek,
151 Language::Hungarian => Algorithm::Hungarian,
152 Language::Italian => Algorithm::Italian,
153 Language::Norwegian => Algorithm::Norwegian,
154 Language::Portuguese => Algorithm::Portuguese,
155 Language::Romanian => Algorithm::Romanian,
156 Language::Russian => Algorithm::Russian,
157 Language::Spanish => Algorithm::Spanish,
158 Language::Swedish => Algorithm::Swedish,
159 Language::Tamil => Algorithm::Tamil,
160 Language::Turkish => Algorithm::Turkish,
161 }
162 }
163
164 fn to_stop_words_language(self) -> LANGUAGE {
165 match self {
166 Language::Arabic => LANGUAGE::Arabic,
167 Language::Danish => LANGUAGE::Danish,
168 Language::Dutch => LANGUAGE::Dutch,
169 Language::English => LANGUAGE::English,
170 Language::Finnish => LANGUAGE::Finnish,
171 Language::French => LANGUAGE::French,
172 Language::German => LANGUAGE::German,
173 Language::Greek => LANGUAGE::Greek,
174 Language::Hungarian => LANGUAGE::Hungarian,
175 Language::Italian => LANGUAGE::Italian,
176 Language::Norwegian => LANGUAGE::Norwegian,
177 Language::Portuguese => LANGUAGE::Portuguese,
178 Language::Romanian => LANGUAGE::Romanian,
179 Language::Russian => LANGUAGE::Russian,
180 Language::Spanish => LANGUAGE::Spanish,
181 Language::Swedish => LANGUAGE::Swedish,
182 Language::Tamil => LANGUAGE::English, Language::Turkish => LANGUAGE::Turkish,
184 }
185 }
186}
187
188#[derive(Debug, Clone)]
192pub struct StopWordTokenizer<T: Tokenizer> {
193 inner: T,
194 stop_words: HashSet<String>,
195}
196
197use std::collections::HashSet;
198
199impl<T: Tokenizer> StopWordTokenizer<T> {
200 pub fn new(inner: T, language: Language) -> Self {
202 let stop_words: HashSet<String> = stop_words::get(language.to_stop_words_language())
203 .into_iter()
204 .map(|s| s.to_string())
205 .collect();
206 Self { inner, stop_words }
207 }
208
209 pub fn english(inner: T) -> Self {
211 Self::new(inner, Language::English)
212 }
213
214 pub fn with_custom_stop_words(inner: T, stop_words: HashSet<String>) -> Self {
216 Self { inner, stop_words }
217 }
218
219 pub fn is_stop_word(&self, word: &str) -> bool {
221 self.stop_words.contains(word)
222 }
223}
224
225impl<T: Tokenizer> Tokenizer for StopWordTokenizer<T> {
226 fn tokenize(&self, text: &str) -> Vec<Token> {
227 self.inner
228 .tokenize(text)
229 .into_iter()
230 .filter(|token| !self.stop_words.contains(&token.text))
231 .collect()
232 }
233}
234
235#[derive(Debug, Clone)]
240pub struct StemmerTokenizer {
241 language: Language,
242}
243
244impl StemmerTokenizer {
245 pub fn new(language: Language) -> Self {
247 Self { language }
248 }
249
250 pub fn english() -> Self {
252 Self::new(Language::English)
253 }
254}
255
256impl Default for StemmerTokenizer {
257 fn default() -> Self {
258 Self::english()
259 }
260}
261
262impl Tokenizer for StemmerTokenizer {
263 fn tokenize(&self, text: &str) -> Vec<Token> {
264 let stemmer = rust_stemmers::Stemmer::create(self.language.to_algorithm());
265 let mut tokens = Vec::new();
266 let mut position = 0u32;
267
268 for (offset, word) in split_whitespace_with_offsets(text) {
269 if !word.is_empty() {
270 let cleaned: String = word
272 .chars()
273 .filter(|c| c.is_alphanumeric())
274 .flat_map(|c| c.to_lowercase())
275 .collect();
276
277 if !cleaned.is_empty() {
278 let stemmed = stemmer.stem(&cleaned);
280 tokens.push(Token::new(
281 stemmed.into_owned(),
282 position,
283 offset,
284 offset + word.len(),
285 ));
286 position += 1;
287 }
288 }
289 }
290
291 tokens
292 }
293}
294
295#[derive(Debug, Clone)]
300pub struct MultiLanguageStemmer {
301 default_language: Language,
302}
303
304impl MultiLanguageStemmer {
305 pub fn new(default_language: Language) -> Self {
307 Self { default_language }
308 }
309
310 pub fn tokenize_with_language(&self, text: &str, language: Language) -> Vec<Token> {
312 let stemmer = rust_stemmers::Stemmer::create(language.to_algorithm());
313 let mut tokens = Vec::new();
314 let mut position = 0u32;
315
316 for (offset, word) in split_whitespace_with_offsets(text) {
317 if !word.is_empty() {
318 let cleaned: String = word
319 .chars()
320 .filter(|c| c.is_alphanumeric())
321 .flat_map(|c| c.to_lowercase())
322 .collect();
323
324 if !cleaned.is_empty() {
325 let stemmed = stemmer.stem(&cleaned);
326 tokens.push(Token::new(
327 stemmed.into_owned(),
328 position,
329 offset,
330 offset + word.len(),
331 ));
332 position += 1;
333 }
334 }
335 }
336
337 tokens
338 }
339
340 pub fn default_language(&self) -> Language {
342 self.default_language
343 }
344}
345
346impl Default for MultiLanguageStemmer {
347 fn default() -> Self {
348 Self::new(Language::English)
349 }
350}
351
352impl Tokenizer for MultiLanguageStemmer {
353 fn tokenize(&self, text: &str) -> Vec<Token> {
354 self.tokenize_with_language(text, self.default_language)
355 }
356}
357
358#[derive(Clone)]
363pub struct LanguageAwareTokenizer<F>
364where
365 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
366{
367 language_selector: F,
368 stemmer: MultiLanguageStemmer,
369}
370
371impl<F> LanguageAwareTokenizer<F>
372where
373 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
374{
375 pub fn new(language_selector: F) -> Self {
392 Self {
393 language_selector,
394 stemmer: MultiLanguageStemmer::default(),
395 }
396 }
397
398 pub fn tokenize_with_hint(&self, text: &str, language_hint: &str) -> Vec<Token> {
402 let language = (self.language_selector)(language_hint);
403 self.stemmer.tokenize_with_language(text, language)
404 }
405}
406
407impl<F> Tokenizer for LanguageAwareTokenizer<F>
408where
409 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
410{
411 fn tokenize(&self, text: &str) -> Vec<Token> {
412 self.stemmer.tokenize_with_language(text, Language::English)
414 }
415}
416
417pub fn parse_language(s: &str) -> Language {
421 match s.to_lowercase().as_str() {
422 "ar" | "arabic" => Language::Arabic,
423 "da" | "danish" => Language::Danish,
424 "nl" | "dutch" => Language::Dutch,
425 "en" | "english" => Language::English,
426 "fi" | "finnish" => Language::Finnish,
427 "fr" | "french" => Language::French,
428 "de" | "german" => Language::German,
429 "el" | "greek" => Language::Greek,
430 "hu" | "hungarian" => Language::Hungarian,
431 "it" | "italian" => Language::Italian,
432 "no" | "norwegian" => Language::Norwegian,
433 "pt" | "portuguese" => Language::Portuguese,
434 "ro" | "romanian" => Language::Romanian,
435 "ru" | "russian" => Language::Russian,
436 "es" | "spanish" => Language::Spanish,
437 "sv" | "swedish" => Language::Swedish,
438 "ta" | "tamil" => Language::Tamil,
439 "tr" | "turkish" => Language::Turkish,
440 _ => Language::English, }
442}
443
444pub type BoxedTokenizer = Box<dyn TokenizerClone>;
446
447pub trait TokenizerClone: Send + Sync {
448 fn tokenize(&self, text: &str) -> Vec<Token>;
449 fn clone_box(&self) -> BoxedTokenizer;
450}
451
452impl<T: Tokenizer> TokenizerClone for T {
453 fn tokenize(&self, text: &str) -> Vec<Token> {
454 Tokenizer::tokenize(self, text)
455 }
456
457 fn clone_box(&self) -> BoxedTokenizer {
458 Box::new(self.clone())
459 }
460}
461
462impl Clone for BoxedTokenizer {
463 fn clone(&self) -> Self {
464 self.clone_box()
465 }
466}
467
468#[derive(Clone)]
473pub struct TokenizerRegistry {
474 tokenizers: Arc<RwLock<HashMap<String, BoxedTokenizer>>>,
475}
476
477impl TokenizerRegistry {
478 pub fn new() -> Self {
480 let registry = Self {
481 tokenizers: Arc::new(RwLock::new(HashMap::new())),
482 };
483 registry.register_defaults();
484 registry
485 }
486
487 fn register_defaults(&self) {
489 self.register("default", LowercaseTokenizer);
491 self.register("simple", SimpleTokenizer);
492 self.register("lowercase", LowercaseTokenizer);
493 self.register("raw", SimpleTokenizer);
494
495 self.register("en_stem", StemmerTokenizer::new(Language::English));
497 self.register("english", StemmerTokenizer::new(Language::English));
498
499 self.register("ar_stem", StemmerTokenizer::new(Language::Arabic));
501 self.register("arabic", StemmerTokenizer::new(Language::Arabic));
502 self.register("da_stem", StemmerTokenizer::new(Language::Danish));
503 self.register("danish", StemmerTokenizer::new(Language::Danish));
504 self.register("nl_stem", StemmerTokenizer::new(Language::Dutch));
505 self.register("dutch", StemmerTokenizer::new(Language::Dutch));
506 self.register("fi_stem", StemmerTokenizer::new(Language::Finnish));
507 self.register("finnish", StemmerTokenizer::new(Language::Finnish));
508 self.register("fr_stem", StemmerTokenizer::new(Language::French));
509 self.register("french", StemmerTokenizer::new(Language::French));
510 self.register("de_stem", StemmerTokenizer::new(Language::German));
511 self.register("german", StemmerTokenizer::new(Language::German));
512 self.register("el_stem", StemmerTokenizer::new(Language::Greek));
513 self.register("greek", StemmerTokenizer::new(Language::Greek));
514 self.register("hu_stem", StemmerTokenizer::new(Language::Hungarian));
515 self.register("hungarian", StemmerTokenizer::new(Language::Hungarian));
516 self.register("it_stem", StemmerTokenizer::new(Language::Italian));
517 self.register("italian", StemmerTokenizer::new(Language::Italian));
518 self.register("no_stem", StemmerTokenizer::new(Language::Norwegian));
519 self.register("norwegian", StemmerTokenizer::new(Language::Norwegian));
520 self.register("pt_stem", StemmerTokenizer::new(Language::Portuguese));
521 self.register("portuguese", StemmerTokenizer::new(Language::Portuguese));
522 self.register("ro_stem", StemmerTokenizer::new(Language::Romanian));
523 self.register("romanian", StemmerTokenizer::new(Language::Romanian));
524 self.register("ru_stem", StemmerTokenizer::new(Language::Russian));
525 self.register("russian", StemmerTokenizer::new(Language::Russian));
526 self.register("es_stem", StemmerTokenizer::new(Language::Spanish));
527 self.register("spanish", StemmerTokenizer::new(Language::Spanish));
528 self.register("sv_stem", StemmerTokenizer::new(Language::Swedish));
529 self.register("swedish", StemmerTokenizer::new(Language::Swedish));
530 self.register("ta_stem", StemmerTokenizer::new(Language::Tamil));
531 self.register("tamil", StemmerTokenizer::new(Language::Tamil));
532 self.register("tr_stem", StemmerTokenizer::new(Language::Turkish));
533 self.register("turkish", StemmerTokenizer::new(Language::Turkish));
534
535 self.register(
537 "en_stop",
538 StopWordTokenizer::new(LowercaseTokenizer, Language::English),
539 );
540 self.register(
541 "de_stop",
542 StopWordTokenizer::new(LowercaseTokenizer, Language::German),
543 );
544 self.register(
545 "fr_stop",
546 StopWordTokenizer::new(LowercaseTokenizer, Language::French),
547 );
548 self.register(
549 "ru_stop",
550 StopWordTokenizer::new(LowercaseTokenizer, Language::Russian),
551 );
552 self.register(
553 "es_stop",
554 StopWordTokenizer::new(LowercaseTokenizer, Language::Spanish),
555 );
556
557 self.register(
559 "en_stem_stop",
560 StopWordTokenizer::new(StemmerTokenizer::new(Language::English), Language::English),
561 );
562 self.register(
563 "de_stem_stop",
564 StopWordTokenizer::new(StemmerTokenizer::new(Language::German), Language::German),
565 );
566 self.register(
567 "fr_stem_stop",
568 StopWordTokenizer::new(StemmerTokenizer::new(Language::French), Language::French),
569 );
570 self.register(
571 "ru_stem_stop",
572 StopWordTokenizer::new(StemmerTokenizer::new(Language::Russian), Language::Russian),
573 );
574 self.register(
575 "es_stem_stop",
576 StopWordTokenizer::new(StemmerTokenizer::new(Language::Spanish), Language::Spanish),
577 );
578 }
579
580 pub fn register<T: Tokenizer>(&self, name: &str, tokenizer: T) {
582 let mut tokenizers = self.tokenizers.write();
583 tokenizers.insert(name.to_string(), Box::new(tokenizer));
584 }
585
586 pub fn get(&self, name: &str) -> Option<BoxedTokenizer> {
588 let tokenizers = self.tokenizers.read();
589 tokenizers.get(name).cloned()
590 }
591
592 pub fn contains(&self, name: &str) -> bool {
594 let tokenizers = self.tokenizers.read();
595 tokenizers.contains_key(name)
596 }
597
598 pub fn names(&self) -> Vec<String> {
600 let tokenizers = self.tokenizers.read();
601 tokenizers.keys().cloned().collect()
602 }
603}
604
605impl Default for TokenizerRegistry {
606 fn default() -> Self {
607 Self::new()
608 }
609}
610
611#[cfg(test)]
612mod tests {
613 use super::*;
614
615 #[test]
616 fn test_simple_tokenizer() {
617 let tokenizer = SimpleTokenizer;
618 let tokens = Tokenizer::tokenize(&tokenizer, "hello world");
619
620 assert_eq!(tokens.len(), 2);
621 assert_eq!(tokens[0].text, "hello");
622 assert_eq!(tokens[0].position, 0);
623 assert_eq!(tokens[1].text, "world");
624 assert_eq!(tokens[1].position, 1);
625 }
626
627 #[test]
628 fn test_lowercase_tokenizer() {
629 let tokenizer = LowercaseTokenizer;
630 let tokens = Tokenizer::tokenize(&tokenizer, "Hello, World!");
631
632 assert_eq!(tokens.len(), 2);
633 assert_eq!(tokens[0].text, "hello");
634 assert_eq!(tokens[1].text, "world");
635 }
636
637 #[test]
638 fn test_empty_text() {
639 let tokenizer = SimpleTokenizer;
640 let tokens = Tokenizer::tokenize(&tokenizer, "");
641 assert!(tokens.is_empty());
642 }
643
644 #[test]
645 fn test_stemmer_tokenizer_english() {
646 let tokenizer = StemmerTokenizer::english();
647 let tokens = Tokenizer::tokenize(&tokenizer, "Dogs are running quickly");
648
649 assert_eq!(tokens.len(), 4);
650 assert_eq!(tokens[0].text, "dog"); assert_eq!(tokens[1].text, "are"); assert_eq!(tokens[2].text, "run"); assert_eq!(tokens[3].text, "quick"); }
655
656 #[test]
657 fn test_stemmer_tokenizer_preserves_offsets() {
658 let tokenizer = StemmerTokenizer::english();
659 let tokens = Tokenizer::tokenize(&tokenizer, "Running dogs");
660
661 assert_eq!(tokens.len(), 2);
662 assert_eq!(tokens[0].text, "run");
663 assert_eq!(tokens[0].offset_from, 0);
664 assert_eq!(tokens[0].offset_to, 7); assert_eq!(tokens[1].text, "dog");
666 assert_eq!(tokens[1].offset_from, 8);
667 assert_eq!(tokens[1].offset_to, 12); }
669
670 #[test]
671 fn test_stemmer_tokenizer_german() {
672 let tokenizer = StemmerTokenizer::new(Language::German);
673 let tokens = Tokenizer::tokenize(&tokenizer, "Häuser Bücher");
674
675 assert_eq!(tokens.len(), 2);
676 assert_eq!(tokens[0].text, "haus"); assert_eq!(tokens[1].text, "buch"); }
680
681 #[test]
682 fn test_stemmer_tokenizer_russian() {
683 let tokenizer = StemmerTokenizer::new(Language::Russian);
684 let tokens = Tokenizer::tokenize(&tokenizer, "бегущие собаки");
685
686 assert_eq!(tokens.len(), 2);
687 assert_eq!(tokens[0].text, "бегущ"); assert_eq!(tokens[1].text, "собак"); }
691
692 #[test]
693 fn test_multi_language_stemmer() {
694 let stemmer = MultiLanguageStemmer::new(Language::English);
695
696 let tokens = stemmer.tokenize_with_language("running dogs", Language::English);
698 assert_eq!(tokens[0].text, "run");
699 assert_eq!(tokens[1].text, "dog");
700
701 let tokens = stemmer.tokenize_with_language("Häuser Bücher", Language::German);
703 assert_eq!(tokens[0].text, "haus");
704 assert_eq!(tokens[1].text, "buch");
705
706 let tokens = stemmer.tokenize_with_language("бегущие собаки", Language::Russian);
708 assert_eq!(tokens[0].text, "бегущ");
709 assert_eq!(tokens[1].text, "собак");
710 }
711
712 #[test]
713 fn test_language_aware_tokenizer() {
714 let tokenizer = LanguageAwareTokenizer::new(parse_language);
715
716 let tokens = tokenizer.tokenize_with_hint("running dogs", "en");
718 assert_eq!(tokens[0].text, "run");
719 assert_eq!(tokens[1].text, "dog");
720
721 let tokens = tokenizer.tokenize_with_hint("Häuser Bücher", "de");
723 assert_eq!(tokens[0].text, "haus");
724 assert_eq!(tokens[1].text, "buch");
725
726 let tokens = tokenizer.tokenize_with_hint("бегущие собаки", "russian");
728 assert_eq!(tokens[0].text, "бегущ");
729 assert_eq!(tokens[1].text, "собак");
730 }
731
732 #[test]
733 fn test_parse_language() {
734 assert_eq!(parse_language("en"), Language::English);
735 assert_eq!(parse_language("english"), Language::English);
736 assert_eq!(parse_language("English"), Language::English);
737 assert_eq!(parse_language("de"), Language::German);
738 assert_eq!(parse_language("german"), Language::German);
739 assert_eq!(parse_language("ru"), Language::Russian);
740 assert_eq!(parse_language("russian"), Language::Russian);
741 assert_eq!(parse_language("unknown"), Language::English); }
743
744 #[test]
745 fn test_tokenizer_registry_defaults() {
746 let registry = TokenizerRegistry::new();
747
748 assert!(registry.contains("default"));
750 assert!(registry.contains("simple"));
751 assert!(registry.contains("lowercase"));
752 assert!(registry.contains("en_stem"));
753 assert!(registry.contains("german"));
754 assert!(registry.contains("russian"));
755 }
756
757 #[test]
758 fn test_tokenizer_registry_get() {
759 let registry = TokenizerRegistry::new();
760
761 let tokenizer = registry.get("en_stem").unwrap();
763 let tokens = tokenizer.tokenize("running dogs");
764 assert_eq!(tokens[0].text, "run");
765 assert_eq!(tokens[1].text, "dog");
766
767 let tokenizer = registry.get("german").unwrap();
769 let tokens = tokenizer.tokenize("Häuser Bücher");
770 assert_eq!(tokens[0].text, "haus");
771 assert_eq!(tokens[1].text, "buch");
772 }
773
774 #[test]
775 fn test_tokenizer_registry_custom() {
776 let registry = TokenizerRegistry::new();
777
778 registry.register("my_tokenizer", LowercaseTokenizer);
780
781 assert!(registry.contains("my_tokenizer"));
782 let tokenizer = registry.get("my_tokenizer").unwrap();
783 let tokens = tokenizer.tokenize("Hello World");
784 assert_eq!(tokens[0].text, "hello");
785 assert_eq!(tokens[1].text, "world");
786 }
787
788 #[test]
789 fn test_tokenizer_registry_nonexistent() {
790 let registry = TokenizerRegistry::new();
791 assert!(registry.get("nonexistent").is_none());
792 }
793
794 #[test]
795 fn test_stop_word_tokenizer_english() {
796 let tokenizer = StopWordTokenizer::english(LowercaseTokenizer);
797 let tokens = Tokenizer::tokenize(&tokenizer, "The quick brown fox jumps over the lazy dog");
798
799 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
801 assert!(!texts.contains(&"the"));
802 assert!(!texts.contains(&"over"));
803 assert!(texts.contains(&"quick"));
804 assert!(texts.contains(&"brown"));
805 assert!(texts.contains(&"fox"));
806 assert!(texts.contains(&"jumps"));
807 assert!(texts.contains(&"lazy"));
808 assert!(texts.contains(&"dog"));
809 }
810
811 #[test]
812 fn test_stop_word_tokenizer_with_stemmer() {
813 let tokenizer = StopWordTokenizer::new(StemmerTokenizer::english(), Language::English);
817 let tokens = Tokenizer::tokenize(&tokenizer, "elephants galaxies quantum");
818
819 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
820 assert!(texts.contains(&"eleph")); assert!(texts.contains(&"galaxi")); assert!(texts.contains(&"quantum")); }
825
826 #[test]
827 fn test_stop_word_tokenizer_german() {
828 let tokenizer = StopWordTokenizer::new(LowercaseTokenizer, Language::German);
829 let tokens = Tokenizer::tokenize(&tokenizer, "Der Hund und die Katze");
830
831 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
833 assert!(!texts.contains(&"der"));
834 assert!(!texts.contains(&"und"));
835 assert!(!texts.contains(&"die"));
836 assert!(texts.contains(&"hund"));
837 assert!(texts.contains(&"katze"));
838 }
839
840 #[test]
841 fn test_stop_word_tokenizer_custom() {
842 let custom_stops: HashSet<String> = ["foo", "bar"].iter().map(|s| s.to_string()).collect();
843 let tokenizer = StopWordTokenizer::with_custom_stop_words(LowercaseTokenizer, custom_stops);
844 let tokens = Tokenizer::tokenize(&tokenizer, "foo baz bar qux");
845
846 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
847 assert!(!texts.contains(&"foo"));
848 assert!(!texts.contains(&"bar"));
849 assert!(texts.contains(&"baz"));
850 assert!(texts.contains(&"qux"));
851 }
852
853 #[test]
854 fn test_stop_word_tokenizer_is_stop_word() {
855 let tokenizer = StopWordTokenizer::english(LowercaseTokenizer);
856 assert!(tokenizer.is_stop_word("the"));
857 assert!(tokenizer.is_stop_word("and"));
858 assert!(tokenizer.is_stop_word("is"));
859 assert!(!tokenizer.is_stop_word("elephant"));
861 assert!(!tokenizer.is_stop_word("quantum"));
862 }
863
864 #[test]
865 fn test_tokenizer_registry_stop_word_tokenizers() {
866 let registry = TokenizerRegistry::new();
867
868 assert!(registry.contains("en_stop"));
870 assert!(registry.contains("en_stem_stop"));
871 assert!(registry.contains("de_stop"));
872 assert!(registry.contains("ru_stop"));
873
874 let tokenizer = registry.get("en_stop").unwrap();
876 let tokens = tokenizer.tokenize("The quick fox");
877 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
878 assert!(!texts.contains(&"the"));
879 assert!(texts.contains(&"quick"));
880 assert!(texts.contains(&"fox"));
881
882 let tokenizer = registry.get("en_stem_stop").unwrap();
884 let tokens = tokenizer.tokenize("elephants galaxies");
885 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
886 assert!(texts.contains(&"eleph")); assert!(texts.contains(&"galaxi")); }
889}