1#[cfg(any(feature = "native", feature = "wasm"))]
4mod hf_tokenizer;
5
6#[cfg(feature = "native")]
7mod idf_weights;
8
9#[cfg(any(feature = "native", feature = "wasm"))]
10pub use hf_tokenizer::{HfTokenizer, TokenizerSource};
11
12#[cfg(feature = "native")]
13pub use hf_tokenizer::{TokenizerCache, tokenizer_cache};
14
15#[cfg(feature = "native")]
16pub use idf_weights::{IdfWeights, IdfWeightsCache, idf_weights_cache};
17
18use std::collections::HashMap;
19use std::sync::Arc;
20
21use parking_lot::RwLock;
22use rust_stemmers::Algorithm;
23use serde::{Deserialize, Serialize};
24use stop_words::LANGUAGE;
25
26#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
28pub struct Token {
29 pub text: String,
31 pub position: u32,
33 pub offset_from: usize,
35 pub offset_to: usize,
37}
38
39impl Token {
40 pub fn new(text: String, position: u32, offset_from: usize, offset_to: usize) -> Self {
41 Self {
42 text,
43 position,
44 offset_from,
45 offset_to,
46 }
47 }
48}
49
50pub trait Tokenizer: Send + Sync + Clone + 'static {
52 fn tokenize(&self, text: &str) -> Vec<Token>;
54}
55
56#[derive(Debug, Clone, Default)]
58pub struct SimpleTokenizer;
59
60impl Tokenizer for SimpleTokenizer {
61 fn tokenize(&self, text: &str) -> Vec<Token> {
62 let mut tokens = Vec::with_capacity(text.len() / 5);
63 let mut position = 0u32;
64
65 for (offset, word) in split_whitespace_with_offsets(text) {
66 if !word.is_empty() {
67 tokens.push(Token::new(
68 word.to_string(),
69 position,
70 offset,
71 offset + word.len(),
72 ));
73 position += 1;
74 }
75 }
76
77 tokens
78 }
79}
80
81#[derive(Debug, Clone, Default)]
83pub struct LowercaseTokenizer;
84
85impl Tokenizer for LowercaseTokenizer {
86 fn tokenize(&self, text: &str) -> Vec<Token> {
87 tokenize_and_clean(text, std::convert::identity)
88 }
89}
90
91#[inline]
96fn clean_word(word: &str) -> String {
97 if word.is_ascii() {
98 let bytes = word.as_bytes();
99 if bytes
101 .iter()
102 .all(|&b| b.is_ascii_lowercase() || b.is_ascii_digit())
103 {
104 return word.to_string();
105 }
106 let mut result = String::with_capacity(bytes.len());
108 for &b in bytes {
109 if b.is_ascii_alphanumeric() {
110 result.push(b.to_ascii_lowercase() as char);
111 }
112 }
113 result
114 } else {
115 word.chars()
117 .filter(|c| c.is_alphanumeric())
118 .flat_map(|c| c.to_lowercase())
119 .collect()
120 }
121}
122
123fn tokenize_and_clean(text: &str, transform: impl Fn(String) -> String) -> Vec<Token> {
129 let mut tokens = Vec::with_capacity(text.len() / 5);
130 let mut position = 0u32;
131 for (offset, word) in split_whitespace_with_offsets(text) {
132 if !word.is_empty() {
133 let cleaned = clean_word(word);
134 if !cleaned.is_empty() {
135 tokens.push(Token::new(
136 transform(cleaned),
137 position,
138 offset,
139 offset + word.len(),
140 ));
141 position += 1;
142 }
143 }
144 }
145 tokens
146}
147
148fn split_whitespace_with_offsets(text: &str) -> impl Iterator<Item = (usize, &str)> {
153 let base = text.as_ptr() as usize;
154 text.split_whitespace()
155 .map(move |word| (word.as_ptr() as usize - base, word))
156}
157
158#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
160#[allow(missing_docs)]
161#[derive(Default)]
162pub enum Language {
163 Arabic,
164 Danish,
165 Dutch,
166 #[default]
167 English,
168 Finnish,
169 French,
170 German,
171 Greek,
172 Hungarian,
173 Italian,
174 Norwegian,
175 Portuguese,
176 Romanian,
177 Russian,
178 Spanish,
179 Swedish,
180 Tamil,
181 Turkish,
182}
183
184impl Language {
185 fn to_algorithm(self) -> Algorithm {
186 match self {
187 Language::Arabic => Algorithm::Arabic,
188 Language::Danish => Algorithm::Danish,
189 Language::Dutch => Algorithm::Dutch,
190 Language::English => Algorithm::English,
191 Language::Finnish => Algorithm::Finnish,
192 Language::French => Algorithm::French,
193 Language::German => Algorithm::German,
194 Language::Greek => Algorithm::Greek,
195 Language::Hungarian => Algorithm::Hungarian,
196 Language::Italian => Algorithm::Italian,
197 Language::Norwegian => Algorithm::Norwegian,
198 Language::Portuguese => Algorithm::Portuguese,
199 Language::Romanian => Algorithm::Romanian,
200 Language::Russian => Algorithm::Russian,
201 Language::Spanish => Algorithm::Spanish,
202 Language::Swedish => Algorithm::Swedish,
203 Language::Tamil => Algorithm::Tamil,
204 Language::Turkish => Algorithm::Turkish,
205 }
206 }
207
208 fn to_stop_words_language(self) -> LANGUAGE {
209 match self {
210 Language::Arabic => LANGUAGE::Arabic,
211 Language::Danish => LANGUAGE::Danish,
212 Language::Dutch => LANGUAGE::Dutch,
213 Language::English => LANGUAGE::English,
214 Language::Finnish => LANGUAGE::Finnish,
215 Language::French => LANGUAGE::French,
216 Language::German => LANGUAGE::German,
217 Language::Greek => LANGUAGE::Greek,
218 Language::Hungarian => LANGUAGE::Hungarian,
219 Language::Italian => LANGUAGE::Italian,
220 Language::Norwegian => LANGUAGE::Norwegian,
221 Language::Portuguese => LANGUAGE::Portuguese,
222 Language::Romanian => LANGUAGE::Romanian,
223 Language::Russian => LANGUAGE::Russian,
224 Language::Spanish => LANGUAGE::Spanish,
225 Language::Swedish => LANGUAGE::Swedish,
226 Language::Tamil => LANGUAGE::English, Language::Turkish => LANGUAGE::Turkish,
228 }
229 }
230}
231
232#[derive(Debug, Clone)]
236pub struct StopWordTokenizer<T: Tokenizer> {
237 inner: T,
238 stop_words: HashSet<String>,
239}
240
241use std::collections::HashSet;
242
243impl<T: Tokenizer> StopWordTokenizer<T> {
244 pub fn new(inner: T, language: Language) -> Self {
246 let stop_words: HashSet<String> = stop_words::get(language.to_stop_words_language())
247 .into_iter()
248 .map(|s| s.to_string())
249 .collect();
250 Self { inner, stop_words }
251 }
252
253 pub fn english(inner: T) -> Self {
255 Self::new(inner, Language::English)
256 }
257
258 pub fn with_custom_stop_words(inner: T, stop_words: HashSet<String>) -> Self {
260 Self { inner, stop_words }
261 }
262
263 pub fn is_stop_word(&self, word: &str) -> bool {
265 self.stop_words.contains(word)
266 }
267}
268
269impl<T: Tokenizer> Tokenizer for StopWordTokenizer<T> {
270 fn tokenize(&self, text: &str) -> Vec<Token> {
271 self.inner
272 .tokenize(text)
273 .into_iter()
274 .filter(|token| !self.stop_words.contains(token.text.as_str()))
275 .collect()
276 }
277}
278
279#[derive(Debug, Clone)]
284pub struct StemmerTokenizer {
285 language: Language,
286}
287
288impl StemmerTokenizer {
289 pub fn new(language: Language) -> Self {
291 Self { language }
292 }
293
294 pub fn english() -> Self {
296 Self::new(Language::English)
297 }
298}
299
300impl Default for StemmerTokenizer {
301 fn default() -> Self {
302 Self::english()
303 }
304}
305
306impl Tokenizer for StemmerTokenizer {
307 fn tokenize(&self, text: &str) -> Vec<Token> {
308 let stemmer = rust_stemmers::Stemmer::create(self.language.to_algorithm());
309 tokenize_and_clean(text, |s| stemmer.stem(&s).into_owned())
310 }
311}
312
313#[derive(Debug, Clone)]
318pub struct MultiLanguageStemmer {
319 default_language: Language,
320}
321
322impl MultiLanguageStemmer {
323 pub fn new(default_language: Language) -> Self {
325 Self { default_language }
326 }
327
328 pub fn tokenize_with_language(&self, text: &str, language: Language) -> Vec<Token> {
330 let stemmer = rust_stemmers::Stemmer::create(language.to_algorithm());
331 tokenize_and_clean(text, |s| stemmer.stem(&s).into_owned())
332 }
333
334 pub fn default_language(&self) -> Language {
336 self.default_language
337 }
338}
339
340impl Default for MultiLanguageStemmer {
341 fn default() -> Self {
342 Self::new(Language::English)
343 }
344}
345
346impl Tokenizer for MultiLanguageStemmer {
347 fn tokenize(&self, text: &str) -> Vec<Token> {
348 self.tokenize_with_language(text, self.default_language)
349 }
350}
351
352#[derive(Clone)]
357pub struct LanguageAwareTokenizer<F>
358where
359 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
360{
361 language_selector: F,
362 stemmer: MultiLanguageStemmer,
363}
364
365impl<F> LanguageAwareTokenizer<F>
366where
367 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
368{
369 pub fn new(language_selector: F) -> Self {
386 Self {
387 language_selector,
388 stemmer: MultiLanguageStemmer::default(),
389 }
390 }
391
392 pub fn tokenize_with_hint(&self, text: &str, language_hint: &str) -> Vec<Token> {
396 let language = (self.language_selector)(language_hint);
397 self.stemmer.tokenize_with_language(text, language)
398 }
399}
400
401impl<F> Tokenizer for LanguageAwareTokenizer<F>
402where
403 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
404{
405 fn tokenize(&self, text: &str) -> Vec<Token> {
406 self.stemmer.tokenize_with_language(text, Language::English)
408 }
409}
410
411pub fn parse_language(s: &str) -> Language {
415 match s.to_lowercase().as_str() {
416 "ar" | "arabic" => Language::Arabic,
417 "da" | "danish" => Language::Danish,
418 "nl" | "dutch" => Language::Dutch,
419 "en" | "english" => Language::English,
420 "fi" | "finnish" => Language::Finnish,
421 "fr" | "french" => Language::French,
422 "de" | "german" => Language::German,
423 "el" | "greek" => Language::Greek,
424 "hu" | "hungarian" => Language::Hungarian,
425 "it" | "italian" => Language::Italian,
426 "no" | "norwegian" => Language::Norwegian,
427 "pt" | "portuguese" => Language::Portuguese,
428 "ro" | "romanian" => Language::Romanian,
429 "ru" | "russian" => Language::Russian,
430 "es" | "spanish" => Language::Spanish,
431 "sv" | "swedish" => Language::Swedish,
432 "ta" | "tamil" => Language::Tamil,
433 "tr" | "turkish" => Language::Turkish,
434 _ => Language::English, }
436}
437
438pub type BoxedTokenizer = Box<dyn TokenizerClone>;
440
441pub trait TokenizerClone: Send + Sync {
442 fn tokenize(&self, text: &str) -> Vec<Token>;
443 fn clone_box(&self) -> BoxedTokenizer;
444}
445
446impl<T: Tokenizer> TokenizerClone for T {
447 fn tokenize(&self, text: &str) -> Vec<Token> {
448 Tokenizer::tokenize(self, text)
449 }
450
451 fn clone_box(&self) -> BoxedTokenizer {
452 Box::new(self.clone())
453 }
454}
455
456impl Clone for BoxedTokenizer {
457 fn clone(&self) -> Self {
458 self.clone_box()
459 }
460}
461
462#[derive(Clone)]
467pub struct TokenizerRegistry {
468 tokenizers: Arc<RwLock<HashMap<String, BoxedTokenizer>>>,
469}
470
471impl TokenizerRegistry {
472 pub fn new() -> Self {
474 let registry = Self {
475 tokenizers: Arc::new(RwLock::new(HashMap::new())),
476 };
477 registry.register_defaults();
478 registry
479 }
480
481 fn register_defaults(&self) {
483 self.register("simple", SimpleTokenizer);
485 self.register("lowercase", LowercaseTokenizer);
486 self.register("raw", SimpleTokenizer);
487
488 self.register("en_stem", StemmerTokenizer::new(Language::English));
490 self.register("english", StemmerTokenizer::new(Language::English));
491
492 self.register("ar_stem", StemmerTokenizer::new(Language::Arabic));
494 self.register("arabic", StemmerTokenizer::new(Language::Arabic));
495 self.register("da_stem", StemmerTokenizer::new(Language::Danish));
496 self.register("danish", StemmerTokenizer::new(Language::Danish));
497 self.register("nl_stem", StemmerTokenizer::new(Language::Dutch));
498 self.register("dutch", StemmerTokenizer::new(Language::Dutch));
499 self.register("fi_stem", StemmerTokenizer::new(Language::Finnish));
500 self.register("finnish", StemmerTokenizer::new(Language::Finnish));
501 self.register("fr_stem", StemmerTokenizer::new(Language::French));
502 self.register("french", StemmerTokenizer::new(Language::French));
503 self.register("de_stem", StemmerTokenizer::new(Language::German));
504 self.register("german", StemmerTokenizer::new(Language::German));
505 self.register("el_stem", StemmerTokenizer::new(Language::Greek));
506 self.register("greek", StemmerTokenizer::new(Language::Greek));
507 self.register("hu_stem", StemmerTokenizer::new(Language::Hungarian));
508 self.register("hungarian", StemmerTokenizer::new(Language::Hungarian));
509 self.register("it_stem", StemmerTokenizer::new(Language::Italian));
510 self.register("italian", StemmerTokenizer::new(Language::Italian));
511 self.register("no_stem", StemmerTokenizer::new(Language::Norwegian));
512 self.register("norwegian", StemmerTokenizer::new(Language::Norwegian));
513 self.register("pt_stem", StemmerTokenizer::new(Language::Portuguese));
514 self.register("portuguese", StemmerTokenizer::new(Language::Portuguese));
515 self.register("ro_stem", StemmerTokenizer::new(Language::Romanian));
516 self.register("romanian", StemmerTokenizer::new(Language::Romanian));
517 self.register("ru_stem", StemmerTokenizer::new(Language::Russian));
518 self.register("russian", StemmerTokenizer::new(Language::Russian));
519 self.register("es_stem", StemmerTokenizer::new(Language::Spanish));
520 self.register("spanish", StemmerTokenizer::new(Language::Spanish));
521 self.register("sv_stem", StemmerTokenizer::new(Language::Swedish));
522 self.register("swedish", StemmerTokenizer::new(Language::Swedish));
523 self.register("ta_stem", StemmerTokenizer::new(Language::Tamil));
524 self.register("tamil", StemmerTokenizer::new(Language::Tamil));
525 self.register("tr_stem", StemmerTokenizer::new(Language::Turkish));
526 self.register("turkish", StemmerTokenizer::new(Language::Turkish));
527
528 self.register(
530 "en_stop",
531 StopWordTokenizer::new(LowercaseTokenizer, Language::English),
532 );
533 self.register(
534 "de_stop",
535 StopWordTokenizer::new(LowercaseTokenizer, Language::German),
536 );
537 self.register(
538 "fr_stop",
539 StopWordTokenizer::new(LowercaseTokenizer, Language::French),
540 );
541 self.register(
542 "ru_stop",
543 StopWordTokenizer::new(LowercaseTokenizer, Language::Russian),
544 );
545 self.register(
546 "es_stop",
547 StopWordTokenizer::new(LowercaseTokenizer, Language::Spanish),
548 );
549
550 self.register(
552 "en_stem_stop",
553 StopWordTokenizer::new(StemmerTokenizer::new(Language::English), Language::English),
554 );
555 self.register(
556 "de_stem_stop",
557 StopWordTokenizer::new(StemmerTokenizer::new(Language::German), Language::German),
558 );
559 self.register(
560 "fr_stem_stop",
561 StopWordTokenizer::new(StemmerTokenizer::new(Language::French), Language::French),
562 );
563 self.register(
564 "ru_stem_stop",
565 StopWordTokenizer::new(StemmerTokenizer::new(Language::Russian), Language::Russian),
566 );
567 self.register(
568 "es_stem_stop",
569 StopWordTokenizer::new(StemmerTokenizer::new(Language::Spanish), Language::Spanish),
570 );
571 }
572
573 pub fn register<T: Tokenizer>(&self, name: &str, tokenizer: T) {
575 let mut tokenizers = self.tokenizers.write();
576 tokenizers.insert(name.to_string(), Box::new(tokenizer));
577 }
578
579 pub fn get(&self, name: &str) -> Option<BoxedTokenizer> {
581 let tokenizers = self.tokenizers.read();
582 tokenizers.get(name).cloned()
583 }
584
585 pub fn contains(&self, name: &str) -> bool {
587 let tokenizers = self.tokenizers.read();
588 tokenizers.contains_key(name)
589 }
590
591 pub fn names(&self) -> Vec<String> {
593 let tokenizers = self.tokenizers.read();
594 tokenizers.keys().cloned().collect()
595 }
596}
597
598impl Default for TokenizerRegistry {
599 fn default() -> Self {
600 Self::new()
601 }
602}
603
604#[cfg(test)]
605mod tests {
606 use super::*;
607
608 #[test]
609 fn test_simple_tokenizer() {
610 let tokenizer = SimpleTokenizer;
611 let tokens = Tokenizer::tokenize(&tokenizer, "hello world");
612
613 assert_eq!(tokens.len(), 2);
614 assert_eq!(tokens[0].text, "hello");
615 assert_eq!(tokens[0].position, 0);
616 assert_eq!(tokens[1].text, "world");
617 assert_eq!(tokens[1].position, 1);
618 }
619
620 #[test]
621 fn test_lowercase_tokenizer() {
622 let tokenizer = LowercaseTokenizer;
623 let tokens = Tokenizer::tokenize(&tokenizer, "Hello, World!");
624
625 assert_eq!(tokens.len(), 2);
626 assert_eq!(tokens[0].text, "hello");
627 assert_eq!(tokens[1].text, "world");
628 }
629
630 #[test]
631 fn test_empty_text() {
632 let tokenizer = SimpleTokenizer;
633 let tokens = Tokenizer::tokenize(&tokenizer, "");
634 assert!(tokens.is_empty());
635 }
636
637 #[test]
638 fn test_stemmer_tokenizer_english() {
639 let tokenizer = StemmerTokenizer::english();
640 let tokens = Tokenizer::tokenize(&tokenizer, "Dogs are running quickly");
641
642 assert_eq!(tokens.len(), 4);
643 assert_eq!(tokens[0].text, "dog"); assert_eq!(tokens[1].text, "are"); assert_eq!(tokens[2].text, "run"); assert_eq!(tokens[3].text, "quick"); }
648
649 #[test]
650 fn test_stemmer_tokenizer_preserves_offsets() {
651 let tokenizer = StemmerTokenizer::english();
652 let tokens = Tokenizer::tokenize(&tokenizer, "Running dogs");
653
654 assert_eq!(tokens.len(), 2);
655 assert_eq!(tokens[0].text, "run");
656 assert_eq!(tokens[0].offset_from, 0);
657 assert_eq!(tokens[0].offset_to, 7); assert_eq!(tokens[1].text, "dog");
659 assert_eq!(tokens[1].offset_from, 8);
660 assert_eq!(tokens[1].offset_to, 12); }
662
663 #[test]
664 fn test_stemmer_tokenizer_german() {
665 let tokenizer = StemmerTokenizer::new(Language::German);
666 let tokens = Tokenizer::tokenize(&tokenizer, "Häuser Bücher");
667
668 assert_eq!(tokens.len(), 2);
669 assert_eq!(tokens[0].text, "haus"); assert_eq!(tokens[1].text, "buch"); }
673
674 #[test]
675 fn test_stemmer_tokenizer_russian() {
676 let tokenizer = StemmerTokenizer::new(Language::Russian);
677 let tokens = Tokenizer::tokenize(&tokenizer, "бегущие собаки");
678
679 assert_eq!(tokens.len(), 2);
680 assert_eq!(tokens[0].text, "бегущ"); assert_eq!(tokens[1].text, "собак"); }
684
685 #[test]
686 fn test_multi_language_stemmer() {
687 let stemmer = MultiLanguageStemmer::new(Language::English);
688
689 let tokens = stemmer.tokenize_with_language("running dogs", Language::English);
691 assert_eq!(tokens[0].text, "run");
692 assert_eq!(tokens[1].text, "dog");
693
694 let tokens = stemmer.tokenize_with_language("Häuser Bücher", Language::German);
696 assert_eq!(tokens[0].text, "haus");
697 assert_eq!(tokens[1].text, "buch");
698
699 let tokens = stemmer.tokenize_with_language("бегущие собаки", Language::Russian);
701 assert_eq!(tokens[0].text, "бегущ");
702 assert_eq!(tokens[1].text, "собак");
703 }
704
705 #[test]
706 fn test_language_aware_tokenizer() {
707 let tokenizer = LanguageAwareTokenizer::new(parse_language);
708
709 let tokens = tokenizer.tokenize_with_hint("running dogs", "en");
711 assert_eq!(tokens[0].text, "run");
712 assert_eq!(tokens[1].text, "dog");
713
714 let tokens = tokenizer.tokenize_with_hint("Häuser Bücher", "de");
716 assert_eq!(tokens[0].text, "haus");
717 assert_eq!(tokens[1].text, "buch");
718
719 let tokens = tokenizer.tokenize_with_hint("бегущие собаки", "russian");
721 assert_eq!(tokens[0].text, "бегущ");
722 assert_eq!(tokens[1].text, "собак");
723 }
724
725 #[test]
726 fn test_parse_language() {
727 assert_eq!(parse_language("en"), Language::English);
728 assert_eq!(parse_language("english"), Language::English);
729 assert_eq!(parse_language("English"), Language::English);
730 assert_eq!(parse_language("de"), Language::German);
731 assert_eq!(parse_language("german"), Language::German);
732 assert_eq!(parse_language("ru"), Language::Russian);
733 assert_eq!(parse_language("russian"), Language::Russian);
734 assert_eq!(parse_language("unknown"), Language::English); }
736
737 #[test]
738 fn test_tokenizer_registry_defaults() {
739 let registry = TokenizerRegistry::new();
740
741 assert!(registry.contains("simple"));
743 assert!(registry.contains("lowercase"));
744 assert!(registry.contains("en_stem"));
745 assert!(registry.contains("german"));
746 assert!(registry.contains("russian"));
747 }
748
749 #[test]
750 fn test_tokenizer_registry_get() {
751 let registry = TokenizerRegistry::new();
752
753 let tokenizer = registry.get("en_stem").unwrap();
755 let tokens = tokenizer.tokenize("running dogs");
756 assert_eq!(tokens[0].text, "run");
757 assert_eq!(tokens[1].text, "dog");
758
759 let tokenizer = registry.get("german").unwrap();
761 let tokens = tokenizer.tokenize("Häuser Bücher");
762 assert_eq!(tokens[0].text, "haus");
763 assert_eq!(tokens[1].text, "buch");
764 }
765
766 #[test]
767 fn test_tokenizer_registry_custom() {
768 let registry = TokenizerRegistry::new();
769
770 registry.register("my_tokenizer", LowercaseTokenizer);
772
773 assert!(registry.contains("my_tokenizer"));
774 let tokenizer = registry.get("my_tokenizer").unwrap();
775 let tokens = tokenizer.tokenize("Hello World");
776 assert_eq!(tokens[0].text, "hello");
777 assert_eq!(tokens[1].text, "world");
778 }
779
780 #[test]
781 fn test_tokenizer_registry_nonexistent() {
782 let registry = TokenizerRegistry::new();
783 assert!(registry.get("nonexistent").is_none());
784 }
785
786 #[test]
787 fn test_stop_word_tokenizer_english() {
788 let tokenizer = StopWordTokenizer::english(LowercaseTokenizer);
789 let tokens = Tokenizer::tokenize(&tokenizer, "The quick brown fox jumps over the lazy dog");
790
791 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
793 assert!(!texts.contains(&"the"));
794 assert!(!texts.contains(&"over"));
795 assert!(texts.contains(&"quick"));
796 assert!(texts.contains(&"brown"));
797 assert!(texts.contains(&"fox"));
798 assert!(texts.contains(&"jumps"));
799 assert!(texts.contains(&"lazy"));
800 assert!(texts.contains(&"dog"));
801 }
802
803 #[test]
804 fn test_stop_word_tokenizer_with_stemmer() {
805 let tokenizer = StopWordTokenizer::new(StemmerTokenizer::english(), Language::English);
809 let tokens = Tokenizer::tokenize(&tokenizer, "elephants galaxies quantum");
810
811 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
812 assert!(texts.contains(&"eleph")); assert!(texts.contains(&"galaxi")); assert!(texts.contains(&"quantum")); }
817
818 #[test]
819 fn test_stop_word_tokenizer_german() {
820 let tokenizer = StopWordTokenizer::new(LowercaseTokenizer, Language::German);
821 let tokens = Tokenizer::tokenize(&tokenizer, "Der Hund und die Katze");
822
823 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
825 assert!(!texts.contains(&"der"));
826 assert!(!texts.contains(&"und"));
827 assert!(!texts.contains(&"die"));
828 assert!(texts.contains(&"hund"));
829 assert!(texts.contains(&"katze"));
830 }
831
832 #[test]
833 fn test_stop_word_tokenizer_custom() {
834 let custom_stops: HashSet<String> = ["foo", "bar"].iter().map(|s| s.to_string()).collect();
835 let tokenizer = StopWordTokenizer::with_custom_stop_words(LowercaseTokenizer, custom_stops);
836 let tokens = Tokenizer::tokenize(&tokenizer, "foo baz bar qux");
837
838 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
839 assert!(!texts.contains(&"foo"));
840 assert!(!texts.contains(&"bar"));
841 assert!(texts.contains(&"baz"));
842 assert!(texts.contains(&"qux"));
843 }
844
845 #[test]
846 fn test_stop_word_tokenizer_is_stop_word() {
847 let tokenizer = StopWordTokenizer::english(LowercaseTokenizer);
848 assert!(tokenizer.is_stop_word("the"));
849 assert!(tokenizer.is_stop_word("and"));
850 assert!(tokenizer.is_stop_word("is"));
851 assert!(!tokenizer.is_stop_word("elephant"));
853 assert!(!tokenizer.is_stop_word("quantum"));
854 }
855
856 #[test]
857 fn test_tokenizer_registry_stop_word_tokenizers() {
858 let registry = TokenizerRegistry::new();
859
860 assert!(registry.contains("en_stop"));
862 assert!(registry.contains("en_stem_stop"));
863 assert!(registry.contains("de_stop"));
864 assert!(registry.contains("ru_stop"));
865
866 let tokenizer = registry.get("en_stop").unwrap();
868 let tokens = tokenizer.tokenize("The quick fox");
869 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
870 assert!(!texts.contains(&"the"));
871 assert!(texts.contains(&"quick"));
872 assert!(texts.contains(&"fox"));
873
874 let tokenizer = registry.get("en_stem_stop").unwrap();
876 let tokens = tokenizer.tokenize("elephants galaxies");
877 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
878 assert!(texts.contains(&"eleph")); assert!(texts.contains(&"galaxi")); }
881}