1#[cfg(any(feature = "native", feature = "wasm"))]
4mod hf_tokenizer;
5
6#[cfg(feature = "native")]
7mod idf_weights;
8
9#[cfg(any(feature = "native", feature = "wasm"))]
10pub use hf_tokenizer::{HfTokenizer, TokenizerSource};
11
12#[cfg(feature = "native")]
13pub use hf_tokenizer::{TokenizerCache, tokenizer_cache};
14
15#[cfg(feature = "native")]
16pub use idf_weights::{IdfWeights, IdfWeightsCache, idf_weights_cache};
17
18use std::collections::HashMap;
19use std::sync::Arc;
20
21use parking_lot::RwLock;
22use rust_stemmers::Algorithm;
23use serde::{Deserialize, Serialize};
24use stop_words::LANGUAGE;
25
26#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
28pub struct Token {
29 pub text: String,
31 pub position: u32,
33 pub offset_from: usize,
35 pub offset_to: usize,
37}
38
39impl Token {
40 pub fn new(text: String, position: u32, offset_from: usize, offset_to: usize) -> Self {
41 Self {
42 text,
43 position,
44 offset_from,
45 offset_to,
46 }
47 }
48}
49
50pub trait Tokenizer: Send + Sync + Clone + 'static {
52 fn tokenize(&self, text: &str) -> Vec<Token>;
54}
55
56#[derive(Debug, Clone, Default)]
60pub struct SimpleTokenizer;
61
62impl Tokenizer for SimpleTokenizer {
63 fn tokenize(&self, text: &str) -> Vec<Token> {
64 tokenize_and_clean(text, std::convert::identity)
65 }
66}
67
68#[derive(Debug, Clone, Default)]
72pub struct RawTokenizer;
73
74impl Tokenizer for RawTokenizer {
75 fn tokenize(&self, text: &str) -> Vec<Token> {
76 let trimmed = text.trim();
77 if trimmed.is_empty() {
78 return Vec::new();
79 }
80 let offset = text.as_ptr() as usize;
81 let trimmed_offset = trimmed.as_ptr() as usize - offset;
82 vec![Token::new(
83 trimmed.to_string(),
84 0,
85 trimmed_offset,
86 trimmed_offset + trimmed.len(),
87 )]
88 }
89}
90
91#[derive(Debug, Clone, Default)]
95pub struct RawCiTokenizer;
96
97impl Tokenizer for RawCiTokenizer {
98 fn tokenize(&self, text: &str) -> Vec<Token> {
99 let trimmed = text.trim();
100 if trimmed.is_empty() {
101 return Vec::new();
102 }
103 let offset = text.as_ptr() as usize;
104 let trimmed_offset = trimmed.as_ptr() as usize - offset;
105 vec![Token::new(
106 lowercase_word(trimmed),
107 0,
108 trimmed_offset,
109 trimmed_offset + trimmed.len(),
110 )]
111 }
112}
113
114#[inline]
118fn lowercase_word(word: &str) -> String {
119 if word.is_ascii() {
120 if word.bytes().all(|b| !b.is_ascii_uppercase()) {
121 return word.to_string();
122 }
123 let mut s = word.to_string();
124 s.make_ascii_lowercase();
125 s
126 } else {
127 word.chars().flat_map(|c| c.to_lowercase()).collect()
128 }
129}
130
131#[inline]
136fn clean_word(word: &str) -> String {
137 if word.is_ascii() {
138 let bytes = word.as_bytes();
139 if bytes
141 .iter()
142 .all(|&b| b.is_ascii_lowercase() || b.is_ascii_digit())
143 {
144 return word.to_string();
145 }
146 let mut result = String::with_capacity(bytes.len());
148 for &b in bytes {
149 if b.is_ascii_alphanumeric() {
150 result.push(b.to_ascii_lowercase() as char);
151 }
152 }
153 result
154 } else {
155 word.chars()
157 .filter(|c| c.is_alphanumeric())
158 .flat_map(|c| c.to_lowercase())
159 .collect()
160 }
161}
162
163fn tokenize_and_clean(text: &str, transform: impl Fn(String) -> String) -> Vec<Token> {
169 let mut tokens = Vec::with_capacity(text.len() / 5);
170 let mut position = 0u32;
171 for (offset, word) in split_whitespace_with_offsets(text) {
172 if !word.is_empty() {
173 let cleaned = clean_word(word);
174 if !cleaned.is_empty() {
175 tokens.push(Token::new(
176 transform(cleaned),
177 position,
178 offset,
179 offset + word.len(),
180 ));
181 position += 1;
182 }
183 }
184 }
185 tokens
186}
187
188fn split_whitespace_with_offsets(text: &str) -> impl Iterator<Item = (usize, &str)> {
193 let base = text.as_ptr() as usize;
194 text.split_whitespace()
195 .map(move |word| (word.as_ptr() as usize - base, word))
196}
197
198#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
200#[allow(missing_docs)]
201#[derive(Default)]
202pub enum Language {
203 Arabic,
204 Danish,
205 Dutch,
206 #[default]
207 English,
208 Finnish,
209 French,
210 German,
211 Greek,
212 Hungarian,
213 Italian,
214 Norwegian,
215 Portuguese,
216 Romanian,
217 Russian,
218 Spanish,
219 Swedish,
220 Tamil,
221 Turkish,
222}
223
224impl Language {
225 fn to_algorithm(self) -> Algorithm {
226 match self {
227 Language::Arabic => Algorithm::Arabic,
228 Language::Danish => Algorithm::Danish,
229 Language::Dutch => Algorithm::Dutch,
230 Language::English => Algorithm::English,
231 Language::Finnish => Algorithm::Finnish,
232 Language::French => Algorithm::French,
233 Language::German => Algorithm::German,
234 Language::Greek => Algorithm::Greek,
235 Language::Hungarian => Algorithm::Hungarian,
236 Language::Italian => Algorithm::Italian,
237 Language::Norwegian => Algorithm::Norwegian,
238 Language::Portuguese => Algorithm::Portuguese,
239 Language::Romanian => Algorithm::Romanian,
240 Language::Russian => Algorithm::Russian,
241 Language::Spanish => Algorithm::Spanish,
242 Language::Swedish => Algorithm::Swedish,
243 Language::Tamil => Algorithm::Tamil,
244 Language::Turkish => Algorithm::Turkish,
245 }
246 }
247
248 fn to_stop_words_language(self) -> LANGUAGE {
249 match self {
250 Language::Arabic => LANGUAGE::Arabic,
251 Language::Danish => LANGUAGE::Danish,
252 Language::Dutch => LANGUAGE::Dutch,
253 Language::English => LANGUAGE::English,
254 Language::Finnish => LANGUAGE::Finnish,
255 Language::French => LANGUAGE::French,
256 Language::German => LANGUAGE::German,
257 Language::Greek => LANGUAGE::Greek,
258 Language::Hungarian => LANGUAGE::Hungarian,
259 Language::Italian => LANGUAGE::Italian,
260 Language::Norwegian => LANGUAGE::Norwegian,
261 Language::Portuguese => LANGUAGE::Portuguese,
262 Language::Romanian => LANGUAGE::Romanian,
263 Language::Russian => LANGUAGE::Russian,
264 Language::Spanish => LANGUAGE::Spanish,
265 Language::Swedish => LANGUAGE::Swedish,
266 Language::Tamil => LANGUAGE::English, Language::Turkish => LANGUAGE::Turkish,
268 }
269 }
270}
271
272#[derive(Debug, Clone)]
276pub struct StopWordTokenizer<T: Tokenizer> {
277 inner: T,
278 stop_words: HashSet<String>,
279}
280
281use std::collections::HashSet;
282
283impl<T: Tokenizer> StopWordTokenizer<T> {
284 pub fn new(inner: T, language: Language) -> Self {
286 let stop_words: HashSet<String> = stop_words::get(language.to_stop_words_language())
287 .into_iter()
288 .map(|s| s.to_string())
289 .collect();
290 Self { inner, stop_words }
291 }
292
293 pub fn english(inner: T) -> Self {
295 Self::new(inner, Language::English)
296 }
297
298 pub fn with_custom_stop_words(inner: T, stop_words: HashSet<String>) -> Self {
300 Self { inner, stop_words }
301 }
302
303 pub fn is_stop_word(&self, word: &str) -> bool {
305 self.stop_words.contains(word)
306 }
307}
308
309impl<T: Tokenizer> Tokenizer for StopWordTokenizer<T> {
310 fn tokenize(&self, text: &str) -> Vec<Token> {
311 self.inner
312 .tokenize(text)
313 .into_iter()
314 .filter(|token| !self.stop_words.contains(token.text.as_str()))
315 .collect()
316 }
317}
318
319#[derive(Debug, Clone)]
324pub struct StemmerTokenizer {
325 language: Language,
326}
327
328impl StemmerTokenizer {
329 pub fn new(language: Language) -> Self {
331 Self { language }
332 }
333
334 pub fn english() -> Self {
336 Self::new(Language::English)
337 }
338}
339
340impl Default for StemmerTokenizer {
341 fn default() -> Self {
342 Self::english()
343 }
344}
345
346impl Tokenizer for StemmerTokenizer {
347 fn tokenize(&self, text: &str) -> Vec<Token> {
348 let stemmer = rust_stemmers::Stemmer::create(self.language.to_algorithm());
349 tokenize_and_clean(text, |s| stemmer.stem(&s).into_owned())
350 }
351}
352
353#[derive(Debug, Clone)]
358pub struct MultiLanguageStemmer {
359 default_language: Language,
360}
361
362impl MultiLanguageStemmer {
363 pub fn new(default_language: Language) -> Self {
365 Self { default_language }
366 }
367
368 pub fn tokenize_with_language(&self, text: &str, language: Language) -> Vec<Token> {
370 let stemmer = rust_stemmers::Stemmer::create(language.to_algorithm());
371 tokenize_and_clean(text, |s| stemmer.stem(&s).into_owned())
372 }
373
374 pub fn default_language(&self) -> Language {
376 self.default_language
377 }
378}
379
380impl Default for MultiLanguageStemmer {
381 fn default() -> Self {
382 Self::new(Language::English)
383 }
384}
385
386impl Tokenizer for MultiLanguageStemmer {
387 fn tokenize(&self, text: &str) -> Vec<Token> {
388 self.tokenize_with_language(text, self.default_language)
389 }
390}
391
392#[derive(Clone)]
397pub struct LanguageAwareTokenizer<F>
398where
399 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
400{
401 language_selector: F,
402 stemmer: MultiLanguageStemmer,
403}
404
405impl<F> LanguageAwareTokenizer<F>
406where
407 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
408{
409 pub fn new(language_selector: F) -> Self {
426 Self {
427 language_selector,
428 stemmer: MultiLanguageStemmer::default(),
429 }
430 }
431
432 pub fn tokenize_with_hint(&self, text: &str, language_hint: &str) -> Vec<Token> {
436 let language = (self.language_selector)(language_hint);
437 self.stemmer.tokenize_with_language(text, language)
438 }
439}
440
441impl<F> Tokenizer for LanguageAwareTokenizer<F>
442where
443 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
444{
445 fn tokenize(&self, text: &str) -> Vec<Token> {
446 self.stemmer.tokenize_with_language(text, Language::English)
448 }
449}
450
451pub fn parse_language(s: &str) -> Language {
455 match s.to_lowercase().as_str() {
456 "ar" | "arabic" => Language::Arabic,
457 "da" | "danish" => Language::Danish,
458 "nl" | "dutch" => Language::Dutch,
459 "en" | "english" => Language::English,
460 "fi" | "finnish" => Language::Finnish,
461 "fr" | "french" => Language::French,
462 "de" | "german" => Language::German,
463 "el" | "greek" => Language::Greek,
464 "hu" | "hungarian" => Language::Hungarian,
465 "it" | "italian" => Language::Italian,
466 "no" | "norwegian" => Language::Norwegian,
467 "pt" | "portuguese" => Language::Portuguese,
468 "ro" | "romanian" => Language::Romanian,
469 "ru" | "russian" => Language::Russian,
470 "es" | "spanish" => Language::Spanish,
471 "sv" | "swedish" => Language::Swedish,
472 "ta" | "tamil" => Language::Tamil,
473 "tr" | "turkish" => Language::Turkish,
474 _ => Language::English, }
476}
477
478pub type BoxedTokenizer = Box<dyn TokenizerClone>;
480
481pub trait TokenizerClone: Send + Sync {
482 fn tokenize(&self, text: &str) -> Vec<Token>;
483 fn clone_box(&self) -> BoxedTokenizer;
484}
485
486impl<T: Tokenizer> TokenizerClone for T {
487 fn tokenize(&self, text: &str) -> Vec<Token> {
488 Tokenizer::tokenize(self, text)
489 }
490
491 fn clone_box(&self) -> BoxedTokenizer {
492 Box::new(self.clone())
493 }
494}
495
496impl Clone for BoxedTokenizer {
497 fn clone(&self) -> Self {
498 self.clone_box()
499 }
500}
501
502#[derive(Clone)]
507pub struct TokenizerRegistry {
508 tokenizers: Arc<RwLock<HashMap<String, BoxedTokenizer>>>,
509}
510
511impl TokenizerRegistry {
512 pub fn new() -> Self {
514 let registry = Self {
515 tokenizers: Arc::new(RwLock::new(HashMap::new())),
516 };
517 registry.register_defaults();
518 registry
519 }
520
521 fn register_defaults(&self) {
523 self.register("simple", SimpleTokenizer);
525 self.register("raw", RawTokenizer);
526 self.register("raw_ci", RawCiTokenizer);
527
528 self.register("en_stem", StemmerTokenizer::new(Language::English));
530 self.register("english", StemmerTokenizer::new(Language::English));
531
532 self.register("ar_stem", StemmerTokenizer::new(Language::Arabic));
534 self.register("arabic", StemmerTokenizer::new(Language::Arabic));
535 self.register("da_stem", StemmerTokenizer::new(Language::Danish));
536 self.register("danish", StemmerTokenizer::new(Language::Danish));
537 self.register("nl_stem", StemmerTokenizer::new(Language::Dutch));
538 self.register("dutch", StemmerTokenizer::new(Language::Dutch));
539 self.register("fi_stem", StemmerTokenizer::new(Language::Finnish));
540 self.register("finnish", StemmerTokenizer::new(Language::Finnish));
541 self.register("fr_stem", StemmerTokenizer::new(Language::French));
542 self.register("french", StemmerTokenizer::new(Language::French));
543 self.register("de_stem", StemmerTokenizer::new(Language::German));
544 self.register("german", StemmerTokenizer::new(Language::German));
545 self.register("el_stem", StemmerTokenizer::new(Language::Greek));
546 self.register("greek", StemmerTokenizer::new(Language::Greek));
547 self.register("hu_stem", StemmerTokenizer::new(Language::Hungarian));
548 self.register("hungarian", StemmerTokenizer::new(Language::Hungarian));
549 self.register("it_stem", StemmerTokenizer::new(Language::Italian));
550 self.register("italian", StemmerTokenizer::new(Language::Italian));
551 self.register("no_stem", StemmerTokenizer::new(Language::Norwegian));
552 self.register("norwegian", StemmerTokenizer::new(Language::Norwegian));
553 self.register("pt_stem", StemmerTokenizer::new(Language::Portuguese));
554 self.register("portuguese", StemmerTokenizer::new(Language::Portuguese));
555 self.register("ro_stem", StemmerTokenizer::new(Language::Romanian));
556 self.register("romanian", StemmerTokenizer::new(Language::Romanian));
557 self.register("ru_stem", StemmerTokenizer::new(Language::Russian));
558 self.register("russian", StemmerTokenizer::new(Language::Russian));
559 self.register("es_stem", StemmerTokenizer::new(Language::Spanish));
560 self.register("spanish", StemmerTokenizer::new(Language::Spanish));
561 self.register("sv_stem", StemmerTokenizer::new(Language::Swedish));
562 self.register("swedish", StemmerTokenizer::new(Language::Swedish));
563 self.register("ta_stem", StemmerTokenizer::new(Language::Tamil));
564 self.register("tamil", StemmerTokenizer::new(Language::Tamil));
565 self.register("tr_stem", StemmerTokenizer::new(Language::Turkish));
566 self.register("turkish", StemmerTokenizer::new(Language::Turkish));
567
568 self.register(
570 "en_stop",
571 StopWordTokenizer::new(SimpleTokenizer, Language::English),
572 );
573 self.register(
574 "de_stop",
575 StopWordTokenizer::new(SimpleTokenizer, Language::German),
576 );
577 self.register(
578 "fr_stop",
579 StopWordTokenizer::new(SimpleTokenizer, Language::French),
580 );
581 self.register(
582 "ru_stop",
583 StopWordTokenizer::new(SimpleTokenizer, Language::Russian),
584 );
585 self.register(
586 "es_stop",
587 StopWordTokenizer::new(SimpleTokenizer, Language::Spanish),
588 );
589
590 self.register(
592 "en_stem_stop",
593 StopWordTokenizer::new(StemmerTokenizer::new(Language::English), Language::English),
594 );
595 self.register(
596 "de_stem_stop",
597 StopWordTokenizer::new(StemmerTokenizer::new(Language::German), Language::German),
598 );
599 self.register(
600 "fr_stem_stop",
601 StopWordTokenizer::new(StemmerTokenizer::new(Language::French), Language::French),
602 );
603 self.register(
604 "ru_stem_stop",
605 StopWordTokenizer::new(StemmerTokenizer::new(Language::Russian), Language::Russian),
606 );
607 self.register(
608 "es_stem_stop",
609 StopWordTokenizer::new(StemmerTokenizer::new(Language::Spanish), Language::Spanish),
610 );
611 }
612
613 pub fn register<T: Tokenizer>(&self, name: &str, tokenizer: T) {
615 let mut tokenizers = self.tokenizers.write();
616 tokenizers.insert(name.to_string(), Box::new(tokenizer));
617 }
618
619 pub fn get(&self, name: &str) -> Option<BoxedTokenizer> {
621 let tokenizers = self.tokenizers.read();
622 tokenizers.get(name).cloned()
623 }
624
625 pub fn contains(&self, name: &str) -> bool {
627 let tokenizers = self.tokenizers.read();
628 tokenizers.contains_key(name)
629 }
630
631 pub fn names(&self) -> Vec<String> {
633 let tokenizers = self.tokenizers.read();
634 tokenizers.keys().cloned().collect()
635 }
636}
637
638impl Default for TokenizerRegistry {
639 fn default() -> Self {
640 Self::new()
641 }
642}
643
644#[cfg(test)]
645mod tests {
646 use super::*;
647
648 #[test]
649 fn test_simple_tokenizer() {
650 let tokenizer = SimpleTokenizer;
651 let tokens = Tokenizer::tokenize(&tokenizer, "Hello World");
652
653 assert_eq!(tokens.len(), 2);
654 assert_eq!(tokens[0].text, "hello");
655 assert_eq!(tokens[0].position, 0);
656 assert_eq!(tokens[1].text, "world");
657 assert_eq!(tokens[1].position, 1);
658 }
659
660 #[test]
661 fn test_raw_tokenizer() {
662 let tokenizer = RawTokenizer;
663 let tokens = Tokenizer::tokenize(&tokenizer, "Hello, World!");
665 assert_eq!(tokens.len(), 1);
666 assert_eq!(tokens[0].text, "Hello, World!");
667 assert_eq!(tokens[0].position, 0);
668 }
669
670 #[test]
671 fn test_raw_tokenizer_trims() {
672 let tokenizer = RawTokenizer;
673 let tokens = Tokenizer::tokenize(&tokenizer, " spaced ");
674 assert_eq!(tokens.len(), 1);
675 assert_eq!(tokens[0].text, "spaced");
676 assert_eq!(tokens[0].offset_from, 2);
677 }
678
679 #[test]
680 fn test_raw_tokenizer_empty() {
681 let tokenizer = RawTokenizer;
682 assert!(Tokenizer::tokenize(&tokenizer, "").is_empty());
683 assert!(Tokenizer::tokenize(&tokenizer, " ").is_empty());
684 }
685
686 #[test]
687 fn test_raw_ci_tokenizer() {
688 let tokenizer = RawCiTokenizer;
689 let tokens = Tokenizer::tokenize(&tokenizer, "Hello, World!");
691 assert_eq!(tokens.len(), 1);
692 assert_eq!(tokens[0].text, "hello, world!");
693 assert_eq!(tokens[0].position, 0);
694 }
695
696 #[test]
697 fn test_raw_ci_tokenizer_preserves_structure() {
698 let tokenizer = RawCiTokenizer;
699 let tokens = Tokenizer::tokenize(&tokenizer, "HTTPS://Example.COM/Page");
700 assert_eq!(tokens.len(), 1);
701 assert_eq!(tokens[0].text, "https://example.com/page");
702 }
703
704 #[test]
705 fn test_simple_tokenizer_strips_punctuation() {
706 let tokenizer = SimpleTokenizer;
707 let tokens = Tokenizer::tokenize(&tokenizer, "Hello, World!");
708
709 assert_eq!(tokens.len(), 2);
710 assert_eq!(tokens[0].text, "hello");
711 assert_eq!(tokens[1].text, "world");
712 }
713
714 #[test]
715 fn test_empty_text() {
716 let tokenizer = SimpleTokenizer;
717 let tokens = Tokenizer::tokenize(&tokenizer, "");
718 assert!(tokens.is_empty());
719 }
720
721 #[test]
722 fn test_stemmer_tokenizer_english() {
723 let tokenizer = StemmerTokenizer::english();
724 let tokens = Tokenizer::tokenize(&tokenizer, "Dogs are running quickly");
725
726 assert_eq!(tokens.len(), 4);
727 assert_eq!(tokens[0].text, "dog"); assert_eq!(tokens[1].text, "are"); assert_eq!(tokens[2].text, "run"); assert_eq!(tokens[3].text, "quick"); }
732
733 #[test]
734 fn test_stemmer_tokenizer_preserves_offsets() {
735 let tokenizer = StemmerTokenizer::english();
736 let tokens = Tokenizer::tokenize(&tokenizer, "Running dogs");
737
738 assert_eq!(tokens.len(), 2);
739 assert_eq!(tokens[0].text, "run");
740 assert_eq!(tokens[0].offset_from, 0);
741 assert_eq!(tokens[0].offset_to, 7); assert_eq!(tokens[1].text, "dog");
743 assert_eq!(tokens[1].offset_from, 8);
744 assert_eq!(tokens[1].offset_to, 12); }
746
747 #[test]
748 fn test_stemmer_tokenizer_german() {
749 let tokenizer = StemmerTokenizer::new(Language::German);
750 let tokens = Tokenizer::tokenize(&tokenizer, "Häuser Bücher");
751
752 assert_eq!(tokens.len(), 2);
753 assert_eq!(tokens[0].text, "haus"); assert_eq!(tokens[1].text, "buch"); }
757
758 #[test]
759 fn test_stemmer_tokenizer_russian() {
760 let tokenizer = StemmerTokenizer::new(Language::Russian);
761 let tokens = Tokenizer::tokenize(&tokenizer, "бегущие собаки");
762
763 assert_eq!(tokens.len(), 2);
764 assert_eq!(tokens[0].text, "бегущ"); assert_eq!(tokens[1].text, "собак"); }
768
769 #[test]
770 fn test_multi_language_stemmer() {
771 let stemmer = MultiLanguageStemmer::new(Language::English);
772
773 let tokens = stemmer.tokenize_with_language("running dogs", Language::English);
775 assert_eq!(tokens[0].text, "run");
776 assert_eq!(tokens[1].text, "dog");
777
778 let tokens = stemmer.tokenize_with_language("Häuser Bücher", Language::German);
780 assert_eq!(tokens[0].text, "haus");
781 assert_eq!(tokens[1].text, "buch");
782
783 let tokens = stemmer.tokenize_with_language("бегущие собаки", Language::Russian);
785 assert_eq!(tokens[0].text, "бегущ");
786 assert_eq!(tokens[1].text, "собак");
787 }
788
789 #[test]
790 fn test_language_aware_tokenizer() {
791 let tokenizer = LanguageAwareTokenizer::new(parse_language);
792
793 let tokens = tokenizer.tokenize_with_hint("running dogs", "en");
795 assert_eq!(tokens[0].text, "run");
796 assert_eq!(tokens[1].text, "dog");
797
798 let tokens = tokenizer.tokenize_with_hint("Häuser Bücher", "de");
800 assert_eq!(tokens[0].text, "haus");
801 assert_eq!(tokens[1].text, "buch");
802
803 let tokens = tokenizer.tokenize_with_hint("бегущие собаки", "russian");
805 assert_eq!(tokens[0].text, "бегущ");
806 assert_eq!(tokens[1].text, "собак");
807 }
808
809 #[test]
810 fn test_parse_language() {
811 assert_eq!(parse_language("en"), Language::English);
812 assert_eq!(parse_language("english"), Language::English);
813 assert_eq!(parse_language("English"), Language::English);
814 assert_eq!(parse_language("de"), Language::German);
815 assert_eq!(parse_language("german"), Language::German);
816 assert_eq!(parse_language("ru"), Language::Russian);
817 assert_eq!(parse_language("russian"), Language::Russian);
818 assert_eq!(parse_language("unknown"), Language::English); }
820
821 #[test]
822 fn test_tokenizer_registry_defaults() {
823 let registry = TokenizerRegistry::new();
824
825 assert!(registry.contains("simple"));
827 assert!(registry.contains("raw"));
828 assert!(registry.contains("raw_ci"));
829 assert!(registry.contains("raw"));
830 assert!(registry.contains("raw_ci"));
831 assert!(registry.contains("en_stem"));
832 assert!(registry.contains("german"));
833 assert!(registry.contains("russian"));
834 }
835
836 #[test]
837 fn test_tokenizer_registry_get() {
838 let registry = TokenizerRegistry::new();
839
840 let tokenizer = registry.get("en_stem").unwrap();
842 let tokens = tokenizer.tokenize("running dogs");
843 assert_eq!(tokens[0].text, "run");
844 assert_eq!(tokens[1].text, "dog");
845
846 let tokenizer = registry.get("german").unwrap();
848 let tokens = tokenizer.tokenize("Häuser Bücher");
849 assert_eq!(tokens[0].text, "haus");
850 assert_eq!(tokens[1].text, "buch");
851 }
852
853 #[test]
854 fn test_tokenizer_registry_custom() {
855 let registry = TokenizerRegistry::new();
856
857 registry.register("my_tokenizer", SimpleTokenizer);
859
860 assert!(registry.contains("my_tokenizer"));
861 let tokenizer = registry.get("my_tokenizer").unwrap();
862 let tokens = tokenizer.tokenize("Hello World");
863 assert_eq!(tokens[0].text, "hello");
864 assert_eq!(tokens[1].text, "world");
865 }
866
867 #[test]
868 fn test_tokenizer_registry_nonexistent() {
869 let registry = TokenizerRegistry::new();
870 assert!(registry.get("nonexistent").is_none());
871 }
872
873 #[test]
874 fn test_stop_word_tokenizer_english() {
875 let tokenizer = StopWordTokenizer::english(SimpleTokenizer);
876 let tokens = Tokenizer::tokenize(&tokenizer, "The quick brown fox jumps over the lazy dog");
877
878 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
880 assert!(!texts.contains(&"the"));
881 assert!(!texts.contains(&"over"));
882 assert!(texts.contains(&"quick"));
883 assert!(texts.contains(&"brown"));
884 assert!(texts.contains(&"fox"));
885 assert!(texts.contains(&"jumps"));
886 assert!(texts.contains(&"lazy"));
887 assert!(texts.contains(&"dog"));
888 }
889
890 #[test]
891 fn test_stop_word_tokenizer_with_stemmer() {
892 let tokenizer = StopWordTokenizer::new(StemmerTokenizer::english(), Language::English);
896 let tokens = Tokenizer::tokenize(&tokenizer, "elephants galaxies quantum");
897
898 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
899 assert!(texts.contains(&"eleph")); assert!(texts.contains(&"galaxi")); assert!(texts.contains(&"quantum")); }
904
905 #[test]
906 fn test_stop_word_tokenizer_german() {
907 let tokenizer = StopWordTokenizer::new(SimpleTokenizer, Language::German);
908 let tokens = Tokenizer::tokenize(&tokenizer, "Der Hund und die Katze");
909
910 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
912 assert!(!texts.contains(&"der"));
913 assert!(!texts.contains(&"und"));
914 assert!(!texts.contains(&"die"));
915 assert!(texts.contains(&"hund"));
916 assert!(texts.contains(&"katze"));
917 }
918
919 #[test]
920 fn test_stop_word_tokenizer_custom() {
921 let custom_stops: HashSet<String> = ["foo", "bar"].iter().map(|s| s.to_string()).collect();
922 let tokenizer = StopWordTokenizer::with_custom_stop_words(SimpleTokenizer, custom_stops);
923 let tokens = Tokenizer::tokenize(&tokenizer, "foo baz bar qux");
924
925 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
926 assert!(!texts.contains(&"foo"));
927 assert!(!texts.contains(&"bar"));
928 assert!(texts.contains(&"baz"));
929 assert!(texts.contains(&"qux"));
930 }
931
932 #[test]
933 fn test_stop_word_tokenizer_is_stop_word() {
934 let tokenizer = StopWordTokenizer::english(SimpleTokenizer);
935 assert!(tokenizer.is_stop_word("the"));
936 assert!(tokenizer.is_stop_word("and"));
937 assert!(tokenizer.is_stop_word("is"));
938 assert!(!tokenizer.is_stop_word("elephant"));
940 assert!(!tokenizer.is_stop_word("quantum"));
941 }
942
943 #[test]
944 fn test_tokenizer_registry_stop_word_tokenizers() {
945 let registry = TokenizerRegistry::new();
946
947 assert!(registry.contains("en_stop"));
949 assert!(registry.contains("en_stem_stop"));
950 assert!(registry.contains("de_stop"));
951 assert!(registry.contains("ru_stop"));
952
953 let tokenizer = registry.get("en_stop").unwrap();
955 let tokens = tokenizer.tokenize("The quick fox");
956 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
957 assert!(!texts.contains(&"the"));
958 assert!(texts.contains(&"quick"));
959 assert!(texts.contains(&"fox"));
960
961 let tokenizer = registry.get("en_stem_stop").unwrap();
963 let tokens = tokenizer.tokenize("elephants galaxies");
964 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
965 assert!(texts.contains(&"eleph")); assert!(texts.contains(&"galaxi")); }
968}