1use std::collections::HashMap;
4use std::sync::Arc;
5
6use parking_lot::RwLock;
7use rust_stemmers::Algorithm;
8use serde::{Deserialize, Serialize};
9
10#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub struct Token {
13 pub text: String,
15 pub position: u32,
17 pub offset_from: usize,
19 pub offset_to: usize,
21}
22
23impl Token {
24 pub fn new(text: String, position: u32, offset_from: usize, offset_to: usize) -> Self {
25 Self {
26 text,
27 position,
28 offset_from,
29 offset_to,
30 }
31 }
32}
33
34pub trait Tokenizer: Send + Sync + Clone + 'static {
36 fn tokenize(&self, text: &str) -> Vec<Token>;
38}
39
40#[derive(Debug, Clone, Default)]
42pub struct SimpleTokenizer;
43
44impl Tokenizer for SimpleTokenizer {
45 fn tokenize(&self, text: &str) -> Vec<Token> {
46 let mut tokens = Vec::new();
47 let mut position = 0u32;
48
49 for (offset, word) in split_whitespace_with_offsets(text) {
50 if !word.is_empty() {
51 tokens.push(Token::new(
52 word.to_string(),
53 position,
54 offset,
55 offset + word.len(),
56 ));
57 position += 1;
58 }
59 }
60
61 tokens
62 }
63}
64
65#[derive(Debug, Clone, Default)]
67pub struct LowercaseTokenizer;
68
69impl Tokenizer for LowercaseTokenizer {
70 fn tokenize(&self, text: &str) -> Vec<Token> {
71 let mut tokens = Vec::new();
72 let mut position = 0u32;
73
74 for (offset, word) in split_whitespace_with_offsets(text) {
75 if !word.is_empty() {
76 let cleaned: String = word
78 .chars()
79 .filter(|c| c.is_alphanumeric())
80 .flat_map(|c| c.to_lowercase())
81 .collect();
82
83 if !cleaned.is_empty() {
84 tokens.push(Token::new(cleaned, position, offset, offset + word.len()));
85 position += 1;
86 }
87 }
88 }
89
90 tokens
91 }
92}
93
94fn split_whitespace_with_offsets(text: &str) -> impl Iterator<Item = (usize, &str)> {
96 let mut offset = 0;
97 text.split_whitespace().map(move |word| {
98 let word_start = text[offset..].find(word).unwrap() + offset;
99 offset = word_start + word.len();
100 (word_start, word)
101 })
102}
103
104#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
106#[allow(missing_docs)]
107#[derive(Default)]
108pub enum Language {
109 Arabic,
110 Danish,
111 Dutch,
112 #[default]
113 English,
114 Finnish,
115 French,
116 German,
117 Greek,
118 Hungarian,
119 Italian,
120 Norwegian,
121 Portuguese,
122 Romanian,
123 Russian,
124 Spanish,
125 Swedish,
126 Tamil,
127 Turkish,
128}
129
130impl Language {
131 fn to_algorithm(self) -> Algorithm {
132 match self {
133 Language::Arabic => Algorithm::Arabic,
134 Language::Danish => Algorithm::Danish,
135 Language::Dutch => Algorithm::Dutch,
136 Language::English => Algorithm::English,
137 Language::Finnish => Algorithm::Finnish,
138 Language::French => Algorithm::French,
139 Language::German => Algorithm::German,
140 Language::Greek => Algorithm::Greek,
141 Language::Hungarian => Algorithm::Hungarian,
142 Language::Italian => Algorithm::Italian,
143 Language::Norwegian => Algorithm::Norwegian,
144 Language::Portuguese => Algorithm::Portuguese,
145 Language::Romanian => Algorithm::Romanian,
146 Language::Russian => Algorithm::Russian,
147 Language::Spanish => Algorithm::Spanish,
148 Language::Swedish => Algorithm::Swedish,
149 Language::Tamil => Algorithm::Tamil,
150 Language::Turkish => Algorithm::Turkish,
151 }
152 }
153}
154
155#[derive(Debug, Clone)]
160pub struct StemmerTokenizer {
161 language: Language,
162}
163
164impl StemmerTokenizer {
165 pub fn new(language: Language) -> Self {
167 Self { language }
168 }
169
170 pub fn english() -> Self {
172 Self::new(Language::English)
173 }
174}
175
176impl Default for StemmerTokenizer {
177 fn default() -> Self {
178 Self::english()
179 }
180}
181
182impl Tokenizer for StemmerTokenizer {
183 fn tokenize(&self, text: &str) -> Vec<Token> {
184 let stemmer = rust_stemmers::Stemmer::create(self.language.to_algorithm());
185 let mut tokens = Vec::new();
186 let mut position = 0u32;
187
188 for (offset, word) in split_whitespace_with_offsets(text) {
189 if !word.is_empty() {
190 let cleaned: String = word
192 .chars()
193 .filter(|c| c.is_alphanumeric())
194 .flat_map(|c| c.to_lowercase())
195 .collect();
196
197 if !cleaned.is_empty() {
198 let stemmed = stemmer.stem(&cleaned);
200 tokens.push(Token::new(
201 stemmed.into_owned(),
202 position,
203 offset,
204 offset + word.len(),
205 ));
206 position += 1;
207 }
208 }
209 }
210
211 tokens
212 }
213}
214
215#[derive(Debug, Clone)]
220pub struct MultiLanguageStemmer {
221 default_language: Language,
222}
223
224impl MultiLanguageStemmer {
225 pub fn new(default_language: Language) -> Self {
227 Self { default_language }
228 }
229
230 pub fn tokenize_with_language(&self, text: &str, language: Language) -> Vec<Token> {
232 let stemmer = rust_stemmers::Stemmer::create(language.to_algorithm());
233 let mut tokens = Vec::new();
234 let mut position = 0u32;
235
236 for (offset, word) in split_whitespace_with_offsets(text) {
237 if !word.is_empty() {
238 let cleaned: String = word
239 .chars()
240 .filter(|c| c.is_alphanumeric())
241 .flat_map(|c| c.to_lowercase())
242 .collect();
243
244 if !cleaned.is_empty() {
245 let stemmed = stemmer.stem(&cleaned);
246 tokens.push(Token::new(
247 stemmed.into_owned(),
248 position,
249 offset,
250 offset + word.len(),
251 ));
252 position += 1;
253 }
254 }
255 }
256
257 tokens
258 }
259
260 pub fn default_language(&self) -> Language {
262 self.default_language
263 }
264}
265
266impl Default for MultiLanguageStemmer {
267 fn default() -> Self {
268 Self::new(Language::English)
269 }
270}
271
272impl Tokenizer for MultiLanguageStemmer {
273 fn tokenize(&self, text: &str) -> Vec<Token> {
274 self.tokenize_with_language(text, self.default_language)
275 }
276}
277
278#[derive(Clone)]
283pub struct LanguageAwareTokenizer<F>
284where
285 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
286{
287 language_selector: F,
288 stemmer: MultiLanguageStemmer,
289}
290
291impl<F> LanguageAwareTokenizer<F>
292where
293 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
294{
295 pub fn new(language_selector: F) -> Self {
312 Self {
313 language_selector,
314 stemmer: MultiLanguageStemmer::default(),
315 }
316 }
317
318 pub fn tokenize_with_hint(&self, text: &str, language_hint: &str) -> Vec<Token> {
322 let language = (self.language_selector)(language_hint);
323 self.stemmer.tokenize_with_language(text, language)
324 }
325}
326
327impl<F> Tokenizer for LanguageAwareTokenizer<F>
328where
329 F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
330{
331 fn tokenize(&self, text: &str) -> Vec<Token> {
332 self.stemmer.tokenize_with_language(text, Language::English)
334 }
335}
336
337pub fn parse_language(s: &str) -> Language {
341 match s.to_lowercase().as_str() {
342 "ar" | "arabic" => Language::Arabic,
343 "da" | "danish" => Language::Danish,
344 "nl" | "dutch" => Language::Dutch,
345 "en" | "english" => Language::English,
346 "fi" | "finnish" => Language::Finnish,
347 "fr" | "french" => Language::French,
348 "de" | "german" => Language::German,
349 "el" | "greek" => Language::Greek,
350 "hu" | "hungarian" => Language::Hungarian,
351 "it" | "italian" => Language::Italian,
352 "no" | "norwegian" => Language::Norwegian,
353 "pt" | "portuguese" => Language::Portuguese,
354 "ro" | "romanian" => Language::Romanian,
355 "ru" | "russian" => Language::Russian,
356 "es" | "spanish" => Language::Spanish,
357 "sv" | "swedish" => Language::Swedish,
358 "ta" | "tamil" => Language::Tamil,
359 "tr" | "turkish" => Language::Turkish,
360 _ => Language::English, }
362}
363
364pub type BoxedTokenizer = Box<dyn TokenizerClone>;
366
367pub trait TokenizerClone: Send + Sync {
368 fn tokenize(&self, text: &str) -> Vec<Token>;
369 fn clone_box(&self) -> BoxedTokenizer;
370}
371
372impl<T: Tokenizer> TokenizerClone for T {
373 fn tokenize(&self, text: &str) -> Vec<Token> {
374 Tokenizer::tokenize(self, text)
375 }
376
377 fn clone_box(&self) -> BoxedTokenizer {
378 Box::new(self.clone())
379 }
380}
381
382impl Clone for BoxedTokenizer {
383 fn clone(&self) -> Self {
384 self.clone_box()
385 }
386}
387
388#[derive(Clone)]
393pub struct TokenizerRegistry {
394 tokenizers: Arc<RwLock<HashMap<String, BoxedTokenizer>>>,
395}
396
397impl TokenizerRegistry {
398 pub fn new() -> Self {
400 let registry = Self {
401 tokenizers: Arc::new(RwLock::new(HashMap::new())),
402 };
403 registry.register_defaults();
404 registry
405 }
406
407 fn register_defaults(&self) {
409 self.register("default", LowercaseTokenizer);
411 self.register("simple", SimpleTokenizer);
412 self.register("lowercase", LowercaseTokenizer);
413 self.register("raw", SimpleTokenizer);
414
415 self.register("en_stem", StemmerTokenizer::new(Language::English));
417 self.register("english", StemmerTokenizer::new(Language::English));
418
419 self.register("ar_stem", StemmerTokenizer::new(Language::Arabic));
421 self.register("arabic", StemmerTokenizer::new(Language::Arabic));
422 self.register("da_stem", StemmerTokenizer::new(Language::Danish));
423 self.register("danish", StemmerTokenizer::new(Language::Danish));
424 self.register("nl_stem", StemmerTokenizer::new(Language::Dutch));
425 self.register("dutch", StemmerTokenizer::new(Language::Dutch));
426 self.register("fi_stem", StemmerTokenizer::new(Language::Finnish));
427 self.register("finnish", StemmerTokenizer::new(Language::Finnish));
428 self.register("fr_stem", StemmerTokenizer::new(Language::French));
429 self.register("french", StemmerTokenizer::new(Language::French));
430 self.register("de_stem", StemmerTokenizer::new(Language::German));
431 self.register("german", StemmerTokenizer::new(Language::German));
432 self.register("el_stem", StemmerTokenizer::new(Language::Greek));
433 self.register("greek", StemmerTokenizer::new(Language::Greek));
434 self.register("hu_stem", StemmerTokenizer::new(Language::Hungarian));
435 self.register("hungarian", StemmerTokenizer::new(Language::Hungarian));
436 self.register("it_stem", StemmerTokenizer::new(Language::Italian));
437 self.register("italian", StemmerTokenizer::new(Language::Italian));
438 self.register("no_stem", StemmerTokenizer::new(Language::Norwegian));
439 self.register("norwegian", StemmerTokenizer::new(Language::Norwegian));
440 self.register("pt_stem", StemmerTokenizer::new(Language::Portuguese));
441 self.register("portuguese", StemmerTokenizer::new(Language::Portuguese));
442 self.register("ro_stem", StemmerTokenizer::new(Language::Romanian));
443 self.register("romanian", StemmerTokenizer::new(Language::Romanian));
444 self.register("ru_stem", StemmerTokenizer::new(Language::Russian));
445 self.register("russian", StemmerTokenizer::new(Language::Russian));
446 self.register("es_stem", StemmerTokenizer::new(Language::Spanish));
447 self.register("spanish", StemmerTokenizer::new(Language::Spanish));
448 self.register("sv_stem", StemmerTokenizer::new(Language::Swedish));
449 self.register("swedish", StemmerTokenizer::new(Language::Swedish));
450 self.register("ta_stem", StemmerTokenizer::new(Language::Tamil));
451 self.register("tamil", StemmerTokenizer::new(Language::Tamil));
452 self.register("tr_stem", StemmerTokenizer::new(Language::Turkish));
453 self.register("turkish", StemmerTokenizer::new(Language::Turkish));
454 }
455
456 pub fn register<T: Tokenizer>(&self, name: &str, tokenizer: T) {
458 let mut tokenizers = self.tokenizers.write();
459 tokenizers.insert(name.to_string(), Box::new(tokenizer));
460 }
461
462 pub fn get(&self, name: &str) -> Option<BoxedTokenizer> {
464 let tokenizers = self.tokenizers.read();
465 tokenizers.get(name).cloned()
466 }
467
468 pub fn contains(&self, name: &str) -> bool {
470 let tokenizers = self.tokenizers.read();
471 tokenizers.contains_key(name)
472 }
473
474 pub fn names(&self) -> Vec<String> {
476 let tokenizers = self.tokenizers.read();
477 tokenizers.keys().cloned().collect()
478 }
479}
480
481impl Default for TokenizerRegistry {
482 fn default() -> Self {
483 Self::new()
484 }
485}
486
487#[cfg(test)]
488mod tests {
489 use super::*;
490
491 #[test]
492 fn test_simple_tokenizer() {
493 let tokenizer = SimpleTokenizer;
494 let tokens = Tokenizer::tokenize(&tokenizer, "hello world");
495
496 assert_eq!(tokens.len(), 2);
497 assert_eq!(tokens[0].text, "hello");
498 assert_eq!(tokens[0].position, 0);
499 assert_eq!(tokens[1].text, "world");
500 assert_eq!(tokens[1].position, 1);
501 }
502
503 #[test]
504 fn test_lowercase_tokenizer() {
505 let tokenizer = LowercaseTokenizer;
506 let tokens = Tokenizer::tokenize(&tokenizer, "Hello, World!");
507
508 assert_eq!(tokens.len(), 2);
509 assert_eq!(tokens[0].text, "hello");
510 assert_eq!(tokens[1].text, "world");
511 }
512
513 #[test]
514 fn test_empty_text() {
515 let tokenizer = SimpleTokenizer;
516 let tokens = Tokenizer::tokenize(&tokenizer, "");
517 assert!(tokens.is_empty());
518 }
519
520 #[test]
521 fn test_stemmer_tokenizer_english() {
522 let tokenizer = StemmerTokenizer::english();
523 let tokens = Tokenizer::tokenize(&tokenizer, "Dogs are running quickly");
524
525 assert_eq!(tokens.len(), 4);
526 assert_eq!(tokens[0].text, "dog"); assert_eq!(tokens[1].text, "are"); assert_eq!(tokens[2].text, "run"); assert_eq!(tokens[3].text, "quick"); }
531
532 #[test]
533 fn test_stemmer_tokenizer_preserves_offsets() {
534 let tokenizer = StemmerTokenizer::english();
535 let tokens = Tokenizer::tokenize(&tokenizer, "Running dogs");
536
537 assert_eq!(tokens.len(), 2);
538 assert_eq!(tokens[0].text, "run");
539 assert_eq!(tokens[0].offset_from, 0);
540 assert_eq!(tokens[0].offset_to, 7); assert_eq!(tokens[1].text, "dog");
542 assert_eq!(tokens[1].offset_from, 8);
543 assert_eq!(tokens[1].offset_to, 12); }
545
546 #[test]
547 fn test_stemmer_tokenizer_german() {
548 let tokenizer = StemmerTokenizer::new(Language::German);
549 let tokens = Tokenizer::tokenize(&tokenizer, "Häuser Bücher");
550
551 assert_eq!(tokens.len(), 2);
552 assert_eq!(tokens[0].text, "haus"); assert_eq!(tokens[1].text, "buch"); }
556
557 #[test]
558 fn test_stemmer_tokenizer_russian() {
559 let tokenizer = StemmerTokenizer::new(Language::Russian);
560 let tokens = Tokenizer::tokenize(&tokenizer, "бегущие собаки");
561
562 assert_eq!(tokens.len(), 2);
563 assert_eq!(tokens[0].text, "бегущ"); assert_eq!(tokens[1].text, "собак"); }
567
568 #[test]
569 fn test_multi_language_stemmer() {
570 let stemmer = MultiLanguageStemmer::new(Language::English);
571
572 let tokens = stemmer.tokenize_with_language("running dogs", Language::English);
574 assert_eq!(tokens[0].text, "run");
575 assert_eq!(tokens[1].text, "dog");
576
577 let tokens = stemmer.tokenize_with_language("Häuser Bücher", Language::German);
579 assert_eq!(tokens[0].text, "haus");
580 assert_eq!(tokens[1].text, "buch");
581
582 let tokens = stemmer.tokenize_with_language("бегущие собаки", Language::Russian);
584 assert_eq!(tokens[0].text, "бегущ");
585 assert_eq!(tokens[1].text, "собак");
586 }
587
588 #[test]
589 fn test_language_aware_tokenizer() {
590 let tokenizer = LanguageAwareTokenizer::new(parse_language);
591
592 let tokens = tokenizer.tokenize_with_hint("running dogs", "en");
594 assert_eq!(tokens[0].text, "run");
595 assert_eq!(tokens[1].text, "dog");
596
597 let tokens = tokenizer.tokenize_with_hint("Häuser Bücher", "de");
599 assert_eq!(tokens[0].text, "haus");
600 assert_eq!(tokens[1].text, "buch");
601
602 let tokens = tokenizer.tokenize_with_hint("бегущие собаки", "russian");
604 assert_eq!(tokens[0].text, "бегущ");
605 assert_eq!(tokens[1].text, "собак");
606 }
607
608 #[test]
609 fn test_parse_language() {
610 assert_eq!(parse_language("en"), Language::English);
611 assert_eq!(parse_language("english"), Language::English);
612 assert_eq!(parse_language("English"), Language::English);
613 assert_eq!(parse_language("de"), Language::German);
614 assert_eq!(parse_language("german"), Language::German);
615 assert_eq!(parse_language("ru"), Language::Russian);
616 assert_eq!(parse_language("russian"), Language::Russian);
617 assert_eq!(parse_language("unknown"), Language::English); }
619
620 #[test]
621 fn test_tokenizer_registry_defaults() {
622 let registry = TokenizerRegistry::new();
623
624 assert!(registry.contains("default"));
626 assert!(registry.contains("simple"));
627 assert!(registry.contains("lowercase"));
628 assert!(registry.contains("en_stem"));
629 assert!(registry.contains("german"));
630 assert!(registry.contains("russian"));
631 }
632
633 #[test]
634 fn test_tokenizer_registry_get() {
635 let registry = TokenizerRegistry::new();
636
637 let tokenizer = registry.get("en_stem").unwrap();
639 let tokens = tokenizer.tokenize("running dogs");
640 assert_eq!(tokens[0].text, "run");
641 assert_eq!(tokens[1].text, "dog");
642
643 let tokenizer = registry.get("german").unwrap();
645 let tokens = tokenizer.tokenize("Häuser Bücher");
646 assert_eq!(tokens[0].text, "haus");
647 assert_eq!(tokens[1].text, "buch");
648 }
649
650 #[test]
651 fn test_tokenizer_registry_custom() {
652 let registry = TokenizerRegistry::new();
653
654 registry.register("my_tokenizer", LowercaseTokenizer);
656
657 assert!(registry.contains("my_tokenizer"));
658 let tokenizer = registry.get("my_tokenizer").unwrap();
659 let tokens = tokenizer.tokenize("Hello World");
660 assert_eq!(tokens[0].text, "hello");
661 assert_eq!(tokens[1].text, "world");
662 }
663
664 #[test]
665 fn test_tokenizer_registry_nonexistent() {
666 let registry = TokenizerRegistry::new();
667 assert!(registry.get("nonexistent").is_none());
668 }
669}