hermes_core/tokenizer/
mod.rs

1//! Tokenizer API for text processing
2
3use std::collections::HashMap;
4use std::sync::Arc;
5
6use parking_lot::RwLock;
7use rust_stemmers::Algorithm;
8use serde::{Deserialize, Serialize};
9
10/// A token produced by tokenization
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub struct Token {
13    /// The text content of the token
14    pub text: String,
15    /// Position in the token stream (0-indexed)
16    pub position: u32,
17    /// Byte offset from start of original text
18    pub offset_from: usize,
19    /// Byte offset to end of token in original text
20    pub offset_to: usize,
21}
22
23impl Token {
24    pub fn new(text: String, position: u32, offset_from: usize, offset_to: usize) -> Self {
25        Self {
26            text,
27            position,
28            offset_from,
29            offset_to,
30        }
31    }
32}
33
34/// Trait for tokenizers
35pub trait Tokenizer: Send + Sync + Clone + 'static {
36    /// Tokenize the input text into a vector of tokens
37    fn tokenize(&self, text: &str) -> Vec<Token>;
38}
39
40/// Simple whitespace tokenizer
41#[derive(Debug, Clone, Default)]
42pub struct SimpleTokenizer;
43
44impl Tokenizer for SimpleTokenizer {
45    fn tokenize(&self, text: &str) -> Vec<Token> {
46        let mut tokens = Vec::new();
47        let mut position = 0u32;
48
49        for (offset, word) in split_whitespace_with_offsets(text) {
50            if !word.is_empty() {
51                tokens.push(Token::new(
52                    word.to_string(),
53                    position,
54                    offset,
55                    offset + word.len(),
56                ));
57                position += 1;
58            }
59        }
60
61        tokens
62    }
63}
64
65/// Lowercase tokenizer - splits on whitespace and lowercases
66#[derive(Debug, Clone, Default)]
67pub struct LowercaseTokenizer;
68
69impl Tokenizer for LowercaseTokenizer {
70    fn tokenize(&self, text: &str) -> Vec<Token> {
71        let mut tokens = Vec::new();
72        let mut position = 0u32;
73
74        for (offset, word) in split_whitespace_with_offsets(text) {
75            if !word.is_empty() {
76                // Remove punctuation and lowercase
77                let cleaned: String = word
78                    .chars()
79                    .filter(|c| c.is_alphanumeric())
80                    .flat_map(|c| c.to_lowercase())
81                    .collect();
82
83                if !cleaned.is_empty() {
84                    tokens.push(Token::new(cleaned, position, offset, offset + word.len()));
85                    position += 1;
86                }
87            }
88        }
89
90        tokens
91    }
92}
93
94/// Split text on whitespace, returning (offset, word) pairs
95fn split_whitespace_with_offsets(text: &str) -> impl Iterator<Item = (usize, &str)> {
96    let mut offset = 0;
97    text.split_whitespace().map(move |word| {
98        let word_start = text[offset..].find(word).unwrap() + offset;
99        offset = word_start + word.len();
100        (word_start, word)
101    })
102}
103
104/// Supported stemmer languages
105#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
106#[allow(missing_docs)]
107#[derive(Default)]
108pub enum Language {
109    Arabic,
110    Danish,
111    Dutch,
112    #[default]
113    English,
114    Finnish,
115    French,
116    German,
117    Greek,
118    Hungarian,
119    Italian,
120    Norwegian,
121    Portuguese,
122    Romanian,
123    Russian,
124    Spanish,
125    Swedish,
126    Tamil,
127    Turkish,
128}
129
130impl Language {
131    fn to_algorithm(self) -> Algorithm {
132        match self {
133            Language::Arabic => Algorithm::Arabic,
134            Language::Danish => Algorithm::Danish,
135            Language::Dutch => Algorithm::Dutch,
136            Language::English => Algorithm::English,
137            Language::Finnish => Algorithm::Finnish,
138            Language::French => Algorithm::French,
139            Language::German => Algorithm::German,
140            Language::Greek => Algorithm::Greek,
141            Language::Hungarian => Algorithm::Hungarian,
142            Language::Italian => Algorithm::Italian,
143            Language::Norwegian => Algorithm::Norwegian,
144            Language::Portuguese => Algorithm::Portuguese,
145            Language::Romanian => Algorithm::Romanian,
146            Language::Russian => Algorithm::Russian,
147            Language::Spanish => Algorithm::Spanish,
148            Language::Swedish => Algorithm::Swedish,
149            Language::Tamil => Algorithm::Tamil,
150            Language::Turkish => Algorithm::Turkish,
151        }
152    }
153}
154
155/// Stemming tokenizer - splits on whitespace, lowercases, and applies stemming
156///
157/// Uses the Snowball stemming algorithm via rust-stemmers.
158/// Supports multiple languages including English, German, French, Spanish, etc.
159#[derive(Debug, Clone)]
160pub struct StemmerTokenizer {
161    language: Language,
162}
163
164impl StemmerTokenizer {
165    /// Create a new stemmer tokenizer for the given language
166    pub fn new(language: Language) -> Self {
167        Self { language }
168    }
169
170    /// Create a new English stemmer tokenizer
171    pub fn english() -> Self {
172        Self::new(Language::English)
173    }
174}
175
176impl Default for StemmerTokenizer {
177    fn default() -> Self {
178        Self::english()
179    }
180}
181
182impl Tokenizer for StemmerTokenizer {
183    fn tokenize(&self, text: &str) -> Vec<Token> {
184        let stemmer = rust_stemmers::Stemmer::create(self.language.to_algorithm());
185        let mut tokens = Vec::new();
186        let mut position = 0u32;
187
188        for (offset, word) in split_whitespace_with_offsets(text) {
189            if !word.is_empty() {
190                // Remove punctuation and lowercase
191                let cleaned: String = word
192                    .chars()
193                    .filter(|c| c.is_alphanumeric())
194                    .flat_map(|c| c.to_lowercase())
195                    .collect();
196
197                if !cleaned.is_empty() {
198                    // Apply stemming
199                    let stemmed = stemmer.stem(&cleaned);
200                    tokens.push(Token::new(
201                        stemmed.into_owned(),
202                        position,
203                        offset,
204                        offset + word.len(),
205                    ));
206                    position += 1;
207                }
208            }
209        }
210
211        tokens
212    }
213}
214
215/// Multi-language stemmer that can select language dynamically
216///
217/// This tokenizer holds stemmers for multiple languages and can tokenize
218/// text using a specific language selected at runtime.
219#[derive(Debug, Clone)]
220pub struct MultiLanguageStemmer {
221    default_language: Language,
222}
223
224impl MultiLanguageStemmer {
225    /// Create a new multi-language stemmer with the given default language
226    pub fn new(default_language: Language) -> Self {
227        Self { default_language }
228    }
229
230    /// Tokenize text using a specific language
231    pub fn tokenize_with_language(&self, text: &str, language: Language) -> Vec<Token> {
232        let stemmer = rust_stemmers::Stemmer::create(language.to_algorithm());
233        let mut tokens = Vec::new();
234        let mut position = 0u32;
235
236        for (offset, word) in split_whitespace_with_offsets(text) {
237            if !word.is_empty() {
238                let cleaned: String = word
239                    .chars()
240                    .filter(|c| c.is_alphanumeric())
241                    .flat_map(|c| c.to_lowercase())
242                    .collect();
243
244                if !cleaned.is_empty() {
245                    let stemmed = stemmer.stem(&cleaned);
246                    tokens.push(Token::new(
247                        stemmed.into_owned(),
248                        position,
249                        offset,
250                        offset + word.len(),
251                    ));
252                    position += 1;
253                }
254            }
255        }
256
257        tokens
258    }
259
260    /// Get the default language
261    pub fn default_language(&self) -> Language {
262        self.default_language
263    }
264}
265
266impl Default for MultiLanguageStemmer {
267    fn default() -> Self {
268        Self::new(Language::English)
269    }
270}
271
272impl Tokenizer for MultiLanguageStemmer {
273    fn tokenize(&self, text: &str) -> Vec<Token> {
274        self.tokenize_with_language(text, self.default_language)
275    }
276}
277
278/// Language-aware tokenizer that can be configured per-field
279///
280/// This allows selecting the stemmer language based on document metadata,
281/// such as a "language" field in the document.
282#[derive(Clone)]
283pub struct LanguageAwareTokenizer<F>
284where
285    F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
286{
287    language_selector: F,
288    stemmer: MultiLanguageStemmer,
289}
290
291impl<F> LanguageAwareTokenizer<F>
292where
293    F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
294{
295    /// Create a new language-aware tokenizer with a custom language selector
296    ///
297    /// The selector function receives a language hint (e.g., from a document field)
298    /// and returns the appropriate Language to use for stemming.
299    ///
300    /// # Example
301    /// ```ignore
302    /// let tokenizer = LanguageAwareTokenizer::new(|hint| {
303    ///     match hint {
304    ///         "en" | "english" => Language::English,
305    ///         "de" | "german" => Language::German,
306    ///         "ru" | "russian" => Language::Russian,
307    ///         _ => Language::English,
308    ///     }
309    /// });
310    /// ```
311    pub fn new(language_selector: F) -> Self {
312        Self {
313            language_selector,
314            stemmer: MultiLanguageStemmer::default(),
315        }
316    }
317
318    /// Tokenize text with a language hint
319    ///
320    /// The hint is passed to the language selector to determine which stemmer to use.
321    pub fn tokenize_with_hint(&self, text: &str, language_hint: &str) -> Vec<Token> {
322        let language = (self.language_selector)(language_hint);
323        self.stemmer.tokenize_with_language(text, language)
324    }
325}
326
327impl<F> Tokenizer for LanguageAwareTokenizer<F>
328where
329    F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
330{
331    fn tokenize(&self, text: &str) -> Vec<Token> {
332        // Default to English when no hint is provided
333        self.stemmer.tokenize_with_language(text, Language::English)
334    }
335}
336
337/// Parse a language string into a Language enum
338///
339/// Supports common language codes and names.
340pub fn parse_language(s: &str) -> Language {
341    match s.to_lowercase().as_str() {
342        "ar" | "arabic" => Language::Arabic,
343        "da" | "danish" => Language::Danish,
344        "nl" | "dutch" => Language::Dutch,
345        "en" | "english" => Language::English,
346        "fi" | "finnish" => Language::Finnish,
347        "fr" | "french" => Language::French,
348        "de" | "german" => Language::German,
349        "el" | "greek" => Language::Greek,
350        "hu" | "hungarian" => Language::Hungarian,
351        "it" | "italian" => Language::Italian,
352        "no" | "norwegian" => Language::Norwegian,
353        "pt" | "portuguese" => Language::Portuguese,
354        "ro" | "romanian" => Language::Romanian,
355        "ru" | "russian" => Language::Russian,
356        "es" | "spanish" => Language::Spanish,
357        "sv" | "swedish" => Language::Swedish,
358        "ta" | "tamil" => Language::Tamil,
359        "tr" | "turkish" => Language::Turkish,
360        _ => Language::English, // Default fallback
361    }
362}
363
364/// Boxed tokenizer for dynamic dispatch
365pub type BoxedTokenizer = Box<dyn TokenizerClone>;
366
367pub trait TokenizerClone: Send + Sync {
368    fn tokenize(&self, text: &str) -> Vec<Token>;
369    fn clone_box(&self) -> BoxedTokenizer;
370}
371
372impl<T: Tokenizer> TokenizerClone for T {
373    fn tokenize(&self, text: &str) -> Vec<Token> {
374        Tokenizer::tokenize(self, text)
375    }
376
377    fn clone_box(&self) -> BoxedTokenizer {
378        Box::new(self.clone())
379    }
380}
381
382impl Clone for BoxedTokenizer {
383    fn clone(&self) -> Self {
384        self.clone_box()
385    }
386}
387
388/// Registry for named tokenizers
389///
390/// Allows registering tokenizers by name and retrieving them for use during indexing.
391/// Pre-registers common tokenizers: "default", "simple", "lowercase", "en_stem", etc.
392#[derive(Clone)]
393pub struct TokenizerRegistry {
394    tokenizers: Arc<RwLock<HashMap<String, BoxedTokenizer>>>,
395}
396
397impl TokenizerRegistry {
398    /// Create a new tokenizer registry with default tokenizers registered
399    pub fn new() -> Self {
400        let registry = Self {
401            tokenizers: Arc::new(RwLock::new(HashMap::new())),
402        };
403        registry.register_defaults();
404        registry
405    }
406
407    /// Register default tokenizers
408    fn register_defaults(&self) {
409        // Basic tokenizers
410        self.register("default", LowercaseTokenizer);
411        self.register("simple", SimpleTokenizer);
412        self.register("lowercase", LowercaseTokenizer);
413        self.register("raw", SimpleTokenizer);
414
415        // English stemmer variants
416        self.register("en_stem", StemmerTokenizer::new(Language::English));
417        self.register("english", StemmerTokenizer::new(Language::English));
418
419        // Other language stemmers
420        self.register("ar_stem", StemmerTokenizer::new(Language::Arabic));
421        self.register("arabic", StemmerTokenizer::new(Language::Arabic));
422        self.register("da_stem", StemmerTokenizer::new(Language::Danish));
423        self.register("danish", StemmerTokenizer::new(Language::Danish));
424        self.register("nl_stem", StemmerTokenizer::new(Language::Dutch));
425        self.register("dutch", StemmerTokenizer::new(Language::Dutch));
426        self.register("fi_stem", StemmerTokenizer::new(Language::Finnish));
427        self.register("finnish", StemmerTokenizer::new(Language::Finnish));
428        self.register("fr_stem", StemmerTokenizer::new(Language::French));
429        self.register("french", StemmerTokenizer::new(Language::French));
430        self.register("de_stem", StemmerTokenizer::new(Language::German));
431        self.register("german", StemmerTokenizer::new(Language::German));
432        self.register("el_stem", StemmerTokenizer::new(Language::Greek));
433        self.register("greek", StemmerTokenizer::new(Language::Greek));
434        self.register("hu_stem", StemmerTokenizer::new(Language::Hungarian));
435        self.register("hungarian", StemmerTokenizer::new(Language::Hungarian));
436        self.register("it_stem", StemmerTokenizer::new(Language::Italian));
437        self.register("italian", StemmerTokenizer::new(Language::Italian));
438        self.register("no_stem", StemmerTokenizer::new(Language::Norwegian));
439        self.register("norwegian", StemmerTokenizer::new(Language::Norwegian));
440        self.register("pt_stem", StemmerTokenizer::new(Language::Portuguese));
441        self.register("portuguese", StemmerTokenizer::new(Language::Portuguese));
442        self.register("ro_stem", StemmerTokenizer::new(Language::Romanian));
443        self.register("romanian", StemmerTokenizer::new(Language::Romanian));
444        self.register("ru_stem", StemmerTokenizer::new(Language::Russian));
445        self.register("russian", StemmerTokenizer::new(Language::Russian));
446        self.register("es_stem", StemmerTokenizer::new(Language::Spanish));
447        self.register("spanish", StemmerTokenizer::new(Language::Spanish));
448        self.register("sv_stem", StemmerTokenizer::new(Language::Swedish));
449        self.register("swedish", StemmerTokenizer::new(Language::Swedish));
450        self.register("ta_stem", StemmerTokenizer::new(Language::Tamil));
451        self.register("tamil", StemmerTokenizer::new(Language::Tamil));
452        self.register("tr_stem", StemmerTokenizer::new(Language::Turkish));
453        self.register("turkish", StemmerTokenizer::new(Language::Turkish));
454    }
455
456    /// Register a tokenizer with a name
457    pub fn register<T: Tokenizer>(&self, name: &str, tokenizer: T) {
458        let mut tokenizers = self.tokenizers.write();
459        tokenizers.insert(name.to_string(), Box::new(tokenizer));
460    }
461
462    /// Get a tokenizer by name
463    pub fn get(&self, name: &str) -> Option<BoxedTokenizer> {
464        let tokenizers = self.tokenizers.read();
465        tokenizers.get(name).cloned()
466    }
467
468    /// Check if a tokenizer is registered
469    pub fn contains(&self, name: &str) -> bool {
470        let tokenizers = self.tokenizers.read();
471        tokenizers.contains_key(name)
472    }
473
474    /// List all registered tokenizer names
475    pub fn names(&self) -> Vec<String> {
476        let tokenizers = self.tokenizers.read();
477        tokenizers.keys().cloned().collect()
478    }
479}
480
481impl Default for TokenizerRegistry {
482    fn default() -> Self {
483        Self::new()
484    }
485}
486
487#[cfg(test)]
488mod tests {
489    use super::*;
490
491    #[test]
492    fn test_simple_tokenizer() {
493        let tokenizer = SimpleTokenizer;
494        let tokens = Tokenizer::tokenize(&tokenizer, "hello world");
495
496        assert_eq!(tokens.len(), 2);
497        assert_eq!(tokens[0].text, "hello");
498        assert_eq!(tokens[0].position, 0);
499        assert_eq!(tokens[1].text, "world");
500        assert_eq!(tokens[1].position, 1);
501    }
502
503    #[test]
504    fn test_lowercase_tokenizer() {
505        let tokenizer = LowercaseTokenizer;
506        let tokens = Tokenizer::tokenize(&tokenizer, "Hello, World!");
507
508        assert_eq!(tokens.len(), 2);
509        assert_eq!(tokens[0].text, "hello");
510        assert_eq!(tokens[1].text, "world");
511    }
512
513    #[test]
514    fn test_empty_text() {
515        let tokenizer = SimpleTokenizer;
516        let tokens = Tokenizer::tokenize(&tokenizer, "");
517        assert!(tokens.is_empty());
518    }
519
520    #[test]
521    fn test_stemmer_tokenizer_english() {
522        let tokenizer = StemmerTokenizer::english();
523        let tokens = Tokenizer::tokenize(&tokenizer, "Dogs are running quickly");
524
525        assert_eq!(tokens.len(), 4);
526        assert_eq!(tokens[0].text, "dog"); // dogs -> dog
527        assert_eq!(tokens[1].text, "are"); // are -> are
528        assert_eq!(tokens[2].text, "run"); // running -> run
529        assert_eq!(tokens[3].text, "quick"); // quickly -> quick
530    }
531
532    #[test]
533    fn test_stemmer_tokenizer_preserves_offsets() {
534        let tokenizer = StemmerTokenizer::english();
535        let tokens = Tokenizer::tokenize(&tokenizer, "Running dogs");
536
537        assert_eq!(tokens.len(), 2);
538        assert_eq!(tokens[0].text, "run");
539        assert_eq!(tokens[0].offset_from, 0);
540        assert_eq!(tokens[0].offset_to, 7); // "Running" is 7 chars
541        assert_eq!(tokens[1].text, "dog");
542        assert_eq!(tokens[1].offset_from, 8);
543        assert_eq!(tokens[1].offset_to, 12); // "dogs" is 4 chars
544    }
545
546    #[test]
547    fn test_stemmer_tokenizer_german() {
548        let tokenizer = StemmerTokenizer::new(Language::German);
549        let tokens = Tokenizer::tokenize(&tokenizer, "Häuser Bücher");
550
551        assert_eq!(tokens.len(), 2);
552        // German stemmer should stem these plural forms
553        assert_eq!(tokens[0].text, "haus"); // häuser -> haus
554        assert_eq!(tokens[1].text, "buch"); // bücher -> buch
555    }
556
557    #[test]
558    fn test_stemmer_tokenizer_russian() {
559        let tokenizer = StemmerTokenizer::new(Language::Russian);
560        let tokens = Tokenizer::tokenize(&tokenizer, "бегущие собаки");
561
562        assert_eq!(tokens.len(), 2);
563        // Russian stemmer should stem these
564        assert_eq!(tokens[0].text, "бегущ"); // бегущие -> бегущ
565        assert_eq!(tokens[1].text, "собак"); // собаки -> собак
566    }
567
568    #[test]
569    fn test_multi_language_stemmer() {
570        let stemmer = MultiLanguageStemmer::new(Language::English);
571
572        // Test with English
573        let tokens = stemmer.tokenize_with_language("running dogs", Language::English);
574        assert_eq!(tokens[0].text, "run");
575        assert_eq!(tokens[1].text, "dog");
576
577        // Test with German
578        let tokens = stemmer.tokenize_with_language("Häuser Bücher", Language::German);
579        assert_eq!(tokens[0].text, "haus");
580        assert_eq!(tokens[1].text, "buch");
581
582        // Test with Russian
583        let tokens = stemmer.tokenize_with_language("бегущие собаки", Language::Russian);
584        assert_eq!(tokens[0].text, "бегущ");
585        assert_eq!(tokens[1].text, "собак");
586    }
587
588    #[test]
589    fn test_language_aware_tokenizer() {
590        let tokenizer = LanguageAwareTokenizer::new(parse_language);
591
592        // English hint
593        let tokens = tokenizer.tokenize_with_hint("running dogs", "en");
594        assert_eq!(tokens[0].text, "run");
595        assert_eq!(tokens[1].text, "dog");
596
597        // German hint
598        let tokens = tokenizer.tokenize_with_hint("Häuser Bücher", "de");
599        assert_eq!(tokens[0].text, "haus");
600        assert_eq!(tokens[1].text, "buch");
601
602        // Russian hint
603        let tokens = tokenizer.tokenize_with_hint("бегущие собаки", "russian");
604        assert_eq!(tokens[0].text, "бегущ");
605        assert_eq!(tokens[1].text, "собак");
606    }
607
608    #[test]
609    fn test_parse_language() {
610        assert_eq!(parse_language("en"), Language::English);
611        assert_eq!(parse_language("english"), Language::English);
612        assert_eq!(parse_language("English"), Language::English);
613        assert_eq!(parse_language("de"), Language::German);
614        assert_eq!(parse_language("german"), Language::German);
615        assert_eq!(parse_language("ru"), Language::Russian);
616        assert_eq!(parse_language("russian"), Language::Russian);
617        assert_eq!(parse_language("unknown"), Language::English); // fallback
618    }
619
620    #[test]
621    fn test_tokenizer_registry_defaults() {
622        let registry = TokenizerRegistry::new();
623
624        // Check default tokenizers are registered
625        assert!(registry.contains("default"));
626        assert!(registry.contains("simple"));
627        assert!(registry.contains("lowercase"));
628        assert!(registry.contains("en_stem"));
629        assert!(registry.contains("german"));
630        assert!(registry.contains("russian"));
631    }
632
633    #[test]
634    fn test_tokenizer_registry_get() {
635        let registry = TokenizerRegistry::new();
636
637        // Get and use a tokenizer
638        let tokenizer = registry.get("en_stem").unwrap();
639        let tokens = tokenizer.tokenize("running dogs");
640        assert_eq!(tokens[0].text, "run");
641        assert_eq!(tokens[1].text, "dog");
642
643        // Get German stemmer
644        let tokenizer = registry.get("german").unwrap();
645        let tokens = tokenizer.tokenize("Häuser Bücher");
646        assert_eq!(tokens[0].text, "haus");
647        assert_eq!(tokens[1].text, "buch");
648    }
649
650    #[test]
651    fn test_tokenizer_registry_custom() {
652        let registry = TokenizerRegistry::new();
653
654        // Register a custom tokenizer
655        registry.register("my_tokenizer", LowercaseTokenizer);
656
657        assert!(registry.contains("my_tokenizer"));
658        let tokenizer = registry.get("my_tokenizer").unwrap();
659        let tokens = tokenizer.tokenize("Hello World");
660        assert_eq!(tokens[0].text, "hello");
661        assert_eq!(tokens[1].text, "world");
662    }
663
664    #[test]
665    fn test_tokenizer_registry_nonexistent() {
666        let registry = TokenizerRegistry::new();
667        assert!(registry.get("nonexistent").is_none());
668    }
669}