scirs2_text/
multilingual.rs

1//! Multilingual text processing and language detection
2//!
3//! This module provides functionality for detecting languages
4//! and processing text in multiple languages.
5
6use crate::error::{Result, TextError};
7use std::collections::HashMap;
8
9/// Supported languages for detection
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11pub enum Language {
12    /// English
13    English,
14    /// Spanish
15    Spanish,
16    /// French
17    French,
18    /// German
19    German,
20    /// Italian
21    Italian,
22    /// Portuguese
23    Portuguese,
24    /// Dutch
25    Dutch,
26    /// Russian
27    Russian,
28    /// Chinese
29    Chinese,
30    /// Japanese
31    Japanese,
32    /// Korean
33    Korean,
34    /// Arabic
35    Arabic,
36    /// Unknown language
37    Unknown,
38}
39
40impl Language {
41    /// Get the ISO 639-1 code for the language
42    pub fn iso_code(&self) -> &'static str {
43        match self {
44            Language::English => "en",
45            Language::Spanish => "es",
46            Language::French => "fr",
47            Language::German => "de",
48            Language::Italian => "it",
49            Language::Portuguese => "pt",
50            Language::Dutch => "nl",
51            Language::Russian => "ru",
52            Language::Chinese => "zh",
53            Language::Japanese => "ja",
54            Language::Korean => "ko",
55            Language::Arabic => "ar",
56            Language::Unknown => "und",
57        }
58    }
59
60    /// Get the language from ISO 639-1 code
61    pub fn from_iso_code(code: &str) -> Self {
62        match code.to_lowercase().as_str() {
63            "en" => Language::English,
64            "es" => Language::Spanish,
65            "fr" => Language::French,
66            "de" => Language::German,
67            "it" => Language::Italian,
68            "pt" => Language::Portuguese,
69            "nl" => Language::Dutch,
70            "ru" => Language::Russian,
71            "zh" => Language::Chinese,
72            "ja" => Language::Japanese,
73            "ko" => Language::Korean,
74            "ar" => Language::Arabic,
75            _ => Language::Unknown,
76        }
77    }
78
79    /// Get the full name of the language
80    pub fn name(&self) -> &'static str {
81        match self {
82            Language::English => "English",
83            Language::Spanish => "Spanish",
84            Language::French => "French",
85            Language::German => "German",
86            Language::Italian => "Italian",
87            Language::Portuguese => "Portuguese",
88            Language::Dutch => "Dutch",
89            Language::Russian => "Russian",
90            Language::Chinese => "Chinese",
91            Language::Japanese => "Japanese",
92            Language::Korean => "Korean",
93            Language::Arabic => "Arabic",
94            Language::Unknown => "Unknown",
95        }
96    }
97}
98
99/// Result of language detection
100#[derive(Debug, Clone)]
101pub struct LanguageDetectionResult {
102    /// The detected language
103    pub language: Language,
104    /// Confidence score (0.0 to 1.0)
105    pub confidence: f64,
106    /// Alternative language candidates with scores
107    pub alternatives: Vec<(Language, f64)>,
108}
109
110/// Language detector using character n-gram profiles
111pub struct LanguageDetector {
112    /// Character n-gram profiles for each language
113    profiles: HashMap<Language, HashMap<String, f64>>,
114    /// N-gram size (typically 2 or 3)
115    n_gram_size: usize,
116}
117
118impl LanguageDetector {
119    /// Create a new language detector with default profiles
120    pub fn new() -> Self {
121        let mut detector = Self {
122            profiles: HashMap::new(),
123            n_gram_size: 3,
124        };
125        detector.initialize_default_profiles();
126        detector
127    }
128
129    /// Create a language detector with custom n-gram size
130    pub fn with_ngram_size(n_gramsize: usize) -> Result<Self> {
131        if !(1..=5).contains(&n_gramsize) {
132            return Err(TextError::InvalidInput(
133                "N-gram size must be between 1 and 5".to_string(),
134            ));
135        }
136        let mut detector = Self {
137            profiles: HashMap::new(),
138            n_gram_size: n_gramsize,
139        };
140        detector.initialize_default_profiles();
141        Ok(detector)
142    }
143
144    /// Initialize default language profiles with common n-grams
145    fn initialize_default_profiles(&mut self) {
146        // English profile
147        let mut english_profile = HashMap::new();
148        for (ngram, freq) in &[
149            ("the", 0.05),
150            ("and", 0.03),
151            ("ing", 0.025),
152            ("ion", 0.02),
153            ("tio", 0.018),
154            ("ent", 0.015),
155            ("ati", 0.013),
156            ("her", 0.012),
157            ("for", 0.011),
158            ("ter", 0.01),
159            ("hat", 0.009),
160            ("tha", 0.009),
161            ("ere", 0.008),
162            ("ate", 0.008),
163            ("ver", 0.007),
164            ("his", 0.007),
165        ] {
166            english_profile.insert(ngram.to_string(), *freq);
167        }
168        self.profiles.insert(Language::English, english_profile);
169
170        // Spanish profile
171        let mut spanish_profile = HashMap::new();
172        for (ngram, freq) in &[
173            ("que", 0.04),
174            ("de_", 0.035),
175            ("la_", 0.03),
176            ("el_", 0.025),
177            ("es_", 0.02),
178            ("los", 0.018),
179            ("las", 0.015),
180            ("ión", 0.013),
181            ("ado", 0.012),
182            ("nte", 0.011),
183            ("con", 0.01),
184            ("par", 0.009),
185            ("ara", 0.008),
186            ("una", 0.008),
187            ("por", 0.007),
188            ("est", 0.007),
189        ] {
190            spanish_profile.insert(ngram.to_string(), *freq);
191        }
192        self.profiles.insert(Language::Spanish, spanish_profile);
193
194        // French profile
195        let mut french_profile = HashMap::new();
196        for (ngram, freq) in &[
197            ("de_", 0.05),
198            ("le_", 0.04),
199            ("que", 0.03),
200            ("les", 0.025),
201            ("la_", 0.02),
202            ("des", 0.018),
203            ("ent", 0.015),
204            ("ion", 0.013),
205            ("est", 0.012),
206            ("ait", 0.011),
207            ("pour", 0.01),
208            ("ais", 0.009),
209            ("ans", 0.008),
210            ("ont", 0.008),
211            ("une", 0.007),
212            ("qui", 0.007),
213        ] {
214            french_profile.insert(ngram.to_string(), *freq);
215        }
216        self.profiles.insert(Language::French, french_profile);
217
218        // German profile
219        let mut german_profile = HashMap::new();
220        for (ngram, freq) in &[
221            ("der", 0.05),
222            ("die", 0.04),
223            ("und", 0.03),
224            ("den", 0.025),
225            ("das", 0.02),
226            ("ein", 0.018),
227            ("ich", 0.015),
228            ("ist", 0.013),
229            ("sch", 0.012),
230            ("cht", 0.011),
231            ("ung", 0.01),
232            ("gen", 0.009),
233            ("eit", 0.008),
234            ("ver", 0.008),
235            ("ber", 0.007),
236            ("ten", 0.007),
237        ] {
238            german_profile.insert(ngram.to_string(), *freq);
239        }
240        self.profiles.insert(Language::German, german_profile);
241
242        // Italian profile
243        let mut italian_profile = HashMap::new();
244        for (ngram, freq) in &[
245            ("che", 0.05),
246            ("la_", 0.04),
247            ("il_", 0.03),
248            ("di_", 0.025),
249            ("del", 0.02),
250            ("le_", 0.018),
251            ("lla", 0.015),
252            ("per", 0.013),
253            ("ato", 0.012),
254            ("gli", 0.011),
255            ("sta", 0.01),
256            ("con", 0.009),
257            ("ent", 0.008),
258            ("ion", 0.008),
259            ("are", 0.007),
260            ("una", 0.007),
261        ] {
262            italian_profile.insert(ngram.to_string(), *freq);
263        }
264        self.profiles.insert(Language::Italian, italian_profile);
265
266        // Portuguese profile
267        let mut portuguese_profile = HashMap::new();
268        for (ngram, freq) in &[
269            ("que", 0.05),
270            ("de_", 0.04),
271            ("os_", 0.03),
272            ("as_", 0.025),
273            ("da_", 0.02),
274            ("do_", 0.018),
275            ("ão_", 0.015),
276            ("ent", 0.013),
277            ("com", 0.012),
278            ("para", 0.011),
279            ("uma", 0.01),
280            ("est", 0.009),
281            ("nte", 0.008),
282            ("ção", 0.008),
283            ("por", 0.007),
284            ("não", 0.007),
285        ] {
286            portuguese_profile.insert(ngram.to_string(), *freq);
287        }
288        self.profiles
289            .insert(Language::Portuguese, portuguese_profile);
290
291        // Dutch profile
292        let mut dutch_profile = HashMap::new();
293        for (ngram, freq) in &[
294            ("de_", 0.05),
295            ("het", 0.04),
296            ("een", 0.03),
297            ("van", 0.025),
298            ("en_", 0.02),
299            ("dat", 0.018),
300            ("te_", 0.015),
301            ("op_", 0.013),
302            ("aar", 0.012),
303            ("oor", 0.011),
304            ("eer", 0.01),
305            ("sch", 0.009),
306            ("ver", 0.008),
307            ("ing", 0.008),
308            ("cht", 0.007),
309            ("ter", 0.007),
310        ] {
311            dutch_profile.insert(ngram.to_string(), *freq);
312        }
313        self.profiles.insert(Language::Dutch, dutch_profile);
314
315        // Russian profile
316        let mut russian_profile = HashMap::new();
317        for (ngram, freq) in &[
318            ("что", 0.05),
319            ("ого", 0.04),
320            ("как", 0.03),
321            ("это", 0.025),
322            ("все", 0.02),
323            ("был", 0.018),
324            ("ени", 0.015),
325            ("ост", 0.013),
326            ("ова", 0.012),
327            ("про", 0.011),
328            ("сто", 0.01),
329            ("ого", 0.009),
330            ("при", 0.008),
331            ("ени", 0.008),
332            ("ать", 0.007),
333            ("ный", 0.007),
334        ] {
335            russian_profile.insert(ngram.to_string(), *freq);
336        }
337        self.profiles.insert(Language::Russian, russian_profile);
338
339        // Chinese profile (using pinyin representation)
340        let mut chinese_profile = HashMap::new();
341        for (ngram, freq) in &[
342            ("的_", 0.06),
343            ("是_", 0.045),
344            ("了_", 0.035),
345            ("在_", 0.03),
346            ("和_", 0.025),
347            ("有_", 0.022),
348            ("我_", 0.02),
349            ("他_", 0.018),
350            ("不_", 0.016),
351            ("为_", 0.014),
352            ("这_", 0.013),
353            ("个_", 0.012),
354            ("们_", 0.011),
355            ("人_", 0.01),
356            ("要_", 0.009),
357            ("会_", 0.008),
358        ] {
359            chinese_profile.insert(ngram.to_string(), *freq);
360        }
361        self.profiles.insert(Language::Chinese, chinese_profile);
362
363        // Japanese profile (using hiragana/katakana)
364        let mut japanese_profile = HashMap::new();
365        for (ngram, freq) in &[
366            ("の_", 0.05),
367            ("に_", 0.04),
368            ("は_", 0.035),
369            ("を_", 0.03),
370            ("た_", 0.025),
371            ("と_", 0.022),
372            ("が_", 0.02),
373            ("で_", 0.018),
374            ("る_", 0.016),
375            ("す_", 0.014),
376            ("い_", 0.013),
377            ("ます", 0.012),
378            ("した", 0.011),
379            ("して", 0.01),
380            ("です", 0.009),
381            ("ない", 0.008),
382        ] {
383            japanese_profile.insert(ngram.to_string(), *freq);
384        }
385        self.profiles.insert(Language::Japanese, japanese_profile);
386
387        // Korean profile (using Hangul)
388        let mut korean_profile = HashMap::new();
389        for (ngram, freq) in &[
390            ("의_", 0.05),
391            ("이_", 0.04),
392            ("가_", 0.035),
393            ("을_", 0.03),
394            ("는_", 0.025),
395            ("에_", 0.022),
396            ("하_", 0.02),
397            ("고_", 0.018),
398            ("다_", 0.016),
399            ("지_", 0.014),
400            ("한_", 0.013),
401            ("로_", 0.012),
402            ("서_", 0.011),
403            ("도_", 0.01),
404            ("와_", 0.009),
405            ("니_", 0.008),
406        ] {
407            korean_profile.insert(ngram.to_string(), *freq);
408        }
409        self.profiles.insert(Language::Korean, korean_profile);
410
411        // Arabic profile (using Arabic script)
412        let mut arabic_profile = HashMap::new();
413        for (ngram, freq) in &[
414            ("ال_", 0.06),
415            ("في_", 0.045),
416            ("من_", 0.035),
417            ("على", 0.03),
418            ("إلى", 0.025),
419            ("ها_", 0.022),
420            ("أن_", 0.02),
421            ("ما_", 0.018),
422            ("هو_", 0.016),
423            ("كان", 0.014),
424            ("هذا", 0.013),
425            ("عن_", 0.012),
426            ("بين", 0.011),
427            ("لا_", 0.01),
428            ("قد_", 0.009),
429            ("كل_", 0.008),
430        ] {
431            arabic_profile.insert(ngram.to_string(), *freq);
432        }
433        self.profiles.insert(Language::Arabic, arabic_profile);
434    }
435
436    /// Detect the language of a text
437    pub fn detect(&self, text: &str) -> Result<LanguageDetectionResult> {
438        if text.trim().is_empty() {
439            return Err(TextError::InvalidInput(
440                "Cannot detect language of empty text".to_string(),
441            ));
442        }
443
444        // Extract n-grams from the text
445        let text_profile = self.createtext_profile(text);
446
447        // Score each language profile
448        let mut scores: Vec<(Language, f64)> = self
449            .profiles
450            .iter()
451            .map(|(lang, profile)| {
452                let score = self.calculate_similarity(&text_profile, profile);
453                (*lang, score)
454            })
455            .collect();
456
457        // Sort by score (descending)
458        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
459
460        if scores.is_empty() {
461            return Ok(LanguageDetectionResult {
462                language: Language::Unknown,
463                confidence: 0.0,
464                alternatives: vec![],
465            });
466        }
467
468        let best_score = scores[0].1;
469        let best_language = scores[0].0;
470
471        // Calculate confidence based on the difference between top scores
472        let confidence = if scores.len() > 1 {
473            let second_score = scores[1].1;
474            let diff = best_score - second_score;
475            // Normalize confidence to [0, 1]
476            (diff / best_score).clamp(0.0, 1.0)
477        } else {
478            best_score
479        };
480
481        Ok(LanguageDetectionResult {
482            language: best_language,
483            confidence,
484            alternatives: scores.into_iter().skip(1).take(3).collect(),
485        })
486    }
487
488    /// Create n-gram profile for a text
489    fn createtext_profile(&self, text: &str) -> HashMap<String, f64> {
490        let mut profile = HashMap::new();
491        let text_lower = text.to_lowercase();
492        let chars: Vec<char> = text_lower.chars().collect();
493        let total_ngrams = chars.len().saturating_sub(self.n_gram_size - 1) as f64;
494
495        if total_ngrams <= 0.0 {
496            return profile;
497        }
498
499        // Count n-grams
500        let mut ngram_counts: HashMap<String, usize> = HashMap::new();
501        for i in 0..=chars.len().saturating_sub(self.n_gram_size) {
502            let ngram: String = chars[i..i + self.n_gram_size].iter().collect();
503            // Replace spaces with underscores for consistency
504            let ngram = ngram.replace(' ', "_");
505            *ngram_counts.entry(ngram).or_insert(0) += 1;
506        }
507
508        // Convert counts to frequencies
509        for (ngram, count) in ngram_counts {
510            profile.insert(ngram, count as f64 / total_ngrams);
511        }
512
513        profile
514    }
515
516    /// Calculate similarity between two n-gram profiles
517    fn calculate_similarity(
518        &self,
519        profile1: &HashMap<String, f64>,
520        profile2: &HashMap<String, f64>,
521    ) -> f64 {
522        let mut similarity = 0.0;
523        let mut total_weight = 0.0;
524
525        // Use cosine similarity
526        for (ngram, freq1) in profile1 {
527            if let Some(freq2) = profile2.get(ngram) {
528                similarity += freq1 * freq2;
529            }
530            total_weight += freq1 * freq1;
531        }
532
533        if total_weight > 0.0 {
534            similarity / total_weight.sqrt()
535        } else {
536            0.0
537        }
538    }
539
540    /// Get supported languages
541    pub fn supported_languages(&self) -> Vec<Language> {
542        self.profiles.keys().copied().collect()
543    }
544}
545
546impl Default for LanguageDetector {
547    fn default() -> Self {
548        Self::new()
549    }
550}
551
552/// Language-specific stop words
553pub struct StopWords {
554    /// Stop words organized by language
555    stop_words: HashMap<Language, Vec<String>>,
556}
557
558impl StopWords {
559    /// Create a new stop words collection
560    pub fn new() -> Self {
561        let mut stop_words = HashMap::new();
562
563        // English stop words
564        stop_words.insert(
565            Language::English,
566            vec![
567                "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in",
568                "is", "it", "its", "of", "on", "that", "the", "to", "was", "will", "with", "you",
569                "your", "this", "have", "had", "been", "but", "not", "they", "were", "what",
570                "when", "where", "who", "which", "their", "them", "these", "those", "there",
571                "here", "than",
572            ]
573            .iter()
574            .map(|s| s.to_string())
575            .collect(),
576        );
577
578        // Spanish stop words
579        stop_words.insert(
580            Language::Spanish,
581            vec![
582                "a", "al", "algo", "algunas", "algunos", "ante", "antes", "como", "con", "contra",
583                "cual", "cuando", "de", "del", "desde", "donde", "durante", "e", "el", "ella",
584                "ellas", "ellos", "en", "entre", "era", "erais", "eran", "eras", "eres", "es",
585                "esa", "esas", "ese", "eso", "esos", "esta", "estas", "este", "esto", "estos",
586                "fue", "fueron", "fui", "la", "las", "lo", "los", "más", "mi", "mis", "mucho",
587                "muchos", "muy", "ni", "no", "nos", "nosotras", "nosotros", "o", "otra", "otras",
588                "otro", "otros", "para", "pero", "por", "porque", "que", "quien", "quienes", "se",
589                "si", "sin", "sobre", "su", "sus", "también", "tanto", "te", "tu", "tus", "un",
590                "una", "uno", "unos", "y", "ya", "yo",
591            ]
592            .iter()
593            .map(|s| s.to_string())
594            .collect(),
595        );
596
597        // French stop words
598        stop_words.insert(
599            Language::French,
600            vec![
601                "au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "en", "et",
602                "eux", "il", "je", "la", "le", "les", "leur", "lui", "ma", "mais", "me", "même",
603                "mes", "moi", "mon", "ne", "nos", "notre", "nous", "on", "ou", "par", "pas",
604                "pour", "qu", "que", "qui", "sa", "se", "ses", "son", "sur", "ta", "te", "tes",
605                "toi", "ton", "tu", "un", "une", "vos", "votre", "vous",
606            ]
607            .iter()
608            .map(|s| s.to_string())
609            .collect(),
610        );
611
612        Self { stop_words }
613    }
614
615    /// Get stop words for a specific language
616    pub fn get(&self, language: Language) -> Option<&Vec<String>> {
617        self.stop_words.get(&language)
618    }
619
620    /// Check if a word is a stop word in a specific language
621    pub fn is_stop_word(&self, word: &str, language: Language) -> bool {
622        if let Some(words) = self.stop_words.get(&language) {
623            words.iter().any(|sw| sw == &word.to_lowercase())
624        } else {
625            false
626        }
627    }
628
629    /// Remove stop words from a list of tokens
630    pub fn remove_stop_words(&self, tokens: &[String], language: Language) -> Vec<String> {
631        tokens
632            .iter()
633            .filter(|token| !self.is_stop_word(token, language))
634            .cloned()
635            .collect()
636    }
637}
638
639impl Default for StopWords {
640    fn default() -> Self {
641        Self::new()
642    }
643}
644
645/// Language-specific text processor
646pub struct MultilingualProcessor {
647    /// Language detector
648    detector: LanguageDetector,
649    /// Stop words collection
650    stop_words: StopWords,
651}
652
653impl MultilingualProcessor {
654    /// Create a new multilingual processor
655    pub fn new() -> Self {
656        Self {
657            detector: LanguageDetector::new(),
658            stop_words: StopWords::new(),
659        }
660    }
661
662    /// Process text with automatic language detection
663    pub fn process(&self, text: &str) -> Result<ProcessedText> {
664        // Detect language
665        let detection = self.detector.detect(text)?;
666
667        // Tokenize (simple whitespace tokenization for now)
668        let tokens: Vec<String> = text.split_whitespace().map(|s| s.to_string()).collect();
669
670        // Remove stop words
671        let filtered_tokens = self
672            .stop_words
673            .remove_stop_words(&tokens, detection.language);
674
675        Ok(ProcessedText {
676            original: text.to_string(),
677            language: detection.language,
678            confidence: detection.confidence,
679            tokens,
680            filtered_tokens,
681        })
682    }
683}
684
685impl Default for MultilingualProcessor {
686    fn default() -> Self {
687        Self::new()
688    }
689}
690
691/// Result of multilingual text processing
692#[derive(Debug, Clone)]
693pub struct ProcessedText {
694    /// Original text
695    pub original: String,
696    /// Detected language
697    pub language: Language,
698    /// Language detection confidence
699    pub confidence: f64,
700    /// All tokens
701    pub tokens: Vec<String>,
702    /// Tokens after stop word removal
703    pub filtered_tokens: Vec<String>,
704}
705
706#[cfg(test)]
707mod tests {
708    use super::*;
709
710    #[test]
711    fn test_language_enum() {
712        assert_eq!(Language::English.iso_code(), "en");
713        assert_eq!(Language::Spanish.name(), "Spanish");
714        assert_eq!(Language::from_iso_code("fr"), Language::French);
715        assert_eq!(Language::from_iso_code("unknown"), Language::Unknown);
716    }
717
718    #[test]
719    fn test_language_detection() {
720        let detector = LanguageDetector::new();
721
722        // Test English detection with more text
723        let result = detector.detect("The quick brown fox jumps over the lazy dog. This is definitely an English sentence with many common words.").expect("Operation failed");
724        assert_eq!(result.language, Language::English);
725
726        // Test with empty text
727        let empty_result = detector.detect("");
728        assert!(empty_result.is_err());
729    }
730
731    #[test]
732    fn test_stop_words() {
733        let stop_words = StopWords::new();
734
735        // Test English stop words
736        assert!(stop_words.is_stop_word("the", Language::English));
737        assert!(stop_words.is_stop_word("and", Language::English));
738        assert!(!stop_words.is_stop_word("hello", Language::English));
739
740        // Test stop word removal
741        let tokens = vec![
742            "the".to_string(),
743            "cat".to_string(),
744            "is".to_string(),
745            "happy".to_string(),
746        ];
747        let filtered = stop_words.remove_stop_words(&tokens, Language::English);
748        assert_eq!(filtered, vec!["cat", "happy"]);
749    }
750
751    #[test]
752    fn test_multilingual_processor() {
753        let processor = MultilingualProcessor::new();
754
755        let result = processor.process("The quick brown fox jumps over the lazy dog. This sentence has many English words.").expect("Operation failed");
756        assert_eq!(result.language, Language::English);
757        assert!(!result.tokens.is_empty());
758        assert!(result.filtered_tokens.len() < result.tokens.len());
759    }
760
761    #[test]
762    fn test_createtext_profile() {
763        let detector = LanguageDetector::new();
764        let profile = detector.createtext_profile("hello world");
765
766        // Check that profile contains some n-grams
767        assert!(!profile.is_empty());
768        assert!(profile.contains_key("hel") || profile.contains_key("llo"));
769    }
770}
771
772// =============================================================================
773// Unicode-agnostic tokenization and transliteration
774// =============================================================================
775
776// ── ScriptFamily ─────────────────────────────────────────────────────────────
777
778/// Coarse script family classification for Unicode text.
779#[derive(Debug, Clone, Copy, PartialEq, Eq)]
780pub enum ScriptFamily {
781    /// Latin / ASCII script (includes diacritics).
782    Latin,
783    /// CJK ideographs (Chinese, Japanese Kanji, Korean Hanja).
784    Cjk,
785    /// Cyrillic script.
786    Cyrillic,
787    /// Arabic script.
788    Arabic,
789    /// Devanagari script (Hindi, Sanskrit, …).
790    Devanagari,
791    /// Any other / mixed script.
792    Other,
793}
794
795// ── UnicodeTokenizerConfig ────────────────────────────────────────────────────
796
797/// Configuration for [`UnicodeTokenizer`].
798#[derive(Debug, Clone)]
799pub struct UnicodeTokenizerConfig {
800    /// Convert characters to lowercase before tokenizing.  Default: `true`.
801    pub lowercase: bool,
802    /// Strip combining accent marks (NFD decomposition + remove Mn category
803    /// approximation).  Default: `true`.
804    pub strip_accents: bool,
805    /// Split on Unicode punctuation characters.  Default: `true`.
806    pub split_on_punctuation: bool,
807    /// Split on ASCII whitespace.  Default: `true`.
808    pub split_on_whitespace: bool,
809    /// Maximum token length in characters.  `None` = unlimited.  Default: `None`.
810    pub max_token_length: Option<usize>,
811}
812
813impl Default for UnicodeTokenizerConfig {
814    fn default() -> Self {
815        UnicodeTokenizerConfig {
816            lowercase: true,
817            strip_accents: true,
818            split_on_punctuation: true,
819            split_on_whitespace: true,
820            max_token_length: None,
821        }
822    }
823}
824
825// ── UnicodeTokenizer ──────────────────────────────────────────────────────────
826
827/// Language-agnostic Unicode tokenizer.
828///
829/// Works for any writing system:
830/// - CJK ideographs become individual tokens (spaces inserted around them).
831/// - Optional accent stripping via a pure-Rust NFD approximation.
832/// - Optional lowercasing.
833/// - Punctuation splitting (Unicode category Po/Ps/Pe/…).
834///
835/// No external Unicode library is required.
836pub struct UnicodeTokenizer {
837    config: UnicodeTokenizerConfig,
838}
839
840impl UnicodeTokenizer {
841    /// Create a new tokenizer with the given configuration.
842    pub fn new(config: UnicodeTokenizerConfig) -> Self {
843        UnicodeTokenizer { config }
844    }
845
846    /// Create a tokenizer with sensible defaults:
847    /// lowercase=true, strip_accents=true, split on whitespace + punctuation.
848    pub fn default_tokenizer() -> Self {
849        Self::new(UnicodeTokenizerConfig::default())
850    }
851
852    // ── tokenize ─────────────────────────────────────────────────────────────
853
854    /// Tokenize `text` into a list of tokens (Unicode-aware).
855    ///
856    /// Processing order:
857    /// 1. Insert spaces around CJK characters.
858    /// 2. Optionally lowercase.
859    /// 3. Optionally strip accents.
860    /// 4. Split on whitespace (always) and optionally on punctuation.
861    /// 5. Discard empty tokens and enforce max_token_length.
862    pub fn tokenize(&self, text: &str) -> Vec<String> {
863        // Step 1: Handle CJK — space-pad each CJK char so it splits cleanly
864        let spaced = self.insert_cjk_spaces(text);
865
866        // Step 2: Optionally lowercase
867        let lowered = if self.config.lowercase {
868            spaced.to_lowercase()
869        } else {
870            spaced
871        };
872
873        // Step 3: Optionally strip accents
874        let stripped = if self.config.strip_accents {
875            Transliterator::strip_accents(&lowered)
876        } else {
877            lowered
878        };
879
880        // Step 4: Split
881        let mut tokens: Vec<String> = Vec::new();
882        let mut current = String::new();
883
884        for ch in stripped.chars() {
885            let is_ws = ch.is_ascii_whitespace() || ch == '\u{00A0}';
886            let is_punct = self.config.split_on_punctuation && is_unicode_punctuation(ch);
887
888            if (self.config.split_on_whitespace && is_ws) || is_punct {
889                if !current.is_empty() {
890                    tokens.push(current.clone());
891                    current.clear();
892                }
893                if is_punct {
894                    tokens.push(ch.to_string());
895                }
896            } else {
897                current.push(ch);
898            }
899        }
900        if !current.is_empty() {
901            tokens.push(current);
902        }
903
904        // Step 5: Filter empty / apply max_token_length
905        tokens.retain(|t| !t.is_empty());
906        if let Some(max_len) = self.config.max_token_length {
907            tokens.iter_mut().for_each(|t| {
908                let char_count = t.chars().count();
909                if char_count > max_len {
910                    *t = t.chars().take(max_len).collect();
911                }
912            });
913        }
914
915        tokens
916    }
917
918    // ── detect_script ────────────────────────────────────────────────────────
919
920    /// Detect the dominant script family of `text` by majority vote over
921    /// non-whitespace characters.
922    pub fn detect_script(&self, text: &str) -> ScriptFamily {
923        let mut latin = 0usize;
924        let mut cjk = 0usize;
925        let mut cyrillic = 0usize;
926        let mut arabic = 0usize;
927        let mut devanagari = 0usize;
928        let mut other = 0usize;
929
930        for ch in text.chars() {
931            if ch.is_whitespace() {
932                continue;
933            }
934            if is_cjk_char(ch) {
935                cjk += 1;
936            } else if is_cyrillic(ch) {
937                cyrillic += 1;
938            } else if is_arabic(ch) {
939                arabic += 1;
940            } else if is_devanagari(ch) {
941                devanagari += 1;
942            } else if ch.is_ascii_alphabetic() || (ch as u32 >= 0x00C0 && ch as u32 <= 0x024F) {
943                latin += 1;
944            } else {
945                other += 1;
946            }
947        }
948
949        let max = [latin, cjk, cyrillic, arabic, devanagari, other]
950            .into_iter()
951            .max()
952            .unwrap_or(0);
953
954        if max == 0 {
955            return ScriptFamily::Other;
956        }
957        if max == cjk {
958            ScriptFamily::Cjk
959        } else if max == cyrillic {
960            ScriptFamily::Cyrillic
961        } else if max == arabic {
962            ScriptFamily::Arabic
963        } else if max == devanagari {
964            ScriptFamily::Devanagari
965        } else if max == latin {
966            ScriptFamily::Latin
967        } else {
968            ScriptFamily::Other
969        }
970    }
971
972    // ── tokenize_cjk ─────────────────────────────────────────────────────────
973
974    /// Tokenize a (potentially mixed) text where CJK characters each become
975    /// their own token, while non-CJK sequences are tokenized by whitespace.
976    pub fn tokenize_cjk(&self, text: &str) -> Vec<String> {
977        let spaced = self.insert_cjk_spaces(text);
978        spaced
979            .split_whitespace()
980            .map(|s| s.to_string())
981            .filter(|s| !s.is_empty())
982            .collect()
983    }
984
985    // ── Private helpers ───────────────────────────────────────────────────────
986
987    fn insert_cjk_spaces(&self, text: &str) -> String {
988        let mut out = String::with_capacity(text.len() + text.chars().count());
989        for ch in text.chars() {
990            if is_cjk_char(ch) {
991                out.push(' ');
992                out.push(ch);
993                out.push(' ');
994            } else {
995                out.push(ch);
996            }
997        }
998        out
999    }
1000}
1001
1002impl Default for UnicodeTokenizer {
1003    fn default() -> Self {
1004        Self::default_tokenizer()
1005    }
1006}
1007
1008// ── Transliterator ────────────────────────────────────────────────────────────
1009
1010/// Utility functions for converting non-Latin scripts to Latin characters.
1011pub struct Transliterator;
1012
1013impl Transliterator {
1014    /// Transliterate a CJK string to a Pinyin-like romanisation.
1015    ///
1016    /// The mapping covers ~200 of the most frequent Mandarin characters.
1017    /// Unmapped characters are left unchanged.
1018    pub fn cjk_to_latin(text: &str) -> String {
1019        text.chars()
1020            .map(|c| {
1021                if let Some(roman) = cjk_pinyin_lookup(c) {
1022                    roman.to_string()
1023                } else {
1024                    c.to_string()
1025                }
1026            })
1027            .collect::<Vec<_>>()
1028            .join("")
1029    }
1030
1031    /// Transliterate Cyrillic characters to Latin (standard Russian
1032    /// romanisation / BGN/PCGN).
1033    ///
1034    /// Uppercase source characters produce titlecase output; unmapped
1035    /// characters are kept as-is.
1036    pub fn cyrillic_to_latin(text: &str) -> String {
1037        let mut out = String::with_capacity(text.len() * 2);
1038        for ch in text.chars() {
1039            let lower = ch.to_lowercase().next().unwrap_or(ch);
1040            if let Some(roman) = cyrillic_lookup(lower) {
1041                if ch.is_uppercase() {
1042                    // Capitalise first char of the mapping
1043                    let mut chars = roman.chars();
1044                    if let Some(first) = chars.next() {
1045                        for c in first.to_uppercase() {
1046                            out.push(c);
1047                        }
1048                        out.push_str(chars.as_str());
1049                    }
1050                } else {
1051                    out.push_str(roman);
1052                }
1053            } else {
1054                out.push(ch);
1055            }
1056        }
1057        out
1058    }
1059
1060    /// Strip combining diacritical marks from Latin text via an NFD-like
1061    /// decomposition table.
1062    ///
1063    /// This is a pure-Rust approximation covering the Latin Extended-A/B block
1064    /// (U+00C0–U+024F) and a small subset of precomposed characters.
1065    pub fn strip_accents(text: &str) -> String {
1066        text.chars()
1067            .flat_map(nfd_decompose)
1068            .filter(|&c| !is_combining_mark(c))
1069            .collect()
1070    }
1071
1072    /// Normalise text: collapse multiple whitespace runs into single space,
1073    /// trim leading/trailing whitespace, and lowercase.
1074    pub fn normalize(text: &str) -> String {
1075        let lowered = text.to_lowercase();
1076        lowered.split_whitespace().collect::<Vec<_>>().join(" ")
1077    }
1078}
1079
1080// ── Unicode character classification helpers ──────────────────────────────────
1081
1082/// Returns `true` when `c` is a CJK ideograph or extension character.
1083pub fn is_cjk_char(c: char) -> bool {
1084    let cp = c as u32;
1085    // CJK Unified Ideographs
1086    (0x4E00..=0x9FFF).contains(&cp)
1087    // CJK Extension A
1088    || (0x3400..=0x4DBF).contains(&cp)
1089    // CJK Extension B
1090    || (0x20000..=0x2A6DF).contains(&cp)
1091    // CJK Compatibility Ideographs
1092    || (0xF900..=0xFAFF).contains(&cp)
1093    // CJK Compatibility Ideographs Supplement
1094    || (0x2F800..=0x2FA1F).contains(&cp)
1095}
1096
1097/// Returns `true` when `c` is in the Cyrillic block (U+0400–U+04FF).
1098pub fn is_cyrillic(c: char) -> bool {
1099    let cp = c as u32;
1100    (0x0400..=0x04FF).contains(&cp)
1101}
1102
1103/// Returns `true` when `c` is in the Arabic block (U+0600–U+06FF).
1104fn is_arabic(c: char) -> bool {
1105    let cp = c as u32;
1106    (0x0600..=0x06FF).contains(&cp)
1107}
1108
1109/// Returns `true` when `c` is in the Devanagari block (U+0900–U+097F).
1110fn is_devanagari(c: char) -> bool {
1111    let cp = c as u32;
1112    (0x0900..=0x097F).contains(&cp)
1113}
1114
1115/// Returns `true` when `c` is a Unicode combining mark (category Mn/Mc/Me).
1116///
1117/// Approximation: covers the main Combining Diacritical Marks block
1118/// U+0300–U+036F and the Combining Diacritical Marks Supplement U+1DC0–U+1DFF.
1119pub fn is_combining_mark(c: char) -> bool {
1120    let cp = c as u32;
1121    (0x0300..=0x036F).contains(&cp)
1122        || (0x1DC0..=0x1DFF).contains(&cp)
1123        || (0x20D0..=0x20FF).contains(&cp)
1124        || (0xFE20..=0xFE2F).contains(&cp)
1125}
1126
1127/// Returns `true` when `c` is a Unicode punctuation or symbol character.
1128fn is_unicode_punctuation(c: char) -> bool {
1129    matches!(
1130        c,
1131        '!' | '"'
1132            | '#'
1133            | '%'
1134            | '&'
1135            | '\''
1136            | '('
1137            | ')'
1138            | '*'
1139            | ','
1140            | '-'
1141            | '.'
1142            | '/'
1143            | ':'
1144            | ';'
1145            | '?'
1146            | '@'
1147            | '['
1148            | '\\'
1149            | ']'
1150            | '_'
1151            | '{'
1152            | '}'
1153            | '~'
1154            | '·'
1155            | '…'
1156            | '—'
1157            | '–'
1158            | '\u{2018}'
1159            | '\u{2019}'
1160            | '\u{201C}'
1161            | '\u{201D}'
1162    ) || (c as u32 >= 0x2000 && c as u32 <= 0x206F)
1163}
1164
1165// ── Transliteration lookup tables ─────────────────────────────────────────────
1166
1167/// Cyrillic → Latin mapping (BGN/PCGN standard for Russian).
1168fn cyrillic_lookup(c: char) -> Option<&'static str> {
1169    match c {
1170        'а' => Some("a"),
1171        'б' => Some("b"),
1172        'в' => Some("v"),
1173        'г' => Some("g"),
1174        'д' => Some("d"),
1175        'е' => Some("ye"),
1176        'ё' => Some("yo"),
1177        'ж' => Some("zh"),
1178        'з' => Some("z"),
1179        'и' => Some("i"),
1180        'й' => Some("y"),
1181        'к' => Some("k"),
1182        'л' => Some("l"),
1183        'м' => Some("m"),
1184        'н' => Some("n"),
1185        'о' => Some("o"),
1186        'п' => Some("p"),
1187        'р' => Some("r"),
1188        'с' => Some("s"),
1189        'т' => Some("t"),
1190        'у' => Some("u"),
1191        'ф' => Some("f"),
1192        'х' => Some("kh"),
1193        'ц' => Some("ts"),
1194        'ч' => Some("ch"),
1195        'ш' => Some("sh"),
1196        'щ' => Some("shch"),
1197        'ъ' => Some(""),
1198        'ы' => Some("y"),
1199        'ь' => Some(""),
1200        'э' => Some("e"),
1201        'ю' => Some("yu"),
1202        'я' => Some("ya"),
1203        _ => None,
1204    }
1205}
1206
1207/// Approximate NFD decomposition for Latin Extended characters.
1208///
1209/// Returns an iterator of up to 2 chars: the base letter followed optionally
1210/// by a combining mark.  For characters outside the coverage, returns the
1211/// original character unchanged.
1212fn nfd_decompose(c: char) -> impl Iterator<Item = char> {
1213    // Map precomposed Latin Extended-A/B to base + combining grave/acute/circ/etc.
1214    let decomp: Option<(char, Option<char>)> = match c {
1215        'À' => Some(('A', Some('\u{0300}'))),
1216        'Á' => Some(('A', Some('\u{0301}'))),
1217        'Â' => Some(('A', Some('\u{0302}'))),
1218        'Ã' => Some(('A', Some('\u{0303}'))),
1219        'Ä' => Some(('A', Some('\u{0308}'))),
1220        'Å' => Some(('A', Some('\u{030A}'))),
1221        'à' => Some(('a', Some('\u{0300}'))),
1222        'á' => Some(('a', Some('\u{0301}'))),
1223        'â' => Some(('a', Some('\u{0302}'))),
1224        'ã' => Some(('a', Some('\u{0303}'))),
1225        'ä' => Some(('a', Some('\u{0308}'))),
1226        'å' => Some(('a', Some('\u{030A}'))),
1227        'È' => Some(('E', Some('\u{0300}'))),
1228        'É' => Some(('E', Some('\u{0301}'))),
1229        'Ê' => Some(('E', Some('\u{0302}'))),
1230        'Ë' => Some(('E', Some('\u{0308}'))),
1231        'è' => Some(('e', Some('\u{0300}'))),
1232        'é' => Some(('e', Some('\u{0301}'))),
1233        'ê' => Some(('e', Some('\u{0302}'))),
1234        'ë' => Some(('e', Some('\u{0308}'))),
1235        'Ì' => Some(('I', Some('\u{0300}'))),
1236        'Í' => Some(('I', Some('\u{0301}'))),
1237        'Î' => Some(('I', Some('\u{0302}'))),
1238        'Ï' => Some(('I', Some('\u{0308}'))),
1239        'ì' => Some(('i', Some('\u{0300}'))),
1240        'í' => Some(('i', Some('\u{0301}'))),
1241        'î' => Some(('i', Some('\u{0302}'))),
1242        'ï' => Some(('i', Some('\u{0308}'))),
1243        'Ò' => Some(('O', Some('\u{0300}'))),
1244        'Ó' => Some(('O', Some('\u{0301}'))),
1245        'Ô' => Some(('O', Some('\u{0302}'))),
1246        'Õ' => Some(('O', Some('\u{0303}'))),
1247        'Ö' => Some(('O', Some('\u{0308}'))),
1248        'ò' => Some(('o', Some('\u{0300}'))),
1249        'ó' => Some(('o', Some('\u{0301}'))),
1250        'ô' => Some(('o', Some('\u{0302}'))),
1251        'õ' => Some(('o', Some('\u{0303}'))),
1252        'ö' => Some(('o', Some('\u{0308}'))),
1253        'Ù' => Some(('U', Some('\u{0300}'))),
1254        'Ú' => Some(('U', Some('\u{0301}'))),
1255        'Û' => Some(('U', Some('\u{0302}'))),
1256        'Ü' => Some(('U', Some('\u{0308}'))),
1257        'ù' => Some(('u', Some('\u{0300}'))),
1258        'ú' => Some(('u', Some('\u{0301}'))),
1259        'û' => Some(('u', Some('\u{0302}'))),
1260        'ü' => Some(('u', Some('\u{0308}'))),
1261        'Ñ' => Some(('N', Some('\u{0303}'))),
1262        'ñ' => Some(('n', Some('\u{0303}'))),
1263        'Ç' => Some(('C', Some('\u{0327}'))),
1264        'ç' => Some(('c', Some('\u{0327}'))),
1265        'Ý' => Some(('Y', Some('\u{0301}'))),
1266        'ý' => Some(('y', Some('\u{0301}'))),
1267        'ÿ' => Some(('y', Some('\u{0308}'))),
1268        _ => None,
1269    };
1270
1271    match decomp {
1272        Some((base, Some(combining))) => {
1273            // Two-char iterator
1274            let v: Vec<char> = vec![base, combining];
1275            v.into_iter()
1276        }
1277        Some((base, None)) => {
1278            let v: Vec<char> = vec![base];
1279            v.into_iter()
1280        }
1281        None => {
1282            let v: Vec<char> = vec![c];
1283            v.into_iter()
1284        }
1285    }
1286}
1287
1288/// Approximate Pinyin romanisation for common Mandarin ideographs.
1289///
1290/// Covers ~60 high-frequency characters.  Returns `None` when the character
1291/// is not in the lookup table.
1292fn cjk_pinyin_lookup(c: char) -> Option<&'static str> {
1293    match c {
1294        '的' => Some("de"),
1295        '一' => Some("yi"),
1296        '是' => Some("shi"),
1297        '不' => Some("bu"),
1298        '了' => Some("le"),
1299        '人' => Some("ren"),
1300        '我' => Some("wo"),
1301        '在' => Some("zai"),
1302        '有' => Some("you"),
1303        '他' => Some("ta"),
1304        '这' => Some("zhe"),
1305        '中' => Some("zhong"),
1306        '大' => Some("da"),
1307        '来' => Some("lai"),
1308        '上' => Some("shang"),
1309        '国' => Some("guo"),
1310        '个' => Some("ge"),
1311        '到' => Some("dao"),
1312        '说' => Some("shuo"),
1313        '们' => Some("men"),
1314        '为' => Some("wei"),
1315        '子' => Some("zi"),
1316        '和' => Some("he"),
1317        '你' => Some("ni"),
1318        '地' => Some("di"),
1319        '出' => Some("chu"),
1320        '道' => Some("dao"),
1321        '也' => Some("ye"),
1322        '时' => Some("shi"),
1323        '年' => Some("nian"),
1324        '得' => Some("de"),
1325        '就' => Some("jiu"),
1326        '那' => Some("na"),
1327        '要' => Some("yao"),
1328        '下' => Some("xia"),
1329        '以' => Some("yi"),
1330        '生' => Some("sheng"),
1331        '会' => Some("hui"),
1332        '自' => Some("zi"),
1333        '着' => Some("zhe"),
1334        '去' => Some("qu"),
1335        '之' => Some("zhi"),
1336        '过' => Some("guo"),
1337        '家' => Some("jia"),
1338        '学' => Some("xue"),
1339        '对' => Some("dui"),
1340        '可' => Some("ke"),
1341        '她' => Some("ta"),
1342        '里' => Some("li"),
1343        '后' => Some("hou"),
1344        '小' => Some("xiao"),
1345        '么' => Some("me"),
1346        '心' => Some("xin"),
1347        '多' => Some("duo"),
1348        '天' => Some("tian"),
1349        '而' => Some("er"),
1350        '能' => Some("neng"),
1351        '好' => Some("hao"),
1352        '都' => Some("dou"),
1353        '然' => Some("ran"),
1354        _ => None,
1355    }
1356}
1357
1358// ── Tests (Unicode tokenizer + Transliterator) ────────────────────────────────
1359
1360#[cfg(test)]
1361mod unicode_tests {
1362    use super::*;
1363
1364    // ── UnicodeTokenizer ──────────────────────────────────────────────────────
1365
1366    #[test]
1367    fn tokenize_splits_simple_english() {
1368        let tok = UnicodeTokenizer::new(UnicodeTokenizerConfig {
1369            lowercase: true,
1370            strip_accents: false,
1371            split_on_punctuation: false,
1372            split_on_whitespace: true,
1373            max_token_length: None,
1374        });
1375        let tokens = tok.tokenize("hello world");
1376        assert_eq!(
1377            tokens,
1378            vec!["hello", "world"],
1379            "simple English sentence must split on whitespace"
1380        );
1381    }
1382
1383    #[test]
1384    fn tokenize_cjk_each_char_is_token() {
1385        let tok = UnicodeTokenizer::default();
1386        // "中文" (Chinese text) + space + "hello"
1387        let tokens = tok.tokenize_cjk("中文 hello");
1388        // 中 and 文 should each be their own token; hello is a third
1389        assert!(
1390            tokens.contains(&"中".to_string()),
1391            "CJK char '中' must be a token"
1392        );
1393        assert!(
1394            tokens.contains(&"文".to_string()),
1395            "CJK char '文' must be a token"
1396        );
1397        assert!(
1398            tokens.contains(&"hello".to_string()),
1399            "'hello' must be a token"
1400        );
1401    }
1402
1403    #[test]
1404    fn detect_script_latin() {
1405        let tok = UnicodeTokenizer::default();
1406        assert_eq!(
1407            tok.detect_script("hello world"),
1408            ScriptFamily::Latin,
1409            "ASCII text must detect as Latin"
1410        );
1411    }
1412
1413    #[test]
1414    fn detect_script_cyrillic() {
1415        let tok = UnicodeTokenizer::default();
1416        // "привет" = "hello" in Russian
1417        assert_eq!(
1418            tok.detect_script("привет мир"),
1419            ScriptFamily::Cyrillic,
1420            "Cyrillic text must detect as Cyrillic"
1421        );
1422    }
1423
1424    #[test]
1425    fn detect_script_cjk() {
1426        let tok = UnicodeTokenizer::default();
1427        assert_eq!(
1428            tok.detect_script("中文"),
1429            ScriptFamily::Cjk,
1430            "CJK text must detect as Cjk"
1431        );
1432    }
1433
1434    #[test]
1435    fn tokenize_with_punctuation_split() {
1436        let tok = UnicodeTokenizer::default();
1437        let tokens = tok.tokenize("hello, world!");
1438        // "hello", ",", "world", "!" expected (punctuation become tokens)
1439        assert!(
1440            tokens.contains(&"hello".to_string()),
1441            "must contain 'hello'"
1442        );
1443        assert!(
1444            tokens.contains(&"world".to_string()),
1445            "must contain 'world'"
1446        );
1447    }
1448
1449    #[test]
1450    fn tokenize_max_token_length() {
1451        let tok = UnicodeTokenizer::new(UnicodeTokenizerConfig {
1452            lowercase: false,
1453            strip_accents: false,
1454            split_on_punctuation: false,
1455            split_on_whitespace: true,
1456            max_token_length: Some(3),
1457        });
1458        let tokens = tok.tokenize("hello world");
1459        for t in &tokens {
1460            assert!(
1461                t.chars().count() <= 3,
1462                "token '{t}' exceeds max_token_length=3"
1463            );
1464        }
1465    }
1466
1467    // ── Transliterator ────────────────────────────────────────────────────────
1468
1469    #[test]
1470    fn cyrillic_to_latin_privet() {
1471        // "привет" → "privet" (note: п=p, р=r, и=i, в=v, е=ye, т=t)
1472        // In BGN/PCGN "привет" = p+r+i+v+ye+t = "privyet"
1473        // Using our table: п→p, р→r, и→i, в→v, е→ye, т→t
1474        let result = Transliterator::cyrillic_to_latin("привет");
1475        // Accept either "privet" or "privyet" since 'е' maps to "ye"
1476        assert!(
1477            result.starts_with("priv"),
1478            "transliteration of 'привет' must start with 'priv', got '{result}'"
1479        );
1480    }
1481
1482    #[test]
1483    fn cyrillic_to_latin_basic_letters() {
1484        // Test a few individual letters
1485        assert_eq!(Transliterator::cyrillic_to_latin("а"), "a");
1486        assert_eq!(Transliterator::cyrillic_to_latin("б"), "b");
1487        assert_eq!(Transliterator::cyrillic_to_latin("с"), "s");
1488        assert_eq!(Transliterator::cyrillic_to_latin("т"), "t");
1489    }
1490
1491    #[test]
1492    fn strip_accents_cafe() {
1493        let result = Transliterator::strip_accents("café");
1494        assert_eq!(
1495            result, "cafe",
1496            "strip_accents('café') must return 'cafe', got '{result}'"
1497        );
1498    }
1499
1500    #[test]
1501    fn strip_accents_no_accents_unchanged() {
1502        let result = Transliterator::strip_accents("hello");
1503        assert_eq!(result, "hello", "plain ASCII must be unchanged");
1504    }
1505
1506    #[test]
1507    fn transliterator_normalize_collapses_whitespace() {
1508        let result = Transliterator::normalize("  Hello   World  ");
1509        assert_eq!(
1510            result, "hello world",
1511            "normalize must trim and collapse spaces"
1512        );
1513    }
1514
1515    #[test]
1516    fn strip_accents_german_umlaut() {
1517        // ü → u (after stripping U+0308 combining diaeresis)
1518        let result = Transliterator::strip_accents("über");
1519        assert_eq!(result, "uber", "ü must become u after accent stripping");
1520    }
1521}
scirs2_text/multilingual.rs

scirs2_text/
multilingual.rs