1use crate::error::{Result, TextError};
7use std::collections::HashMap;
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11pub enum Language {
12 English,
14 Spanish,
16 French,
18 German,
20 Italian,
22 Portuguese,
24 Dutch,
26 Russian,
28 Chinese,
30 Japanese,
32 Korean,
34 Arabic,
36 Unknown,
38}
39
40impl Language {
41 pub fn iso_code(&self) -> &'static str {
43 match self {
44 Language::English => "en",
45 Language::Spanish => "es",
46 Language::French => "fr",
47 Language::German => "de",
48 Language::Italian => "it",
49 Language::Portuguese => "pt",
50 Language::Dutch => "nl",
51 Language::Russian => "ru",
52 Language::Chinese => "zh",
53 Language::Japanese => "ja",
54 Language::Korean => "ko",
55 Language::Arabic => "ar",
56 Language::Unknown => "und",
57 }
58 }
59
60 pub fn from_iso_code(code: &str) -> Self {
62 match code.to_lowercase().as_str() {
63 "en" => Language::English,
64 "es" => Language::Spanish,
65 "fr" => Language::French,
66 "de" => Language::German,
67 "it" => Language::Italian,
68 "pt" => Language::Portuguese,
69 "nl" => Language::Dutch,
70 "ru" => Language::Russian,
71 "zh" => Language::Chinese,
72 "ja" => Language::Japanese,
73 "ko" => Language::Korean,
74 "ar" => Language::Arabic,
75 _ => Language::Unknown,
76 }
77 }
78
79 pub fn name(&self) -> &'static str {
81 match self {
82 Language::English => "English",
83 Language::Spanish => "Spanish",
84 Language::French => "French",
85 Language::German => "German",
86 Language::Italian => "Italian",
87 Language::Portuguese => "Portuguese",
88 Language::Dutch => "Dutch",
89 Language::Russian => "Russian",
90 Language::Chinese => "Chinese",
91 Language::Japanese => "Japanese",
92 Language::Korean => "Korean",
93 Language::Arabic => "Arabic",
94 Language::Unknown => "Unknown",
95 }
96 }
97}
98
99#[derive(Debug, Clone)]
101pub struct LanguageDetectionResult {
102 pub language: Language,
104 pub confidence: f64,
106 pub alternatives: Vec<(Language, f64)>,
108}
109
110pub struct LanguageDetector {
112 profiles: HashMap<Language, HashMap<String, f64>>,
114 n_gram_size: usize,
116}
117
118impl LanguageDetector {
119 pub fn new() -> Self {
121 let mut detector = Self {
122 profiles: HashMap::new(),
123 n_gram_size: 3,
124 };
125 detector.initialize_default_profiles();
126 detector
127 }
128
129 pub fn with_ngram_size(n_gramsize: usize) -> Result<Self> {
131 if !(1..=5).contains(&n_gramsize) {
132 return Err(TextError::InvalidInput(
133 "N-gram size must be between 1 and 5".to_string(),
134 ));
135 }
136 let mut detector = Self {
137 profiles: HashMap::new(),
138 n_gram_size: n_gramsize,
139 };
140 detector.initialize_default_profiles();
141 Ok(detector)
142 }
143
144 fn initialize_default_profiles(&mut self) {
146 let mut english_profile = HashMap::new();
148 for (ngram, freq) in &[
149 ("the", 0.05),
150 ("and", 0.03),
151 ("ing", 0.025),
152 ("ion", 0.02),
153 ("tio", 0.018),
154 ("ent", 0.015),
155 ("ati", 0.013),
156 ("her", 0.012),
157 ("for", 0.011),
158 ("ter", 0.01),
159 ("hat", 0.009),
160 ("tha", 0.009),
161 ("ere", 0.008),
162 ("ate", 0.008),
163 ("ver", 0.007),
164 ("his", 0.007),
165 ] {
166 english_profile.insert(ngram.to_string(), *freq);
167 }
168 self.profiles.insert(Language::English, english_profile);
169
170 let mut spanish_profile = HashMap::new();
172 for (ngram, freq) in &[
173 ("que", 0.04),
174 ("de_", 0.035),
175 ("la_", 0.03),
176 ("el_", 0.025),
177 ("es_", 0.02),
178 ("los", 0.018),
179 ("las", 0.015),
180 ("ión", 0.013),
181 ("ado", 0.012),
182 ("nte", 0.011),
183 ("con", 0.01),
184 ("par", 0.009),
185 ("ara", 0.008),
186 ("una", 0.008),
187 ("por", 0.007),
188 ("est", 0.007),
189 ] {
190 spanish_profile.insert(ngram.to_string(), *freq);
191 }
192 self.profiles.insert(Language::Spanish, spanish_profile);
193
194 let mut french_profile = HashMap::new();
196 for (ngram, freq) in &[
197 ("de_", 0.05),
198 ("le_", 0.04),
199 ("que", 0.03),
200 ("les", 0.025),
201 ("la_", 0.02),
202 ("des", 0.018),
203 ("ent", 0.015),
204 ("ion", 0.013),
205 ("est", 0.012),
206 ("ait", 0.011),
207 ("pour", 0.01),
208 ("ais", 0.009),
209 ("ans", 0.008),
210 ("ont", 0.008),
211 ("une", 0.007),
212 ("qui", 0.007),
213 ] {
214 french_profile.insert(ngram.to_string(), *freq);
215 }
216 self.profiles.insert(Language::French, french_profile);
217
218 let mut german_profile = HashMap::new();
220 for (ngram, freq) in &[
221 ("der", 0.05),
222 ("die", 0.04),
223 ("und", 0.03),
224 ("den", 0.025),
225 ("das", 0.02),
226 ("ein", 0.018),
227 ("ich", 0.015),
228 ("ist", 0.013),
229 ("sch", 0.012),
230 ("cht", 0.011),
231 ("ung", 0.01),
232 ("gen", 0.009),
233 ("eit", 0.008),
234 ("ver", 0.008),
235 ("ber", 0.007),
236 ("ten", 0.007),
237 ] {
238 german_profile.insert(ngram.to_string(), *freq);
239 }
240 self.profiles.insert(Language::German, german_profile);
241
242 let mut italian_profile = HashMap::new();
244 for (ngram, freq) in &[
245 ("che", 0.05),
246 ("la_", 0.04),
247 ("il_", 0.03),
248 ("di_", 0.025),
249 ("del", 0.02),
250 ("le_", 0.018),
251 ("lla", 0.015),
252 ("per", 0.013),
253 ("ato", 0.012),
254 ("gli", 0.011),
255 ("sta", 0.01),
256 ("con", 0.009),
257 ("ent", 0.008),
258 ("ion", 0.008),
259 ("are", 0.007),
260 ("una", 0.007),
261 ] {
262 italian_profile.insert(ngram.to_string(), *freq);
263 }
264 self.profiles.insert(Language::Italian, italian_profile);
265
266 let mut portuguese_profile = HashMap::new();
268 for (ngram, freq) in &[
269 ("que", 0.05),
270 ("de_", 0.04),
271 ("os_", 0.03),
272 ("as_", 0.025),
273 ("da_", 0.02),
274 ("do_", 0.018),
275 ("ão_", 0.015),
276 ("ent", 0.013),
277 ("com", 0.012),
278 ("para", 0.011),
279 ("uma", 0.01),
280 ("est", 0.009),
281 ("nte", 0.008),
282 ("ção", 0.008),
283 ("por", 0.007),
284 ("não", 0.007),
285 ] {
286 portuguese_profile.insert(ngram.to_string(), *freq);
287 }
288 self.profiles
289 .insert(Language::Portuguese, portuguese_profile);
290
291 let mut dutch_profile = HashMap::new();
293 for (ngram, freq) in &[
294 ("de_", 0.05),
295 ("het", 0.04),
296 ("een", 0.03),
297 ("van", 0.025),
298 ("en_", 0.02),
299 ("dat", 0.018),
300 ("te_", 0.015),
301 ("op_", 0.013),
302 ("aar", 0.012),
303 ("oor", 0.011),
304 ("eer", 0.01),
305 ("sch", 0.009),
306 ("ver", 0.008),
307 ("ing", 0.008),
308 ("cht", 0.007),
309 ("ter", 0.007),
310 ] {
311 dutch_profile.insert(ngram.to_string(), *freq);
312 }
313 self.profiles.insert(Language::Dutch, dutch_profile);
314
315 let mut russian_profile = HashMap::new();
317 for (ngram, freq) in &[
318 ("что", 0.05),
319 ("ого", 0.04),
320 ("как", 0.03),
321 ("это", 0.025),
322 ("все", 0.02),
323 ("был", 0.018),
324 ("ени", 0.015),
325 ("ост", 0.013),
326 ("ова", 0.012),
327 ("про", 0.011),
328 ("сто", 0.01),
329 ("ого", 0.009),
330 ("при", 0.008),
331 ("ени", 0.008),
332 ("ать", 0.007),
333 ("ный", 0.007),
334 ] {
335 russian_profile.insert(ngram.to_string(), *freq);
336 }
337 self.profiles.insert(Language::Russian, russian_profile);
338
339 let mut chinese_profile = HashMap::new();
341 for (ngram, freq) in &[
342 ("的_", 0.06),
343 ("是_", 0.045),
344 ("了_", 0.035),
345 ("在_", 0.03),
346 ("和_", 0.025),
347 ("有_", 0.022),
348 ("我_", 0.02),
349 ("他_", 0.018),
350 ("不_", 0.016),
351 ("为_", 0.014),
352 ("这_", 0.013),
353 ("个_", 0.012),
354 ("们_", 0.011),
355 ("人_", 0.01),
356 ("要_", 0.009),
357 ("会_", 0.008),
358 ] {
359 chinese_profile.insert(ngram.to_string(), *freq);
360 }
361 self.profiles.insert(Language::Chinese, chinese_profile);
362
363 let mut japanese_profile = HashMap::new();
365 for (ngram, freq) in &[
366 ("の_", 0.05),
367 ("に_", 0.04),
368 ("は_", 0.035),
369 ("を_", 0.03),
370 ("た_", 0.025),
371 ("と_", 0.022),
372 ("が_", 0.02),
373 ("で_", 0.018),
374 ("る_", 0.016),
375 ("す_", 0.014),
376 ("い_", 0.013),
377 ("ます", 0.012),
378 ("した", 0.011),
379 ("して", 0.01),
380 ("です", 0.009),
381 ("ない", 0.008),
382 ] {
383 japanese_profile.insert(ngram.to_string(), *freq);
384 }
385 self.profiles.insert(Language::Japanese, japanese_profile);
386
387 let mut korean_profile = HashMap::new();
389 for (ngram, freq) in &[
390 ("의_", 0.05),
391 ("이_", 0.04),
392 ("가_", 0.035),
393 ("을_", 0.03),
394 ("는_", 0.025),
395 ("에_", 0.022),
396 ("하_", 0.02),
397 ("고_", 0.018),
398 ("다_", 0.016),
399 ("지_", 0.014),
400 ("한_", 0.013),
401 ("로_", 0.012),
402 ("서_", 0.011),
403 ("도_", 0.01),
404 ("와_", 0.009),
405 ("니_", 0.008),
406 ] {
407 korean_profile.insert(ngram.to_string(), *freq);
408 }
409 self.profiles.insert(Language::Korean, korean_profile);
410
411 let mut arabic_profile = HashMap::new();
413 for (ngram, freq) in &[
414 ("ال_", 0.06),
415 ("في_", 0.045),
416 ("من_", 0.035),
417 ("على", 0.03),
418 ("إلى", 0.025),
419 ("ها_", 0.022),
420 ("أن_", 0.02),
421 ("ما_", 0.018),
422 ("هو_", 0.016),
423 ("كان", 0.014),
424 ("هذا", 0.013),
425 ("عن_", 0.012),
426 ("بين", 0.011),
427 ("لا_", 0.01),
428 ("قد_", 0.009),
429 ("كل_", 0.008),
430 ] {
431 arabic_profile.insert(ngram.to_string(), *freq);
432 }
433 self.profiles.insert(Language::Arabic, arabic_profile);
434 }
435
436 pub fn detect(&self, text: &str) -> Result<LanguageDetectionResult> {
438 if text.trim().is_empty() {
439 return Err(TextError::InvalidInput(
440 "Cannot detect language of empty text".to_string(),
441 ));
442 }
443
444 let text_profile = self.createtext_profile(text);
446
447 let mut scores: Vec<(Language, f64)> = self
449 .profiles
450 .iter()
451 .map(|(lang, profile)| {
452 let score = self.calculate_similarity(&text_profile, profile);
453 (*lang, score)
454 })
455 .collect();
456
457 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
459
460 if scores.is_empty() {
461 return Ok(LanguageDetectionResult {
462 language: Language::Unknown,
463 confidence: 0.0,
464 alternatives: vec![],
465 });
466 }
467
468 let best_score = scores[0].1;
469 let best_language = scores[0].0;
470
471 let confidence = if scores.len() > 1 {
473 let second_score = scores[1].1;
474 let diff = best_score - second_score;
475 (diff / best_score).clamp(0.0, 1.0)
477 } else {
478 best_score
479 };
480
481 Ok(LanguageDetectionResult {
482 language: best_language,
483 confidence,
484 alternatives: scores.into_iter().skip(1).take(3).collect(),
485 })
486 }
487
488 fn createtext_profile(&self, text: &str) -> HashMap<String, f64> {
490 let mut profile = HashMap::new();
491 let text_lower = text.to_lowercase();
492 let chars: Vec<char> = text_lower.chars().collect();
493 let total_ngrams = chars.len().saturating_sub(self.n_gram_size - 1) as f64;
494
495 if total_ngrams <= 0.0 {
496 return profile;
497 }
498
499 let mut ngram_counts: HashMap<String, usize> = HashMap::new();
501 for i in 0..=chars.len().saturating_sub(self.n_gram_size) {
502 let ngram: String = chars[i..i + self.n_gram_size].iter().collect();
503 let ngram = ngram.replace(' ', "_");
505 *ngram_counts.entry(ngram).or_insert(0) += 1;
506 }
507
508 for (ngram, count) in ngram_counts {
510 profile.insert(ngram, count as f64 / total_ngrams);
511 }
512
513 profile
514 }
515
516 fn calculate_similarity(
518 &self,
519 profile1: &HashMap<String, f64>,
520 profile2: &HashMap<String, f64>,
521 ) -> f64 {
522 let mut similarity = 0.0;
523 let mut total_weight = 0.0;
524
525 for (ngram, freq1) in profile1 {
527 if let Some(freq2) = profile2.get(ngram) {
528 similarity += freq1 * freq2;
529 }
530 total_weight += freq1 * freq1;
531 }
532
533 if total_weight > 0.0 {
534 similarity / total_weight.sqrt()
535 } else {
536 0.0
537 }
538 }
539
540 pub fn supported_languages(&self) -> Vec<Language> {
542 self.profiles.keys().copied().collect()
543 }
544}
545
546impl Default for LanguageDetector {
547 fn default() -> Self {
548 Self::new()
549 }
550}
551
552pub struct StopWords {
554 stop_words: HashMap<Language, Vec<String>>,
556}
557
558impl StopWords {
559 pub fn new() -> Self {
561 let mut stop_words = HashMap::new();
562
563 stop_words.insert(
565 Language::English,
566 vec![
567 "a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in",
568 "is", "it", "its", "of", "on", "that", "the", "to", "was", "will", "with", "you",
569 "your", "this", "have", "had", "been", "but", "not", "they", "were", "what",
570 "when", "where", "who", "which", "their", "them", "these", "those", "there",
571 "here", "than",
572 ]
573 .iter()
574 .map(|s| s.to_string())
575 .collect(),
576 );
577
578 stop_words.insert(
580 Language::Spanish,
581 vec![
582 "a", "al", "algo", "algunas", "algunos", "ante", "antes", "como", "con", "contra",
583 "cual", "cuando", "de", "del", "desde", "donde", "durante", "e", "el", "ella",
584 "ellas", "ellos", "en", "entre", "era", "erais", "eran", "eras", "eres", "es",
585 "esa", "esas", "ese", "eso", "esos", "esta", "estas", "este", "esto", "estos",
586 "fue", "fueron", "fui", "la", "las", "lo", "los", "más", "mi", "mis", "mucho",
587 "muchos", "muy", "ni", "no", "nos", "nosotras", "nosotros", "o", "otra", "otras",
588 "otro", "otros", "para", "pero", "por", "porque", "que", "quien", "quienes", "se",
589 "si", "sin", "sobre", "su", "sus", "también", "tanto", "te", "tu", "tus", "un",
590 "una", "uno", "unos", "y", "ya", "yo",
591 ]
592 .iter()
593 .map(|s| s.to_string())
594 .collect(),
595 );
596
597 stop_words.insert(
599 Language::French,
600 vec![
601 "au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "en", "et",
602 "eux", "il", "je", "la", "le", "les", "leur", "lui", "ma", "mais", "me", "même",
603 "mes", "moi", "mon", "ne", "nos", "notre", "nous", "on", "ou", "par", "pas",
604 "pour", "qu", "que", "qui", "sa", "se", "ses", "son", "sur", "ta", "te", "tes",
605 "toi", "ton", "tu", "un", "une", "vos", "votre", "vous",
606 ]
607 .iter()
608 .map(|s| s.to_string())
609 .collect(),
610 );
611
612 Self { stop_words }
613 }
614
615 pub fn get(&self, language: Language) -> Option<&Vec<String>> {
617 self.stop_words.get(&language)
618 }
619
620 pub fn is_stop_word(&self, word: &str, language: Language) -> bool {
622 if let Some(words) = self.stop_words.get(&language) {
623 words.iter().any(|sw| sw == &word.to_lowercase())
624 } else {
625 false
626 }
627 }
628
629 pub fn remove_stop_words(&self, tokens: &[String], language: Language) -> Vec<String> {
631 tokens
632 .iter()
633 .filter(|token| !self.is_stop_word(token, language))
634 .cloned()
635 .collect()
636 }
637}
638
639impl Default for StopWords {
640 fn default() -> Self {
641 Self::new()
642 }
643}
644
645pub struct MultilingualProcessor {
647 detector: LanguageDetector,
649 stop_words: StopWords,
651}
652
653impl MultilingualProcessor {
654 pub fn new() -> Self {
656 Self {
657 detector: LanguageDetector::new(),
658 stop_words: StopWords::new(),
659 }
660 }
661
662 pub fn process(&self, text: &str) -> Result<ProcessedText> {
664 let detection = self.detector.detect(text)?;
666
667 let tokens: Vec<String> = text.split_whitespace().map(|s| s.to_string()).collect();
669
670 let filtered_tokens = self
672 .stop_words
673 .remove_stop_words(&tokens, detection.language);
674
675 Ok(ProcessedText {
676 original: text.to_string(),
677 language: detection.language,
678 confidence: detection.confidence,
679 tokens,
680 filtered_tokens,
681 })
682 }
683}
684
685impl Default for MultilingualProcessor {
686 fn default() -> Self {
687 Self::new()
688 }
689}
690
691#[derive(Debug, Clone)]
693pub struct ProcessedText {
694 pub original: String,
696 pub language: Language,
698 pub confidence: f64,
700 pub tokens: Vec<String>,
702 pub filtered_tokens: Vec<String>,
704}
705
706#[cfg(test)]
707mod tests {
708 use super::*;
709
710 #[test]
711 fn test_language_enum() {
712 assert_eq!(Language::English.iso_code(), "en");
713 assert_eq!(Language::Spanish.name(), "Spanish");
714 assert_eq!(Language::from_iso_code("fr"), Language::French);
715 assert_eq!(Language::from_iso_code("unknown"), Language::Unknown);
716 }
717
718 #[test]
719 fn test_language_detection() {
720 let detector = LanguageDetector::new();
721
722 let result = detector.detect("The quick brown fox jumps over the lazy dog. This is definitely an English sentence with many common words.").expect("Operation failed");
724 assert_eq!(result.language, Language::English);
725
726 let empty_result = detector.detect("");
728 assert!(empty_result.is_err());
729 }
730
731 #[test]
732 fn test_stop_words() {
733 let stop_words = StopWords::new();
734
735 assert!(stop_words.is_stop_word("the", Language::English));
737 assert!(stop_words.is_stop_word("and", Language::English));
738 assert!(!stop_words.is_stop_word("hello", Language::English));
739
740 let tokens = vec![
742 "the".to_string(),
743 "cat".to_string(),
744 "is".to_string(),
745 "happy".to_string(),
746 ];
747 let filtered = stop_words.remove_stop_words(&tokens, Language::English);
748 assert_eq!(filtered, vec!["cat", "happy"]);
749 }
750
751 #[test]
752 fn test_multilingual_processor() {
753 let processor = MultilingualProcessor::new();
754
755 let result = processor.process("The quick brown fox jumps over the lazy dog. This sentence has many English words.").expect("Operation failed");
756 assert_eq!(result.language, Language::English);
757 assert!(!result.tokens.is_empty());
758 assert!(result.filtered_tokens.len() < result.tokens.len());
759 }
760
761 #[test]
762 fn test_createtext_profile() {
763 let detector = LanguageDetector::new();
764 let profile = detector.createtext_profile("hello world");
765
766 assert!(!profile.is_empty());
768 assert!(profile.contains_key("hel") || profile.contains_key("llo"));
769 }
770}
771
772#[derive(Debug, Clone, Copy, PartialEq, Eq)]
780pub enum ScriptFamily {
781 Latin,
783 Cjk,
785 Cyrillic,
787 Arabic,
789 Devanagari,
791 Other,
793}
794
795#[derive(Debug, Clone)]
799pub struct UnicodeTokenizerConfig {
800 pub lowercase: bool,
802 pub strip_accents: bool,
805 pub split_on_punctuation: bool,
807 pub split_on_whitespace: bool,
809 pub max_token_length: Option<usize>,
811}
812
813impl Default for UnicodeTokenizerConfig {
814 fn default() -> Self {
815 UnicodeTokenizerConfig {
816 lowercase: true,
817 strip_accents: true,
818 split_on_punctuation: true,
819 split_on_whitespace: true,
820 max_token_length: None,
821 }
822 }
823}
824
825pub struct UnicodeTokenizer {
837 config: UnicodeTokenizerConfig,
838}
839
840impl UnicodeTokenizer {
841 pub fn new(config: UnicodeTokenizerConfig) -> Self {
843 UnicodeTokenizer { config }
844 }
845
846 pub fn default_tokenizer() -> Self {
849 Self::new(UnicodeTokenizerConfig::default())
850 }
851
852 pub fn tokenize(&self, text: &str) -> Vec<String> {
863 let spaced = self.insert_cjk_spaces(text);
865
866 let lowered = if self.config.lowercase {
868 spaced.to_lowercase()
869 } else {
870 spaced
871 };
872
873 let stripped = if self.config.strip_accents {
875 Transliterator::strip_accents(&lowered)
876 } else {
877 lowered
878 };
879
880 let mut tokens: Vec<String> = Vec::new();
882 let mut current = String::new();
883
884 for ch in stripped.chars() {
885 let is_ws = ch.is_ascii_whitespace() || ch == '\u{00A0}';
886 let is_punct = self.config.split_on_punctuation && is_unicode_punctuation(ch);
887
888 if (self.config.split_on_whitespace && is_ws) || is_punct {
889 if !current.is_empty() {
890 tokens.push(current.clone());
891 current.clear();
892 }
893 if is_punct {
894 tokens.push(ch.to_string());
895 }
896 } else {
897 current.push(ch);
898 }
899 }
900 if !current.is_empty() {
901 tokens.push(current);
902 }
903
904 tokens.retain(|t| !t.is_empty());
906 if let Some(max_len) = self.config.max_token_length {
907 tokens.iter_mut().for_each(|t| {
908 let char_count = t.chars().count();
909 if char_count > max_len {
910 *t = t.chars().take(max_len).collect();
911 }
912 });
913 }
914
915 tokens
916 }
917
918 pub fn detect_script(&self, text: &str) -> ScriptFamily {
923 let mut latin = 0usize;
924 let mut cjk = 0usize;
925 let mut cyrillic = 0usize;
926 let mut arabic = 0usize;
927 let mut devanagari = 0usize;
928 let mut other = 0usize;
929
930 for ch in text.chars() {
931 if ch.is_whitespace() {
932 continue;
933 }
934 if is_cjk_char(ch) {
935 cjk += 1;
936 } else if is_cyrillic(ch) {
937 cyrillic += 1;
938 } else if is_arabic(ch) {
939 arabic += 1;
940 } else if is_devanagari(ch) {
941 devanagari += 1;
942 } else if ch.is_ascii_alphabetic() || (ch as u32 >= 0x00C0 && ch as u32 <= 0x024F) {
943 latin += 1;
944 } else {
945 other += 1;
946 }
947 }
948
949 let max = [latin, cjk, cyrillic, arabic, devanagari, other]
950 .into_iter()
951 .max()
952 .unwrap_or(0);
953
954 if max == 0 {
955 return ScriptFamily::Other;
956 }
957 if max == cjk {
958 ScriptFamily::Cjk
959 } else if max == cyrillic {
960 ScriptFamily::Cyrillic
961 } else if max == arabic {
962 ScriptFamily::Arabic
963 } else if max == devanagari {
964 ScriptFamily::Devanagari
965 } else if max == latin {
966 ScriptFamily::Latin
967 } else {
968 ScriptFamily::Other
969 }
970 }
971
972 pub fn tokenize_cjk(&self, text: &str) -> Vec<String> {
977 let spaced = self.insert_cjk_spaces(text);
978 spaced
979 .split_whitespace()
980 .map(|s| s.to_string())
981 .filter(|s| !s.is_empty())
982 .collect()
983 }
984
985 fn insert_cjk_spaces(&self, text: &str) -> String {
988 let mut out = String::with_capacity(text.len() + text.chars().count());
989 for ch in text.chars() {
990 if is_cjk_char(ch) {
991 out.push(' ');
992 out.push(ch);
993 out.push(' ');
994 } else {
995 out.push(ch);
996 }
997 }
998 out
999 }
1000}
1001
1002impl Default for UnicodeTokenizer {
1003 fn default() -> Self {
1004 Self::default_tokenizer()
1005 }
1006}
1007
1008pub struct Transliterator;
1012
1013impl Transliterator {
1014 pub fn cjk_to_latin(text: &str) -> String {
1019 text.chars()
1020 .map(|c| {
1021 if let Some(roman) = cjk_pinyin_lookup(c) {
1022 roman.to_string()
1023 } else {
1024 c.to_string()
1025 }
1026 })
1027 .collect::<Vec<_>>()
1028 .join("")
1029 }
1030
1031 pub fn cyrillic_to_latin(text: &str) -> String {
1037 let mut out = String::with_capacity(text.len() * 2);
1038 for ch in text.chars() {
1039 let lower = ch.to_lowercase().next().unwrap_or(ch);
1040 if let Some(roman) = cyrillic_lookup(lower) {
1041 if ch.is_uppercase() {
1042 let mut chars = roman.chars();
1044 if let Some(first) = chars.next() {
1045 for c in first.to_uppercase() {
1046 out.push(c);
1047 }
1048 out.push_str(chars.as_str());
1049 }
1050 } else {
1051 out.push_str(roman);
1052 }
1053 } else {
1054 out.push(ch);
1055 }
1056 }
1057 out
1058 }
1059
1060 pub fn strip_accents(text: &str) -> String {
1066 text.chars()
1067 .flat_map(nfd_decompose)
1068 .filter(|&c| !is_combining_mark(c))
1069 .collect()
1070 }
1071
1072 pub fn normalize(text: &str) -> String {
1075 let lowered = text.to_lowercase();
1076 lowered.split_whitespace().collect::<Vec<_>>().join(" ")
1077 }
1078}
1079
1080pub fn is_cjk_char(c: char) -> bool {
1084 let cp = c as u32;
1085 (0x4E00..=0x9FFF).contains(&cp)
1087 || (0x3400..=0x4DBF).contains(&cp)
1089 || (0x20000..=0x2A6DF).contains(&cp)
1091 || (0xF900..=0xFAFF).contains(&cp)
1093 || (0x2F800..=0x2FA1F).contains(&cp)
1095}
1096
1097pub fn is_cyrillic(c: char) -> bool {
1099 let cp = c as u32;
1100 (0x0400..=0x04FF).contains(&cp)
1101}
1102
1103fn is_arabic(c: char) -> bool {
1105 let cp = c as u32;
1106 (0x0600..=0x06FF).contains(&cp)
1107}
1108
1109fn is_devanagari(c: char) -> bool {
1111 let cp = c as u32;
1112 (0x0900..=0x097F).contains(&cp)
1113}
1114
1115pub fn is_combining_mark(c: char) -> bool {
1120 let cp = c as u32;
1121 (0x0300..=0x036F).contains(&cp)
1122 || (0x1DC0..=0x1DFF).contains(&cp)
1123 || (0x20D0..=0x20FF).contains(&cp)
1124 || (0xFE20..=0xFE2F).contains(&cp)
1125}
1126
1127fn is_unicode_punctuation(c: char) -> bool {
1129 matches!(
1130 c,
1131 '!' | '"'
1132 | '#'
1133 | '%'
1134 | '&'
1135 | '\''
1136 | '('
1137 | ')'
1138 | '*'
1139 | ','
1140 | '-'
1141 | '.'
1142 | '/'
1143 | ':'
1144 | ';'
1145 | '?'
1146 | '@'
1147 | '['
1148 | '\\'
1149 | ']'
1150 | '_'
1151 | '{'
1152 | '}'
1153 | '~'
1154 | '·'
1155 | '…'
1156 | '—'
1157 | '–'
1158 | '\u{2018}'
1159 | '\u{2019}'
1160 | '\u{201C}'
1161 | '\u{201D}'
1162 ) || (c as u32 >= 0x2000 && c as u32 <= 0x206F)
1163}
1164
1165fn cyrillic_lookup(c: char) -> Option<&'static str> {
1169 match c {
1170 'а' => Some("a"),
1171 'б' => Some("b"),
1172 'в' => Some("v"),
1173 'г' => Some("g"),
1174 'д' => Some("d"),
1175 'е' => Some("ye"),
1176 'ё' => Some("yo"),
1177 'ж' => Some("zh"),
1178 'з' => Some("z"),
1179 'и' => Some("i"),
1180 'й' => Some("y"),
1181 'к' => Some("k"),
1182 'л' => Some("l"),
1183 'м' => Some("m"),
1184 'н' => Some("n"),
1185 'о' => Some("o"),
1186 'п' => Some("p"),
1187 'р' => Some("r"),
1188 'с' => Some("s"),
1189 'т' => Some("t"),
1190 'у' => Some("u"),
1191 'ф' => Some("f"),
1192 'х' => Some("kh"),
1193 'ц' => Some("ts"),
1194 'ч' => Some("ch"),
1195 'ш' => Some("sh"),
1196 'щ' => Some("shch"),
1197 'ъ' => Some(""),
1198 'ы' => Some("y"),
1199 'ь' => Some(""),
1200 'э' => Some("e"),
1201 'ю' => Some("yu"),
1202 'я' => Some("ya"),
1203 _ => None,
1204 }
1205}
1206
1207fn nfd_decompose(c: char) -> impl Iterator<Item = char> {
1213 let decomp: Option<(char, Option<char>)> = match c {
1215 'À' => Some(('A', Some('\u{0300}'))),
1216 'Á' => Some(('A', Some('\u{0301}'))),
1217 'Â' => Some(('A', Some('\u{0302}'))),
1218 'Ã' => Some(('A', Some('\u{0303}'))),
1219 'Ä' => Some(('A', Some('\u{0308}'))),
1220 'Å' => Some(('A', Some('\u{030A}'))),
1221 'à' => Some(('a', Some('\u{0300}'))),
1222 'á' => Some(('a', Some('\u{0301}'))),
1223 'â' => Some(('a', Some('\u{0302}'))),
1224 'ã' => Some(('a', Some('\u{0303}'))),
1225 'ä' => Some(('a', Some('\u{0308}'))),
1226 'å' => Some(('a', Some('\u{030A}'))),
1227 'È' => Some(('E', Some('\u{0300}'))),
1228 'É' => Some(('E', Some('\u{0301}'))),
1229 'Ê' => Some(('E', Some('\u{0302}'))),
1230 'Ë' => Some(('E', Some('\u{0308}'))),
1231 'è' => Some(('e', Some('\u{0300}'))),
1232 'é' => Some(('e', Some('\u{0301}'))),
1233 'ê' => Some(('e', Some('\u{0302}'))),
1234 'ë' => Some(('e', Some('\u{0308}'))),
1235 'Ì' => Some(('I', Some('\u{0300}'))),
1236 'Í' => Some(('I', Some('\u{0301}'))),
1237 'Î' => Some(('I', Some('\u{0302}'))),
1238 'Ï' => Some(('I', Some('\u{0308}'))),
1239 'ì' => Some(('i', Some('\u{0300}'))),
1240 'í' => Some(('i', Some('\u{0301}'))),
1241 'î' => Some(('i', Some('\u{0302}'))),
1242 'ï' => Some(('i', Some('\u{0308}'))),
1243 'Ò' => Some(('O', Some('\u{0300}'))),
1244 'Ó' => Some(('O', Some('\u{0301}'))),
1245 'Ô' => Some(('O', Some('\u{0302}'))),
1246 'Õ' => Some(('O', Some('\u{0303}'))),
1247 'Ö' => Some(('O', Some('\u{0308}'))),
1248 'ò' => Some(('o', Some('\u{0300}'))),
1249 'ó' => Some(('o', Some('\u{0301}'))),
1250 'ô' => Some(('o', Some('\u{0302}'))),
1251 'õ' => Some(('o', Some('\u{0303}'))),
1252 'ö' => Some(('o', Some('\u{0308}'))),
1253 'Ù' => Some(('U', Some('\u{0300}'))),
1254 'Ú' => Some(('U', Some('\u{0301}'))),
1255 'Û' => Some(('U', Some('\u{0302}'))),
1256 'Ü' => Some(('U', Some('\u{0308}'))),
1257 'ù' => Some(('u', Some('\u{0300}'))),
1258 'ú' => Some(('u', Some('\u{0301}'))),
1259 'û' => Some(('u', Some('\u{0302}'))),
1260 'ü' => Some(('u', Some('\u{0308}'))),
1261 'Ñ' => Some(('N', Some('\u{0303}'))),
1262 'ñ' => Some(('n', Some('\u{0303}'))),
1263 'Ç' => Some(('C', Some('\u{0327}'))),
1264 'ç' => Some(('c', Some('\u{0327}'))),
1265 'Ý' => Some(('Y', Some('\u{0301}'))),
1266 'ý' => Some(('y', Some('\u{0301}'))),
1267 'ÿ' => Some(('y', Some('\u{0308}'))),
1268 _ => None,
1269 };
1270
1271 match decomp {
1272 Some((base, Some(combining))) => {
1273 let v: Vec<char> = vec![base, combining];
1275 v.into_iter()
1276 }
1277 Some((base, None)) => {
1278 let v: Vec<char> = vec![base];
1279 v.into_iter()
1280 }
1281 None => {
1282 let v: Vec<char> = vec![c];
1283 v.into_iter()
1284 }
1285 }
1286}
1287
1288fn cjk_pinyin_lookup(c: char) -> Option<&'static str> {
1293 match c {
1294 '的' => Some("de"),
1295 '一' => Some("yi"),
1296 '是' => Some("shi"),
1297 '不' => Some("bu"),
1298 '了' => Some("le"),
1299 '人' => Some("ren"),
1300 '我' => Some("wo"),
1301 '在' => Some("zai"),
1302 '有' => Some("you"),
1303 '他' => Some("ta"),
1304 '这' => Some("zhe"),
1305 '中' => Some("zhong"),
1306 '大' => Some("da"),
1307 '来' => Some("lai"),
1308 '上' => Some("shang"),
1309 '国' => Some("guo"),
1310 '个' => Some("ge"),
1311 '到' => Some("dao"),
1312 '说' => Some("shuo"),
1313 '们' => Some("men"),
1314 '为' => Some("wei"),
1315 '子' => Some("zi"),
1316 '和' => Some("he"),
1317 '你' => Some("ni"),
1318 '地' => Some("di"),
1319 '出' => Some("chu"),
1320 '道' => Some("dao"),
1321 '也' => Some("ye"),
1322 '时' => Some("shi"),
1323 '年' => Some("nian"),
1324 '得' => Some("de"),
1325 '就' => Some("jiu"),
1326 '那' => Some("na"),
1327 '要' => Some("yao"),
1328 '下' => Some("xia"),
1329 '以' => Some("yi"),
1330 '生' => Some("sheng"),
1331 '会' => Some("hui"),
1332 '自' => Some("zi"),
1333 '着' => Some("zhe"),
1334 '去' => Some("qu"),
1335 '之' => Some("zhi"),
1336 '过' => Some("guo"),
1337 '家' => Some("jia"),
1338 '学' => Some("xue"),
1339 '对' => Some("dui"),
1340 '可' => Some("ke"),
1341 '她' => Some("ta"),
1342 '里' => Some("li"),
1343 '后' => Some("hou"),
1344 '小' => Some("xiao"),
1345 '么' => Some("me"),
1346 '心' => Some("xin"),
1347 '多' => Some("duo"),
1348 '天' => Some("tian"),
1349 '而' => Some("er"),
1350 '能' => Some("neng"),
1351 '好' => Some("hao"),
1352 '都' => Some("dou"),
1353 '然' => Some("ran"),
1354 _ => None,
1355 }
1356}
1357
1358#[cfg(test)]
1361mod unicode_tests {
1362 use super::*;
1363
1364 #[test]
1367 fn tokenize_splits_simple_english() {
1368 let tok = UnicodeTokenizer::new(UnicodeTokenizerConfig {
1369 lowercase: true,
1370 strip_accents: false,
1371 split_on_punctuation: false,
1372 split_on_whitespace: true,
1373 max_token_length: None,
1374 });
1375 let tokens = tok.tokenize("hello world");
1376 assert_eq!(
1377 tokens,
1378 vec!["hello", "world"],
1379 "simple English sentence must split on whitespace"
1380 );
1381 }
1382
1383 #[test]
1384 fn tokenize_cjk_each_char_is_token() {
1385 let tok = UnicodeTokenizer::default();
1386 let tokens = tok.tokenize_cjk("中文 hello");
1388 assert!(
1390 tokens.contains(&"中".to_string()),
1391 "CJK char '中' must be a token"
1392 );
1393 assert!(
1394 tokens.contains(&"文".to_string()),
1395 "CJK char '文' must be a token"
1396 );
1397 assert!(
1398 tokens.contains(&"hello".to_string()),
1399 "'hello' must be a token"
1400 );
1401 }
1402
1403 #[test]
1404 fn detect_script_latin() {
1405 let tok = UnicodeTokenizer::default();
1406 assert_eq!(
1407 tok.detect_script("hello world"),
1408 ScriptFamily::Latin,
1409 "ASCII text must detect as Latin"
1410 );
1411 }
1412
1413 #[test]
1414 fn detect_script_cyrillic() {
1415 let tok = UnicodeTokenizer::default();
1416 assert_eq!(
1418 tok.detect_script("привет мир"),
1419 ScriptFamily::Cyrillic,
1420 "Cyrillic text must detect as Cyrillic"
1421 );
1422 }
1423
1424 #[test]
1425 fn detect_script_cjk() {
1426 let tok = UnicodeTokenizer::default();
1427 assert_eq!(
1428 tok.detect_script("中文"),
1429 ScriptFamily::Cjk,
1430 "CJK text must detect as Cjk"
1431 );
1432 }
1433
1434 #[test]
1435 fn tokenize_with_punctuation_split() {
1436 let tok = UnicodeTokenizer::default();
1437 let tokens = tok.tokenize("hello, world!");
1438 assert!(
1440 tokens.contains(&"hello".to_string()),
1441 "must contain 'hello'"
1442 );
1443 assert!(
1444 tokens.contains(&"world".to_string()),
1445 "must contain 'world'"
1446 );
1447 }
1448
1449 #[test]
1450 fn tokenize_max_token_length() {
1451 let tok = UnicodeTokenizer::new(UnicodeTokenizerConfig {
1452 lowercase: false,
1453 strip_accents: false,
1454 split_on_punctuation: false,
1455 split_on_whitespace: true,
1456 max_token_length: Some(3),
1457 });
1458 let tokens = tok.tokenize("hello world");
1459 for t in &tokens {
1460 assert!(
1461 t.chars().count() <= 3,
1462 "token '{t}' exceeds max_token_length=3"
1463 );
1464 }
1465 }
1466
1467 #[test]
1470 fn cyrillic_to_latin_privet() {
1471 let result = Transliterator::cyrillic_to_latin("привет");
1475 assert!(
1477 result.starts_with("priv"),
1478 "transliteration of 'привет' must start with 'priv', got '{result}'"
1479 );
1480 }
1481
1482 #[test]
1483 fn cyrillic_to_latin_basic_letters() {
1484 assert_eq!(Transliterator::cyrillic_to_latin("а"), "a");
1486 assert_eq!(Transliterator::cyrillic_to_latin("б"), "b");
1487 assert_eq!(Transliterator::cyrillic_to_latin("с"), "s");
1488 assert_eq!(Transliterator::cyrillic_to_latin("т"), "t");
1489 }
1490
1491 #[test]
1492 fn strip_accents_cafe() {
1493 let result = Transliterator::strip_accents("café");
1494 assert_eq!(
1495 result, "cafe",
1496 "strip_accents('café') must return 'cafe', got '{result}'"
1497 );
1498 }
1499
1500 #[test]
1501 fn strip_accents_no_accents_unchanged() {
1502 let result = Transliterator::strip_accents("hello");
1503 assert_eq!(result, "hello", "plain ASCII must be unchanged");
1504 }
1505
1506 #[test]
1507 fn transliterator_normalize_collapses_whitespace() {
1508 let result = Transliterator::normalize(" Hello World ");
1509 assert_eq!(
1510 result, "hello world",
1511 "normalize must trim and collapse spaces"
1512 );
1513 }
1514
1515 #[test]
1516 fn strip_accents_german_umlaut() {
1517 let result = Transliterator::strip_accents("über");
1519 assert_eq!(result, "uber", "ü must become u after accent stripping");
1520 }
1521}