1use unicode_normalization::UnicodeNormalization;
46
47pub mod cyrillic;
48pub mod hepburn;
49pub mod pinyin;
50
51pub use cyrillic::{CyrillicScheme, CyrillicTransliterator};
52pub use hepburn::HepburnTransliterator;
53pub use pinyin::{PinyinStyle, PinyinTransliterator};
54
55pub trait Transliterator {
63 fn transliterate(&self, input: &str) -> String;
65}
66
67#[derive(Debug, Clone, PartialEq, Eq)]
71#[non_exhaustive]
72pub enum JapaneseScript {
73 Hiragana,
75 Katakana,
77 Romaji,
79}
80
81#[derive(Debug, Clone, PartialEq, Eq)]
83#[non_exhaustive]
84pub enum ChineseSystem {
85 Pinyin,
87 #[allow(non_camel_case_types)]
89 Wade_Giles,
90}
91
92#[derive(Debug, Clone, PartialEq, Eq)]
94#[non_exhaustive]
95pub enum Script {
96 Cyrillic,
98 Greek,
100 Arabic,
102 Hebrew,
104 Japanese(JapaneseScript),
106 Korean,
108 Chinese(ChineseSystem),
110 Latin,
112}
113
114#[derive(Debug, Clone)]
116pub struct TranslitConfig {
117 pub preserve_case: bool,
119 pub strip_diacritics: bool,
121}
122
123impl Default for TranslitConfig {
124 fn default() -> Self {
125 Self {
126 preserve_case: true,
127 strip_diacritics: false,
128 }
129 }
130}
131
132pub static CYRILLIC_TO_LATIN: &[(&str, &str)] = &[
136 ("а", "a"),
138 ("б", "b"),
139 ("в", "v"),
140 ("г", "g"),
141 ("д", "d"),
142 ("е", "je"),
143 ("ё", "jo"),
144 ("ж", "zh"),
145 ("з", "z"),
146 ("и", "i"),
147 ("й", "j"),
148 ("к", "k"),
149 ("л", "l"),
150 ("м", "m"),
151 ("н", "n"),
152 ("о", "o"),
153 ("п", "p"),
154 ("р", "r"),
155 ("с", "s"),
156 ("т", "t"),
157 ("у", "u"),
158 ("ф", "f"),
159 ("х", "h"),
160 ("ц", "c"),
161 ("ч", "ch"),
162 ("ш", "sh"),
163 ("щ", "shh"),
164 ("ъ", "\u{2033}"), ("ы", "y"),
166 ("ь", "\u{2032}"), ("э", "eh"),
168 ("ю", "ju"),
169 ("я", "ja"),
170 ("А", "A"),
172 ("Б", "B"),
173 ("В", "V"),
174 ("Г", "G"),
175 ("Д", "D"),
176 ("Е", "Je"),
177 ("Ё", "Jo"),
178 ("Ж", "Zh"),
179 ("З", "Z"),
180 ("И", "I"),
181 ("Й", "J"),
182 ("К", "K"),
183 ("Л", "L"),
184 ("М", "M"),
185 ("Н", "N"),
186 ("О", "O"),
187 ("П", "P"),
188 ("Р", "R"),
189 ("С", "S"),
190 ("Т", "T"),
191 ("У", "U"),
192 ("Ф", "F"),
193 ("Х", "H"),
194 ("Ц", "C"),
195 ("Ч", "Ch"),
196 ("Ш", "Sh"),
197 ("Щ", "Shh"),
198 ("Ъ", "\u{2033}"),
199 ("Ы", "Y"),
200 ("Ь", "\u{2032}"),
201 ("Э", "Eh"),
202 ("Ю", "Ju"),
203 ("Я", "Ja"),
204];
205
206pub static GREEK_TO_LATIN: &[(&str, &str)] = &[
208 ("α", "a"),
209 ("β", "b"),
210 ("γ", "g"),
211 ("δ", "d"),
212 ("ε", "e"),
213 ("ζ", "z"),
214 ("η", "\u{0113}"), ("θ", "th"),
216 ("ι", "i"),
217 ("κ", "k"),
218 ("λ", "l"),
219 ("μ", "m"),
220 ("ν", "n"),
221 ("ξ", "x"),
222 ("ο", "o"),
223 ("π", "p"),
224 ("ρ", "r"),
225 ("σ", "s"),
226 ("ς", "s"), ("τ", "t"),
228 ("υ", "y"),
229 ("φ", "ph"),
230 ("χ", "ch"),
231 ("ψ", "ps"),
232 ("ω", "\u{014D}"), ("Α", "A"),
235 ("Β", "B"),
236 ("Γ", "G"),
237 ("Δ", "D"),
238 ("Ε", "E"),
239 ("Ζ", "Z"),
240 ("Η", "\u{0112}"), ("Θ", "Th"),
242 ("Ι", "I"),
243 ("Κ", "K"),
244 ("Λ", "L"),
245 ("Μ", "M"),
246 ("Ν", "N"),
247 ("Ξ", "X"),
248 ("Ο", "O"),
249 ("Π", "P"),
250 ("Ρ", "R"),
251 ("Σ", "S"),
252 ("Τ", "T"),
253 ("Υ", "Y"),
254 ("Φ", "Ph"),
255 ("Χ", "Ch"),
256 ("Ψ", "Ps"),
257 ("Ω", "\u{014C}"), ];
259
260pub static HIRAGANA_TO_ROMAJI: &[(&str, &str)] = &[
262 ("あ", "a"),
263 ("い", "i"),
264 ("う", "u"),
265 ("え", "e"),
266 ("お", "o"),
267 ("か", "ka"),
268 ("き", "ki"),
269 ("く", "ku"),
270 ("け", "ke"),
271 ("こ", "ko"),
272 ("さ", "sa"),
273 ("し", "shi"),
274 ("す", "su"),
275 ("せ", "se"),
276 ("そ", "so"),
277 ("た", "ta"),
278 ("ち", "chi"),
279 ("つ", "tsu"),
280 ("て", "te"),
281 ("と", "to"),
282 ("な", "na"),
283 ("に", "ni"),
284 ("ぬ", "nu"),
285 ("ね", "ne"),
286 ("の", "no"),
287 ("は", "ha"),
288 ("ひ", "hi"),
289 ("ふ", "fu"),
290 ("へ", "he"),
291 ("ほ", "ho"),
292 ("ま", "ma"),
293 ("み", "mi"),
294 ("む", "mu"),
295 ("め", "me"),
296 ("も", "mo"),
297 ("や", "ya"),
298 ("ゆ", "yu"),
299 ("よ", "yo"),
300 ("ら", "ra"),
301 ("り", "ri"),
302 ("る", "ru"),
303 ("れ", "re"),
304 ("ろ", "ro"),
305 ("わ", "wa"),
306 ("を", "wo"),
307 ("ん", "n"),
308 ("が", "ga"),
310 ("ぎ", "gi"),
311 ("ぐ", "gu"),
312 ("げ", "ge"),
313 ("ご", "go"),
314 ("ざ", "za"),
315 ("じ", "ji"),
316 ("ず", "zu"),
317 ("ぜ", "ze"),
318 ("ぞ", "zo"),
319 ("だ", "da"),
320 ("ぢ", "di"),
321 ("づ", "du"),
322 ("で", "de"),
323 ("ど", "do"),
324 ("ば", "ba"),
325 ("び", "bi"),
326 ("ぶ", "bu"),
327 ("べ", "be"),
328 ("ぼ", "bo"),
329 ("ぱ", "pa"),
331 ("ぴ", "pi"),
332 ("ぷ", "pu"),
333 ("ぺ", "pe"),
334 ("ぽ", "po"),
335 ("ぁ", "xa"),
337 ("ぃ", "xi"),
338 ("ぅ", "xu"),
339 ("ぇ", "xe"),
340 ("ぉ", "xo"),
341];
342
343pub static KATAKANA_TO_ROMAJI: &[(&str, &str)] = &[
345 ("ア", "a"),
346 ("イ", "i"),
347 ("ウ", "u"),
348 ("エ", "e"),
349 ("オ", "o"),
350 ("カ", "ka"),
351 ("キ", "ki"),
352 ("ク", "ku"),
353 ("ケ", "ke"),
354 ("コ", "ko"),
355 ("サ", "sa"),
356 ("シ", "shi"),
357 ("ス", "su"),
358 ("セ", "se"),
359 ("ソ", "so"),
360 ("タ", "ta"),
361 ("チ", "chi"),
362 ("ツ", "tsu"),
363 ("テ", "te"),
364 ("ト", "to"),
365 ("ナ", "na"),
366 ("ニ", "ni"),
367 ("ヌ", "nu"),
368 ("ネ", "ne"),
369 ("ノ", "no"),
370 ("ハ", "ha"),
371 ("ヒ", "hi"),
372 ("フ", "fu"),
373 ("ヘ", "he"),
374 ("ホ", "ho"),
375 ("マ", "ma"),
376 ("ミ", "mi"),
377 ("ム", "mu"),
378 ("メ", "me"),
379 ("モ", "mo"),
380 ("ヤ", "ya"),
381 ("ユ", "yu"),
382 ("ヨ", "yo"),
383 ("ラ", "ra"),
384 ("リ", "ri"),
385 ("ル", "ru"),
386 ("レ", "re"),
387 ("ロ", "ro"),
388 ("ワ", "wa"),
389 ("ヲ", "wo"),
390 ("ン", "n"),
391 ("ガ", "ga"),
393 ("ギ", "gi"),
394 ("グ", "gu"),
395 ("ゲ", "ge"),
396 ("ゴ", "go"),
397 ("ザ", "za"),
398 ("ジ", "ji"),
399 ("ズ", "zu"),
400 ("ゼ", "ze"),
401 ("ゾ", "zo"),
402 ("ダ", "da"),
403 ("ヂ", "di"),
404 ("ヅ", "du"),
405 ("デ", "de"),
406 ("ド", "do"),
407 ("バ", "ba"),
408 ("ビ", "bi"),
409 ("ブ", "bu"),
410 ("ベ", "be"),
411 ("ボ", "bo"),
412 ("パ", "pa"),
414 ("ピ", "pi"),
415 ("プ", "pu"),
416 ("ペ", "pe"),
417 ("ポ", "po"),
418];
419
420pub struct ScriptTransliterator {
427 config: TranslitConfig,
428}
429
430impl ScriptTransliterator {
431 pub fn new(config: TranslitConfig) -> Self {
433 Self { config }
434 }
435
436 pub fn transliterate(&self, text: &str, from: &Script) -> String {
440 let table: &[(&str, &str)] = match from {
441 Script::Cyrillic => CYRILLIC_TO_LATIN,
442 Script::Greek => GREEK_TO_LATIN,
443 Script::Japanese(JapaneseScript::Hiragana) => HIRAGANA_TO_ROMAJI,
444 Script::Japanese(JapaneseScript::Katakana) => KATAKANA_TO_ROMAJI,
445 Script::Japanese(JapaneseScript::Romaji) | Script::Latin => {
446 return if self.config.strip_diacritics {
448 strip_diacritics(text)
449 } else {
450 text.to_string()
451 };
452 }
453 _ => {
454 return text.to_string();
456 }
457 };
458
459 let mut result = String::with_capacity(text.len() * 2);
460 let chars: Vec<char> = text.chars().collect();
461 let mut i = 0;
462 'outer: while i < chars.len() {
463 let mut candidate = String::new();
466 for &ch in &chars[i..] {
467 candidate.push(ch);
468 let any_prefix = table
470 .iter()
471 .any(|(src, _)| src.starts_with(candidate.as_str()));
472 if !any_prefix {
473 break;
474 }
475 }
476 let remaining: String = chars[i..].iter().collect();
478 for (src, dst) in table.iter() {
479 if remaining.starts_with(src) {
480 result.push_str(dst);
481 i += src.chars().count();
482 continue 'outer;
483 }
484 }
485 result.push(chars[i]);
487 i += 1;
488 }
489
490 if self.config.strip_diacritics {
491 strip_diacritics(&result)
492 } else {
493 result
494 }
495 }
496
497 pub fn detect_script(text: &str) -> Script {
501 let mut cyrillic = 0usize;
502 let mut greek = 0usize;
503 let mut arabic = 0usize;
504 let mut hebrew = 0usize;
505 let mut hiragana = 0usize;
506 let mut katakana = 0usize;
507 let mut hangul = 0usize;
508 let mut cjk = 0usize;
509
510 for ch in text.chars() {
511 let cp = ch as u32;
512 if (0x0400..=0x04FF).contains(&cp) {
513 cyrillic += 1;
514 } else if (0x0370..=0x03FF).contains(&cp) {
515 greek += 1;
516 } else if (0x0600..=0x06FF).contains(&cp) {
517 arabic += 1;
518 } else if (0x0590..=0x05FF).contains(&cp) {
519 hebrew += 1;
520 } else if (0x3040..=0x309F).contains(&cp) {
521 hiragana += 1;
522 } else if (0x30A0..=0x30FF).contains(&cp) {
523 katakana += 1;
524 } else if (0xAC00..=0xD7AF).contains(&cp) {
525 hangul += 1;
526 } else if (0x4E00..=0x9FFF).contains(&cp) {
527 cjk += 1;
528 }
529 }
530
531 let scores: [(usize, fn() -> Script); 8] = [
533 (cyrillic, || Script::Cyrillic),
534 (greek, || Script::Greek),
535 (arabic, || Script::Arabic),
536 (hebrew, || Script::Hebrew),
537 (hiragana, || Script::Japanese(JapaneseScript::Hiragana)),
538 (katakana, || Script::Japanese(JapaneseScript::Katakana)),
539 (hangul, || Script::Korean),
540 (cjk, || Script::Chinese(ChineseSystem::Pinyin)),
541 ];
542
543 let best = scores.iter().max_by_key(|(count, _)| *count);
544
545 match best {
546 Some((count, make_script)) if *count > 0 => make_script(),
547 _ => Script::Latin,
548 }
549 }
550}
551
552pub fn strip_diacritics(s: &str) -> String {
557 s.nfd()
558 .filter(|ch| {
559 let cp = *ch as u32;
560 !(0x0300..=0x036F).contains(&cp)
561 })
562 .nfc()
563 .collect()
564}
565
566#[cfg(test)]
569mod tests {
570 use super::*;
571
572 #[test]
573 fn test_detect_cyrillic() {
574 assert_eq!(
575 ScriptTransliterator::detect_script("Привет"),
576 Script::Cyrillic
577 );
578 }
579
580 #[test]
581 fn test_detect_greek() {
582 assert_eq!(ScriptTransliterator::detect_script("αβγδ"), Script::Greek);
583 }
584
585 #[test]
586 fn test_detect_hiragana() {
587 let s = ScriptTransliterator::detect_script("あいうえお");
588 assert_eq!(s, Script::Japanese(JapaneseScript::Hiragana));
589 }
590
591 #[test]
592 fn test_detect_katakana() {
593 let s = ScriptTransliterator::detect_script("アイウエオ");
594 assert_eq!(s, Script::Japanese(JapaneseScript::Katakana));
595 }
596
597 #[test]
598 fn test_detect_latin_fallback() {
599 assert_eq!(
600 ScriptTransliterator::detect_script("hello world"),
601 Script::Latin
602 );
603 }
604
605 #[test]
606 fn test_transliterate_cyrillic() {
607 let t = ScriptTransliterator::new(TranslitConfig::default());
608 let result = t.transliterate("привет", &Script::Cyrillic);
609 assert!(
611 result
612 .chars()
613 .all(|c| c.is_ascii() || c == '\u{2032}' || c == '\u{2033}'),
614 "Cyrillic should transliterate to Latin-like chars, got: {}",
615 result
616 );
617 assert!(!result.is_empty());
618 }
619
620 #[test]
621 fn test_transliterate_cyrillic_known() {
622 let t = ScriptTransliterator::new(TranslitConfig::default());
623 assert_eq!(t.transliterate("а", &Script::Cyrillic), "a");
624 assert_eq!(t.transliterate("б", &Script::Cyrillic), "b");
625 assert_eq!(t.transliterate("ш", &Script::Cyrillic), "sh");
626 }
627
628 #[test]
629 fn test_transliterate_hiragana_aiu() {
630 let t = ScriptTransliterator::new(TranslitConfig::default());
631 let result = t.transliterate("あいう", &Script::Japanese(JapaneseScript::Hiragana));
632 assert_eq!(result, "aiu");
633 }
634
635 #[test]
636 fn test_transliterate_hiragana_full_word() {
637 let t = ScriptTransliterator::new(TranslitConfig::default());
638 let result = t.transliterate("さくら", &Script::Japanese(JapaneseScript::Hiragana));
640 assert_eq!(result, "sakura");
641 }
642
643 #[test]
644 fn test_transliterate_katakana() {
645 let t = ScriptTransliterator::new(TranslitConfig::default());
646 let result = t.transliterate("アイウ", &Script::Japanese(JapaneseScript::Katakana));
647 assert_eq!(result, "aiu");
648 }
649
650 #[test]
651 fn test_transliterate_greek() {
652 let t = ScriptTransliterator::new(TranslitConfig::default());
653 let result = t.transliterate("αβγ", &Script::Greek);
654 assert_eq!(result, "abg");
655 }
656
657 #[test]
658 fn test_strip_diacritics() {
659 let s = strip_diacritics("café");
661 assert_eq!(s, "cafe");
662 }
663
664 #[test]
665 fn test_strip_diacritics_config() {
666 let t = ScriptTransliterator::new(TranslitConfig {
667 strip_diacritics: true,
668 ..Default::default()
669 });
670 let result = t.transliterate("η", &Script::Greek);
672 assert_eq!(result, "e");
673 }
674
675 #[test]
676 fn test_no_match_passthrough() {
677 let t = ScriptTransliterator::new(TranslitConfig::default());
678 let result = t.transliterate("abc", &Script::Cyrillic);
680 assert_eq!(result, "abc");
681 }
682}