Skip to main content

scirs2_text/transliteration/
mod.rs

1//! Transliteration utilities: convert non-Latin scripts to Latin characters.
2//!
3//! # Overview
4//!
5//! This module provides two tiers of API:
6//!
7//! ## Trait-based API (new)
8//! High-level, script-specific transliterators each implementing the
9//! [`Transliterator`] trait:
10//!
11//! | Type | Script | Notes |
12//! |------|--------|-------|
13//! | [`HepburnTransliterator`] | Japanese kana | Modified Hepburn, yōon, long vowels |
14//! | [`CyrillicTransliterator`] | Cyrillic | GOST 2005, BGN/PCGN 1947, ALA-LC |
15//! | [`PinyinTransliterator`] | Simplified Chinese | HSK 1-6, tone marks or numbered |
16//!
17//! ## Struct-based API (legacy)
18//! [`ScriptTransliterator`] accepts a [`Script`] enum at call time and dispatches
19//! to embedded tables for Cyrillic (ISO 9), Greek (ALA-LC), Hiragana and Katakana.
20//! This API is kept for backwards-compatibility.
21//!
22//! # Examples
23//!
24//! ```rust
25//! use scirs2_text::transliteration::{
26//!     Transliterator, HepburnTransliterator, CyrillicTransliterator,
27//!     CyrillicScheme, PinyinTransliterator, PinyinStyle,
28//! };
29//!
30//! // Japanese kana → romaji
31//! let hepburn = HepburnTransliterator::new();
32//! assert_eq!(hepburn.transliterate("さくら"), "sakura");
33//!
34//! // Cyrillic → Latin (BGN/PCGN)
35//! let cyrillic = CyrillicTransliterator::new(CyrillicScheme::BgnPcgn);
36//! let r = cyrillic.transliterate("Москва");
37//! assert!(r.to_lowercase().contains("moskva"));
38//!
39//! // Chinese → Pinyin
40//! let pinyin = PinyinTransliterator::new(PinyinStyle::WithToneMarks);
41//! let r = pinyin.transliterate("你好");
42//! assert!(r.contains("nǐ") || r.contains("ni"));
43//! ```
44
45use unicode_normalization::UnicodeNormalization;
46
47pub mod cyrillic;
48pub mod hepburn;
49pub mod pinyin;
50
51pub use cyrillic::{CyrillicScheme, CyrillicTransliterator};
52pub use hepburn::HepburnTransliterator;
53pub use pinyin::{PinyinStyle, PinyinTransliterator};
54
55// ─── Core trait ───────────────────────────────────────────────────────────────
56
57/// Common interface for all script-specific transliterators.
58///
59/// Each implementation converts its source script to a Latin representation.
60/// Characters that are not part of the source script (e.g. Latin letters,
61/// digits, punctuation) are passed through unchanged.
62pub trait Transliterator {
63    /// Transliterate `input` and return the Latin representation.
64    fn transliterate(&self, input: &str) -> String;
65}
66
67// ─── Legacy struct-based API ──────────────────────────────────────────────────
68
69/// Japanese writing system variant.
70#[derive(Debug, Clone, PartialEq, Eq)]
71#[non_exhaustive]
72pub enum JapaneseScript {
73    /// Hiragana syllabary (U+3040–U+309F).
74    Hiragana,
75    /// Katakana syllabary (U+30A0–U+30FF).
76    Katakana,
77    /// Latin romaji representation.
78    Romaji,
79}
80
81/// Chinese romanisation system.
82#[derive(Debug, Clone, PartialEq, Eq)]
83#[non_exhaustive]
84pub enum ChineseSystem {
85    /// Hanyu Pinyin (mainland standard).
86    Pinyin,
87    /// Wade-Giles system.
88    #[allow(non_camel_case_types)]
89    Wade_Giles,
90}
91
92/// Writing script identifier.
93#[derive(Debug, Clone, PartialEq, Eq)]
94#[non_exhaustive]
95pub enum Script {
96    /// Cyrillic script (U+0400–U+04FF).
97    Cyrillic,
98    /// Greek script (U+0370–U+03FF).
99    Greek,
100    /// Arabic script (U+0600–U+06FF).
101    Arabic,
102    /// Hebrew script (U+0590–U+05FF).
103    Hebrew,
104    /// Japanese (Hiragana / Katakana).
105    Japanese(JapaneseScript),
106    /// Korean Hangul (U+AC00–U+D7AF).
107    Korean,
108    /// Chinese characters (U+4E00–U+9FFF).
109    Chinese(ChineseSystem),
110    /// Latin / ASCII script.
111    Latin,
112}
113
114/// Configuration for the [`ScriptTransliterator`].
115#[derive(Debug, Clone)]
116pub struct TranslitConfig {
117    /// If `true`, preserve case through mapping of uppercase source characters.
118    pub preserve_case: bool,
119    /// If `true`, strip diacritics from the output string.
120    pub strip_diacritics: bool,
121}
122
123impl Default for TranslitConfig {
124    fn default() -> Self {
125        Self {
126            preserve_case: true,
127            strip_diacritics: false,
128        }
129    }
130}
131
132// ─── Static transliteration tables (legacy) ───────────────────────────────────
133
134/// Cyrillic → Latin (ISO 9).
135pub static CYRILLIC_TO_LATIN: &[(&str, &str)] = &[
136    // Lowercase
137    ("а", "a"),
138    ("б", "b"),
139    ("в", "v"),
140    ("г", "g"),
141    ("д", "d"),
142    ("е", "je"),
143    ("ё", "jo"),
144    ("ж", "zh"),
145    ("з", "z"),
146    ("и", "i"),
147    ("й", "j"),
148    ("к", "k"),
149    ("л", "l"),
150    ("м", "m"),
151    ("н", "n"),
152    ("о", "o"),
153    ("п", "p"),
154    ("р", "r"),
155    ("с", "s"),
156    ("т", "t"),
157    ("у", "u"),
158    ("ф", "f"),
159    ("х", "h"),
160    ("ц", "c"),
161    ("ч", "ch"),
162    ("ш", "sh"),
163    ("щ", "shh"),
164    ("ъ", "\u{2033}"), // double prime ″
165    ("ы", "y"),
166    ("ь", "\u{2032}"), // prime ′
167    ("э", "eh"),
168    ("ю", "ju"),
169    ("я", "ja"),
170    // Uppercase
171    ("А", "A"),
172    ("Б", "B"),
173    ("В", "V"),
174    ("Г", "G"),
175    ("Д", "D"),
176    ("Е", "Je"),
177    ("Ё", "Jo"),
178    ("Ж", "Zh"),
179    ("З", "Z"),
180    ("И", "I"),
181    ("Й", "J"),
182    ("К", "K"),
183    ("Л", "L"),
184    ("М", "M"),
185    ("Н", "N"),
186    ("О", "O"),
187    ("П", "P"),
188    ("Р", "R"),
189    ("С", "S"),
190    ("Т", "T"),
191    ("У", "U"),
192    ("Ф", "F"),
193    ("Х", "H"),
194    ("Ц", "C"),
195    ("Ч", "Ch"),
196    ("Ш", "Sh"),
197    ("Щ", "Shh"),
198    ("Ъ", "\u{2033}"),
199    ("Ы", "Y"),
200    ("Ь", "\u{2032}"),
201    ("Э", "Eh"),
202    ("Ю", "Ju"),
203    ("Я", "Ja"),
204];
205
206/// Greek → Latin (ALA-LC).
207pub static GREEK_TO_LATIN: &[(&str, &str)] = &[
208    ("α", "a"),
209    ("β", "b"),
210    ("γ", "g"),
211    ("δ", "d"),
212    ("ε", "e"),
213    ("ζ", "z"),
214    ("η", "\u{0113}"), // ē
215    ("θ", "th"),
216    ("ι", "i"),
217    ("κ", "k"),
218    ("λ", "l"),
219    ("μ", "m"),
220    ("ν", "n"),
221    ("ξ", "x"),
222    ("ο", "o"),
223    ("π", "p"),
224    ("ρ", "r"),
225    ("σ", "s"),
226    ("ς", "s"), // final sigma
227    ("τ", "t"),
228    ("υ", "y"),
229    ("φ", "ph"),
230    ("χ", "ch"),
231    ("ψ", "ps"),
232    ("ω", "\u{014D}"), // ō
233    // Uppercase
234    ("Α", "A"),
235    ("Β", "B"),
236    ("Γ", "G"),
237    ("Δ", "D"),
238    ("Ε", "E"),
239    ("Ζ", "Z"),
240    ("Η", "\u{0112}"), // Ē
241    ("Θ", "Th"),
242    ("Ι", "I"),
243    ("Κ", "K"),
244    ("Λ", "L"),
245    ("Μ", "M"),
246    ("Ν", "N"),
247    ("Ξ", "X"),
248    ("Ο", "O"),
249    ("Π", "P"),
250    ("Ρ", "R"),
251    ("Σ", "S"),
252    ("Τ", "T"),
253    ("Υ", "Y"),
254    ("Φ", "Ph"),
255    ("Χ", "Ch"),
256    ("Ψ", "Ps"),
257    ("Ω", "\u{014C}"), // Ō
258];
259
260/// Hiragana → Romaji (Hepburn).
261pub static HIRAGANA_TO_ROMAJI: &[(&str, &str)] = &[
262    ("あ", "a"),
263    ("い", "i"),
264    ("う", "u"),
265    ("え", "e"),
266    ("お", "o"),
267    ("か", "ka"),
268    ("き", "ki"),
269    ("く", "ku"),
270    ("け", "ke"),
271    ("こ", "ko"),
272    ("さ", "sa"),
273    ("し", "shi"),
274    ("す", "su"),
275    ("せ", "se"),
276    ("そ", "so"),
277    ("た", "ta"),
278    ("ち", "chi"),
279    ("つ", "tsu"),
280    ("て", "te"),
281    ("と", "to"),
282    ("な", "na"),
283    ("に", "ni"),
284    ("ぬ", "nu"),
285    ("ね", "ne"),
286    ("の", "no"),
287    ("は", "ha"),
288    ("ひ", "hi"),
289    ("ふ", "fu"),
290    ("へ", "he"),
291    ("ほ", "ho"),
292    ("ま", "ma"),
293    ("み", "mi"),
294    ("む", "mu"),
295    ("め", "me"),
296    ("も", "mo"),
297    ("や", "ya"),
298    ("ゆ", "yu"),
299    ("よ", "yo"),
300    ("ら", "ra"),
301    ("り", "ri"),
302    ("る", "ru"),
303    ("れ", "re"),
304    ("ろ", "ro"),
305    ("わ", "wa"),
306    ("を", "wo"),
307    ("ん", "n"),
308    // Voiced consonants
309    ("が", "ga"),
310    ("ぎ", "gi"),
311    ("ぐ", "gu"),
312    ("げ", "ge"),
313    ("ご", "go"),
314    ("ざ", "za"),
315    ("じ", "ji"),
316    ("ず", "zu"),
317    ("ぜ", "ze"),
318    ("ぞ", "zo"),
319    ("だ", "da"),
320    ("ぢ", "di"),
321    ("づ", "du"),
322    ("で", "de"),
323    ("ど", "do"),
324    ("ば", "ba"),
325    ("び", "bi"),
326    ("ぶ", "bu"),
327    ("べ", "be"),
328    ("ぼ", "bo"),
329    // Semi-voiced
330    ("ぱ", "pa"),
331    ("ぴ", "pi"),
332    ("ぷ", "pu"),
333    ("ぺ", "pe"),
334    ("ぽ", "po"),
335    // Small vowels / combination starters
336    ("ぁ", "xa"),
337    ("ぃ", "xi"),
338    ("ぅ", "xu"),
339    ("ぇ", "xe"),
340    ("ぉ", "xo"),
341];
342
343/// Katakana → Romaji (Hepburn).
344pub static KATAKANA_TO_ROMAJI: &[(&str, &str)] = &[
345    ("ア", "a"),
346    ("イ", "i"),
347    ("ウ", "u"),
348    ("エ", "e"),
349    ("オ", "o"),
350    ("カ", "ka"),
351    ("キ", "ki"),
352    ("ク", "ku"),
353    ("ケ", "ke"),
354    ("コ", "ko"),
355    ("サ", "sa"),
356    ("シ", "shi"),
357    ("ス", "su"),
358    ("セ", "se"),
359    ("ソ", "so"),
360    ("タ", "ta"),
361    ("チ", "chi"),
362    ("ツ", "tsu"),
363    ("テ", "te"),
364    ("ト", "to"),
365    ("ナ", "na"),
366    ("ニ", "ni"),
367    ("ヌ", "nu"),
368    ("ネ", "ne"),
369    ("ノ", "no"),
370    ("ハ", "ha"),
371    ("ヒ", "hi"),
372    ("フ", "fu"),
373    ("ヘ", "he"),
374    ("ホ", "ho"),
375    ("マ", "ma"),
376    ("ミ", "mi"),
377    ("ム", "mu"),
378    ("メ", "me"),
379    ("モ", "mo"),
380    ("ヤ", "ya"),
381    ("ユ", "yu"),
382    ("ヨ", "yo"),
383    ("ラ", "ra"),
384    ("リ", "ri"),
385    ("ル", "ru"),
386    ("レ", "re"),
387    ("ロ", "ro"),
388    ("ワ", "wa"),
389    ("ヲ", "wo"),
390    ("ン", "n"),
391    // Voiced
392    ("ガ", "ga"),
393    ("ギ", "gi"),
394    ("グ", "gu"),
395    ("ゲ", "ge"),
396    ("ゴ", "go"),
397    ("ザ", "za"),
398    ("ジ", "ji"),
399    ("ズ", "zu"),
400    ("ゼ", "ze"),
401    ("ゾ", "zo"),
402    ("ダ", "da"),
403    ("ヂ", "di"),
404    ("ヅ", "du"),
405    ("デ", "de"),
406    ("ド", "do"),
407    ("バ", "ba"),
408    ("ビ", "bi"),
409    ("ブ", "bu"),
410    ("ベ", "be"),
411    ("ボ", "bo"),
412    // Semi-voiced
413    ("パ", "pa"),
414    ("ピ", "pi"),
415    ("プ", "pu"),
416    ("ペ", "pe"),
417    ("ポ", "po"),
418];
419
420// ─── ScriptTransliterator (legacy, struct-based) ──────────────────────────────
421
422/// Stateful transliterator (legacy struct-based API).
423///
424/// For new code, prefer the trait-based API with [`HepburnTransliterator`],
425/// [`CyrillicTransliterator`], or [`PinyinTransliterator`].
426pub struct ScriptTransliterator {
427    config: TranslitConfig,
428}
429
430impl ScriptTransliterator {
431    /// Create a new `ScriptTransliterator` with the given configuration.
432    pub fn new(config: TranslitConfig) -> Self {
433        Self { config }
434    }
435
436    /// Transliterate `text` from `from` script to Latin.
437    ///
438    /// Characters that have no entry in the table are passed through unchanged.
439    pub fn transliterate(&self, text: &str, from: &Script) -> String {
440        let table: &[(&str, &str)] = match from {
441            Script::Cyrillic => CYRILLIC_TO_LATIN,
442            Script::Greek => GREEK_TO_LATIN,
443            Script::Japanese(JapaneseScript::Hiragana) => HIRAGANA_TO_ROMAJI,
444            Script::Japanese(JapaneseScript::Katakana) => KATAKANA_TO_ROMAJI,
445            Script::Japanese(JapaneseScript::Romaji) | Script::Latin => {
446                // Already Latin — just pass through (optionally strip diacritics).
447                return if self.config.strip_diacritics {
448                    strip_diacritics(text)
449                } else {
450                    text.to_string()
451                };
452            }
453            _ => {
454                // Arabic, Hebrew, Korean, Chinese, etc. — no table yet, return as-is.
455                return text.to_string();
456            }
457        };
458
459        let mut result = String::with_capacity(text.len() * 2);
460        let chars: Vec<char> = text.chars().collect();
461        let mut i = 0;
462        'outer: while i < chars.len() {
463            // Try to match the longest table entry starting at position i.
464            // Build a candidate string starting at chars[i].
465            let mut candidate = String::new();
466            for &ch in &chars[i..] {
467                candidate.push(ch);
468                // Check if any table entry starts with this prefix.
469                let any_prefix = table
470                    .iter()
471                    .any(|(src, _)| src.starts_with(candidate.as_str()));
472                if !any_prefix {
473                    break;
474                }
475            }
476            // Now try decreasing lengths to find an exact match.
477            let remaining: String = chars[i..].iter().collect();
478            for (src, dst) in table.iter() {
479                if remaining.starts_with(src) {
480                    result.push_str(dst);
481                    i += src.chars().count();
482                    continue 'outer;
483                }
484            }
485            // No match: emit the character unchanged.
486            result.push(chars[i]);
487            i += 1;
488        }
489
490        if self.config.strip_diacritics {
491            strip_diacritics(&result)
492        } else {
493            result
494        }
495    }
496
497    /// Detect the predominant writing script of `text` based on Unicode block ranges.
498    ///
499    /// Returns `Script::Latin` if no recognised non-Latin characters are found.
500    pub fn detect_script(text: &str) -> Script {
501        let mut cyrillic = 0usize;
502        let mut greek = 0usize;
503        let mut arabic = 0usize;
504        let mut hebrew = 0usize;
505        let mut hiragana = 0usize;
506        let mut katakana = 0usize;
507        let mut hangul = 0usize;
508        let mut cjk = 0usize;
509
510        for ch in text.chars() {
511            let cp = ch as u32;
512            if (0x0400..=0x04FF).contains(&cp) {
513                cyrillic += 1;
514            } else if (0x0370..=0x03FF).contains(&cp) {
515                greek += 1;
516            } else if (0x0600..=0x06FF).contains(&cp) {
517                arabic += 1;
518            } else if (0x0590..=0x05FF).contains(&cp) {
519                hebrew += 1;
520            } else if (0x3040..=0x309F).contains(&cp) {
521                hiragana += 1;
522            } else if (0x30A0..=0x30FF).contains(&cp) {
523                katakana += 1;
524            } else if (0xAC00..=0xD7AF).contains(&cp) {
525                hangul += 1;
526            } else if (0x4E00..=0x9FFF).contains(&cp) {
527                cjk += 1;
528            }
529        }
530
531        // Return the script with the most characters; fall back to Latin if none.
532        let scores: [(usize, fn() -> Script); 8] = [
533            (cyrillic, || Script::Cyrillic),
534            (greek, || Script::Greek),
535            (arabic, || Script::Arabic),
536            (hebrew, || Script::Hebrew),
537            (hiragana, || Script::Japanese(JapaneseScript::Hiragana)),
538            (katakana, || Script::Japanese(JapaneseScript::Katakana)),
539            (hangul, || Script::Korean),
540            (cjk, || Script::Chinese(ChineseSystem::Pinyin)),
541        ];
542
543        let best = scores.iter().max_by_key(|(count, _)| *count);
544
545        match best {
546            Some((count, make_script)) if *count > 0 => make_script(),
547            _ => Script::Latin,
548        }
549    }
550}
551
552/// Strip diacritical combining marks (U+0300–U+036F) from a string.
553///
554/// The string is first NFD-decomposed, then all combining characters in the
555/// diacritics range are removed, and the result is NFC-recomposed.
556pub fn strip_diacritics(s: &str) -> String {
557    s.nfd()
558        .filter(|ch| {
559            let cp = *ch as u32;
560            !(0x0300..=0x036F).contains(&cp)
561        })
562        .nfc()
563        .collect()
564}
565
566// ─── Tests (legacy API) ────────────────────────────────────────────────────────
567
568#[cfg(test)]
569mod tests {
570    use super::*;
571
572    #[test]
573    fn test_detect_cyrillic() {
574        assert_eq!(
575            ScriptTransliterator::detect_script("Привет"),
576            Script::Cyrillic
577        );
578    }
579
580    #[test]
581    fn test_detect_greek() {
582        assert_eq!(ScriptTransliterator::detect_script("αβγδ"), Script::Greek);
583    }
584
585    #[test]
586    fn test_detect_hiragana() {
587        let s = ScriptTransliterator::detect_script("あいうえお");
588        assert_eq!(s, Script::Japanese(JapaneseScript::Hiragana));
589    }
590
591    #[test]
592    fn test_detect_katakana() {
593        let s = ScriptTransliterator::detect_script("アイウエオ");
594        assert_eq!(s, Script::Japanese(JapaneseScript::Katakana));
595    }
596
597    #[test]
598    fn test_detect_latin_fallback() {
599        assert_eq!(
600            ScriptTransliterator::detect_script("hello world"),
601            Script::Latin
602        );
603    }
604
605    #[test]
606    fn test_transliterate_cyrillic() {
607        let t = ScriptTransliterator::new(TranslitConfig::default());
608        let result = t.transliterate("привет", &Script::Cyrillic);
609        // "привет" → "p"+"r"+"i"+"v"+"je"+"t" = "privjet"
610        assert!(
611            result
612                .chars()
613                .all(|c| c.is_ascii() || c == '\u{2032}' || c == '\u{2033}'),
614            "Cyrillic should transliterate to Latin-like chars, got: {}",
615            result
616        );
617        assert!(!result.is_empty());
618    }
619
620    #[test]
621    fn test_transliterate_cyrillic_known() {
622        let t = ScriptTransliterator::new(TranslitConfig::default());
623        assert_eq!(t.transliterate("а", &Script::Cyrillic), "a");
624        assert_eq!(t.transliterate("б", &Script::Cyrillic), "b");
625        assert_eq!(t.transliterate("ш", &Script::Cyrillic), "sh");
626    }
627
628    #[test]
629    fn test_transliterate_hiragana_aiu() {
630        let t = ScriptTransliterator::new(TranslitConfig::default());
631        let result = t.transliterate("あいう", &Script::Japanese(JapaneseScript::Hiragana));
632        assert_eq!(result, "aiu");
633    }
634
635    #[test]
636    fn test_transliterate_hiragana_full_word() {
637        let t = ScriptTransliterator::new(TranslitConfig::default());
638        // "さくら" (sakura)
639        let result = t.transliterate("さくら", &Script::Japanese(JapaneseScript::Hiragana));
640        assert_eq!(result, "sakura");
641    }
642
643    #[test]
644    fn test_transliterate_katakana() {
645        let t = ScriptTransliterator::new(TranslitConfig::default());
646        let result = t.transliterate("アイウ", &Script::Japanese(JapaneseScript::Katakana));
647        assert_eq!(result, "aiu");
648    }
649
650    #[test]
651    fn test_transliterate_greek() {
652        let t = ScriptTransliterator::new(TranslitConfig::default());
653        let result = t.transliterate("αβγ", &Script::Greek);
654        assert_eq!(result, "abg");
655    }
656
657    #[test]
658    fn test_strip_diacritics() {
659        // "café" → "cafe"
660        let s = strip_diacritics("café");
661        assert_eq!(s, "cafe");
662    }
663
664    #[test]
665    fn test_strip_diacritics_config() {
666        let t = ScriptTransliterator::new(TranslitConfig {
667            strip_diacritics: true,
668            ..Default::default()
669        });
670        // Greek η transliterates to ē (with macron); with strip_diacritics it becomes e.
671        let result = t.transliterate("η", &Script::Greek);
672        assert_eq!(result, "e");
673    }
674
675    #[test]
676    fn test_no_match_passthrough() {
677        let t = ScriptTransliterator::new(TranslitConfig::default());
678        // ASCII should pass through unchanged for Cyrillic transliterator.
679        let result = t.transliterate("abc", &Script::Cyrillic);
680        assert_eq!(result, "abc");
681    }
682}