use super::Transliterator;
static HIRAGANA_YOON: &[(&str, &str)] = &[
("きゃ", "kya"),
("きゅ", "kyu"),
("きょ", "kyo"),
("しゃ", "sha"),
("しゅ", "shu"),
("しょ", "sho"),
("ちゃ", "cha"),
("ちゅ", "chu"),
("ちょ", "cho"),
("にゃ", "nya"),
("にゅ", "nyu"),
("にょ", "nyo"),
("ひゃ", "hya"),
("ひゅ", "hyu"),
("ひょ", "hyo"),
("みゃ", "mya"),
("みゅ", "myu"),
("みょ", "myo"),
("りゃ", "rya"),
("りゅ", "ryu"),
("りょ", "ryo"),
("ぎゃ", "gya"),
("ぎゅ", "gyu"),
("ぎょ", "gyo"),
("じゃ", "ja"),
("じゅ", "ju"),
("じょ", "jo"),
("びゃ", "bya"),
("びゅ", "byu"),
("びょ", "byo"),
("ぴゃ", "pya"),
("ぴゅ", "pyu"),
("ぴょ", "pyo"),
("ちゃ", "cha"),
("ちゅ", "chu"),
("ちょ", "cho"),
("にゃ", "nya"),
("にゅ", "nyu"),
("にょ", "nyo"),
];
static HIRAGANA_BASE: &[(&str, &str)] = &[
("あ", "a"),
("い", "i"),
("う", "u"),
("え", "e"),
("お", "o"),
("か", "ka"),
("き", "ki"),
("く", "ku"),
("け", "ke"),
("こ", "ko"),
("さ", "sa"),
("し", "shi"),
("す", "su"),
("せ", "se"),
("そ", "so"),
("た", "ta"),
("ち", "chi"),
("つ", "tsu"),
("て", "te"),
("と", "to"),
("な", "na"),
("に", "ni"),
("ぬ", "nu"),
("ね", "ne"),
("の", "no"),
("は", "ha"),
("ひ", "hi"),
("ふ", "fu"),
("へ", "he"),
("ほ", "ho"),
("ま", "ma"),
("み", "mi"),
("む", "mu"),
("め", "me"),
("も", "mo"),
("や", "ya"),
("ゆ", "yu"),
("よ", "yo"),
("ら", "ra"),
("り", "ri"),
("る", "ru"),
("れ", "re"),
("ろ", "ro"),
("わ", "wa"),
("ゐ", "i"),
("ゑ", "e"),
("を", "wo"),
("ん", "n"),
("が", "ga"),
("ぎ", "gi"),
("ぐ", "gu"),
("げ", "ge"),
("ご", "go"),
("ざ", "za"),
("じ", "ji"),
("ず", "zu"),
("ぜ", "ze"),
("ぞ", "zo"),
("だ", "da"),
("ぢ", "ji"),
("づ", "zu"),
("で", "de"),
("ど", "do"),
("ば", "ba"),
("び", "bi"),
("ぶ", "bu"),
("べ", "be"),
("ぼ", "bo"),
("ぱ", "pa"),
("ぴ", "pi"),
("ぷ", "pu"),
("ぺ", "pe"),
("ぽ", "po"),
("ぁ", "a"),
("ぃ", "i"),
("ぅ", "u"),
("ぇ", "e"),
("ぉ", "o"),
("ゃ", "ya"),
("ゅ", "yu"),
("ょ", "yo"),
("ゎ", "wa"),
("ゔ", "vu"),
];
static KATAKANA_YOON: &[(&str, &str)] = &[
("キャ", "kya"),
("キュ", "kyu"),
("キョ", "kyo"),
("シャ", "sha"),
("シュ", "shu"),
("ショ", "sho"),
("チャ", "cha"),
("チュ", "chu"),
("チョ", "cho"),
("ニャ", "nya"),
("ニュ", "nyu"),
("ニョ", "nyo"),
("ヒャ", "hya"),
("ヒュ", "hyu"),
("ヒョ", "hyo"),
("ミャ", "mya"),
("ミュ", "myu"),
("ミョ", "myo"),
("リャ", "rya"),
("リュ", "ryu"),
("リョ", "ryo"),
("ギャ", "gya"),
("ギュ", "gyu"),
("ギョ", "gyo"),
("ジャ", "ja"),
("ジュ", "ju"),
("ジョ", "jo"),
("ビャ", "bya"),
("ビュ", "byu"),
("ビョ", "byo"),
("ピャ", "pya"),
("ピュ", "pyu"),
("ピョ", "pyo"),
("ファ", "fa"),
("フィ", "fi"),
("フェ", "fe"),
("フォ", "fo"),
("ウィ", "wi"),
("ウェ", "we"),
("ウォ", "wo"),
("ティ", "ti"),
("ディ", "di"),
("ツァ", "tsa"),
("ツェ", "tse"),
("ツォ", "tso"),
("チェ", "che"),
("ジェ", "je"),
("シェ", "she"),
("イェ", "ye"),
("ヴァ", "va"),
("ヴィ", "vi"),
("ヴェ", "ve"),
("ヴォ", "vo"),
];
static KATAKANA_BASE: &[(&str, &str)] = &[
("ア", "a"),
("イ", "i"),
("ウ", "u"),
("エ", "e"),
("オ", "o"),
("カ", "ka"),
("キ", "ki"),
("ク", "ku"),
("ケ", "ke"),
("コ", "ko"),
("サ", "sa"),
("シ", "shi"),
("ス", "su"),
("セ", "se"),
("ソ", "so"),
("タ", "ta"),
("チ", "chi"),
("ツ", "tsu"),
("テ", "te"),
("ト", "to"),
("ナ", "na"),
("ニ", "ni"),
("ヌ", "nu"),
("ネ", "ne"),
("ノ", "no"),
("ハ", "ha"),
("ヒ", "hi"),
("フ", "fu"),
("ヘ", "he"),
("ホ", "ho"),
("マ", "ma"),
("ミ", "mi"),
("ム", "mu"),
("メ", "me"),
("モ", "mo"),
("ヤ", "ya"),
("ユ", "yu"),
("ヨ", "yo"),
("ラ", "ra"),
("リ", "ri"),
("ル", "ru"),
("レ", "re"),
("ロ", "ro"),
("ワ", "wa"),
("ヲ", "wo"),
("ン", "n"),
("ガ", "ga"),
("ギ", "gi"),
("グ", "gu"),
("ゲ", "ge"),
("ゴ", "go"),
("ザ", "za"),
("ジ", "ji"),
("ズ", "zu"),
("ゼ", "ze"),
("ゾ", "zo"),
("ダ", "da"),
("ヂ", "ji"),
("ヅ", "zu"),
("デ", "de"),
("ド", "do"),
("バ", "ba"),
("ビ", "bi"),
("ブ", "bu"),
("ベ", "be"),
("ボ", "bo"),
("パ", "pa"),
("ピ", "pi"),
("プ", "pu"),
("ペ", "pe"),
("ポ", "po"),
("ァ", "a"),
("ィ", "i"),
("ゥ", "u"),
("ェ", "e"),
("ォ", "o"),
("ャ", "ya"),
("ュ", "yu"),
("ョ", "yo"),
("ヮ", "wa"),
("ヴ", "vu"),
];
const HIRAGANA_SMALL_TSU: char = 'っ';
const KATAKANA_SMALL_TSU: char = 'ッ';
const KATAKANA_LONG_VOWEL: char = 'ー';
const fn long_vowel_macron(romaji: &str) -> Option<&'static str> {
match romaji.as_bytes() {
b"a" => Some("ā"),
b"i" => Some("ī"),
b"u" => Some("ū"),
b"e" => Some("ē"),
b"o" => Some("ō"),
_ => None,
}
}
#[derive(Debug, Clone)]
pub struct HepburnTransliterator {
use_macron: bool,
}
impl HepburnTransliterator {
pub fn new() -> Self {
Self { use_macron: true }
}
pub fn with_macron(use_macron: bool) -> Self {
Self { use_macron }
}
}
impl Default for HepburnTransliterator {
fn default() -> Self {
Self::new()
}
}
impl Transliterator for HepburnTransliterator {
fn transliterate(&self, input: &str) -> String {
let chars: Vec<char> = input.chars().collect();
let mut result = String::with_capacity(input.len() * 2);
let mut i = 0;
while i < chars.len() {
let ch = chars[i];
if ch == HIRAGANA_SMALL_TSU || ch == KATAKANA_SMALL_TSU {
if i + 1 < chars.len() {
let next_romaji = transliterate_single_char_or_combo(&chars, i + 1);
if let Some(first_byte) = next_romaji
.chars()
.next()
.filter(|c| c.is_ascii_alphabetic())
{
result.push(first_byte);
i += 1;
continue;
}
}
i += 1;
continue;
}
if ch == KATAKANA_LONG_VOWEL {
if self.use_macron {
let prev_vowel = get_last_vowel_char(&result);
if let Some(v) = prev_vowel.and_then(long_vowel_macron) {
replace_last_vowel_with_macron(&mut result, v);
} else {
result.push(ch); }
} else {
if let Some(v) = get_last_vowel_char(&result) {
let v_char = v.chars().next().unwrap_or('ー');
result.push(v_char);
} else {
result.push(ch);
}
}
i += 1;
continue;
}
if i + 1 < chars.len() {
let two_char: String = chars[i..=i + 1].iter().collect();
if let Some(romaji) = lookup_table(HIRAGANA_YOON, &two_char)
.or_else(|| lookup_table(KATAKANA_YOON, &two_char))
{
result.push_str(romaji);
i += 2;
continue;
}
}
let one_char: String = std::iter::once(ch).collect();
if let Some(romaji) = lookup_table(HIRAGANA_BASE, &one_char)
.or_else(|| lookup_table(KATAKANA_BASE, &one_char))
{
result.push_str(romaji);
i += 1;
continue;
}
result.push(ch);
i += 1;
}
result
}
}
fn lookup_table<'a>(table: &'a [(&str, &str)], key: &str) -> Option<&'a str> {
table
.iter()
.find(|(src, _)| *src == key)
.map(|(_, dst)| *dst)
}
fn transliterate_single_char_or_combo(chars: &[char], idx: usize) -> String {
if idx >= chars.len() {
return String::new();
}
if idx + 1 < chars.len() {
let two: String = chars[idx..=idx + 1].iter().collect();
if let Some(r) =
lookup_table(HIRAGANA_YOON, &two).or_else(|| lookup_table(KATAKANA_YOON, &two))
{
return r.to_string();
}
}
let one: String = std::iter::once(chars[idx]).collect();
lookup_table(HIRAGANA_BASE, &one)
.or_else(|| lookup_table(KATAKANA_BASE, &one))
.map(|s| s.to_string())
.unwrap_or_default()
}
fn get_last_vowel_char(s: &str) -> Option<&'static str> {
let last_char = s.chars().next_back()?;
match last_char {
'a' => Some("a"),
'i' => Some("i"),
'u' => Some("u"),
'e' => Some("e"),
'o' => Some("o"),
'ā' => Some("a"),
'ī' => Some("i"),
'ū' => Some("u"),
'ē' => Some("e"),
'ō' => Some("o"),
_ => None,
}
}
fn replace_last_vowel_with_macron(s: &mut String, macron: &str) {
if s.is_empty() {
s.push_str(macron);
return;
}
let last_char_len = s.chars().next_back().map(|c| c.len_utf8()).unwrap_or(0);
let new_len = s.len() - last_char_len;
s.truncate(new_len);
s.push_str(macron);
}
#[cfg(test)]
mod tests {
use super::*;
use crate::transliteration::Transliterator;
#[test]
fn test_hiragana_basic() {
let t = HepburnTransliterator::new();
assert_eq!(t.transliterate("あいうえお"), "aiueo");
}
#[test]
fn test_hiragana_sakura() {
let t = HepburnTransliterator::new();
assert_eq!(t.transliterate("さくら"), "sakura");
}
#[test]
fn test_hiragana_konnichiwa() {
let t = HepburnTransliterator::new();
assert_eq!(t.transliterate("こんにちは"), "konnichiha");
}
#[test]
fn test_hiragana_yoon() {
let t = HepburnTransliterator::new();
assert_eq!(t.transliterate("きゃ"), "kya");
assert_eq!(t.transliterate("しゃ"), "sha");
assert_eq!(t.transliterate("ちゃ"), "cha");
}
#[test]
fn test_small_tsu_doubled_consonant() {
let t = HepburnTransliterator::new();
assert_eq!(t.transliterate("っき"), "kki");
assert_eq!(t.transliterate("にっき"), "nikki");
}
#[test]
fn test_katakana_basic() {
let t = HepburnTransliterator::new();
assert_eq!(t.transliterate("アイウエオ"), "aiueo");
}
#[test]
fn test_katakana_long_vowel_macron() {
let t = HepburnTransliterator::new();
let result = t.transliterate("トーキョー");
assert!(
result.contains('ō') || result.contains("oo"),
"expected long-o: got {result}"
);
}
#[test]
fn test_katakana_long_vowel_no_macron() {
let t = HepburnTransliterator::with_macron(false);
let result = t.transliterate("トー");
assert_eq!(result, "too");
}
#[test]
fn test_latin_passthrough() {
let t = HepburnTransliterator::new();
assert_eq!(t.transliterate("Hello"), "Hello");
}
#[test]
fn test_mixed_latin_hiragana() {
let t = HepburnTransliterator::new();
let result = t.transliterate("Hello こんにちは World");
assert!(result.contains("Hello"), "got: {result}");
assert!(result.contains("World"), "got: {result}");
assert!(result.contains("konnichiha"), "got: {result}");
}
#[test]
fn test_dakuten_voiced() {
let t = HepburnTransliterator::new();
assert_eq!(t.transliterate("が"), "ga");
assert_eq!(t.transliterate("じ"), "ji");
assert_eq!(t.transliterate("ぼ"), "bo");
}
#[test]
fn test_handakuten_semivoiced() {
let t = HepburnTransliterator::new();
assert_eq!(t.transliterate("ぱ"), "pa");
assert_eq!(t.transliterate("ぽ"), "po");
}
}