any-tts 0.1.1 - Docs.rs

use std::error::Error;
use std::fmt;
use std::sync::{Mutex, OnceLock};

use super::english_g2p;
use kana2phone::kana2phone;
use lindera::dictionary::load_dictionary;
use lindera::mode::Mode;
use lindera::segmenter::Segmenter;
use lindera::tokenizer::Tokenizer;
use pinyin::ToPinyin;

pub type ESpeakResult<T> = Result<T, ESpeakError>;

#[derive(Debug, Clone)]
pub struct ESpeakError(pub String);

impl fmt::Display for ESpeakError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "pure Rust eSpeak compatibility error: {}", self.0)
    }
}

impl Error for ESpeakError {}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum SupportedLanguage {
    EnglishUs,
    EnglishGb,
    Japanese,
    Chinese,
    Korean,
    French,
    German,
    Italian,
    Portuguese,
    Spanish,
    Hindi,
}

pub fn text_to_phonemes(
    text: &str,
    language: &str,
    phoneme_separator: Option<char>,
    remove_lang_switch_flags: bool,
    remove_stress: bool,
) -> ESpeakResult<Vec<String>> {
    let language = parse_language(language).ok_or_else(|| {
        ESpeakError(format!(
            "unsupported Kokoro language '{language}' for pure-Rust espeak compatibility"
        ))
    })?;

    let mut sentences = Vec::new();
    for line in text.lines() {
        for (body, terminator) in split_sentences(line) {
            let mut phonemes = phonemize_body(&body, language)?;
            if remove_lang_switch_flags {
                phonemes = strip_lang_switch_flags(&phonemes);
            }
            if remove_stress {
                phonemes = phonemes
                    .chars()
                    .filter(|ch| !matches!(ch, 'ˈ' | 'ˌ'))
                    .collect();
            }

            phonemes = normalize_output(&phonemes);
            phonemes.push(terminator);

            if let Some(separator) = phoneme_separator {
                phonemes = apply_phoneme_separator(&phonemes, separator);
            }

            sentences.push(phonemes);
        }
    }

    Ok(sentences)
}

fn parse_language(language: &str) -> Option<SupportedLanguage> {
    match language.to_ascii_lowercase().as_str() {
        "en" | "en-us" | "en_us" | "english" | "a" => Some(SupportedLanguage::EnglishUs),
        "en-gb" | "en_uk" | "en-gb-x-rp" | "british" | "b" => Some(SupportedLanguage::EnglishGb),
        "ja" | "jp" | "japanese" | "j" => Some(SupportedLanguage::Japanese),
        "zh" | "zh-cn" | "cmn" | "mandarin" | "z" => Some(SupportedLanguage::Chinese),
        "ko" | "korean" | "k" => Some(SupportedLanguage::Korean),
        "fr" | "french" | "f" => Some(SupportedLanguage::French),
        "de" | "german" | "d" => Some(SupportedLanguage::German),
        "it" | "italian" | "i" => Some(SupportedLanguage::Italian),
        "pt" | "pt-br" | "pt-pt" | "portuguese" | "p" => Some(SupportedLanguage::Portuguese),
        "es" | "spanish" | "e" => Some(SupportedLanguage::Spanish),
        "hi" | "hindi" | "h" => Some(SupportedLanguage::Hindi),
        _ => None,
    }
}

fn split_sentences(line: &str) -> Vec<(String, char)> {
    let mut sentences = Vec::new();
    let mut current = String::new();

    for ch in line.chars() {
        if let Some(punctuation) = normalize_punctuation(ch) {
            if matches!(punctuation, '.' | '!' | '?') {
                if !current.trim().is_empty() {
                    sentences.push((current.trim().to_string(), punctuation));
                    current.clear();
                }
            } else {
                current.push(punctuation);
            }
        } else {
            current.push(ch);
        }
    }

    if !current.trim().is_empty() {
        sentences.push((current.trim().to_string(), '.'));
    }

    sentences
}

fn phonemize_body(text: &str, language: SupportedLanguage) -> ESpeakResult<String> {
    match language {
        SupportedLanguage::EnglishUs => phonemize_english(text, false),
        SupportedLanguage::EnglishGb => phonemize_english(text, true),
        SupportedLanguage::Japanese => phonemize_by_runs(text, phonemize_japanese_run),
        SupportedLanguage::Chinese => phonemize_by_runs(text, phonemize_chinese_run),
        SupportedLanguage::Korean => phonemize_by_runs(text, phonemize_korean_run),
        SupportedLanguage::French => Ok(phonemize_french_phrase(text)),
        SupportedLanguage::German => phonemize_by_runs(text, |run| Ok(phonemize_german_word(run))),
        SupportedLanguage::Italian => {
            phonemize_by_runs(text, |run| Ok(phonemize_italian_word(run)))
        }
        SupportedLanguage::Portuguese => {
            phonemize_by_runs(text, |run| Ok(phonemize_portuguese_word(run)))
        }
        SupportedLanguage::Spanish => {
            phonemize_by_runs(text, |run| Ok(phonemize_spanish_word(run)))
        }
        SupportedLanguage::Hindi => phonemize_by_runs(text, phonemize_hindi_run),
    }
}

fn phonemize_english(text: &str, british: bool) -> ESpeakResult<String> {
    let normalized = normalize_inline_punctuation(text);
    let mut output = String::new();
    let mut current = String::new();

    for ch in normalized.chars() {
        if matches!(ch, ',' | ';' | ':' | '"' | '(' | ')' | '—' | '…') {
            if !current.trim().is_empty() {
                output.push_str(&phonemize_english_clause(&current, british)?);
                current.clear();
            }
            output.push(ch);
        } else {
            current.push(ch);
        }
    }

    if !current.trim().is_empty() {
        output.push_str(&phonemize_english_clause(&current, british)?);
    }

    Ok(output)
}

fn phonemize_english_clause(text: &str, british: bool) -> ESpeakResult<String> {
    Ok(english_g2p::phonemize_clause(text, british))
}

#[cfg(test)]
fn normalize_english_text(text: &str) -> String {
    english_g2p::normalize_text(text)
}

fn phonemize_japanese_run(text: &str) -> ESpeakResult<String> {
    let tokenizer = japanese_tokenizer()?;
    let tokenizer = tokenizer
        .lock()
        .map_err(|err| ESpeakError(format!("Japanese tokenizer mutex poisoned: {err}")))?;
    let mut tokens = tokenizer
        .tokenize(text)
        .map_err(|err| ESpeakError(format!("Japanese tokenization failed: {err}")))?;

    let mut output = String::new();
    for token in tokens.iter_mut() {
        let surface = token.surface.to_string();
        if surface.trim().is_empty() {
            if !output.ends_with(' ') {
                output.push(' ');
            }
            continue;
        }

        if is_ascii_word(&surface) {
            output.push_str(&phonemize_japanese_ascii_token(&surface));
            output.push(' ');
            continue;
        }

        let details = token.details();
        let reading = details
            .get(8)
            .copied()
            .filter(|value| *value != "*" && *value != "UNK")
            .or_else(|| {
                details
                    .get(7)
                    .copied()
                    .filter(|value| *value != "*" && *value != "UNK")
            })
            .map(str::to_string)
            .unwrap_or_else(|| to_katakana(&surface));

        let phones = kana2phone(&reading);
        let mapped = phones
            .split_whitespace()
            .map(map_japanese_phone)
            .collect::<String>();

        output.push_str(&mapped);
        output.push(' ');
    }

    Ok(output)
}

fn is_ascii_word(text: &str) -> bool {
    !text.is_empty() && text.chars().all(|ch| ch.is_ascii_alphanumeric())
}

fn phonemize_japanese_ascii_token(token: &str) -> String {
    let lower = token.to_ascii_lowercase();
    if let Some(phonemes) = lookup_word_list(lower.as_str(), JAPANESE_ASCII_WORD_LIST) {
        return phonemes.to_string();
    }

    if token.chars().all(|ch| ch.is_ascii_uppercase()) {
        return english_g2p::phonemize_clause(token, false);
    }

    english_g2p::phonemize_clause(token, false)
}

fn japanese_tokenizer() -> ESpeakResult<&'static Mutex<Tokenizer>> {
    static TOKENIZER: OnceLock<ESpeakResult<Mutex<Tokenizer>>> = OnceLock::new();

    let state = TOKENIZER.get_or_init(|| {
        let dictionary = load_dictionary("embedded://ipadic").map_err(|err| {
            ESpeakError(format!("failed to load embedded Lindera dictionary: {err}"))
        })?;
        let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
        Ok(Mutex::new(Tokenizer::new(segmenter)))
    });

    match state {
        Ok(tokenizer) => Ok(tokenizer),
        Err(err) => Err(err.clone()),
    }
}

fn phonemize_chinese_run(text: &str) -> ESpeakResult<String> {
    let mut units = chinese_units(text);
    apply_chinese_tone_sandhi(&mut units);

    let mut output = String::new();

    for unit in units {
        match unit {
            ChineseUnit::Syllable { base, tone } => {
                output.push_str(&map_pinyin_base_and_tone(&base, tone));
            }
            ChineseUnit::Literal(text) => {
                output.push_str(&phonemize_chinese_literal(&text)?);
            }
        }
        if !output.ends_with(' ') {
            output.push(' ');
        }
    }

    Ok(output)
}

#[derive(Debug, Clone)]
enum ChineseUnit {
    Syllable { base: String, tone: u8 },
    Literal(String),
}

fn chinese_units(text: &str) -> Vec<ChineseUnit> {
    let mut units = Vec::new();
    let mut literal = String::new();

    let mut cursor = 0;
    while cursor < text.len() {
        let rest = &text[cursor..];
        if let Some((phrase, syllables)) = chinese_phrase_override(rest) {
            if !literal.is_empty() {
                units.push(ChineseUnit::Literal(std::mem::take(&mut literal)));
            }
            push_chinese_phrase_units(&mut units, syllables);
            cursor += phrase.len();
            continue;
        }

        let Some(ch) = rest.chars().next() else {
            break;
        };
        if let Some(pinyin) = ch.to_pinyin() {
            if !literal.is_empty() {
                units.push(ChineseUnit::Literal(std::mem::take(&mut literal)));
            }
            let (base, tone) = split_tone_number(pinyin.with_tone_num_end());
            units.push(ChineseUnit::Syllable { base, tone });
        } else if ch.is_ascii_alphanumeric() {
            literal.push(ch);
        } else if !literal.is_empty() {
            units.push(ChineseUnit::Literal(std::mem::take(&mut literal)));
        }

        cursor += ch.len_utf8();
    }

    if !literal.is_empty() {
        units.push(ChineseUnit::Literal(literal));
    }

    units
}

fn chinese_phrase_override(text: &str) -> Option<(&'static str, &'static [&'static str])> {
    CHINESE_PHRASE_PINYIN
        .iter()
        .filter(|(phrase, _)| text.starts_with(*phrase))
        .max_by_key(|(phrase, _)| phrase.chars().count())
        .map(|(phrase, syllables)| (*phrase, *syllables))
}

fn push_chinese_phrase_units(units: &mut Vec<ChineseUnit>, syllables: &[&str]) {
    for syllable in syllables {
        let (base, tone) = split_tone_number(syllable);
        units.push(ChineseUnit::Syllable { base, tone });
    }
}

fn phonemize_chinese_literal(text: &str) -> ESpeakResult<String> {
    if text.trim().is_empty() {
        return Ok(String::new());
    }

    phonemize_english_clause(text, false)
}

fn apply_chinese_tone_sandhi(units: &mut [ChineseUnit]) {
    let mut start = 0;

    while start < units.len() {
        while start < units.len() && matches!(units[start], ChineseUnit::Literal(_)) {
            start += 1;
        }

        let mut end = start;
        while end < units.len() && matches!(units[end], ChineseUnit::Syllable { .. }) {
            end += 1;
        }

        if start == end {
            continue;
        }

        apply_bu_yi_sandhi(&mut units[start..end]);
        apply_third_tone_sandhi(&mut units[start..end]);
        start = end;
    }
}

fn apply_bu_yi_sandhi(units: &mut [ChineseUnit]) {
    for index in 0..units.len().saturating_sub(1) {
        let next_tone = chinese_tone(&units[index + 1]).unwrap_or(5);

        let Some(base) = chinese_base(&units[index]).map(str::to_string) else {
            continue;
        };

        let Some(current_tone) = chinese_tone(&units[index]) else {
            continue;
        };

        let replacement = if base == "yi" && current_tone == 1 {
            if next_tone == 4 {
                Some(2)
            } else if next_tone != 5 {
                Some(4)
            } else {
                None
            }
        } else if base == "bu" && current_tone == 4 && next_tone == 4 {
            Some(2)
        } else {
            None
        };

        if let Some(tone) = replacement {
            set_chinese_tone(&mut units[index], tone);
        }
    }
}

fn apply_third_tone_sandhi(units: &mut [ChineseUnit]) {
    let mut index = 0;

    while index < units.len() {
        if chinese_tone(&units[index]) != Some(3) {
            index += 1;
            continue;
        }

        let mut end = index + 1;
        while end < units.len() && chinese_tone(&units[end]) == Some(3) {
            end += 1;
        }

        if end - index > 1 {
            for syllable in &mut units[index..end - 1] {
                set_chinese_tone(syllable, 2);
            }
        }

        index = end;
    }
}

fn chinese_base(unit: &ChineseUnit) -> Option<&str> {
    match unit {
        ChineseUnit::Syllable { base, .. } => Some(base.as_str()),
        ChineseUnit::Literal(_) => None,
    }
}

fn chinese_tone(unit: &ChineseUnit) -> Option<u8> {
    match unit {
        ChineseUnit::Syllable { tone, .. } => Some(*tone),
        ChineseUnit::Literal(_) => None,
    }
}

fn set_chinese_tone(unit: &mut ChineseUnit, tone: u8) {
    if let ChineseUnit::Syllable { tone: current, .. } = unit {
        *current = tone;
    }
}

fn phonemize_korean_run(text: &str) -> ESpeakResult<String> {
    let mut output = String::new();
    let mut ascii = String::new();

    for ch in text.chars() {
        if ch.is_ascii_alphanumeric() {
            ascii.push(ch);
            continue;
        }

        if !ascii.is_empty() {
            output.push_str(&phonemize_english_clause(&ascii, false)?);
            ascii.clear();
        }

        if let Some((onset, vowel, coda)) = decompose_hangul_syllable(ch) {
            output.push_str(KOREAN_ONSETS[onset]);
            output.push_str(KOREAN_VOWELS[vowel]);
            output.push_str(KOREAN_CODAS[coda]);
        }
    }

    if !ascii.is_empty() {
        output.push_str(&phonemize_english_clause(&ascii, false)?);
    }

    Ok(output)
}

fn phonemize_hindi_run(text: &str) -> ESpeakResult<String> {
    if let Some(exception) = lookup_word_list(text, HINDI_WORD_LIST) {
        return Ok(exception.to_string());
    }

    if is_ascii_word(text) {
        return phonemize_english_clause(text, false);
    }

    let chars: Vec<char> = text.chars().collect();
    let mut output = String::new();
    let mut index = 0;

    while index < chars.len() {
        let ch = chars[index];

        if let Some(vowel) = hindi_independent_vowel(ch) {
            output.push_str(vowel);
            index += 1;
            continue;
        }

        if let Some(consonant) = hindi_consonant(ch) {
            output.push_str(consonant);

            let mut consumed = 0usize;
            let mut vowel = "ə";
            if let Some(next) = chars.get(index + 1).copied() {
                if next == '़' {
                    consumed += 1;
                }

                if let Some(marker) = chars.get(index + 1 + consumed).copied() {
                    if let Some(matra) = hindi_matra(marker) {
                        vowel = matra;
                        consumed += 1;
                    } else if marker == '्' {
                        vowel = "";
                        consumed += 1;
                    }
                }
            }

            let is_word_end = index + consumed + 1 >= chars.len();
            if is_word_end && vowel == "ə" {
                vowel = "";
            }

            output.push_str(vowel);
            index += consumed + 1;
            continue;
        }

        match ch {
            'ं' => output.push('ŋ'),
            'ँ' => output.push('\u{0303}'),
            'ः' => output.push('h'),
            _ if ch.is_ascii_alphanumeric() => {
                let mut ascii = String::new();
                ascii.push(ch);
                index += 1;
                while let Some(next) = chars.get(index).copied() {
                    if !next.is_ascii_alphanumeric() {
                        break;
                    }
                    ascii.push(next);
                    index += 1;
                }
                output.push_str(&phonemize_english_clause(&ascii, false)?);
                continue;
            }
            _ => {}
        }

        index += 1;
    }

    Ok(output)
}

fn phonemize_by_runs<F>(text: &str, mut phonemize_run: F) -> ESpeakResult<String>
where
    F: FnMut(&str) -> ESpeakResult<String>,
{
    let mut output = String::new();
    let mut current = String::new();

    for ch in text.chars() {
        if is_run_char(ch) {
            current.push(ch);
            continue;
        }

        if !current.is_empty() {
            output.push_str(&phonemize_run(&current)?);
            current.clear();
        }

        if ch.is_whitespace() {
            output.push(' ');
        } else if let Some(punctuation) = normalize_punctuation(ch) {
            output.push(punctuation);
        }
    }

    if !current.is_empty() {
        output.push_str(&phonemize_run(&current)?);
    }

    Ok(output)
}

fn is_run_char(ch: char) -> bool {
    !ch.is_whitespace() && normalize_punctuation(ch).is_none()
}

fn normalize_punctuation(ch: char) -> Option<char> {
    match ch {
        '.' | '。' | '।' | '॥' => Some('.'),
        ',' | '，' | '、' => Some(','),
        '¡' => Some('!'),
        '!' | '！' => Some('!'),
        '¿' => Some('?'),
        '?' | '？' => Some('?'),
        ';' | '；' => Some(';'),
        ':' | '：' => Some(':'),
        '"' | '“' | '”' | '「' | '」' | '『' | '』' => Some('"'),
        '(' | '（' => Some('('),
        ')' | '）' => Some(')'),
        '—' | '–' => Some('—'),
        '…' => Some('…'),
        _ => None,
    }
}

fn normalize_inline_punctuation(text: &str) -> String {
    text.chars()
        .map(|ch| normalize_punctuation(ch).unwrap_or(ch))
        .collect()
}

fn normalize_output(text: &str) -> String {
    let mut normalized = collapse_spaces(text);
    for punctuation in [",", ".", "!", "?", ";", ":", ")", "”"] {
        normalized = normalized.replace(&format!(" {punctuation}"), punctuation);
    }
    normalized = normalized.replace("( ", "(");
    normalized = normalized.replace("“ ", "“");
    normalized.trim().to_string()
}

#[derive(Debug, Clone)]
enum PhraseToken {
    Run(String),
    Space,
    Separator(char),
}

fn split_phrase_tokens(text: &str) -> Vec<PhraseToken> {
    let mut tokens = Vec::new();
    let mut current = String::new();

    for ch in text.chars() {
        if is_run_char(ch) {
            current.push(ch);
            continue;
        }

        if !current.is_empty() {
            tokens.push(PhraseToken::Run(std::mem::take(&mut current)));
        }

        if ch.is_whitespace() {
            if !matches!(tokens.last(), Some(PhraseToken::Space)) {
                tokens.push(PhraseToken::Space);
            }
        } else if let Some(punctuation) = normalize_punctuation(ch) {
            tokens.push(PhraseToken::Separator(punctuation));
        }
    }

    if !current.is_empty() {
        tokens.push(PhraseToken::Run(current));
    }

    tokens
}

fn next_phrase_run(tokens: &[PhraseToken], index: usize) -> Option<&str> {
    let mut cursor = index + 1;
    let mut saw_space = false;

    while let Some(token) = tokens.get(cursor) {
        match token {
            PhraseToken::Space => {
                saw_space = true;
            }
            PhraseToken::Separator(_) => return None,
            PhraseToken::Run(run) => return saw_space.then_some(run.as_str()),
        }
        cursor += 1;
    }

    None
}

fn collapse_spaces(text: &str) -> String {
    let mut output = String::new();
    let mut last_space = false;

    for ch in text.chars() {
        if ch.is_whitespace() {
            if !last_space && !output.is_empty() {
                output.push(' ');
            }
            last_space = true;
        } else {
            output.push(ch);
            last_space = false;
        }
    }

    output.trim().to_string()
}

fn apply_phoneme_separator(text: &str, separator: char) -> String {
    let units = split_phoneme_units(text);
    let mut output = String::new();
    let mut previous_was_phoneme = false;

    for unit in units {
        let is_punctuation = unit
            .chars()
            .all(|ch| ch == ' ' || normalize_punctuation(ch).is_some());
        if is_punctuation {
            output.push_str(&unit);
            previous_was_phoneme = false;
        } else {
            if previous_was_phoneme {
                output.push(separator);
            }
            output.push_str(&unit);
            previous_was_phoneme = true;
        }
    }

    output
}

fn split_phoneme_units(text: &str) -> Vec<String> {
    let mut units = Vec::new();
    let mut current = String::new();

    for ch in text.chars() {
        if ch == ' ' || normalize_punctuation(ch).is_some() {
            if !current.is_empty() {
                units.push(current.clone());
                current.clear();
            }
            units.push(ch.to_string());
            continue;
        }

        if matches!(ch, 'ˈ' | 'ˌ') {
            if !current.is_empty() {
                units.push(current.clone());
                current.clear();
            }
            current.push(ch);
            continue;
        }

        if matches!(ch, '\u{0303}' | 'ː' | 'ʰ' | 'ʲ') {
            current.push(ch);
            continue;
        }

        if current.starts_with('ˈ') || current.starts_with('ˌ') {
            current.push(ch);
            units.push(current.clone());
            current.clear();
            continue;
        }

        if !current.is_empty() {
            units.push(current.clone());
            current.clear();
        }

        current.push(ch);
    }

    if !current.is_empty() {
        units.push(current);
    }

    units
}

fn strip_lang_switch_flags(text: &str) -> String {
    let mut output = String::new();
    let mut chars = text.chars().peekable();

    while let Some(ch) = chars.next() {
        if ch != '(' {
            output.push(ch);
            continue;
        }

        let mut flag = String::new();
        let mut valid = true;
        for next in chars.by_ref() {
            if next == ')' {
                break;
            }

            flag.push(next);
            if !next.is_ascii_alphabetic() && next != '-' {
                valid = false;
            }
        }

        if !valid || flag.is_empty() {
            output.push('(');
            output.push_str(&flag);
            output.push(')');
        }
    }

    output
}

fn phonemize_french_phrase(text: &str) -> String {
    let tokens = split_phrase_tokens(text);
    let mut output = String::new();

    for (index, token) in tokens.iter().enumerate() {
        match token {
            PhraseToken::Run(run) => {
                let lower = run.to_lowercase();
                let mut phonemes = phonemize_french_word(run);

                if let Some(next_word) = next_phrase_run(&tokens, index) {
                    if let Some(liaison) = french_liaison_sound(&lower, next_word) {
                        phonemes.push_str(liaison);
                    }
                }

                output.push_str(&phonemes);
            }
            PhraseToken::Space => output.push(' '),
            PhraseToken::Separator(ch) => output.push(*ch),
        }
    }

    output
}

fn french_liaison_sound(current_word: &str, next_word: &str) -> Option<&'static str> {
    if !french_starts_with_vowel_sound(next_word) {
        return None;
    }

    match current_word {
        "est" | "petit" | "grand" => Some("t"),
        "êtes" => Some("z"),
        "les" | "des" | "mes" | "tes" | "ses" | "nos" | "vos" | "deux" | "trois" | "sont"
        | "vous" | "nous" => Some("z"),
        "un" | "mon" | "ton" | "son" | "bon" => Some("n"),
        "leurs" => Some("ʁ"),
        _ => None,
    }
}

fn french_starts_with_vowel_sound(word: &str) -> bool {
    word.chars().next().is_some_and(|ch| {
        matches!(
            ch,
            'a' | 'e'
                | 'h'
                | 'i'
                | 'o'
                | 'u'
                | 'y'
                | 'à'
                | 'â'
                | 'ä'
                | 'é'
                | 'è'
                | 'ê'
                | 'ë'
                | 'î'
                | 'ï'
                | 'ô'
                | 'ö'
                | 'ù'
                | 'û'
                | 'ü'
                | 'œ'
        )
    })
}

fn to_katakana(text: &str) -> String {
    text.chars()
        .map(|ch| {
            if ('ぁ'..='ゖ').contains(&ch) {
                char::from_u32(ch as u32 + 0x60).unwrap_or(ch)
            } else {
                ch
            }
        })
        .collect()
}

fn map_japanese_phone(phone: &str) -> String {
    if let Some(base) = phone.strip_suffix(':') {
        return format!("{}ː", map_japanese_phone(base));
    }

    match phone {
        "a" => "a".into(),
        "i" => "i".into(),
        "u" => "ɯ".into(),
        "e" => "e".into(),
        "o" => "o".into(),
        "N" => "ɴ".into(),
        "q" => "q".into(),
        "ch" => "ʧ".into(),
        "sh" => "ʃ".into(),
        "ts" => "ʦ".into(),
        "j" => "ʥ".into(),
        "r" => "ɾ".into(),
        "ry" => "ɾj".into(),
        "ny" => "ɲ".into(),
        "hy" => "ç".into(),
        "by" => "bj".into(),
        "py" => "pj".into(),
        "my" => "mj".into(),
        "gy" => "gj".into(),
        "ky" => "kj".into(),
        "f" => "ɸ".into(),
        "y" => "j".into(),
        _ => phone.to_string(),
    }
}

fn map_pinyin_base_and_tone(base: &str, tone: u8) -> String {
    let base = base.replace('ü', "v").replace("u:", "v");

    let syllable = match base.as_str() {
        "zhi" => "ʧɻ".to_string(),
        "chi" => "ʧʰɻ".to_string(),
        "shi" => "ʃɻ".to_string(),
        "ri" => "ɻ".to_string(),
        "zi" => "ʦz".to_string(),
        "ci" => "ʦʰz".to_string(),
        "si" => "sz".to_string(),
        "yi" => "i".to_string(),
        "ya" => "ja".to_string(),
        "yan" => "jɛn".to_string(),
        "yang" => "jɑŋ".to_string(),
        "yao" => "jau".to_string(),
        "ye" => "jɛ".to_string(),
        "yin" => "in".to_string(),
        "ying" => "iŋ".to_string(),
        "yong" => "jʊŋ".to_string(),
        "you" => "jou".to_string(),
        "yu" => "y".to_string(),
        "yue" => "yɛ".to_string(),
        "yuan" => "yɛn".to_string(),
        "yun" => "yn".to_string(),
        "wu" => "u".to_string(),
        "wa" => "wa".to_string(),
        "wai" => "wai".to_string(),
        "wan" => "wan".to_string(),
        "wang" => "wɑŋ".to_string(),
        "wei" => "wei".to_string(),
        "wen" => "wən".to_string(),
        "weng" => "wəŋ".to_string(),
        "wo" => "wo".to_string(),
        _ => {
            let (initial, final_part) = split_pinyin_initial(&base);
            let onset = map_pinyin_initial(initial);
            let rhyme = map_pinyin_final(final_part);
            format!("{onset}{rhyme}")
        }
    };

    format!("{syllable}{}", chinese_tone_marker(tone))
}

fn split_tone_number(pinyin: &str) -> (String, u8) {
    match pinyin.chars().last() {
        Some(ch) if ch.is_ascii_digit() => {
            let tone = ch.to_digit(10).unwrap_or(5) as u8;
            (
                pinyin[..pinyin.len() - ch.len_utf8()].to_ascii_lowercase(),
                tone,
            )
        }
        _ => (pinyin.to_ascii_lowercase(), 5),
    }
}

fn split_pinyin_initial(pinyin: &str) -> (&str, &str) {
    for initial in [
        "zh", "ch", "sh", "b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x",
        "r", "z", "c", "s",
    ] {
        if let Some(rest) = pinyin.strip_prefix(initial) {
            return (initial, rest);
        }
    }

    ("", pinyin)
}

fn map_pinyin_initial(initial: &str) -> &'static str {
    match initial {
        "b" => "p",
        "p" => "pʰ",
        "m" => "m",
        "f" => "f",
        "d" => "t",
        "t" => "tʰ",
        "n" => "n",
        "l" => "l",
        "g" => "k",
        "k" => "kʰ",
        "h" => "χ",
        "j" => "ʨ",
        "q" => "ʨʰ",
        "x" => "ɕ",
        "zh" => "ʧ",
        "ch" => "ʧʰ",
        "sh" => "ʃ",
        "r" => "ɻ",
        "z" => "ʦ",
        "c" => "ʦʰ",
        "s" => "s",
        _ => "",
    }
}

fn map_pinyin_final(final_part: &str) -> String {
    match final_part {
        "a" => "a".into(),
        "ai" => "ai".into(),
        "an" => "an".into(),
        "ang" => "ɑŋ".into(),
        "ao" => "au".into(),
        "e" => "ɤ".into(),
        "ei" => "ei".into(),
        "en" => "ən".into(),
        "eng" => "əŋ".into(),
        "er" => "ɚ".into(),
        "i" => "i".into(),
        "ia" => "ja".into(),
        "ian" => "jɛn".into(),
        "iang" => "jɑŋ".into(),
        "iao" => "jau".into(),
        "ie" => "jɛ".into(),
        "in" => "in".into(),
        "ing" => "iŋ".into(),
        "iong" => "jʊŋ".into(),
        "iu" => "jou".into(),
        "o" => "o".into(),
        "ong" => "ʊŋ".into(),
        "ou" => "ou".into(),
        "u" => "u".into(),
        "ua" => "wa".into(),
        "uai" => "wai".into(),
        "uan" => "wan".into(),
        "uang" => "wɑŋ".into(),
        "ui" => "wei".into(),
        "un" => "wən".into(),
        "uo" => "wo".into(),
        "v" => "y".into(),
        "ve" => "yɛ".into(),
        "van" => "yɛn".into(),
        "vn" => "yn".into(),
        _ => final_part.to_string(),
    }
}

fn chinese_tone_marker(tone: u8) -> &'static str {
    match tone {
        1 => "→",
        2 => "↗",
        3 => "↓",
        4 => "↘",
        _ => "",
    }
}

const S_BASE: u32 = 0xAC00;
const L_COUNT: u32 = 19;
const V_COUNT: u32 = 21;
const T_COUNT: u32 = 28;
const N_COUNT: u32 = V_COUNT * T_COUNT;
const S_COUNT: u32 = L_COUNT * N_COUNT;

const KOREAN_ONSETS: [&str; 19] = [
    "k", "k", "n", "t", "t", "ɾ", "m", "p", "p", "s", "s", "", "ʧ", "ʧ", "ʧʰ", "kʰ", "tʰ", "pʰ",
    "h",
];

const KOREAN_VOWELS: [&str; 21] = [
    "a", "ɛ", "ja", "jɛ", "ʌ", "e", "jʌ", "je", "o", "wa", "wɛ", "we", "jo", "u", "wʌ", "we", "wi",
    "ju", "ɯ", "ɰi", "i",
];

const KOREAN_CODAS: [&str; 28] = [
    "", "k", "k", "k", "n", "n", "n", "t", "l", "k", "m", "p", "l", "l", "p", "l", "m", "p", "p",
    "t", "t", "ŋ", "t", "t", "k", "t", "p", "t",
];

const CHINESE_PHRASE_PINYIN: &[(&str, &[&str])] = &[
    ("重庆", &["chong2", "qing4"]),
    ("银行", &["yin2", "hang2"]),
    ("音乐", &["yin1", "yue4"]),
    ("重要", &["zhong4", "yao4"]),
    ("长大", &["zhang3", "da4"]),
    ("长城", &["chang2", "cheng2"]),
];

const GERMAN_WORD_LIST: &[(&str, &str)] = &[
    ("hallo", "hˈaloː"),
    ("dies", "diːs"),
    ("ist", "ɪst"),
    ("in", "ɪn"),
    ("ein", "aɪn"),
    ("der", "dɛɾ"),
    ("und", "ʊnt"),
    ("gut", "ɡˈuːt"),
    ("bleiben", "blˈaɪbən"),
    ("bleibt", "blˈaɪpt"),
    ("sprach", "ʃpʁaːχ"),
    ("überraschend", "ˌyːbɜrˈaʃənt"),
    ("test", "tˈɛst"),
    ("schön", "ʃøːn"),
    ("kokoro", "koːkˈoːroː"),
    ("rust", "rˈʊst"),
    ("synthese", "zyntˈeːzə"),
    ("sport", "ʃpɔʁt"),
    ("vollständig", "fˈɔlʃtˌɛndɪç"),
    ("implementiert", "ˌɪmpleːməntˈiːɾt"),
];

const JAPANESE_ASCII_WORD_LIST: &[(&str, &str)] = &[("kokoro", "kokoɾo"), ("rust", "ɾasɯto")];

const FRENCH_WORD_LIST: &[(&str, &str)] = &[
    ("ami", "amˈi"),
    ("bonjour", "bɔ̃ʒˈuʁ"),
    ("ceci", "səsˌi"),
    ("est", "ɛ"),
    ("un", "œ̃"),
    ("test", "tˈɛst"),
    ("de", "də"),
    ("la", "la"),
    ("kokoro", "kokoʁˈo"),
    ("rust", "ɹˈʌst"),
    ("vous", "vu"),
    ("nous", "nu"),
    ("êtes", "ɛt"),
    ("très", "tʁɛ"),
    ("synthèse", "sɛ̃tˈɛz"),
    ("vocale", "vokˈal"),
    ("entièrement", "ɑ̃tjɛʁmˈɑ̃"),
    ("en", "ɑ̃"),
];

const SPANISH_WORD_LIST: &[(&str, &str)] = &[
    ("clara", "klˈaɾa"),
    ("de", "ðe"),
    ("es", "ˈes"),
    ("esta", "ˈesta"),
    ("hola", "ˈola"),
    ("implementada", "ˌimplementˈaða"),
    ("kokoro", "kokˈoɾo"),
    ("prueba", "pɾuˈeβa"),
    ("quiero", "kjˈeɾo"),
    ("rust", "rˈust"),
    ("síntesis", "sˈintesis"),
    ("una", "ˈuna"),
    ("voz", "βˈoθ"),
    ("yo", "ʝˈo"),
];

const ITALIAN_WORD_LIST: &[(&str, &str)] = &[
    ("chiara", "kjˈaɾa"),
    ("ciao", "ʧˈao"),
    ("della", "dˌɛlla"),
    ("interamente", "interamˈente"),
    ("italiana", "italiˈana"),
    ("kokoro", "kokˈɔro"),
    ("questo", "kwˈesto"),
    ("resta", "rˈɛsta"),
    ("rust", "rˈust"),
    ("sciarpa", "ʃˈarpa"),
    ("sintesi", "sˈintezi"),
    ("test", "tˈɛst"),
    ("un", "ʊn"),
    ("vocale", "vokˈale"),
    ("è", "ˈɛ"),
];

const PORTUGUESE_WORD_LIST: &[(&str, &str)] = &[
    ("a", "ɐ"),
    ("casa", "kˈazɐ"),
    ("da", "dɐ"),
    ("de", "dɨ"),
    ("do", "dʊ"),
    ("é", "ɛ"),
    ("em", "ẽ"),
    ("essa", "ˈɛsɐ"),
    ("este", "ˈɛʃtɨ"),
    ("fala", "fˈalɐ"),
    ("implementada", "ˌimplementˈadɐ"),
    ("inteiramente", "ĩteɾamˈẽtɨ"),
    ("kokoro", "kokˈɔɾo"),
    ("mar", "mˈaɹ"),
    ("não", "nˈɐ̃w"),
    ("olá", "ɔlˈa"),
    ("rust", "ʁˈuʃt"),
    ("síntese", "sˈiŋtɨzɨ"),
    ("teste", "tˈɛʃtɨ"),
    ("um", "ũŋ"),
    ("usa", "ˈuzɐ"),
];

const HINDI_WORD_LIST: &[(&str, &str)] = &[
    ("का", "kaː"),
    ("कर", "kˈʌɾ"),
    ("गए", "ɡˈʌeː"),
    ("ध्वनि", "dʰʋˈʌnɪ"),
    ("नमस्ते", "nəmˈʌsteː"),
    ("परीक्षण", "pəɾˈiːkʃəɳ"),
    ("पाठ-से-भाषण", "pˈaːʈʰseːbʰˈaːʂəɳ"),
    ("में", "mẽː"),
    ("यह", "jˌəh"),
    ("रहे", "ɾˌəheː"),
    ("लिखे", "lˈɪkʰeː"),
    ("संश्लेषण", "sənʃlˈeːʂəɳ"),
    ("हम", "hˌəm"),
    ("है", "hɛː"),
    ("हैं", "hɛ̃"),
];

const GERMAN_COMPOUND_TAILS: &[&str] = &["synthese", "sport", "werk", "art", "frau"];

const GERMAN_PREFIXES: &[(&str, &str)] = &[
    ("über", "yːbɐ"),
    ("ver", "fɛɐ"),
    ("ent", "ɛnt"),
    ("zer", "ʦɛɐ"),
    ("miss", "mɪs"),
];

const GERMAN_SUFFIXES: &[(&str, &str)] = &[
    ("schaft", "ʃaft"),
    ("keit", "kaɪt"),
    ("heit", "haɪt"),
    ("lich", "lɪç"),
    ("ung", "ʊŋ"),
    ("nis", "nɪs"),
];

fn lookup_word_list<'a>(word: &str, entries: &'a [(&'a str, &'a str)]) -> Option<&'a str> {
    entries
        .iter()
        .find_map(|(spelling, phonemes)| (*spelling == word).then_some(*phonemes))
}

fn lookup_latin_exception(
    word: &str,
    entries: &'static [(&'static str, &'static str)],
) -> Option<&'static str> {
    lookup_word_list(word, entries)
}

fn decompose_hangul_syllable(ch: char) -> Option<(usize, usize, usize)> {
    let code = ch as u32;
    if !(S_BASE..S_BASE + S_COUNT).contains(&code) {
        return None;
    }

    let s_index = code - S_BASE;
    let onset = (s_index / N_COUNT) as usize;
    let vowel = ((s_index % N_COUNT) / T_COUNT) as usize;
    let coda = (s_index % T_COUNT) as usize;
    Some((onset, vowel, coda))
}

fn phonemize_spanish_word(word: &str) -> String {
    let lower = word.to_lowercase();
    if let Some(exception) = lookup_latin_exception(&lower, SPANISH_WORD_LIST) {
        return exception.to_string();
    }

    let chars: Vec<char> = lower.chars().collect();
    let mut output = String::new();
    let mut index = 0;

    while index < chars.len() {
        if starts_with(&chars, index, "gue") || starts_with(&chars, index, "gui") {
            output.push('g');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "ch") {
            output.push('ʧ');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "ll") {
            output.push('ʝ');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "rr") {
            output.push('r');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "qu") {
            output.push('k');
            index += 2;
            continue;
        }

        let ch = chars[index];
        let previous = if index == 0 {
            None
        } else {
            chars.get(index - 1).copied()
        };
        let next = chars.get(index + 1).copied();
        match ch {
            'a' => output.push('a'),
            'á' => push_stressed_phoneme(&mut output, "a"),
            'e' => output.push('e'),
            'é' => push_stressed_phoneme(&mut output, "e"),
            'i' => output.push('i'),
            'í' => push_stressed_phoneme(&mut output, "i"),
            'o' => output.push('o'),
            'ó' => push_stressed_phoneme(&mut output, "o"),
            'u' | 'ü' => output.push('u'),
            'ú' => push_stressed_phoneme(&mut output, "u"),
            'b' | 'v' => output.push(if is_word_final(index, &chars) {
                'p'
            } else {
                'b'
            }),
            'c' => {
                if matches!(next, Some('e' | 'é' | 'i' | 'í')) {
                    output.push('s');
                } else {
                    output.push('k');
                }
            }
            'd' => output.push(if is_word_final(index, &chars) {
                't'
            } else {
                'd'
            }),
            'f' => output.push('f'),
            'g' => {
                if matches!(next, Some('e' | 'é' | 'i' | 'í')) {
                    output.push('χ');
                } else {
                    output.push(if is_word_final(index, &chars) {
                        'k'
                    } else {
                        'g'
                    });
                }
            }
            'h' => {}
            'j' => output.push('χ'),
            'k' => output.push('k'),
            'l' => output.push('l'),
            'm' => output.push('m'),
            'n' => output.push('n'),
            'ñ' => output.push('ɲ'),
            'p' => output.push('p'),
            'q' => output.push('k'),
            'r' => {
                let previous = if index == 0 {
                    None
                } else {
                    chars.get(index - 1).copied()
                };
                if index == 0 || previous.map(|value| !is_vowel(value)).unwrap_or(false) {
                    output.push('r');
                } else {
                    output.push('ɾ');
                }
            }
            's' => output.push('s'),
            't' => output.push('t'),
            'w' => output.push('w'),
            'x' => output.push_str("ks"),
            'y' => {
                if is_word_final(index, &chars) {
                    output.push('i');
                } else if previous.map(is_vowel).unwrap_or(false)
                    || next.map(is_vowel).unwrap_or(false)
                {
                    output.push('j');
                } else {
                    output.push('i');
                }
            }
            'z' => output.push('s'),
            '\'' | '’' => {}
            _ => output.push(ch),
        }

        index += 1;
    }

    output
}

fn phonemize_italian_word(word: &str) -> String {
    let lower = word.to_lowercase();
    if let Some(exception) = lookup_latin_exception(&lower, ITALIAN_WORD_LIST) {
        return exception.to_string();
    }

    let chars: Vec<char> = lower.chars().collect();
    let mut output = String::new();
    let mut index = 0;

    while index < chars.len() {
        if starts_with(&chars, index, "qu") {
            output.push_str("kw");
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "gli") {
            output.push('ʎ');
            index += 3;
            continue;
        }
        if starts_with(&chars, index, "gn") {
            output.push('ɲ');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "ch") {
            output.push('k');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "gh") {
            output.push('g');
            index += 2;
            continue;
        }
        if chars.get(index + 1) == Some(&chars[index])
            && matches!(chars[index], 'l' | 'm' | 'n' | 'p' | 'r' | 's' | 't' | 'z')
        {
            let consonant = match chars[index] {
                'r' => 'r',
                'z' => 'ʦ',
                other => other,
            };
            output.push(consonant);
            output.push('ː');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "sci")
            && chars.get(index + 3).copied().map(is_vowel).unwrap_or(false)
        {
            output.push('ʃ');
            index += 3;
            continue;
        }
        if starts_with(&chars, index, "sc") {
            let next = chars.get(index + 2).copied();
            if matches!(next, Some('e' | 'i')) {
                output.push('ʃ');
                index += 2;
                continue;
            }
        }

        let ch = chars[index];
        let next = chars.get(index + 1).copied();
        match ch {
            'a' => output.push('a'),
            'à' => push_stressed_phoneme(&mut output, "a"),
            'e' => output.push('e'),
            'è' | 'é' => push_stressed_phoneme(&mut output, "e"),
            'i' => output.push('i'),
            'ì' | 'í' => push_stressed_phoneme(&mut output, "i"),
            'o' => output.push('o'),
            'ò' | 'ó' => push_stressed_phoneme(&mut output, "o"),
            'u' => output.push('u'),
            'ù' | 'ú' => push_stressed_phoneme(&mut output, "u"),
            'c' => {
                if matches!(next, Some('e' | 'i')) {
                    output.push('ʧ');
                } else {
                    output.push('k');
                }
            }
            'g' => {
                if matches!(next, Some('e' | 'i')) {
                    output.push('ʤ');
                } else {
                    output.push('g');
                }
            }
            'h' => {}
            'j' => output.push('j'),
            'q' => output.push('k'),
            'r' => {
                let previous = if index == 0 {
                    None
                } else {
                    chars.get(index - 1).copied()
                };
                if index == 0 || previous.map(|value| !is_vowel(value)).unwrap_or(false) {
                    output.push('r');
                } else {
                    output.push('ɾ');
                }
            }
            's' => output.push('s'),
            't' => output.push('t'),
            'v' => output.push('v'),
            'z' => output.push('ʦ'),
            '\'' | '’' => {}
            _ => output.push(ch),
        }

        index += 1;
    }

    output
}

fn phonemize_portuguese_word(word: &str) -> String {
    let lower = word.to_lowercase();
    if let Some(exception) = lookup_latin_exception(&lower, PORTUGUESE_WORD_LIST) {
        return exception.to_string();
    }

    let chars: Vec<char> = lower.chars().collect();
    let mut output = String::new();
    let mut index = 0;

    while index < chars.len() {
        if starts_with(&chars, index, "ão") && index + 2 == chars.len() {
            push_stressed_phoneme(&mut output, "ɐ");
            output.push('\u{0303}');
            output.push('w');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "nh") {
            output.push('ɲ');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "lh") {
            output.push('ʎ');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "ch") {
            output.push('ʃ');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "rr") {
            output.push('ʁ');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "ss") {
            output.push('s');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "qu") {
            output.push('k');
            index += 2;
            continue;
        }

        let ch = chars[index];
        let previous = if index == 0 {
            None
        } else {
            chars.get(index - 1).copied()
        };
        let next = chars.get(index + 1).copied();
        if matches!(
            ch,
            'a' | 'e' | 'i' | 'o' | 'u' | 'á' | 'â' | 'ã' | 'é' | 'ê' | 'í' | 'ó' | 'ô' | 'õ' | 'ú'
        ) && matches!(next, Some('m' | 'n'))
            && chars
                .get(index + 2)
                .map(|ch| !is_vowel(*ch))
                .unwrap_or(true)
        {
            output.push_str(match ch {
                'a' | 'á' | 'â' => "a",
                'ã' => "ɐ",
                'e' | 'é' | 'ê' => "e",
                'i' | 'í' => "i",
                'o' | 'ó' | 'ô' | 'õ' => "o",
                _ => "u",
            });
            output.push('\u{0303}');
            index += 2;
            continue;
        }

        match ch {
            'a' | 'â' => output.push('a'),
            'á' => push_stressed_phoneme(&mut output, "a"),
            'ã' => {
                push_stressed_phoneme(&mut output, "ɐ");
                output.push('\u{0303}');
            }
            'e' => output.push('e'),
            'é' | 'ê' => push_stressed_phoneme(&mut output, "e"),
            'i' => output.push('i'),
            'í' => push_stressed_phoneme(&mut output, "i"),
            'o' => output.push('o'),
            'ó' | 'ô' => push_stressed_phoneme(&mut output, "o"),
            'õ' => {
                push_stressed_phoneme(&mut output, "o");
                output.push('\u{0303}');
            }
            'u' => output.push('u'),
            'ú' => push_stressed_phoneme(&mut output, "u"),
            'b' => output.push('b'),
            'c' => {
                if matches!(next, Some('e' | 'é' | 'ê' | 'i' | 'í')) {
                    output.push('s');
                } else {
                    output.push('k');
                }
            }
            'k' | 'q' => output.push('k'),
            'ç' => output.push('s'),
            'd' => output.push('d'),
            'f' => output.push('f'),
            'g' => {
                if matches!(next, Some('e' | 'é' | 'ê' | 'i' | 'í')) {
                    output.push('ʒ');
                } else {
                    output.push('g');
                }
            }
            'h' => {}
            'j' => output.push('ʒ'),
            'l' => output.push('l'),
            'm' => output.push('m'),
            'n' => output.push('n'),
            'p' => output.push('p'),
            'r' => {
                if is_word_final(index, &chars)
                    || next.map(|value| !is_vowel(value)).unwrap_or(false)
                {
                    output.push('ʁ');
                } else {
                    output.push('ɾ');
                }
            }
            's' => {
                if previous.map(is_vowel).unwrap_or(false) && next.map(is_vowel).unwrap_or(false) {
                    output.push('z');
                } else if is_word_final(index, &chars) && previous.map(is_vowel).unwrap_or(false) {
                    output.push('ʃ');
                } else {
                    output.push('s');
                }
            }
            't' => output.push('t'),
            'v' => output.push('v'),
            'x' => output.push('ʃ'),
            'z' => output.push(if is_word_final(index, &chars) {
                'ʃ'
            } else {
                'z'
            }),
            '\'' | '’' => {}
            _ => output.push(ch),
        }

        index += 1;
    }

    output
}

fn phonemize_german_word(word: &str) -> String {
    let lower = word.to_lowercase();
    if let Some(exception) = lookup_latin_exception(&lower, GERMAN_WORD_LIST) {
        return exception.to_string();
    }

    if let Some(phonemes) = phonemize_german_morphemes(&lower) {
        return phonemes;
    }

    let chars: Vec<char> = lower.chars().collect();
    let mut output = String::new();
    let mut index = 0;

    while index < chars.len() {
        if starts_with(&chars, index, "sp") && german_should_hush_s(&chars, index) {
            output.push_str("ʃp");
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "st") && german_should_hush_s(&chars, index) {
            output.push_str("ʃt");
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "tsch") {
            output.push('ʧ');
            index += 4;
            continue;
        }
        if starts_with(&chars, index, "sch") {
            output.push('ʃ');
            index += 3;
            continue;
        }
        if starts_with(&chars, index, "pf") {
            output.push_str("pf");
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "ng") {
            output.push('ŋ');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "nk") {
            output.push('ŋ');
            output.push('k');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "au") {
            output.push('a');
            output.push('ʊ');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "ei") {
            output.push('a');
            output.push('ɪ');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "ie") {
            output.push('i');
            output.push('ː');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "eu") || starts_with(&chars, index, "äu") {
            output.push('ɔ');
            output.push('ʏ');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "ph") {
            output.push('f');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "qu") {
            output.push_str("kv");
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "ig") && index + 2 == chars.len() {
            output.push('ɪ');
            output.push('ç');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "er") && index > 0 && index + 2 == chars.len() {
            output.push('ɐ');
            index += 2;
            continue;
        }
        if starts_with(&chars, index, "en") && index > 0 && index + 2 == chars.len() {
            output.push('ə');
            output.push('n');
            index += 2;
            continue;
        }

        let ch = chars[index];
        let next = chars.get(index + 1).copied();

        if next == Some(ch)
            && matches!(
                ch,
                'b' | 'd' | 'f' | 'g' | 'k' | 'l' | 'm' | 'n' | 'p' | 'r' | 's' | 't'
            )
        {
            match ch {
                'b' => output.push('b'),
                'd' => output.push('d'),
                'f' => output.push('f'),
                'g' => output.push('g'),
                'k' => output.push('k'),
                'l' => output.push('l'),
                'm' => output.push('m'),
                'n' => output.push('n'),
                'p' => output.push('p'),
                'r' => output.push('ʁ'),
                's' => output.push('s'),
                't' => output.push('t'),
                _ => {}
            }
            index += 2;
            continue;
        }

        match ch {
            'a' => {
                if next == Some('h') {
                    output.push('a');
                    output.push('ː');
                    index += 2;
                    continue;
                }
                output.push('a');
                if german_has_open_syllable(&chars, index) {
                    output.push('ː');
                }
            }
            'ä' => {
                if next == Some('h') {
                    output.push('ɛ');
                    output.push('ː');
                    index += 2;
                    continue;
                }
                output.push('ɛ');
            }
            'e' => {
                if index + 1 == chars.len() {
                    output.push('ə');
                } else if next == Some('h') {
                    output.push('e');
                    output.push('ː');
                    index += 2;
                    continue;
                } else {
                    output.push('e');
                    if german_has_open_syllable(&chars, index) {
                        output.push('ː');
                    }
                }
            }
            'i' => {
                if next == Some('h') {
                    output.push('i');
                    output.push('ː');
                    index += 2;
                    continue;
                }
                output.push('i');
                if german_has_open_syllable(&chars, index) {
                    output.push('ː');
                }
            }
            'o' => {
                if next == Some('h') {
                    output.push('o');
                    output.push('ː');
                    index += 2;
                    continue;
                }
                output.push('o');
                if german_has_open_syllable(&chars, index) {
                    output.push('ː');
                }
            }
            'ö' => {
                if next == Some('h') {
                    output.push('ø');
                    output.push('ː');
                    index += 2;
                    continue;
                }
                output.push('ø');
            }
            'u' => {
                if next == Some('h') {
                    output.push('u');
                    output.push('ː');
                    index += 2;
                    continue;
                }
                output.push('u');
                if german_has_open_syllable(&chars, index) {
                    output.push('ː');
                }
            }
            'ü' => {
                if next == Some('h') {
                    output.push('y');
                    output.push('ː');
                    index += 2;
                    continue;
                }
                output.push('y');
            }
            'ß' => output.push('s'),
            'b' => output.push(if index + 1 == chars.len() { 'p' } else { 'b' }),
            'c' => {
                if matches!(next, Some('e' | 'i' | 'y' | 'ä' | 'ö' | 'ü')) {
                    output.push('ʦ');
                } else {
                    output.push('k');
                }
            }
            'd' => output.push(if index + 1 == chars.len() { 't' } else { 'd' }),
            'f' => output.push('f'),
            'g' => output.push(if index + 1 == chars.len() { 'k' } else { 'g' }),
            'h' => {
                let previous = if index == 0 {
                    None
                } else {
                    chars.get(index - 1).copied()
                };
                if index == 0 || previous.map(|value| !is_vowel(value)).unwrap_or(true) {
                    output.push('h');
                }
            }
            'j' => output.push('j'),
            'k' => output.push('k'),
            'l' => output.push('l'),
            'm' => output.push('m'),
            'n' => output.push('n'),
            'p' => output.push('p'),
            'q' => output.push('k'),
            'r' => output.push('ʁ'),
            's' => {
                let previous = if index == 0 {
                    None
                } else {
                    chars.get(index - 1).copied()
                };
                if previous.map(is_vowel).unwrap_or(false) && next.map(is_vowel).unwrap_or(false) {
                    output.push('z');
                } else {
                    output.push('s');
                }
            }
            't' => output.push('t'),
            'v' => output.push('f'),
            'w' => output.push('v'),
            'x' => output.push_str("ks"),
            'y' => output.push('y'),
            'z' => output.push('ʦ'),
            _ => output.push(ch),
        }

        if ch == 'c' && matches!(next, Some('h')) {
            output.pop();
            output.push(german_ch_sound(&chars, index));
            index += 2;
            continue;
        }

        index += 1;
    }

    output
}

fn phonemize_french_word(word: &str) -> String {
    let lower = word.to_lowercase();
    if let Some(exception) = lookup_latin_exception(&lower, FRENCH_WORD_LIST) {
        return exception.to_string();
    }

    let chars: Vec<char> = lower.chars().collect();
    let mut output = String::new();
    let mut index = 0;

    while index < chars.len() {
        let mut matched = false;
        for (pattern, replacement) in [
            ("eaux", "o"),
            ("eau", "o"),
            ("ain", "ɛ\u{0303}"),
            ("ein", "ɛ\u{0303}"),
            ("oin", "wɛ\u{0303}"),
            ("ion", "jɔ\u{0303}"),
            ("ill", "j"),
            ("ou", "u"),
            ("oi", "wa"),
            ("eu", "ø"),
            ("œu", "ø"),
            ("an", "ɑ\u{0303}"),
            ("am", "ɑ\u{0303}"),
            ("en", "ɑ\u{0303}"),
            ("em", "ɑ\u{0303}"),
            ("on", "ɔ\u{0303}"),
            ("om", "ɔ\u{0303}"),
            ("un", "œ\u{0303}"),
            ("um", "œ\u{0303}"),
            ("in", "ɛ\u{0303}"),
            ("im", "ɛ\u{0303}"),
            ("yn", "ɛ\u{0303}"),
            ("ym", "ɛ\u{0303}"),
            ("gn", "ɲ"),
            ("ch", "ʃ"),
            ("ph", "f"),
            ("qu", "k"),
        ] {
            if starts_with(&chars, index, pattern) {
                output.push_str(replacement);
                index += pattern.chars().count();
                matched = true;
                break;
            }
        }

        if matched {
            continue;
        }

        let ch = chars[index];
        let next = chars.get(index + 1).copied();
        match ch {
            'a' | 'à' | 'â' => output.push('a'),
            'e' | 'é' | 'è' | 'ê' | 'ë' => output.push('e'),
            'i' | 'î' | 'ï' => output.push('i'),
            'o' | 'ô' => output.push('o'),
            'u' | 'ù' | 'û' | 'ü' => output.push('y'),
            'b' => output.push('b'),
            'c' => {
                if matches!(next, Some('e' | 'é' | 'i' | 'y')) {
                    output.push('s');
                } else {
                    output.push('k');
                }
            }
            'd' => output.push('d'),
            'f' => output.push('f'),
            'g' => {
                if matches!(next, Some('e' | 'é' | 'i' | 'y')) {
                    output.push('ʒ');
                } else {
                    output.push('g');
                }
            }
            'h' => {}
            'j' => output.push('ʒ'),
            'k' => output.push('k'),
            'l' => output.push('l'),
            'm' => output.push('m'),
            'n' => output.push('n'),
            'p' => output.push('p'),
            'q' => output.push('k'),
            'r' => output.push('ʁ'),
            's' => output.push('s'),
            't' => output.push('t'),
            'v' => output.push('v'),
            'w' => output.push('w'),
            'x' => output.push_str("ks"),
            'y' => output.push('j'),
            'z' => output.push('z'),
            '\'' | '’' => {}
            _ => output.push(ch),
        }

        index += 1;
    }

    trim_french_silent_final(&lower, output)
}

fn trim_french_silent_final(word: &str, mut phonemes: String) -> String {
    if matches!(
        word,
        "de" | "le" | "me" | "se" | "ce" | "je" | "que" | "très" | "plus" | "tous"
    ) {
        return phonemes;
    }

    for suffix in ['e', 's', 't', 'd', 'x', 'p'] {
        if word.ends_with(suffix) {
            phonemes.pop();
            break;
        }
    }

    phonemes
}

fn german_should_hush_s(chars: &[char], index: usize) -> bool {
    if !matches!(chars.get(index + 1), Some('p' | 't')) {
        return false;
    }

    if index == 0 {
        return true;
    }

    chars
        .get(index - 1)
        .copied()
        .map(|value| !is_vowel(value) && value != 's')
        .unwrap_or(false)
}

fn german_ch_sound(chars: &[char], index: usize) -> char {
    let previous = if index == 0 {
        None
    } else {
        chars.get(index - 1).copied()
    };
    let before_previous = if index < 2 {
        None
    } else {
        chars.get(index - 2).copied()
    };

    if matches!((before_previous, previous), (Some('a'), Some('u'))) {
        return 'χ';
    }

    if previous.map(is_front_vowel).unwrap_or(false)
        || matches!(previous, Some('l' | 'n' | 'r'))
        || matches!(
            (before_previous, previous),
            (Some('e'), Some('i')) | (Some('e'), Some('u')) | (Some('ä'), Some('u'))
        )
    {
        'ç'
    } else {
        'χ'
    }
}

fn starts_with(chars: &[char], index: usize, pattern: &str) -> bool {
    for (offset, expected) in pattern.chars().enumerate() {
        if chars.get(index + offset) != Some(&expected) {
            return false;
        }
    }
    true
}

fn is_word_final(index: usize, chars: &[char]) -> bool {
    index + 1 == chars.len()
}

fn push_stressed_phoneme(output: &mut String, phoneme: &str) {
    if !output.contains('ˈ') {
        output.push('ˈ');
    }
    output.push_str(phoneme);
}

fn german_has_open_syllable(chars: &[char], index: usize) -> bool {
    let Some(next) = chars.get(index + 1).copied() else {
        return false;
    };
    let Some(after_next) = chars.get(index + 2).copied() else {
        return false;
    };

    !is_vowel(next) && is_vowel(after_next) && after_next != next
}

fn phonemize_german_morphemes(word: &str) -> Option<String> {
    if let Some((stem, suffix)) = german_split_suffix(word) {
        return Some(format!("{}{}", phonemize_german_word(stem), suffix));
    }

    if let Some((prefix, stem)) = german_split_prefix(word) {
        let stem_phonemes = phonemize_german_word(stem);
        let stem_phonemes = if stem_phonemes.contains('ˈ') {
            stem_phonemes
        } else {
            format!("ˈ{stem_phonemes}")
        };
        return Some(format!("{prefix}{stem_phonemes}"));
    }

    if let Some((left, right)) = german_split_compound(word) {
        return Some(format!(
            "{}{}",
            phonemize_german_word(left),
            phonemize_german_word(right)
        ));
    }

    None
}

fn german_split_prefix(word: &str) -> Option<(&'static str, &str)> {
    GERMAN_PREFIXES.iter().find_map(|(prefix, phonemes)| {
        word.strip_prefix(prefix)
            .filter(|stem| stem.chars().count() >= 3)
            .map(|stem| (*phonemes, stem))
    })
}

fn german_split_suffix(word: &str) -> Option<(&str, &'static str)> {
    GERMAN_SUFFIXES.iter().find_map(|(suffix, phonemes)| {
        word.strip_suffix(suffix)
            .filter(|stem| stem.chars().count() >= 2)
            .map(|stem| (stem, *phonemes))
    })
}

fn german_split_compound(word: &str) -> Option<(&str, &str)> {
    GERMAN_COMPOUND_TAILS.iter().find_map(|tail| {
        word.find(tail)
            .filter(|index| *index >= 3)
            .map(|index| (&word[..index], &word[index..]))
    })
}

fn is_vowel(ch: char) -> bool {
    matches!(
        ch,
        'a' | 'e'
            | 'i'
            | 'o'
            | 'u'
            | 'y'
            | 'á'
            | 'à'
            | 'â'
            | 'ã'
            | 'ä'
            | 'é'
            | 'è'
            | 'ê'
            | 'ë'
            | 'í'
            | 'ì'
            | 'î'
            | 'ï'
            | 'ó'
            | 'ò'
            | 'ô'
            | 'õ'
            | 'ö'
            | 'ú'
            | 'ù'
            | 'û'
            | 'ü'
            | 'ă'
    )
}

fn is_front_vowel(ch: char) -> bool {
    matches!(ch, 'e' | 'i' | 'ä' | 'ö' | 'ü' | 'é' | 'è' | 'ê' | 'y')
}

fn hindi_independent_vowel(ch: char) -> Option<&'static str> {
    Some(match ch {
        'अ' => "ə",
        'आ' => "a",
        'इ' => "i",
        'ई' => "i",
        'उ' => "u",
        'ऊ' => "u",
        'ए' => "e",
        'ऐ' => "ɛ",
        'ओ' => "o",
        'औ' => "ɔ",
        'ऋ' => "ɾi",
        _ => return None,
    })
}

fn hindi_matra(ch: char) -> Option<&'static str> {
    Some(match ch {
        'ा' => "a",
        'ि' | 'ी' => "i",
        'ु' | 'ू' => "u",
        'े' => "e",
        'ै' => "ɛ",
        'ो' => "o",
        'ौ' => "ɔ",
        'ृ' => "ɾi",
        _ => return None,
    })
}

fn hindi_consonant(ch: char) -> Option<&'static str> {
    Some(match ch {
        'क' => "k",
        'ख' => "kʰ",
        'ग' => "g",
        'घ' => "gʰ",
        'ङ' => "ŋ",
        'च' => "ʧ",
        'छ' => "ʧʰ",
        'ज' => "ʤ",
        'झ' => "ʤʰ",
        'ञ' => "ɲ",
        'ट' => "ʈ",
        'ठ' => "ʈʰ",
        'ड' => "ɖ",
        'ढ' => "ɖʰ",
        'ण' => "ɳ",
        'त' => "t",
        'थ' => "tʰ",
        'द' => "d",
        'ध' => "dʰ",
        'न' => "n",
        'प' => "p",
        'फ' => "pʰ",
        'ब' => "b",
        'भ' => "bʰ",
        'म' => "m",
        'य' => "j",
        'र' => "ɾ",
        'ल' => "l",
        'व' => "ʋ",
        'श' => "ʃ",
        'ष' => "ʂ",
        'स' => "s",
        'ह' => "h",
        _ => return None,
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    const TEXT_ALICE: &str =
        "Who are you? said the Caterpillar. Replied Alice, rather shyly, I hardly know, sir!";

    #[test]
    fn test_basic_en() {
        let phonemes = text_to_phonemes("test", "en-US", None, false, false)
            .expect("english phonemization should work")
            .join("");
        assert!(phonemes.ends_with('.'));
        assert!(phonemes.contains('t'));
    }

    #[test]
    fn test_empty_input_returns_no_sentences() {
        let phonemes = text_to_phonemes("   \n  ", "en-US", None, false, false)
            .expect("whitespace input should not fail");
        assert!(phonemes.is_empty());
    }

    #[test]
    fn test_unsupported_language_errors() {
        let err = text_to_phonemes("salve", "la", None, false, false)
            .expect_err("unsupported languages should error");
        assert!(err.to_string().contains("unsupported"));
    }

    #[test]
    fn test_english_acronym_normalization() {
        assert_eq!(
            normalize_english_text("AI API serves GPU TTS on the CPU."),
            "AI API serves GPU TTS on the CPU."
        );
    }

    #[test]
    fn test_it_splits_sentences() {
        let phonemes = text_to_phonemes(TEXT_ALICE, "en-US", None, false, false)
            .expect("english phonemization should work");
        assert_eq!(phonemes.len(), 3);
    }

    #[test]
    fn test_it_adds_phoneme_separator() {
        let phonemes = text_to_phonemes("test", "en-US", Some('_'), false, false)
            .expect("english phonemization should work")
            .join("");
        assert!(phonemes.contains('_'));
        assert!(phonemes.ends_with('.'));
    }

    #[test]
    fn test_it_preserves_clause_breakers() {
        let phonemes = text_to_phonemes(TEXT_ALICE, "en-US", None, false, false)
            .expect("english phonemization should work")
            .join("");
        for punctuation in [',', '.', '?', '!'] {
            assert!(
                phonemes.contains(punctuation),
                "missing punctuation {punctuation}"
            );
        }
    }

    #[test]
    fn test_stress_toggle() {
        let with_stress = text_to_phonemes(TEXT_ALICE, "en-US", None, false, false)
            .expect("english phonemization should work")
            .join("");
        let without_stress = text_to_phonemes(TEXT_ALICE, "en-US", None, false, true)
            .expect("english phonemization should work")
            .join("");

        assert!(with_stress.contains('ˈ') || with_stress.contains('ˌ'));
        assert!(!without_stress.contains('ˈ') && !without_stress.contains('ˌ'));
    }

    #[test]
    fn test_line_splitting() {
        let phonemes = text_to_phonemes("Hello\nThere\nAnd\nWelcome", "en-US", None, false, false)
            .expect("english phonemization should work");
        assert_eq!(phonemes.len(), 4);
    }

    #[test]
    fn test_kokoro_languages_smoke() {
        let cases = [
            ("Hello world", "en"),
            ("The schedule changed", "en-gb"),
            ("Hola mundo", "es"),
            ("Bonjour le monde", "fr"),
            ("Guten Tag", "de"),
            ("Ciao mondo", "it"),
            ("Olá mundo", "pt"),
            ("こんにちは世界", "ja"),
            ("你好世界", "zh"),
            ("안녕하세요", "ko"),
            ("नमस्ते दुनिया", "hi"),
        ];

        for (text, lang) in cases {
            let phonemes = text_to_phonemes(text, lang, None, false, false)
                .unwrap_or_else(|err| panic!("{lang} phonemization failed: {err}"))
                .join("");
            assert!(
                !phonemes.is_empty(),
                "expected non-empty phonemes for language {lang}"
            );
        }
    }

    #[test]
    fn test_german_word_list_and_suffix_rules() {
        assert_eq!(phonemize_german_word("hallo"), "hˈaloː");
        assert_eq!(phonemize_german_word("vollständig"), "fˈɔlʃtˌɛndɪç");
        assert_eq!(phonemize_german_word("gut"), "ɡˈuːt");
        assert_eq!(phonemize_german_word("und"), "ʊnt");
        assert_eq!(phonemize_german_word("rust"), "rˈʊst");
        assert_eq!(phonemize_german_word("kokoro"), "koːkˈoːroː");
        assert_eq!(phonemize_german_word("sprach"), "ʃpʁaːχ");
        assert_eq!(phonemize_german_word("sprachsynthese"), "ʃpʁaːχzyntˈeːzə");
        assert_eq!(phonemize_german_word("wegen"), "veːgən");
        assert_eq!(phonemize_german_word("rad"), "ʁat");
        assert_eq!(phonemize_german_word("ich"), "iç");
        assert_eq!(phonemize_german_word("bach"), "baχ");
        assert_eq!(phonemize_german_word("schönheit"), "ʃøːnhaɪt");

        let sentence = text_to_phonemes("Hallo vollständig", "de", None, false, false)
            .expect("german phonemization should work")
            .join("");
        assert!(sentence.starts_with("hˈaloː"));
        assert!(sentence.contains("ɪç"));
    }

    #[test]
    fn test_german_morpheme_and_compound_rules() {
        assert_eq!(phonemize_german_word("radsport"), "ʁatʃpɔʁt");
        assert_eq!(phonemize_german_word("überraschung"), "yːbɐˈʁaʃʊŋ");
        assert_eq!(phonemize_german_word("freundlichkeit"), "fʁɔʏntlɪçkaɪt");
        assert_eq!(phonemize_german_word("überraschend"), "ˌyːbɜrˈaʃənt");
        assert_eq!(phonemize_german_word("bleiben"), "blˈaɪbən");
        assert_eq!(phonemize_german_word("bleibt"), "blˈaɪpt");
        assert_eq!(phonemize_german_word("implementiert"), "ˌɪmpleːməntˈiːɾt");
    }

    #[test]
    fn test_japanese_ascii_loanwords_are_transliterated() {
        let phonemes = text_to_phonemes(
            "RustでKokoroの音声合成を試します。",
            "ja",
            None,
            false,
            false,
        )
        .expect("japanese phonemization should work")
        .join("");

        assert!(
            phonemes.contains("ɾasɯto"),
            "unexpected Rust output: {phonemes}"
        );
        assert!(
            phonemes.contains("kokoɾo"),
            "unexpected Kokoro output: {phonemes}"
        );
        assert!(
            !phonemes.contains("Rust"),
            "raw ASCII leaked into output: {phonemes}"
        );
    }

    #[test]
    fn test_french_word_list_common_words() {
        assert_eq!(phonemize_french_word("ami"), "amˈi");
        assert_eq!(phonemize_french_word("bonjour"), "bɔ̃ʒˈuʁ");
        assert_eq!(phonemize_french_word("de"), "də");
        assert_eq!(phonemize_french_word("synthèse"), "sɛ̃tˈɛz");
        assert_eq!(phonemize_french_word("vous"), "vu");
        assert_eq!(phonemize_french_word("êtes"), "ɛt");
    }

    #[test]
    fn test_french_liaison_common_case() {
        let phonemes = text_to_phonemes("Ceci est un ami", "fr", None, false, false)
            .expect("french phonemization should work")
            .join("");
        assert!(
            phonemes.contains("ɛt œ̃"),
            "expected est-un liaison, got {phonemes}"
        );
    }

    #[test]
    fn test_french_vous_liaison() {
        let phonemes = text_to_phonemes("Vous êtes un ami", "fr", None, false, false)
            .expect("french phonemization should work")
            .join("");
        assert!(
            phonemes.contains("vuz ɛtz"),
            "expected vous-etes liaison, got {phonemes}"
        );
        assert!(
            phonemes.contains("œ̃n amˈi"),
            "expected un-ami liaison, got {phonemes}"
        );
    }

    #[test]
    fn test_spanish_r_distinction() {
        assert_eq!(phonemize_spanish_word("rosa"), "rosa");
        assert_eq!(phonemize_spanish_word("perro"), "pero");
        assert_eq!(phonemize_spanish_word("yo"), "ʝˈo");
        assert_eq!(phonemize_spanish_word("quiero"), "kjˈeɾo");
        assert_eq!(phonemize_spanish_word("voz"), "βˈoθ");
        assert!(phonemize_spanish_word("madrid").ends_with('t'));
    }

    #[test]
    fn test_italian_word_list_and_qu_cluster() {
        assert_eq!(phonemize_italian_word("questo"), "kwˈesto");
        assert_eq!(phonemize_italian_word("sintesi"), "sˈintezi");
        assert_eq!(phonemize_italian_word("sciarpa"), "ʃˈarpa");
        assert_eq!(phonemize_italian_word("chiara"), "kjˈaɾa");
    }

    #[test]
    fn test_portuguese_word_list() {
        assert_eq!(phonemize_portuguese_word("olá"), "ɔlˈa");
        assert_eq!(phonemize_portuguese_word("síntese"), "sˈiŋtɨzɨ");
        assert_eq!(phonemize_portuguese_word("a"), "ɐ");
        assert_eq!(phonemize_portuguese_word("casa"), "kˈazɐ");
        assert_eq!(phonemize_portuguese_word("não"), "nˈɐ̃w");
        assert_eq!(phonemize_portuguese_word("do"), "dʊ");
        assert_eq!(phonemize_portuguese_word("mar"), "mˈaɹ");
        assert_eq!(phonemize_portuguese_word("um"), "ũŋ");
    }

    #[test]
    fn test_korean_ascii_segments_are_phonemized() {
        let phonemes = text_to_phonemes("Rust로 Kokoro 테스트", "ko", None, false, false)
            .expect("korean phonemization should work")
            .join("");
        assert!(
            phonemes.contains("ɹˈʌst"),
            "unexpected Rust output: {phonemes}"
        );
        assert!(
            phonemes.contains("kəkˈɔːɹoʊ"),
            "unexpected Kokoro output: {phonemes}"
        );
    }

    #[test]
    fn test_hindi_word_list_and_ascii_segments() {
        let phonemes = text_to_phonemes("नमस्ते Rust पाठ-से-भाषण", "hi", None, false, false)
            .expect("hindi phonemization should work")
            .join("");
        assert!(
            phonemes.contains("nəmˈʌsteː"),
            "unexpected नमस्ते output: {phonemes}"
        );
        assert!(
            phonemes.contains("ɹˈʌst"),
            "unexpected Rust output: {phonemes}"
        );
        assert!(
            phonemes.contains("pˈaːʈʰseːbʰˈaːʂəɳ"),
            "unexpected phrase output: {phonemes}"
        );
    }

    #[test]
    fn test_chinese_third_tone_sandhi() {
        let phonemes = text_to_phonemes("你好", "zh", None, false, false)
            .expect("chinese phonemization should work")
            .join("");
        assert!(
            phonemes.contains("ni↗"),
            "expected third-tone sandhi on 你, got {phonemes}"
        );
        assert!(
            phonemes.contains("χau↓"),
            "expected third tone on 好, got {phonemes}"
        );
    }

    #[test]
    fn test_chinese_phrase_overrides_polyphones() {
        let phonemes = text_to_phonemes("重庆银行", "zh", None, false, false)
            .expect("chinese phrase overrides should work")
            .join("");
        assert!(
            phonemes.contains("ʧʰʊŋ↗ ʨʰiŋ↘"),
            "expected chongqing override, got {phonemes}"
        );
        assert!(
            phonemes.contains("in↗ χɑŋ↗"),
            "expected yinhang override, got {phonemes}"
        );
    }

    #[test]
    fn test_chinese_ascii_loanwords_are_phonemized() {
        let phonemes = text_to_phonemes("AI语音TTS", "zh", None, false, false)
            .expect("mixed Chinese ASCII phonemization should work")
            .join("");
        assert!(
            !phonemes.contains("AI"),
            "expected ASCII AI to be phonemized, got {phonemes}"
        );
        assert!(
            !phonemes.contains("TTS"),
            "expected ASCII TTS to be phonemized, got {phonemes}"
        );
    }
}