any-tts 0.1.1

A Rust TTS library with Candle backends and runtime adapters for modern open TTS models
Documentation
//! Text-to-phoneme conversion for Kokoro-82M.
//!
//! Converts plain text to Kokoro-compatible IPA phoneme strings using an
//! in-tree pure-Rust `espeak-rs` compatibility layer.

use std::collections::HashMap;

use crate::error::{TtsError, TtsResult};

use super::espeak_compat::text_to_phonemes;

fn lang_to_espeak(lang: &str) -> &'static str {
    match lang {
        "en" | "en-us" | "a" => "en-us",
        "en-gb" | "b" => "en-gb",
        "ja" | "j" => "ja",
        "zh" | "z" => "cmn",
        "ko" | "k" => "ko",
        "fr" | "f" => "fr",
        "de" | "d" => "de",
        "it" | "i" => "it",
        "pt" | "p" => "pt",
        "es" | "e" => "es",
        "hi" | "h" => "hi",
        _ => "en-us",
    }
}

/// Infer the language from a Kokoro voice name's first letter.
///
/// Kokoro voice naming convention:
///   first letter = language (a=American, b=British, e=Spanish, f=French,
///   h=Hindi, i=Italian, j=Japanese, p=Portuguese, z=Chinese)
pub fn language_from_voice(voice: &str) -> &'static str {
    match voice.chars().next() {
        Some('a') => "en",
        Some('b') => "en-gb",
        Some('e') => "es",
        Some('f') => "fr",
        Some('h') => "hi",
        Some('i') => "it",
        Some('j') => "ja",
        Some('k') => "ko",
        Some('p') => "pt",
        Some('z') => "zh",
        _ => "en",
    }
}

fn apply_kokoro_replacements(phonemes: &str) -> String {
    let mut s = phonemes
        .replace('ʲ', "j")
        .replace('ɝ', "ɚ")
        .replace('g', "ɡ")
        .replace('x', "k")
        .replace("ɬ", "l");

    // Remove tie bars (espeak uses combining tie U+0361 for affricates)
    s = s.replace('\u{0361}', "");

    s
}

/// Filter a phoneme string to only characters present in the Kokoro vocab.
fn filter_to_vocab(phonemes: &str, vocab: &HashMap<String, u32>) -> String {
    phonemes
        .chars()
        .filter(|c| {
            let key = c.to_string();
            vocab.contains_key(&key)
        })
        .collect()
}

/// Convert plain text to Kokoro-compatible IPA phoneme string.
///
/// Pipeline:
/// 1. Use the pure-Rust `espeak-rs` compatibility layer for the requested language
/// 2. Apply Kokoro-specific cleanup
/// 3. Filter to only characters in the Kokoro vocab
pub fn phonemize(text: &str, language: &str, vocab: &HashMap<String, u32>) -> TtsResult<String> {
    let espeak_lang = lang_to_espeak(language);

    let raw_phonemes = text_to_phonemes(text, espeak_lang, None, true, false).map_err(|e| {
        TtsError::ModelError(format!(
            "pure-Rust phonemization failed for lang '{language}' (compat voice '{espeak_lang}'): {e}"
        ))
    })?;

    let joined = raw_phonemes.join("");
    let replaced = apply_kokoro_replacements(&joined);
    let filtered = filter_to_vocab(&replaced, vocab);

    if filtered.is_empty() {
        return Err(TtsError::TokenizerError(format!(
            "Phonemization produced no valid tokens for text: \"{text}\" (lang: {language})"
        )));
    }

    Ok(filtered)
}

#[cfg(test)]
mod tests {
    use super::*;

    fn dummy_vocab() -> HashMap<String, u32> {
        // Build a minimal vocab containing common IPA chars
        let chars = "$;:,.!?¡¿—…\"«»\u{201c}\u{201d} \
            ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnoprstuvwxyz\
            ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰ\
            ŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢ\
            ǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘ᵻ";
        let mut vocab = HashMap::new();
        for (i, c) in chars.chars().enumerate() {
            vocab.insert(c.to_string(), i as u32);
        }
        vocab
    }

    #[test]
    fn test_language_from_voice() {
        assert_eq!(language_from_voice("af_heart"), "en");
        assert_eq!(language_from_voice("dm_speaker"), "en"); // unknown prefix
        assert_eq!(language_from_voice("jf_alpha"), "ja");
        assert_eq!(language_from_voice("ef_dora"), "es");
        assert_eq!(language_from_voice("ff_siwis"), "fr");
    }

    #[test]
    fn test_kokoro_replacements() {
        let input = "ʲrgxɬɝ";
        let output = apply_kokoro_replacements(input);
        assert_eq!(output, "jrɡklɚ");
    }

    #[test]
    fn test_phonemize_english() {
        let vocab = dummy_vocab();
        let result = phonemize("Hello world", "en", &vocab);
        assert!(result.is_ok(), "phonemize failed: {:?}", result.err());
        let ph = result.unwrap();
        assert!(!ph.is_empty(), "phonemes should not be empty");
        // Should contain IPA characters, not ASCII "Hello"
        assert!(
            ph.contains('ə') || ph.contains('ɛ') || ph.contains('ˈ') || ph.contains('l'),
            "Expected IPA phonemes, got: {ph}"
        );
    }

    #[test]
    fn test_phonemize_british_english_variant() {
        let vocab = dummy_vocab();
        let us = phonemize("schedule", "en", &vocab).expect("US phonemization should work");
        let gb = phonemize("schedule", "en-gb", &vocab).expect("British phonemization should work");

        assert!(!us.is_empty());
        assert!(!gb.is_empty());
        assert_ne!(us, gb, "expected dialect-specific phoneme output");
    }

    #[test]
    fn test_phonemize_multilingual_smoke() {
        let vocab = dummy_vocab();
        for (text, lang) in [
            ("Hola mundo", "es"),
            ("Bonjour le monde", "fr"),
            ("Guten Tag", "de"),
            ("Ciao mondo", "it"),
            ("Olá mundo", "pt"),
            ("こんにちは世界", "ja"),
            ("你好世界", "zh"),
            ("안녕하세요", "ko"),
            ("नमस्ते दुनिया", "hi"),
        ] {
            let result = phonemize(text, lang, &vocab);
            assert!(
                result.is_ok(),
                "phonemize failed for {lang}: {:?}",
                result.err()
            );
            assert!(
                !result.unwrap().is_empty(),
                "expected non-empty phonemes for {lang}"
            );
        }
    }

    #[test]
    fn test_filter_to_vocab() {
        let mut vocab = HashMap::new();
        vocab.insert("a".to_string(), 1);
        vocab.insert("b".to_string(), 2);
        assert_eq!(filter_to_vocab("abc", &vocab), "ab");
    }
}