use thiserror::Error;
#[derive(Debug, Error)]
pub enum G2pError {
#[error("tts-g2p feature is disabled — rebuild with `--features tts-g2p`")]
FeatureDisabled,
#[error("unsupported language: {0}")]
UnsupportedLanguage(String),
#[error("espeak-ng error: {0}")]
Backend(String),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Lang {
EnUs,
EnGb,
Es,
Fr,
Ja,
Hi,
It,
Pt,
Zh,
Ko,
}
impl std::str::FromStr for Lang {
type Err = G2pError;
fn from_str(s: &str) -> Result<Self, G2pError> {
Self::parse(s)
}
}
impl Lang {
pub fn parse(s: &str) -> Result<Self, G2pError> {
let normalized = s.to_ascii_lowercase().replace('_', "-");
match normalized.as_str() {
"en" | "en-us" => Ok(Self::EnUs),
"en-gb" | "en-uk" => Ok(Self::EnGb),
"es" | "es-es" | "es-mx" => Ok(Self::Es),
"fr" | "fr-fr" => Ok(Self::Fr),
"ja" | "ja-jp" => Ok(Self::Ja),
"hi" | "hi-in" => Ok(Self::Hi),
"it" | "it-it" => Ok(Self::It),
"pt" | "pt-br" | "pt-pt" => Ok(Self::Pt),
"zh" | "zh-cn" | "zh-tw" => Ok(Self::Zh),
"ko" | "ko-kr" => Ok(Self::Ko),
_ => Err(G2pError::UnsupportedLanguage(s.to_string())),
}
}
pub fn espeak_voice(self) -> &'static str {
match self {
Self::EnUs => "en-us",
Self::EnGb => "en-gb",
Self::Es => "es",
Self::Fr => "fr",
Self::Ja => "ja",
Self::Hi => "hi",
Self::It => "it",
Self::Pt => "pt",
Self::Zh => "cmn", Self::Ko => "ko",
}
}
}
#[derive(Debug)]
pub struct Phonemizer {
lang: Lang,
#[cfg(feature = "tts-g2p")]
_espeak: (),
}
impl Phonemizer {
pub fn new(lang: Lang) -> Result<Self, G2pError> {
#[cfg(feature = "tts-g2p")]
{
espeakng::initialise(None).map_err(|e| G2pError::Backend(format!("{e:?}")))?;
Ok(Self { lang, _espeak: () })
}
#[cfg(not(feature = "tts-g2p"))]
{
let _ = lang;
Err(G2pError::FeatureDisabled)
}
}
pub fn lang(&self) -> Lang {
self.lang
}
pub fn text_to_phonemes(&self, text: &str) -> Result<Vec<String>, G2pError> {
#[cfg(feature = "tts-g2p")]
{
let mut speaker = espeakng::Speaker::new()
.map_err(|e| G2pError::Backend(format!("new speaker: {e:?}")))?;
speaker
.set_voice_by_name(self.lang.espeak_voice())
.map_err(|e| G2pError::Backend(format!("set_voice: {e:?}")))?;
let ipa = speaker
.synthesize_ipa(text)
.map_err(|e| G2pError::Backend(format!("synthesize_ipa: {e:?}")))?;
Ok(tokenize_ipa(&ipa))
}
#[cfg(not(feature = "tts-g2p"))]
{
let _ = text;
Err(G2pError::FeatureDisabled)
}
}
}
pub fn tokenize_ipa(ipa: &str) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
let mut current = String::new();
for ch in ipa.chars() {
match ch {
' ' => {
flush(&mut current, &mut out);
if !matches!(out.last().map(String::as_str), Some("<space>")) {
out.push("<space>".to_string());
}
}
'_' => {
flush(&mut current, &mut out);
}
'.' | ',' | '?' | '!' | ';' | ':' => {
flush(&mut current, &mut out);
out.push(ch.to_string());
}
'ˈ' | 'ˌ' => {
flush(&mut current, &mut out);
current.push(ch);
}
c if is_combining(c) => {
current.push(c);
}
c => {
if current.is_empty() || current_is_stress_prefix(¤t) {
current.push(c);
} else {
flush(&mut current, &mut out);
current.push(c);
}
}
}
}
flush(&mut current, &mut out);
if matches!(out.last().map(String::as_str), Some("<space>")) {
out.pop();
}
out
}
fn flush(current: &mut String, out: &mut Vec<String>) {
if !current.is_empty() {
out.push(std::mem::take(current));
}
}
fn current_is_stress_prefix(s: &str) -> bool {
s == "ˈ" || s == "ˌ"
}
fn is_combining(c: char) -> bool {
let u = c as u32;
(0x0300..=0x036F).contains(&u) || c == 'ː' || c == '̃' || c == '̩'
}
pub fn phonemes_to_ids<V>(tokens: &[String], vocab: &V) -> Vec<Option<u32>>
where
V: PhonemeVocab,
{
tokens.iter().map(|t| vocab.lookup(t)).collect()
}
pub trait PhonemeVocab {
fn lookup(&self, phoneme: &str) -> Option<u32>;
}
impl PhonemeVocab for std::collections::HashMap<String, u32> {
fn lookup(&self, phoneme: &str) -> Option<u32> {
self.get(phoneme).copied()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashMap;
#[test]
fn lang_parses_common_forms() {
assert_eq!(Lang::parse("en").unwrap(), Lang::EnUs);
assert_eq!(Lang::parse("en-US").unwrap(), Lang::EnUs);
assert_eq!(Lang::parse("en_GB").unwrap(), Lang::EnGb);
assert_eq!(Lang::parse("zh-CN").unwrap(), Lang::Zh);
assert!(Lang::parse("xx").is_err());
}
#[test]
fn tokenize_ipa_inserts_spaces_between_words() {
let tokens = tokenize_ipa("hɛloʊ wɝld");
assert!(tokens.contains(&"<space>".to_string()));
for w in tokens.windows(2) {
assert!(!(w[0] == "<space>" && w[1] == "<space>"));
}
}
#[test]
fn tokenize_ipa_keeps_punctuation() {
let tokens = tokenize_ipa("hɛloʊ, wɝld.");
assert!(tokens.iter().any(|t| t == ","));
assert!(tokens.iter().any(|t| t == "."));
}
#[test]
fn tokenize_ipa_attaches_stress_to_next_phoneme() {
let tokens = tokenize_ipa("ˈhɛ");
assert_eq!(tokens[0], "ˈh");
}
#[test]
fn tokenize_ipa_attaches_length_mark() {
let tokens = tokenize_ipa("aːb");
assert_eq!(tokens[0], "aː");
assert_eq!(tokens[1], "b");
}
#[test]
fn tokenize_ipa_drops_syllable_separator() {
let tokens = tokenize_ipa("a_b");
assert_eq!(tokens, vec!["a".to_string(), "b".to_string()]);
}
#[test]
fn tokenize_ipa_trims_trailing_space() {
let tokens = tokenize_ipa("a ");
assert_eq!(tokens, vec!["a".to_string()]);
}
#[test]
fn phonemes_to_ids_looks_up_vocab() {
let mut vocab: HashMap<String, u32> = HashMap::new();
vocab.insert("h".to_string(), 1);
vocab.insert("ɛ".to_string(), 2);
let ids = phonemes_to_ids(&["h".into(), "ɛ".into(), "?".into()], &vocab);
assert_eq!(ids, vec![Some(1), Some(2), None]);
}
#[test]
#[cfg(not(feature = "tts-g2p"))]
fn phonemizer_errors_without_feature() {
let err = Phonemizer::new(Lang::EnUs).unwrap_err();
assert!(matches!(err, G2pError::FeatureDisabled));
}
}