use alloc::{string::ToString, vec::Vec};
use serde::{Deserialize, Serialize};
use tracing::{trace, warn};
use svara::phoneme::Phoneme;
use svara::sequence::PhonemeEvent;
use crate::dictionary::PronunciationDict;
use crate::error::{Result, ShabdaError};
use crate::normalize;
use crate::prosody;
use crate::rules;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[non_exhaustive]
pub enum Language {
English,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct G2PEngine {
language: Language,
dictionary: PronunciationDict,
}
impl G2PEngine {
#[must_use]
pub fn new(language: Language) -> Self {
let dictionary = match language {
Language::English => PronunciationDict::english(),
};
Self {
language,
dictionary,
}
}
#[must_use]
pub fn language(&self) -> Language {
self.language
}
#[must_use]
pub fn dictionary(&self) -> &PronunciationDict {
&self.dictionary
}
pub fn dictionary_mut(&mut self) -> &mut PronunciationDict {
&mut self.dictionary
}
pub fn convert(&self, text: &str) -> Result<Vec<PhonemeEvent>> {
if text.trim().is_empty() {
return Err(ShabdaError::InvalidInput("empty text".to_string()));
}
let intonation = normalize::detect_intonation(text);
let normalized = normalize::normalize(text);
trace!(
input = text,
normalized = normalized.as_str(),
?intonation,
"converting text to phonemes"
);
let words: Vec<&str> = normalized.split_whitespace().collect();
let mut events = Vec::new();
for (i, word) in words.iter().enumerate() {
if *word == normalize::COMMA_PAUSE {
events.push(PhonemeEvent::new(
Phoneme::Silence,
0.15,
svara::prosody::Stress::Unstressed,
));
continue;
}
if *word == normalize::PERIOD_PAUSE {
events.push(PhonemeEvent::new(
Phoneme::Silence,
0.30,
svara::prosody::Stress::Unstressed,
));
continue;
}
let phonemes: Vec<Phoneme> = if let Some(dict_entry) = self.dictionary.lookup(word) {
trace!(word, phoneme_count = dict_entry.len(), "dictionary hit");
dict_entry.to_vec()
} else {
trace!(word, "dictionary miss, falling back to rules");
match self.language {
Language::English => rules::english_rules(word),
}
};
if phonemes.is_empty() {
warn!(word, "no phonemes produced, skipping word");
continue;
}
let is_content = prosody::is_content_word(word);
let syllables = crate::syllable::syllabify(&phonemes);
let word_events = if syllables.is_empty() {
trace!(word, "no syllables (consonant-only), using simple stress");
prosody::assign_stress(&phonemes, is_content)
} else {
trace!(
word,
syllable_count = syllables.len(),
is_content,
"syllabified"
);
prosody::assign_stress_syllabic(&syllables, is_content)
};
events.extend(word_events);
if i < words.len() - 1 {
events.push(PhonemeEvent::new(
Phoneme::Silence,
0.04,
svara::prosody::Stress::Unstressed,
));
}
}
Ok(events)
}
pub fn speak(
&self,
text: &str,
voice: &svara::voice::VoiceProfile,
sample_rate: f32,
) -> Result<Vec<f32>> {
let events = self.convert(text)?;
let mut seq = svara::sequence::PhonemeSequence::new();
for event in events {
seq.push(event);
}
seq.render(voice, sample_rate)
.map_err(|e| ShabdaError::RuleError(alloc::format!("audio synthesis failed: {e}")))
}
}