use alloc::{string::ToString, vec::Vec};
use serde::{Deserialize, Serialize};
use tracing::{trace, warn};
use svara::phoneme::Phoneme;
use svara::sequence::PhonemeEvent;
use crate::dictionary::PronunciationDict;
use crate::error::{Result, ShabdaError};
use crate::normalize;
use crate::prosody;
use crate::rules;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[non_exhaustive]
pub enum Language {
English,
}
#[cfg(feature = "varna")]
#[must_use]
pub fn detect_language(text: &str) -> Option<Language> {
if text.trim().is_empty() {
return None;
}
let scripts = [
("Latn", Language::English),
];
let mut best: Option<(Language, usize)> = None;
for (script_code, language) in &scripts {
if let Some(script) = varna::script::by_code(script_code) {
let count = text
.chars()
.filter(|c| script.contains_codepoint(u32::from(*c)))
.count();
if count > 0 {
match best {
Some((_, best_count)) if count > best_count => {
best = Some((*language, count));
}
None => {
best = Some((*language, count));
}
_ => {}
}
}
}
}
best.map(|(lang, _)| lang)
}
#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct ConvertOptions {
#[serde(default)]
pub emphasis: bool,
#[serde(default)]
pub speaking_rate_wpm: Option<f32>,
#[serde(default)]
pub timing: Option<TimingProfile>,
}
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct TimingProfile {
pub vowel_scale: f32,
pub consonant_scale: f32,
pub pause_scale: f32,
}
impl TimingProfile {
#[must_use]
pub fn new(vowel_scale: f32, consonant_scale: f32, pause_scale: f32) -> Self {
Self {
vowel_scale,
consonant_scale,
pause_scale,
}
}
}
impl Default for TimingProfile {
fn default() -> Self {
Self {
vowel_scale: 1.0,
consonant_scale: 1.0,
pause_scale: 1.0,
}
}
}
impl ConvertOptions {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn with_emphasis(mut self, emphasis: bool) -> Self {
self.emphasis = emphasis;
self
}
#[must_use]
pub fn with_speaking_rate(mut self, wpm: f32) -> Self {
self.speaking_rate_wpm = Some(wpm);
self
}
#[must_use]
pub fn with_timing(mut self, timing: TimingProfile) -> Self {
self.timing = Some(timing);
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct G2PEngine {
language: Language,
dictionary: PronunciationDict,
}
impl G2PEngine {
#[must_use]
pub fn new(language: Language) -> Self {
let dictionary = match language {
Language::English => PronunciationDict::english(),
};
Self {
language,
dictionary,
}
}
#[must_use]
pub fn language(&self) -> Language {
self.language
}
#[must_use]
pub fn dictionary(&self) -> &PronunciationDict {
&self.dictionary
}
pub fn dictionary_mut(&mut self) -> &mut PronunciationDict {
&mut self.dictionary
}
pub fn convert(&self, text: &str) -> Result<Vec<PhonemeEvent>> {
self.convert_with(text, &ConvertOptions::default())
}
pub fn convert_with(&self, text: &str, options: &ConvertOptions) -> Result<Vec<PhonemeEvent>> {
if text.trim().is_empty() {
return Err(ShabdaError::InvalidInput("empty text".to_string()));
}
#[cfg(feature = "varna")]
let varna_inventory = crate::validate::inventory_for(self.language);
let intonation = normalize::detect_intonation(text);
let normalized = if options.emphasis {
normalize::normalize_with_emphasis(text)
} else {
normalize::normalize(text)
};
trace!(
input = text,
normalized = normalized.as_str(),
?intonation,
emphasis = options.emphasis,
rate = ?options.speaking_rate_wpm,
"converting text to phonemes"
);
let words: Vec<&str> = normalized.split_whitespace().collect();
let mut events = Vec::new();
let mut emphasis_active = false;
for (i, word) in words.iter().enumerate() {
if *word == normalize::EMPHASIS_START {
emphasis_active = true;
continue;
}
if *word == normalize::EMPHASIS_END {
emphasis_active = false;
continue;
}
if *word == normalize::COMMA_PAUSE {
events.push(PhonemeEvent::new(
Phoneme::Silence,
0.15,
svara::prosody::Stress::Unstressed,
));
continue;
}
if *word == normalize::PERIOD_PAUSE {
events.push(PhonemeEvent::new(
Phoneme::Silence,
0.30,
svara::prosody::Stress::Unstressed,
));
continue;
}
let preceding: Vec<&str> = words[..i]
.iter()
.rev()
.filter(|w| {
**w != normalize::COMMA_PAUSE
&& **w != normalize::PERIOD_PAUSE
&& **w != normalize::EMPHASIS_START
&& **w != normalize::EMPHASIS_END
})
.take(3)
.copied()
.collect();
let phonemes: Vec<Phoneme> = if let Some(rule) = crate::heteronym::lookup(word) {
if let Some(prons) = self.dictionary.lookup_all(word) {
trace!(word, variant_count = prons.len(), "heteronym lookup");
crate::heteronym::select_phonemes(rule, &preceding, prons).to_vec()
} else if let Some(dict_entry) = self.dictionary.lookup(word) {
dict_entry.to_vec()
} else {
match self.language {
Language::English => rules::english_rules(word),
}
}
} else if let Some(dict_entry) = self.dictionary.lookup(word) {
trace!(word, phoneme_count = dict_entry.len(), "dictionary hit");
dict_entry.to_vec()
} else if normalize::is_foreign_word(word) {
trace!(word, "foreign word detected, stripping diacritics");
let stripped = normalize::strip_diacritics(word);
if let Some(dict_entry) = self.dictionary.lookup(&stripped) {
dict_entry.to_vec()
} else {
match self.language {
Language::English => rules::english_rules(&stripped),
}
}
} else {
trace!(word, "dictionary miss, falling back to rules");
match self.language {
Language::English => rules::english_rules(word),
}
};
#[cfg(feature = "varna")]
{
let invalid = crate::validate::validate_phonemes(&phonemes, &varna_inventory);
debug_assert!(
invalid.is_empty(),
"word {word:?} produced phonemes not in varna inventory: {invalid:?}"
);
}
if phonemes.is_empty() {
warn!(word, "no phonemes produced, skipping word");
continue;
}
let is_content = prosody::is_content_word(word);
let syllables = crate::syllable::syllabify(&phonemes);
let mut word_events = if syllables.is_empty() {
trace!(word, "no syllables (consonant-only), using simple stress");
prosody::assign_stress(&phonemes, is_content)
} else {
trace!(
word,
syllable_count = syllables.len(),
is_content,
"syllabified"
);
prosody::assign_stress_syllabic(&syllables, is_content)
};
if emphasis_active {
prosody::apply_emphasis(&mut word_events);
}
events.extend(word_events);
if i < words.len() - 1 {
events.push(PhonemeEvent::new(
Phoneme::Silence,
0.04,
svara::prosody::Stress::Unstressed,
));
}
}
if let Some(wpm) = options.speaking_rate_wpm {
prosody::apply_rate(&mut events, wpm);
}
if let Some(ref timing) = options.timing {
prosody::apply_timing(&mut events, timing);
}
Ok(events)
}
pub fn speak(
&self,
text: &str,
voice: &svara::voice::VoiceProfile,
sample_rate: f32,
) -> Result<Vec<f32>> {
self.speak_with(text, voice, sample_rate, &ConvertOptions::default())
}
pub fn speak_with(
&self,
text: &str,
voice: &svara::voice::VoiceProfile,
sample_rate: f32,
options: &ConvertOptions,
) -> Result<Vec<f32>> {
let events = self.convert_with(text, options)?;
let mut seq = svara::sequence::PhonemeSequence::new();
for event in events {
seq.push(event);
}
seq.render(voice, sample_rate)
.map_err(|e| ShabdaError::RuleError(alloc::format!("audio synthesis failed: {e}")))
}
pub fn convert_ssml(&self, ssml: &str) -> Result<Vec<PhonemeEvent>> {
if ssml.trim().is_empty() {
return Err(ShabdaError::InvalidInput("empty SSML".to_string()));
}
let nodes = crate::ssml::parse(ssml)
.map_err(|e| ShabdaError::InvalidInput(alloc::format!("SSML parse error: {e}")))?;
let mut events = Vec::new();
self.render_ssml_nodes(&nodes, &ConvertOptions::default(), &mut events)?;
Ok(events)
}
fn render_ssml_nodes(
&self,
nodes: &[crate::ssml::SsmlNode],
base_options: &ConvertOptions,
events: &mut Vec<PhonemeEvent>,
) -> Result<()> {
use crate::ssml::SsmlNode;
for node in nodes {
match node {
SsmlNode::Text(text) => {
if !text.trim().is_empty() {
let mut text_events = self.convert_with(text, base_options)?;
events.append(&mut text_events);
}
}
SsmlNode::Break { duration_ms } => {
let duration_secs = *duration_ms as f32 / 1000.0;
events.push(PhonemeEvent::new(
Phoneme::Silence,
duration_secs,
svara::prosody::Stress::Unstressed,
));
}
SsmlNode::Emphasis { level, children } => {
let emphasis_opts = ConvertOptions {
emphasis: true,
..base_options.clone()
};
let start_idx = events.len();
self.render_ssml_nodes(children, &emphasis_opts, events)?;
let emphasis_events = &mut events[start_idx..];
match level {
crate::ssml::EmphasisLevel::Strong => {
prosody::apply_emphasis(emphasis_events);
}
crate::ssml::EmphasisLevel::Moderate => {
}
crate::ssml::EmphasisLevel::Reduced => {
for event in emphasis_events.iter_mut() {
event.stress = svara::prosody::Stress::Unstressed;
}
}
}
}
SsmlNode::Prosody { rate, children } => {
let prosody_opts = if let Some(r) = rate {
ConvertOptions {
speaking_rate_wpm: Some(r.wpm()),
..base_options.clone()
}
} else {
base_options.clone()
};
self.render_ssml_nodes(children, &prosody_opts, events)?;
}
}
}
Ok(())
}
pub fn convert_streaming<F>(&self, text: &str, mut callback: F) -> Result<()>
where
F: FnMut(&str, &[PhonemeEvent]),
{
if text.trim().is_empty() {
return Err(ShabdaError::InvalidInput("empty text".to_string()));
}
let normalized = normalize::normalize(text);
let words: Vec<&str> = normalized.split_whitespace().collect();
for word in &words {
if *word == normalize::COMMA_PAUSE {
let events = [PhonemeEvent::new(
Phoneme::Silence,
0.15,
svara::prosody::Stress::Unstressed,
)];
callback(word, &events);
continue;
}
if *word == normalize::PERIOD_PAUSE {
let events = [PhonemeEvent::new(
Phoneme::Silence,
0.30,
svara::prosody::Stress::Unstressed,
)];
callback(word, &events);
continue;
}
let phonemes: Vec<Phoneme> = if let Some(dict_entry) = self.dictionary.lookup(word) {
dict_entry.to_vec()
} else {
match self.language {
Language::English => rules::english_rules(word),
}
};
if phonemes.is_empty() {
continue;
}
let is_content = prosody::is_content_word(word);
let syllables = crate::syllable::syllabify(&phonemes);
let word_events = if syllables.is_empty() {
prosody::assign_stress(&phonemes, is_content)
} else {
prosody::assign_stress_syllabic(&syllables, is_content)
};
callback(word, &word_events);
}
Ok(())
}
}