use std::collections::HashMap;
use std::sync::Arc;
use crate::embedded_data::materialized_data_dir;
use crate::error::Result;
use crate::lang_detect::StreamingLanguageDetector;
use crate::phoneme::PhonemeData;
use crate::{
FullSentencePhonemeUpgrade, G2pToken, Language, SentenceUnit, TextExpand, TextUnit,
WordPhonemizer,
};
pub struct FullG2p {
phonemizers: HashMap<Language, WordPhonemizer>,
sentence_upgrade: FullSentencePhonemeUpgrade,
default_language: Language,
languages: Vec<Language>,
}
impl FullG2p {
pub fn new(language: Language) -> Result<Self> {
Self::with_languages(&[language], language)
}
pub fn with_languages(languages: &[Language], default_language: Language) -> Result<Self> {
let data_dir = materialized_data_dir()?;
let phdata = Arc::new(PhonemeData::load(data_dir)?);
let mut phonemizers = HashMap::new();
for &lang in languages {
phonemizers.insert(
lang,
WordPhonemizer::new_with_data(lang, Arc::clone(&phdata))?,
);
}
Ok(Self {
phonemizers,
sentence_upgrade: FullSentencePhonemeUpgrade::new(default_language)?,
default_language,
languages: languages.to_vec(),
})
}
pub fn g2p(&self, text: &str) -> Result<Vec<G2pToken>> {
let mut sentence_units = vec![];
let mut expander = if self.languages.len() == 1 {
TextExpand::with_language(self.default_language)
} else {
let detector =
StreamingLanguageDetector::with_lingua(&self.languages, self.default_language);
TextExpand::with_detector(&self.languages, self.default_language, detector)
};
for ch in text.chars() {
if let Some((unit, lang)) = expander.push(ch) {
let text_unit = TextUnit::from_expand_unit(unit, lang);
let phonemizer = self.phonemizer_for(lang);
let su = SentenceUnit::from_text_unit(text_unit, phonemizer)?;
sentence_units.push(su);
}
}
while let Some((unit, lang)) = expander.finish() {
let text_unit = TextUnit::from_expand_unit(unit, lang);
let phonemizer = self.phonemizer_for(lang);
let su = SentenceUnit::from_text_unit(text_unit, phonemizer)?;
sentence_units.push(su);
}
let out_tokens = self.sentence_upgrade.upgrade(&sentence_units);
Ok(out_tokens)
}
fn phonemizer_for(&self, lang: Language) -> &WordPhonemizer {
self.phonemizers
.get(&lang)
.or_else(|| self.phonemizers.get(&self.default_language))
.expect("at least the default language phonemizer must be present")
}
}