use std::collections::HashMap;
use std::sync::Arc;
use crate::embedded_data::materialized_data_dir;
use crate::error::Result;
use crate::lang_detect::StreamingLanguageDetector;
use crate::phoneme::PhonemeData;
use crate::{
G2pToken, Language, SentenceUnit, StreamingSentencePhonemeUpgrade,
StreamingSentencePhonemeUpgradeSession, TextExpand, TextUnit, WordPhonemizer,
};
pub struct StreamingG2pSession {
expander: TextExpand,
session: StreamingSentencePhonemeUpgradeSession,
}
pub struct StreamingG2P {
phonemizers: HashMap<Language, WordPhonemizer>,
stream_sentence_upgrade: StreamingSentencePhonemeUpgrade,
default_language: Language,
languages: Vec<Language>,
}
impl StreamingG2P {
pub fn new(lang: Language) -> Result<Self> {
Self::with_languages(&[lang], lang)
}
pub fn with_languages(languages: &[Language], default_language: Language) -> Result<Self> {
let data_dir = materialized_data_dir()?;
let phdata = Arc::new(PhonemeData::load(data_dir)?);
let mut phonemizers = HashMap::new();
for &lang in languages {
phonemizers.insert(
lang,
WordPhonemizer::new_with_data(lang, Arc::clone(&phdata))?,
);
}
Ok(Self {
phonemizers,
stream_sentence_upgrade: StreamingSentencePhonemeUpgrade::new(default_language)?,
default_language,
languages: languages.to_vec(),
})
}
pub fn new_session(&self) -> StreamingG2pSession {
let expander = if self.languages.len() == 1 {
TextExpand::with_language(self.default_language)
} else {
let detector =
StreamingLanguageDetector::with_lingua(&self.languages, self.default_language);
TextExpand::with_detector(&self.languages, self.default_language, detector)
};
StreamingG2pSession {
session: self.stream_sentence_upgrade.new_session(),
expander,
}
}
pub fn push_text(
&self,
session: &mut StreamingG2pSession,
text: &str,
) -> Result<Vec<G2pToken>> {
let mut outs = Vec::new();
for ch in text.chars() {
if let Some((unit, lang)) = session.expander.push(ch) {
let text_unit = TextUnit::from_expand_unit(unit, lang);
let phonemizer = self.phonemizer_for(lang);
let su = SentenceUnit::from_text_unit(text_unit, phonemizer)?;
outs.extend(session.session.push(su));
}
}
Ok(outs)
}
pub fn finish(&self, session: &mut StreamingG2pSession) -> Result<Vec<G2pToken>> {
let mut outs = Vec::new();
while let Some((unit, lang)) = session.expander.finish() {
let text_unit = TextUnit::from_expand_unit(unit, lang);
let phonemizer = self.phonemizer_for(lang);
let su = SentenceUnit::from_text_unit(text_unit, phonemizer)?;
outs.extend(session.session.push(su));
}
outs.extend(session.session.finish());
Ok(outs)
}
fn phonemizer_for(&self, lang: Language) -> &WordPhonemizer {
self.phonemizers
.get(&lang)
.or_else(|| self.phonemizers.get(&self.default_language))
.expect("at least the default language phonemizer must be present")
}
}