use crate::error::{Error, Result};
use crate::model::audio::g2p::Phonemizer;
use crate::model::audio::kokoro::{
KokoroModelV2, KokoroPhonemeVocab, VoiceResolver, select_voice_style,
};
use numr::runtime::cpu::{CpuClient, CpuRuntime};
use numr::tensor::Tensor;
use std::sync::Arc;
pub struct KokoroEngine {
pub model: KokoroModelV2<CpuRuntime>,
pub vocab: KokoroPhonemeVocab,
pub resolver: VoiceResolver,
pub client: Arc<CpuClient>,
pub device: Arc<<CpuRuntime as numr::runtime::Runtime>::Device>,
pub min_frames_per_phoneme: u32,
}
impl KokoroEngine {
pub fn synthesize(&self, text: &str, voice_spec: &str, speed: f32) -> Result<Vec<f32>> {
if text.is_empty() {
return Err(Error::InvalidArgument {
arg: "text",
reason: "input text must not be empty".into(),
});
}
let phonemizer =
Phonemizer::new(self.model.config_lang()).map_err(|e| Error::DataError {
reason: format!("phonemizer init: {e}"),
})?;
let phonemes = phonemizer
.text_to_phonemes(text)
.map_err(|e| Error::DataError {
reason: format!("G2P: {e}"),
})?;
let ids: Vec<i64> = self
.vocab
.encode_skipping_unknown(&phonemes)
.into_iter()
.map(i64::from)
.collect();
if ids.is_empty() {
return Err(Error::DataError {
reason: "no phonemes mapped to vocab — check that G2P and vocab agree".into(),
});
}
let token_ids = Tensor::<CpuRuntime>::from_slice(&ids, &[1, ids.len()], &self.device);
let voice_pack = self.resolver.load::<CpuRuntime>(voice_spec, &self.device)?;
let voice_row = select_voice_style(&voice_pack, ids.len())?;
let floor = ((self.min_frames_per_phoneme as f32) / speed.max(0.1)).round() as u32;
let waveform_tensor =
self.model
.synthesize_cpu(&self.client, &token_ids, &voice_row, floor.max(1))?;
Ok(waveform_tensor.contiguous()?.to_vec())
}
pub fn sample_rate(&self) -> u32 {
self.model.config.sample_rate
}
}
impl KokoroModelV2<CpuRuntime> {
fn config_lang(&self) -> crate::model::audio::g2p::Lang {
crate::model::audio::g2p::Lang::EnUs
}
}
#[cfg(test)]
mod tests {
use crate::model::audio::kokoro::KokoroConfig;
#[test]
fn config_lang_default_is_en_us() {
assert_eq!(
crate::model::audio::g2p::Lang::EnUs,
crate::model::audio::g2p::Lang::EnUs
);
let _cfg = KokoroConfig::default();
}
}