use boostr::Runtime;
use boostr::format::load_voice_pt;
use boostr::model::audio::kokoro::{KokoroPhonemeVocab, load_kokoro_v2, select_voice_style};
use boostr::model::audio::wav_encode::{encode_wav_f32, encode_wav_pcm16};
use boostr::runtime::cpu::{CpuDevice, CpuRuntime};
use boostr::tensor::Tensor;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let model_dir = std::env::args()
.nth(1)
.ok_or("usage: synth_kokoro MODEL_DIR VOICE_PATH")?;
let voice_path = std::env::args()
.nth(2)
.ok_or("usage: synth_kokoro MODEL_DIR VOICE_PATH")?;
let device = CpuDevice::new();
let client = CpuRuntime::default_client(&device);
println!("loading checkpoint ...");
let model = load_kokoro_v2::<CpuRuntime, _>(&client, &model_dir, &device)?;
let vocab_path = std::path::Path::new(&model_dir).join("config.json");
let vocab = KokoroPhonemeVocab::from_json_file(&vocab_path)?;
println!("vocab: {} symbols", vocab.len());
let phonemes: Vec<String> = ["h", "ə", "l", "o", "ʊ", " ", "w", "ɜ", "ɹ", "l", "d"]
.iter()
.map(|s| s.to_string())
.collect();
let ids = vocab.encode_strict(&phonemes)?;
println!("phonemes {} → ids {:?}", phonemes.len(), ids);
let token_ids_f: Vec<i64> = ids.iter().map(|&x| x as i64).collect();
let token_ids =
Tensor::<CpuRuntime>::from_slice(&token_ids_f, &[1, token_ids_f.len()], &device);
println!("loading voice pack from {voice_path} ...");
let voice_pack = load_voice_pt::<CpuRuntime>(&voice_path, &device)?;
println!("voice pack shape: {:?}", voice_pack.shape());
let voice_row = select_voice_style(&voice_pack, ids.len())?;
println!("voice row shape: {:?}", voice_row.shape());
println!("synthesizing ...");
let start = std::time::Instant::now();
let waveform = model.synthesize_cpu(&client, &token_ids, &voice_row, 1)?;
let elapsed = start.elapsed();
let samples: Vec<f32> = waveform.contiguous()?.to_vec();
println!(
"ok — {} samples @ {} Hz = {:.2}s audio, synth took {:.2}s ({:.2}x realtime)",
samples.len(),
model.config.sample_rate,
samples.len() as f32 / model.config.sample_rate as f32,
elapsed.as_secs_f32(),
(samples.len() as f32 / model.config.sample_rate as f32) / elapsed.as_secs_f32(),
);
let min = samples.iter().cloned().fold(f32::INFINITY, f32::min);
let max = samples.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let rms = (samples.iter().map(|x| x * x).sum::<f32>() / samples.len() as f32).sqrt();
println!("amplitude: range=[{min:.4}, {max:.4}], rms={rms:.4}");
let out_dir = std::path::Path::new(&model_dir);
let pcm_path = out_dir.join("synth_output.wav");
std::fs::write(
&pcm_path,
encode_wav_pcm16(&samples, model.config.sample_rate),
)?;
println!("wrote {}", pcm_path.display());
let f32_path = out_dir.join("synth_output_f32.wav");
std::fs::write(
&f32_path,
encode_wav_f32(&samples, model.config.sample_rate),
)?;
println!("wrote {}", f32_path.display());
Ok(())
}