use std::path::{Path, PathBuf};
use std::process::Command;
use std::time::{Duration, Instant};
use super::{AudioError, TtsEngine};
const KOKORO_REQUIRED_FILES: &[&str] = &[
"model.onnx",
"voices.bin",
"tokens.txt",
"espeak-ng-data",
"lexicon-us-en.txt",
];
const PIPER_REQUIRED_FILES: &[&str] = &["en_US-lessac-medium.onnx", "tokens.txt", "espeak-ng-data"];
const KOKORO_SPEAKERS: &[(&str, u16)] = &[
("af_alloy", 0),
("af_aoede", 1),
("af_bella", 2),
("af_heart", 3),
("af_jessica", 4),
("af_kore", 5),
("af_nicole", 6),
("af_nova", 7),
("af_river", 8),
("af_sarah", 9),
("af_sky", 10),
("am_adam", 11),
("am_echo", 12),
("am_eric", 13),
("am_fenrir", 14),
("am_liam", 15),
("am_michael", 16),
("am_onyx", 17),
("am_puck", 18),
("am_santa", 19),
("bf_alice", 20),
("bf_emma", 21),
("bf_isabella", 22),
("bf_lily", 23),
("bm_daniel", 24),
("bm_fable", 25),
("bm_george", 26),
("bm_lewis", 27),
("ef_dora", 28),
("em_alex", 29),
("ff_siwis", 30),
("hf_alpha", 31),
("hf_beta", 32),
("hm_omega", 33),
("hm_psi", 34),
("if_sara", 35),
("im_nicola", 36),
("jf_alpha", 37),
("jf_gongitsune", 38),
("jf_nezumi", 39),
("jf_tebukuro", 40),
("jm_kumo", 41),
("pf_dora", 42),
("pm_alex", 43),
("pm_santa", 44),
("zf_xiaobei", 45),
("zf_xiaoni", 46),
("zf_xiaoxiao", 47),
("zf_xiaoyi", 48),
("zm_yunjian", 49),
("zm_yunxi", 50),
("zm_yunxia", 51),
("zm_yunyang", 52),
];
#[derive(Debug, Clone, Copy)]
pub struct TtsModelSpec {
pub engine: TtsEngine,
pub display_name: &'static str,
pub directory: &'static str,
pub archive_url: &'static str,
pub docs_url: &'static str,
pub runtime_binary: &'static str,
pub required_files: &'static [&'static str],
pub description: &'static str,
}
#[derive(Debug, Clone)]
pub struct TtsModelStatus {
pub spec: &'static TtsModelSpec,
pub path: PathBuf,
pub installed: bool,
pub missing_files: Vec<String>,
}
#[derive(Debug, Clone)]
pub struct TtsModelDownload {
pub spec: &'static TtsModelSpec,
pub path: PathBuf,
pub downloaded: bool,
}
pub const TTS_MODEL_SPECS: &[TtsModelSpec] = &[
TtsModelSpec {
engine: TtsEngine::Kokoro,
display_name: "Kokoro 82M multilingual",
directory: "kokoro",
archive_url: "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2",
docs_url: "https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html",
runtime_binary: "sherpa-onnx-offline-tts",
required_files: KOKORO_REQUIRED_FILES,
description: "Natural local neural TTS with multi-speaker voice IDs.",
},
TtsModelSpec {
engine: TtsEngine::Piper,
display_name: "Piper en_US lessac medium",
directory: "piper",
archive_url: "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-lessac-medium.tar.bz2",
docs_url: "https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html",
runtime_binary: "sherpa-onnx-offline-tts",
required_files: PIPER_REQUIRED_FILES,
description: "Lightweight local Piper voice packaged for sherpa-onnx.",
},
];
pub fn model_root() -> Result<PathBuf, AudioError> {
let home = std::env::var("HOME").map_err(|_| {
AudioError::Synthesis("Cannot determine $HOME for TTS model directory".to_string())
})?;
Ok(PathBuf::from(home)
.join(".axterminator")
.join("models")
.join("tts"))
}
#[must_use]
pub fn spec_for_engine(engine: TtsEngine) -> Option<&'static TtsModelSpec> {
TTS_MODEL_SPECS.iter().find(|spec| spec.engine == engine)
}
pub fn model_dir(engine: TtsEngine) -> Result<PathBuf, AudioError> {
let spec = spec_for_engine(engine).ok_or_else(|| {
AudioError::Synthesis(format!(
"{} does not use the enhanced TTS model cache",
engine.as_str()
))
})?;
Ok(model_root()?.join(spec.directory))
}
pub fn model_statuses() -> Result<Vec<TtsModelStatus>, AudioError> {
TTS_MODEL_SPECS
.iter()
.map(|spec| model_status_for_engine(spec.engine))
.collect()
}
pub fn model_status_for_engine(engine: TtsEngine) -> Result<TtsModelStatus, AudioError> {
let spec = spec_for_engine(engine).ok_or_else(|| {
AudioError::Synthesis(format!("Unknown enhanced TTS engine: {}", engine.as_str()))
})?;
let path = model_dir(engine)?;
let missing_files = missing_model_files_in_dir(&path, spec);
Ok(TtsModelStatus {
spec,
path,
installed: missing_files.is_empty(),
missing_files,
})
}
pub fn model_files_present(engine: TtsEngine) -> bool {
model_status_for_engine(engine)
.map(|status| status.installed)
.unwrap_or(false)
}
pub fn missing_model_files(engine: TtsEngine) -> Result<Vec<String>, AudioError> {
Ok(model_status_for_engine(engine)?.missing_files)
}
pub fn download_model(engine: TtsEngine, force: bool) -> Result<TtsModelDownload, AudioError> {
let spec = spec_for_engine(engine).ok_or_else(|| {
AudioError::Synthesis(format!(
"Engine {} has no downloadable TTS model",
engine.as_str()
))
})?;
let root = model_root()?;
let target_dir = root.join(spec.directory);
if target_dir.exists() && model_files_present(engine) && !force {
return Ok(TtsModelDownload {
spec,
path: target_dir,
downloaded: false,
});
}
if force && target_dir.exists() {
std::fs::remove_dir_all(&target_dir).map_err(|e| {
AudioError::Synthesis(format!("Failed to remove {}: {e}", target_dir.display()))
})?;
}
std::fs::create_dir_all(&target_dir).map_err(|e| {
AudioError::Synthesis(format!("Failed to create {}: {e}", target_dir.display()))
})?;
let archive_path = root.join(format!("{}.tar.bz2.download", spec.directory));
let curl_status = Command::new("curl")
.args(["--fail", "--location", "--show-error", "--output"])
.arg(&archive_path)
.arg(spec.archive_url)
.status()
.map_err(|e| AudioError::Synthesis(format!("Failed to start curl: {e}")))?;
if !curl_status.success() {
return Err(AudioError::Synthesis(format!(
"curl failed while downloading {} from {}",
spec.display_name, spec.archive_url
)));
}
let tar_status = Command::new("tar")
.arg("-xf")
.arg(&archive_path)
.arg("-C")
.arg(&target_dir)
.arg("--strip-components=1")
.status()
.map_err(|e| AudioError::Synthesis(format!("Failed to start tar: {e}")))?;
let _ = std::fs::remove_file(&archive_path);
if !tar_status.success() {
return Err(AudioError::Synthesis(format!(
"tar failed while extracting {} into {}",
spec.display_name,
target_dir.display()
)));
}
let status = model_status_for_engine(engine)?;
if !status.installed {
return Err(AudioError::Synthesis(format!(
"{} extracted but required files are missing: {}",
spec.display_name,
status.missing_files.join(", ")
)));
}
Ok(TtsModelDownload {
spec,
path: target_dir,
downloaded: true,
})
}
pub fn speak_with_sherpa(
text: &str,
voice: Option<&str>,
engine: TtsEngine,
) -> Result<Duration, AudioError> {
let missing = missing_model_files(engine)?;
if !missing.is_empty() {
return Err(AudioError::Synthesis(format!(
"{} model files not downloaded (missing: {}). Run \
`axterminator models tts download {}` first.",
engine.as_str(),
missing.join(", "),
engine.as_str()
)));
}
let output = tempfile::Builder::new()
.prefix("axterminator-tts-")
.suffix(".wav")
.tempfile()
.map_err(|e| AudioError::Synthesis(format!("Failed to create temp WAV: {e}")))?;
let started = Instant::now();
let synth_output = build_sherpa_command(engine, voice, text, output.path())?
.output()
.map_err(|e| {
AudioError::Synthesis(format!(
"Failed to start {}. Install it or set AXTERMINATOR_SHERPA_ONNX_TTS: {e}",
sherpa_binary()
))
})?;
if !synth_output.status.success() {
let stderr = String::from_utf8_lossy(&synth_output.stderr);
return Err(AudioError::Synthesis(format!(
"{} failed for {}: {}",
sherpa_binary(),
engine.as_str(),
stderr.trim()
)));
}
let play_status = Command::new("afplay")
.arg(output.path())
.status()
.map_err(|e| AudioError::Synthesis(format!("Failed to start afplay: {e}")))?;
if !play_status.success() {
return Err(AudioError::Synthesis(format!(
"afplay failed while playing {} output",
engine.as_str()
)));
}
Ok(started.elapsed())
}
fn missing_model_files_in_dir(path: &Path, spec: &TtsModelSpec) -> Vec<String> {
spec.required_files
.iter()
.filter(|file| !path.join(file).exists())
.map(|file| (*file).to_string())
.collect()
}
fn build_sherpa_command(
engine: TtsEngine,
voice: Option<&str>,
text: &str,
output_path: &Path,
) -> Result<Command, AudioError> {
let dir = model_dir(engine)?;
let mut cmd = Command::new(sherpa_binary());
cmd.arg("--num-threads=2")
.arg(format!("--output-filename={}", output_path.display()));
match engine {
TtsEngine::Kokoro => {
let speaker = kokoro_speaker_id(voice)?;
let lexicons = [dir.join("lexicon-us-en.txt"), dir.join("lexicon-zh.txt")]
.iter()
.filter(|path| path.exists())
.map(|path| path.display().to_string())
.collect::<Vec<_>>()
.join(",");
cmd.arg(format!(
"--kokoro-model={}",
dir.join("model.onnx").display()
))
.arg(format!(
"--kokoro-voices={}",
dir.join("voices.bin").display()
))
.arg(format!(
"--kokoro-tokens={}",
dir.join("tokens.txt").display()
))
.arg(format!(
"--kokoro-data-dir={}",
dir.join("espeak-ng-data").display()
))
.arg(format!("--sid={speaker}"));
if !lexicons.is_empty() {
cmd.arg(format!("--kokoro-lexicon={lexicons}"));
}
}
TtsEngine::Piper => {
cmd.arg(format!(
"--vits-model={}",
dir.join("en_US-lessac-medium.onnx").display()
))
.arg(format!(
"--vits-tokens={}",
dir.join("tokens.txt").display()
))
.arg(format!(
"--vits-data-dir={}",
dir.join("espeak-ng-data").display()
));
}
TtsEngine::System => {
return Err(AudioError::Synthesis(
"system TTS does not use sherpa-onnx".to_string(),
));
}
}
cmd.arg(text);
Ok(cmd)
}
fn sherpa_binary() -> String {
std::env::var("AXTERMINATOR_SHERPA_ONNX_TTS")
.unwrap_or_else(|_| "sherpa-onnx-offline-tts".to_string())
}
fn kokoro_speaker_id(voice: Option<&str>) -> Result<u16, AudioError> {
let Some(candidate) = voice.map(str::trim).filter(|v| !v.is_empty()) else {
return Ok(3);
};
if let Ok(id) = candidate.parse::<u16>()
&& KOKORO_SPEAKERS
.iter()
.any(|(_, speaker_id)| *speaker_id == id)
{
return Ok(id);
}
KOKORO_SPEAKERS
.iter()
.find(|(name, _)| name.eq_ignore_ascii_case(candidate))
.map(|(_, id)| *id)
.ok_or_else(|| {
AudioError::Synthesis(format!(
"Unknown Kokoro voice \"{candidate}\". Use one of {} or a speaker id 0-52.",
KOKORO_SPEAKERS
.iter()
.map(|(name, _)| *name)
.collect::<Vec<_>>()
.join(", ")
))
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn spec_lookup_excludes_system_engine() {
assert!(spec_for_engine(TtsEngine::System).is_none());
assert_eq!(
spec_for_engine(TtsEngine::Kokoro).unwrap().directory,
"kokoro"
);
assert_eq!(
spec_for_engine(TtsEngine::Piper).unwrap().directory,
"piper"
);
}
#[test]
fn kokoro_voice_accepts_default_name_and_numeric_id() {
assert_eq!(kokoro_speaker_id(None).unwrap(), 3);
assert_eq!(kokoro_speaker_id(Some("af_heart")).unwrap(), 3);
assert_eq!(kokoro_speaker_id(Some("27")).unwrap(), 27);
}
#[test]
fn kokoro_voice_rejects_unknown_name() {
let err = kokoro_speaker_id(Some("does_not_exist")).unwrap_err();
assert_eq!(err.code(), "synthesis_error");
}
}