use crate::tts::{AudioFormat, Speaker, SynthesizedAudio};
use crate::{Result, VoiceConfig, VoiceError};
use async_trait::async_trait;
use objc2::rc::Retained;
use objc2_avf_audio::{AVSpeechSynthesisVoice, AVSpeechSynthesizer, AVSpeechUtterance};
use objc2_foundation::NSString;
const AVSPEECH_DEFAULT_RATE: f32 = 0.5;
const SAY_DEFAULT_WPM: f32 = 175.0;
const SYNTH_SAMPLE_RATE: u32 = 22050;
#[derive(Debug, Clone)]
pub struct AppleSpeechSpeaker {
voice: String,
rate: f32,
pitch: f32,
volume: f32,
}
impl AppleSpeechSpeaker {
pub fn from_config(config: &VoiceConfig) -> Self {
let user_speed = config.local_tts_speed.clamp(0.5, 2.0);
let rate = (AVSPEECH_DEFAULT_RATE * user_speed).clamp(0.0, 1.0);
Self {
voice: config.local_tts_voice.clone(),
rate,
pitch: 1.0,
volume: 1.0,
}
}
fn say_rate_wpm(&self) -> u32 {
let scale = self.rate / AVSPEECH_DEFAULT_RATE;
(SAY_DEFAULT_WPM * scale).clamp(50.0, 500.0) as u32
}
fn say_voice_arg(&self) -> Option<String> {
if self.voice.is_empty() {
return None;
}
let name = self
.voice
.rsplit('.')
.next()
.unwrap_or(&self.voice)
.to_string();
Some(name)
}
}
#[async_trait]
impl Speaker for AppleSpeechSpeaker {
async fn synth(&self, text: &str) -> Result<SynthesizedAudio> {
if text.is_empty() {
return Err(VoiceError::Tts(
"apple speech: refusing to synthesize empty text".into(),
));
}
let text = text.to_owned();
let voice = self.say_voice_arg();
let rate_wpm = self.say_rate_wpm();
let bytes = tokio::task::spawn_blocking(move || -> Result<Vec<u8>> {
let tmp = tempfile::Builder::new()
.prefix("car-apple-tts-")
.suffix(".wav")
.tempfile()
.map_err(|e| VoiceError::Tts(format!("apple speech: tempfile: {e}")))?;
let path = tmp.path().to_owned();
let mut cmd = std::process::Command::new("/usr/bin/say");
cmd.arg("-o")
.arg(&path)
.arg("--file-format=WAVE")
.arg(format!("--data-format=LEI16@{}", SYNTH_SAMPLE_RATE))
.arg("-r")
.arg(rate_wpm.to_string());
if let Some(v) = voice.as_deref() {
cmd.arg("-v").arg(v);
}
cmd.arg(&text);
let output = cmd
.output()
.map_err(|e| VoiceError::Tts(format!("apple speech: spawn say: {e}")))?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr).into_owned();
return Err(VoiceError::Tts(format!(
"apple speech: say(1) exited {} — {}",
output.status,
stderr.trim()
)));
}
std::fs::read(&path)
.map_err(|e| VoiceError::Tts(format!("apple speech: read tempfile: {e}")))
})
.await
.map_err(|e| VoiceError::Tts(format!("apple speech: join error: {e}")))??;
Ok(SynthesizedAudio {
bytes,
format: AudioFormat::Wav,
})
}
async fn speak(&self, text: &str) -> Result<()> {
if text.is_empty() {
return Ok(());
}
let text = text.to_owned();
let voice = self.voice.clone();
let rate = self.rate;
let pitch = self.pitch;
let volume = self.volume;
tokio::task::spawn_blocking(move || speak_blocking(text, voice, rate, pitch, volume))
.await
.map_err(|e| VoiceError::Tts(format!("apple speech: join error: {e}")))?
}
}
fn speak_blocking(
text: String,
voice_id: String,
rate: f32,
pitch: f32,
volume: f32,
) -> Result<()> {
let synth: Retained<AVSpeechSynthesizer> = unsafe { AVSpeechSynthesizer::new() };
let utterance: Retained<AVSpeechUtterance> = unsafe {
let ns_text = NSString::from_str(&text);
AVSpeechUtterance::speechUtteranceWithString(&ns_text)
};
unsafe {
utterance.setRate(rate);
utterance.setPitchMultiplier(pitch);
utterance.setVolume(volume);
if !voice_id.is_empty() {
let ns_id = NSString::from_str(&voice_id);
let v = AVSpeechSynthesisVoice::voiceWithIdentifier(&ns_id)
.or_else(|| AVSpeechSynthesisVoice::voiceWithLanguage(Some(&ns_id)));
if let Some(v) = v {
utterance.setVoice(Some(&v));
}
}
}
unsafe { synth.speakUtterance(&utterance) };
let poll_interval = std::time::Duration::from_millis(50);
let max_wait = std::time::Duration::from_secs(60);
let start = std::time::Instant::now();
let warmup_deadline = start + std::time::Duration::from_millis(500);
while !unsafe { synth.isSpeaking() } && std::time::Instant::now() < warmup_deadline {
std::thread::sleep(poll_interval);
}
while unsafe { synth.isSpeaking() } {
if start.elapsed() > max_wait {
unsafe {
use objc2_avf_audio::AVSpeechBoundary;
synth.stopSpeakingAtBoundary(AVSpeechBoundary::Immediate);
}
return Err(VoiceError::Tts(
"apple speech: speakUtterance still speaking after 60 s — aborted".into(),
));
}
std::thread::sleep(poll_interval);
}
Ok(())
}