car-voice 0.14.0

Voice I/O capability for CAR — mic capture, VAD, listener/speaker traits
Documentation
//! macOS Apple-Speech TTS provider.
//!
//! Implements [`Speaker`] using the same Apple speech synthesis engine
//! that AVSpeechSynthesizer uses, packaged two different ways:
//!
//! - [`AppleSpeechSpeaker::synth`] shells out to `/usr/bin/say` with
//!   `--file-format=WAVE --data-format=LEI16@<rate>` to capture
//!   16-bit PCM WAV directly. `say(1)` is Apple's own thin wrapper
//!   around the same speech-synthesis stack — same voices, same
//!   quality, no model download. It writes WAV-without-LIST chunks
//!   that decode cleanly via `rodio`.
//! - [`Speaker::speak`] overrides the trait default to use
//!   [`AVSpeechSynthesizer::speakUtterance`] directly, polling
//!   `isSpeaking` for completion. Direct playback path; avoids the
//!   PCM round-trip when the caller doesn't need the bytes.
//!
//! ## Why not `writeUtterance:toBufferCallback:`?
//!
//! The earlier draft of this provider used
//! [`AVSpeechSynthesizer::writeUtterance_toBufferCallback`] to
//! capture PCM in-process. On macOS 26.4.1 (M5) the buffer-callback
//! block is **never invoked** for any voice we tested — `writeUtterance`
//! returns immediately, `isSpeaking` reports false, no callbacks fire.
//! The same synthesizer instance speaks fine via `speakUtterance` on
//! the same thread, so it's not a synthesizer-init problem; it appears
//! to be a regression or undocumented requirement in the
//! `toBufferCallback:` path on this macOS release.
//!
//! Rather than block-debug an Apple framework on a single-host repro,
//! we reach for `say(1)` — it's been part of macOS since the early
//! 2000s, ships with every install, and produces exactly the audio we
//! want. If Apple fixes the in-process API we can swap back without
//! breaking the public Speaker shape.
//!
//! See `docs/proposals/macos-apple-frameworks.md` for the broader plan.

use crate::tts::{AudioFormat, Speaker, SynthesizedAudio};
use crate::{Result, VoiceConfig, VoiceError};
use async_trait::async_trait;
use objc2::rc::Retained;
use objc2_avf_audio::{AVSpeechSynthesisVoice, AVSpeechSynthesizer, AVSpeechUtterance};
use objc2_foundation::NSString;

/// AVSpeechUtterance's "normal" speech rate. Apple defines this as
/// `AVSpeechUtteranceDefaultSpeechRate` in the Speech framework — about
/// 0.5 in the legal range `[AVSpeechUtteranceMinimumSpeechRate,
/// AVSpeechUtteranceMaximumSpeechRate]`. The `say(1)` CLI uses
/// words-per-minute on a different scale; we map between them in
/// [`AppleSpeechSpeaker::say_rate_wpm`].
const AVSPEECH_DEFAULT_RATE: f32 = 0.5;

/// `say(1)`'s default words-per-minute when invoked without `-r`.
/// We use this as the multiplier base when mapping
/// `local_tts_speed = 1.0` → "normal pace".
const SAY_DEFAULT_WPM: f32 = 175.0;

/// Sample rate we ask `say(1)` to render at. AVSpeechSynthesizer's
/// own internal rate has historically been 22050 Hz; staying compatible
/// avoids resampling overhead on the playback path.
const SYNTH_SAMPLE_RATE: u32 = 22050;

/// macOS-native TTS via Apple's speech synthesis engine.
///
/// `voice` is interpreted as either an `AVSpeechSynthesisVoice`
/// identifier (e.g. `com.apple.voice.compact.en-US.Samantha`), a
/// BCP-47 language tag (e.g. `en-US`), or an empty string for the
/// system default. For the `synth()` path we map identifier-style
/// voices to `say(1)`'s `-v <name>` argument by stripping the
/// reverse-DNS prefix.
#[derive(Debug, Clone)]
pub struct AppleSpeechSpeaker {
    voice: String,
    rate: f32,
    pitch: f32,
    volume: f32,
}

impl AppleSpeechSpeaker {
    /// Construct from a [`VoiceConfig`].
    ///
    /// `local_tts_speed` (which other providers treat as a `1.0 = normal`
    /// multiplier) is remapped onto AVSpeechUtterance's 0..1 absolute-rate
    /// scale where 0.5 ≈ normal. With the config default of 1.0 this
    /// yields ~0.5 (Apple's normal pace), not the jarring max-speed
    /// reading you'd get if we passed 1.0 through.
    pub fn from_config(config: &VoiceConfig) -> Self {
        let user_speed = config.local_tts_speed.clamp(0.5, 2.0);
        let rate = (AVSPEECH_DEFAULT_RATE * user_speed).clamp(0.0, 1.0);
        Self {
            voice: config.local_tts_voice.clone(),
            rate,
            pitch: 1.0,
            volume: 1.0,
        }
    }

    /// Convert the AVSpeech-style 0..1 rate into `say(1)`'s words-per-minute.
    fn say_rate_wpm(&self) -> u32 {
        // 0.5 = AVSpeech normal = ~175 WPM. Linear scaling beyond that
        // matches what the AV utterance rate actually does.
        let scale = self.rate / AVSPEECH_DEFAULT_RATE;
        (SAY_DEFAULT_WPM * scale).clamp(50.0, 500.0) as u32
    }

    /// Resolve the configured voice into a `say -v` argument.
    /// Returns `None` when the user wanted the system default.
    fn say_voice_arg(&self) -> Option<String> {
        if self.voice.is_empty() {
            return None;
        }
        // `say` accepts the human-readable name (`Samantha`,
        // `Samantha (Premium)`). Identifier-style voices like
        // `com.apple.voice.compact.en-US.Samantha` get the trailing
        // segment after the last `.` extracted as the name.
        let name = self
            .voice
            .rsplit('.')
            .next()
            .unwrap_or(&self.voice)
            .to_string();
        Some(name)
    }
}

#[async_trait]
impl Speaker for AppleSpeechSpeaker {
    async fn synth(&self, text: &str) -> Result<SynthesizedAudio> {
        if text.is_empty() {
            return Err(VoiceError::Tts(
                "apple speech: refusing to synthesize empty text".into(),
            ));
        }

        let text = text.to_owned();
        let voice = self.say_voice_arg();
        let rate_wpm = self.say_rate_wpm();

        let bytes = tokio::task::spawn_blocking(move || -> Result<Vec<u8>> {
            let tmp = tempfile::Builder::new()
                .prefix("car-apple-tts-")
                .suffix(".wav")
                .tempfile()
                .map_err(|e| VoiceError::Tts(format!("apple speech: tempfile: {e}")))?;
            let path = tmp.path().to_owned();

            let mut cmd = std::process::Command::new("/usr/bin/say");
            cmd.arg("-o")
                .arg(&path)
                .arg("--file-format=WAVE")
                .arg(format!("--data-format=LEI16@{}", SYNTH_SAMPLE_RATE))
                .arg("-r")
                .arg(rate_wpm.to_string());
            if let Some(v) = voice.as_deref() {
                cmd.arg("-v").arg(v);
            }
            cmd.arg(&text);

            let output = cmd
                .output()
                .map_err(|e| VoiceError::Tts(format!("apple speech: spawn say: {e}")))?;
            if !output.status.success() {
                let stderr = String::from_utf8_lossy(&output.stderr).into_owned();
                return Err(VoiceError::Tts(format!(
                    "apple speech: say(1) exited {}{}",
                    output.status,
                    stderr.trim()
                )));
            }

            std::fs::read(&path)
                .map_err(|e| VoiceError::Tts(format!("apple speech: read tempfile: {e}")))
        })
        .await
        .map_err(|e| VoiceError::Tts(format!("apple speech: join error: {e}")))??;

        Ok(SynthesizedAudio {
            bytes,
            format: AudioFormat::Wav,
        })
    }

    /// Direct playback override. Skips the PCM round-trip — uses
    /// AVSpeechSynthesizer.speakUtterance to play through the system
    /// default output, then polls `isSpeaking` for completion.
    async fn speak(&self, text: &str) -> Result<()> {
        if text.is_empty() {
            return Ok(());
        }
        let text = text.to_owned();
        let voice = self.voice.clone();
        let rate = self.rate;
        let pitch = self.pitch;
        let volume = self.volume;

        tokio::task::spawn_blocking(move || speak_blocking(text, voice, rate, pitch, volume))
            .await
            .map_err(|e| VoiceError::Tts(format!("apple speech: join error: {e}")))?
    }
}

/// Blocking direct-playback path. Owns the AVSpeechSynthesizer for the
/// duration of the call and polls `isSpeaking` until synthesis ends.
fn speak_blocking(
    text: String,
    voice_id: String,
    rate: f32,
    pitch: f32,
    volume: f32,
) -> Result<()> {
    let synth: Retained<AVSpeechSynthesizer> = unsafe { AVSpeechSynthesizer::new() };

    let utterance: Retained<AVSpeechUtterance> = unsafe {
        let ns_text = NSString::from_str(&text);
        AVSpeechUtterance::speechUtteranceWithString(&ns_text)
    };
    unsafe {
        utterance.setRate(rate);
        utterance.setPitchMultiplier(pitch);
        utterance.setVolume(volume);
        if !voice_id.is_empty() {
            let ns_id = NSString::from_str(&voice_id);
            let v = AVSpeechSynthesisVoice::voiceWithIdentifier(&ns_id)
                .or_else(|| AVSpeechSynthesisVoice::voiceWithLanguage(Some(&ns_id)));
            if let Some(v) = v {
                utterance.setVoice(Some(&v));
            }
        }
    }

    unsafe { synth.speakUtterance(&utterance) };

    // Poll for completion. AVSpeechSynthesizer doesn't surface an
    // async completion future from its ObjC API; the documented
    // pattern is delegate callbacks (didFinishSpeechUtterance) but
    // wiring an objc2 delegate adds a lot of surface for this single
    // observable. Polling at 50 ms is good enough — first observable
    // becomes-true happens within a few hundred ms after speakUtterance,
    // and most sentences finish within 5–10 s. 60 s ceiling covers
    // any reasonable input.
    let poll_interval = std::time::Duration::from_millis(50);
    let max_wait = std::time::Duration::from_secs(60);
    let start = std::time::Instant::now();

    // Wait briefly for synthesis to actually start (isSpeaking flips
    // to true). Without this, the "stop polling when isSpeaking==false"
    // loop would terminate immediately for short utterances on slow
    // schedulers.
    let warmup_deadline = start + std::time::Duration::from_millis(500);
    while !unsafe { synth.isSpeaking() } && std::time::Instant::now() < warmup_deadline {
        std::thread::sleep(poll_interval);
    }

    while unsafe { synth.isSpeaking() } {
        if start.elapsed() > max_wait {
            unsafe {
                use objc2_avf_audio::AVSpeechBoundary;
                synth.stopSpeakingAtBoundary(AVSpeechBoundary::Immediate);
            }
            return Err(VoiceError::Tts(
                "apple speech: speakUtterance still speaking after 60 s — aborted".into(),
            ));
        }
        std::thread::sleep(poll_interval);
    }

    Ok(())
}