car-voice 0.15.1

//! macOS SFSpeechRecognizer-backed STT provider (batch mode).
//!
//! Implements [`SttProvider`] via Apple's Speech framework — on-device,
//! free, multilingual, no model download. macOS 10.15+.
//!
//! Approach: encode caller-supplied f32 PCM samples to a temp WAV file,
//! hand the file URL to [`SFSpeechURLRecognitionRequest`], and wait for
//! the final result. The temp file is held on the calling task's stack
//! and dropped after recognition completes.
//!
//! ## Host requirements (validated empirically on macOS 26.4.1, M5 Pro)
//!
//! For recognition to actually return results, the calling binary must
//! satisfy ALL of:
//!
//! 1. `NSSpeechRecognitionUsageDescription` in the bundle's `Info.plist`
//!    (otherwise the framework logs `SFUtilities defaultClientID:
//!    Application does not have a bundle identifier; using unstable
//!    "<private>" as client identifier` and recognition fails to
//!    resolve assets).
//! 2. `SFSpeechRecognizer.requestAuthorization` called and granted by
//!    the user — checked at every transcribe call; if not Authorized
//!    we surface an actionable error.
//! 3. **Code-signed with a real Developer ID certificate, not ad-hoc.**
//!    The Apple Mobile File Integrity daemon (`amfid`) rejects ad-hoc
//!    signed binaries with code -423 ("The file is adhoc signed or
//!    signed by an unknown certificate chain") and, critically, the
//!    Speech framework's XPC service silently drops result-handler
//!    callbacks for those rejected processes — recognition appears to
//!    run for the full request window with no result and no error.
//!    Our 60 s safety timeout then catches the hang.
//!
//! Practically, this means STT works inside a properly-signed host
//! app (`Parslee.app`, the future iOS host) but not inside a stand-
//! alone `cargo run`'d CLI binary. The `examples/apple_speech_stt_smoke.rs`
//! example documents this behavior explicitly and exits non-fatally
//! when it sees the timeout pattern.
//!
//! See `docs/proposals/macos-apple-frameworks.md` for the broader plan.

use crate::stt::SttProvider;
use crate::tts::encode_pcm_f32_to_wav_pcm16;
use crate::{Result, VoiceConfig, VoiceError};
use async_trait::async_trait;
use block2::{DynBlock, RcBlock};
use objc2::rc::Retained;
use objc2::AllocAnyThread;
use objc2_avf_audio::{AVAudioFormat, AVAudioPCMBuffer};
use objc2_foundation::{NSError, NSLocale, NSString, NSURL};
use objc2_speech::{
    SFSpeechAudioBufferRecognitionRequest, SFSpeechRecognitionRequest, SFSpeechRecognitionResult,
    SFSpeechRecognizer, SFSpeechRecognizerAuthorizationStatus, SFSpeechURLRecognitionRequest,
};
use std::path::PathBuf;
use std::sync::mpsc::sync_channel;
use tempfile::NamedTempFile;
use tokio::sync::mpsc::UnboundedSender;

/// One streaming snapshot from Apple's recognizer. Unlike Parakeet's
/// monotonic per-token partials, Apple may revise earlier words as
/// more audio context becomes available — so consumers should treat
/// each `text` as a full snapshot that supersedes the previous one.
#[derive(Debug, Clone)]
pub struct AppleSpeechPartial {
    /// Latest transcript snapshot.
    pub text: String,
}

/// macOS-native STT via `SFSpeechRecognizer`.
#[derive(Debug, Clone, Default)]
pub struct AppleSpeechSttProvider {
    /// BCP-47 locale identifier, e.g. `"en-US"`. Empty / `None` →
    /// system default locale.
    locale: Option<String>,
    /// If true, set `requiresOnDeviceRecognition = true` on the request.
    /// On supported devices this guarantees no audio leaves the machine.
    /// Defaults to `true` for the "free, on-device" use case.
    on_device_only: bool,
}

impl AppleSpeechSttProvider {
    pub fn new() -> Self {
        Self {
            locale: None,
            on_device_only: true,
        }
    }

    /// Construct from a [`VoiceConfig`].
    ///
    /// Maps `config.language` (a BCP-47-ish two-letter code from the
    /// other providers) into an NSLocale identifier when set. Empty →
    /// system default.
    pub fn from_config(config: &VoiceConfig) -> Self {
        let locale = if config.language.is_empty() {
            None
        } else {
            // Best-effort: pass through whatever the user provided.
            // SFSpeechRecognizer accepts BCP-47 — `"en"`, `"en-US"`,
            // and `"fr-FR"` all work; mismatches yield isAvailable=false
            // and the recognizer fails closed with an actionable error.
            Some(config.language.clone())
        };
        Self {
            locale,
            on_device_only: true,
        }
    }

    pub fn with_locale(mut self, locale: impl Into<String>) -> Self {
        self.locale = Some(locale.into());
        self
    }

    /// Allow falling back to network-based recognition for locales that
    /// don't have on-device coverage. Off by default.
    pub fn allow_server(mut self) -> Self {
        self.on_device_only = false;
        self
    }

    /// Streaming variant. Decodes a finalized audio segment with
    /// [`SFSpeechAudioBufferRecognitionRequest`], emitting transcript
    /// snapshots via `on_partial` as the recognizer makes progress.
    /// Returns the final transcript when the recognizer signals
    /// `isFinal=true`.
    ///
    /// Mirrors [`crate::parakeet_stt::ParakeetSttProvider::transcribe_streaming`]
    /// in shape so the listeners can plug either provider into the same
    /// streaming-STT-worker arm. Apple's snapshots can revise (whisper-
    /// style) where Parakeet's strictly extend (token-level) — consumer
    /// UIs should render each partial as a replacement, not an append.
    ///
    /// Apple's single-request audio limit is ~1 minute; for longer
    /// utterances chunk the audio at the caller level. VAD-finalized
    /// segments from the existing listener pipeline are well below
    /// this in practice.
    pub async fn transcribe_streaming<F>(
        &self,
        samples: &[f32],
        sample_rate: u32,
        mut on_partial: F,
    ) -> Result<String>
    where
        F: FnMut(AppleSpeechPartial) + Send,
    {
        if samples.is_empty() {
            return Ok(String::new());
        }

        let (event_tx, mut event_rx) = tokio::sync::mpsc::unbounded_channel::<StreamEvent>();

        let samples_owned: Vec<f32> = samples.to_vec();
        let locale = self.locale.clone();
        let on_device_only = self.on_device_only;

        // ObjC interaction must run on a single thread. Spawn it on the
        // blocking pool, detached — when our async drain loop finishes,
        // the blocking task will see its `event_tx` cloned-into-the-block
        // disconnect and tear down on its own.
        let _join = tokio::task::spawn_blocking(move || {
            stream_blocking(samples_owned, sample_rate, locale, on_device_only, event_tx);
        });

        loop {
            match event_rx.recv().await {
                Some(StreamEvent::Partial(p)) => on_partial(p),
                Some(StreamEvent::Final(text)) => return Ok(text),
                Some(StreamEvent::Error(msg)) => return Err(VoiceError::Stt(msg)),
                None => {
                    return Err(VoiceError::Stt(
                        "apple speech: streaming task ended without a result".into(),
                    ))
                }
            }
        }
    }
}

/// Internal channel events between the blocking ObjC thread and the
/// async caller.
enum StreamEvent {
    Partial(AppleSpeechPartial),
    Final(String),
    Error(String),
}

#[async_trait]
impl SttProvider for AppleSpeechSttProvider {
    async fn transcribe(&self, samples: &[f32], sample_rate: u32) -> Result<String> {
        if samples.is_empty() {
            return Ok(String::new());
        }

        let wav_bytes = encode_pcm_f32_to_wav_pcm16(samples, sample_rate)
            .map_err(|e| VoiceError::Stt(format!("apple speech: wav encode: {e}")))?;

        let mut tmp = NamedTempFile::with_suffix(".wav")
            .map_err(|e| VoiceError::Stt(format!("apple speech: tempfile: {e}")))?;
        {
            let f = tmp.as_file_mut();
            std::io::Write::write_all(f, &wav_bytes)
                .map_err(|e| VoiceError::Stt(format!("apple speech: tempfile write: {e}")))?;
            f.sync_all()
                .map_err(|e| VoiceError::Stt(format!("apple speech: tempfile sync: {e}")))?;
        }

        let path = tmp.path().to_path_buf();
        let locale = self.locale.clone();
        let on_device_only = self.on_device_only;

        let result =
            tokio::task::spawn_blocking(move || transcribe_blocking(path, locale, on_device_only))
                .await
                .map_err(|e| VoiceError::Stt(format!("apple speech: join error: {e}")))??;

        // Hold tmp until here so the file outlives recognition.
        drop(tmp);

        Ok(result)
    }
}

/// Synchronous recognition path. Runs on a `spawn_blocking` thread.
fn transcribe_blocking(
    wav_path: PathBuf,
    locale: Option<String>,
    on_device_only: bool,
) -> Result<String> {
    // Build the recognizer for the requested locale, or the system
    // default when None. `initWithLocale` returns `Option<Retained<Self>>`
    // — `None` means the locale isn't supported on this device.
    let recognizer: Retained<SFSpeechRecognizer> = match locale {
        Some(loc) => {
            let ns_loc_id = NSString::from_str(&loc);
            let alloc = NSLocale::alloc();
            let ns_locale: Retained<NSLocale> =
                NSLocale::initWithLocaleIdentifier(alloc, &ns_loc_id);
            unsafe { SFSpeechRecognizer::initWithLocale(SFSpeechRecognizer::alloc(), &ns_locale) }
                .ok_or_else(|| {
                VoiceError::Stt(format!(
                    "apple speech: locale '{loc}' not supported on this device"
                ))
            })?
        }
        None => unsafe { SFSpeechRecognizer::new() },
    };

    let available: bool = unsafe { recognizer.isAvailable() };
    if !available {
        return Err(VoiceError::Stt(
            "apple speech: recognizer unavailable — locale may lack on-device \
             support, network may be unreachable, or Speech Recognition may be \
             disabled in System Settings > Privacy & Security"
                .into(),
        ));
    }

    let status = unsafe { SFSpeechRecognizer::authorizationStatus() };
    if status != SFSpeechRecognizerAuthorizationStatus::Authorized {
        let reason = match status {
            SFSpeechRecognizerAuthorizationStatus::NotDetermined => {
                "not yet requested — host app must call \
                 SFSpeechRecognizer.requestAuthorization at startup"
            }
            SFSpeechRecognizerAuthorizationStatus::Denied => {
                "denied by user — re-grant in System Settings > Privacy & \
                 Security > Speech Recognition"
            }
            SFSpeechRecognizerAuthorizationStatus::Restricted => {
                "restricted by parental controls or MDM"
            }
            SFSpeechRecognizerAuthorizationStatus::Authorized => "authorized", // unreachable
            _ => "unknown status",
        };
        return Err(VoiceError::Stt(format!(
            "apple speech: authorization {reason}. Note: bundle Info.plist \
             must declare NSSpeechRecognitionUsageDescription or the app \
             will crash on first request."
        )));
    }

    // Build the URL via fileURLWithPath (handles encoding, unlike a
    // hand-rolled "file://..." string passed to URLWithString).
    let ns_path = NSString::from_str(&wav_path.to_string_lossy());
    let url: Retained<NSURL> = NSURL::fileURLWithPath(&ns_path);

    let request: Retained<SFSpeechURLRecognitionRequest> = unsafe {
        let alloc = SFSpeechURLRecognitionRequest::alloc();
        SFSpeechURLRecognitionRequest::initWithURL(alloc, &url)
    };

    unsafe {
        request.setShouldReportPartialResults(false);
        if on_device_only {
            request.setRequiresOnDeviceRecognition(true);
        }
    }

    enum TaskOutcome {
        Final(String),
        Failed(String),
    }
    let (tx, rx) = sync_channel::<TaskOutcome>(1);
    let tx_clone = tx.clone();

    // Result handler. With partial results disabled this fires once
    // (or twice if there's an immediate error). The block uses
    // `try_send` — it MUST NOT block, as it runs on Apple's queue.
    let handler: RcBlock<dyn Fn(*mut SFSpeechRecognitionResult, *mut NSError)> = RcBlock::new(
        move |result_ptr: *mut SFSpeechRecognitionResult, err_ptr: *mut NSError| {
            if !err_ptr.is_null() {
                let msg = unsafe { (*err_ptr).localizedDescription() };
                let _ = tx_clone.try_send(TaskOutcome::Failed(format!(
                    "apple speech: recognition error: {}",
                    msg.to_string()
                )));
                return;
            }
            if result_ptr.is_null() {
                return;
            }
            let result: &SFSpeechRecognitionResult = unsafe { &*result_ptr };
            let is_final: bool = unsafe { result.isFinal() };
            if !is_final {
                return;
            }
            let transcription = unsafe { result.bestTranscription() };
            let formatted = unsafe { transcription.formattedString() };
            let _ = tx_clone.try_send(TaskOutcome::Final(formatted.to_string()));
        },
    );

    let handler_ref: &DynBlock<dyn Fn(*mut SFSpeechRecognitionResult, *mut NSError)> = &*handler;

    // Upcast Retained<SFSpeechURLRecognitionRequest> to the abstract base
    // class via the auto-generated Deref chain.
    let request_super: &SFSpeechRecognitionRequest = &**request;

    // Hold the task on this thread's stack so it isn't dropped before
    // the result handler fires.
    let task =
        unsafe { recognizer.recognitionTaskWithRequest_resultHandler(request_super, handler_ref) };

    // Wait for the final result. 60 s is generous for any plausible
    // utterance; longer audio means longer recognition.
    let outcome = rx.recv_timeout(std::time::Duration::from_secs(60));

    // On any non-success path, cancel the task so the recognizer stops
    // doing work for an answer no-one is listening for.
    let result_str = match outcome {
        Ok(TaskOutcome::Final(s)) => Ok(s),
        Ok(TaskOutcome::Failed(msg)) => {
            unsafe { task.cancel() };
            Err(VoiceError::Stt(msg))
        }
        Err(_) => {
            unsafe { task.cancel() };
            Err(VoiceError::Stt(
                "apple speech: recognition timed out (60 s)".into(),
            ))
        }
    };

    drop(task);
    result_str
}

/// Streaming counterpart to [`transcribe_blocking`]. Runs on a
/// `spawn_blocking` thread; pushes [`StreamEvent`]s to the async caller
/// via `event_tx` and blocks on a local channel until the recognizer
/// signals completion (or hits a 60 s safety timeout).
fn stream_blocking(
    samples: Vec<f32>,
    sample_rate: u32,
    locale: Option<String>,
    on_device_only: bool,
    event_tx: UnboundedSender<StreamEvent>,
) {
    macro_rules! abort {
        ($msg:expr) => {{
            let _ = event_tx.send(StreamEvent::Error($msg.into()));
            return;
        }};
    }

    // Recognizer.
    let recognizer: Retained<SFSpeechRecognizer> = match locale {
        Some(loc) => {
            let ns_loc_id = NSString::from_str(&loc);
            let alloc = NSLocale::alloc();
            let ns_locale: Retained<NSLocale> =
                NSLocale::initWithLocaleIdentifier(alloc, &ns_loc_id);
            match unsafe {
                SFSpeechRecognizer::initWithLocale(SFSpeechRecognizer::alloc(), &ns_locale)
            } {
                Some(r) => r,
                None => abort!(format!(
                    "apple speech: locale '{loc}' not supported on this device"
                )),
            }
        }
        None => unsafe { SFSpeechRecognizer::new() },
    };

    if !unsafe { recognizer.isAvailable() } {
        abort!(
            "apple speech: recognizer unavailable — locale may lack on-device \
             support, network may be unreachable, or Speech Recognition may be \
             disabled in System Settings > Privacy & Security"
        );
    }

    let status = unsafe { SFSpeechRecognizer::authorizationStatus() };
    if status != SFSpeechRecognizerAuthorizationStatus::Authorized {
        abort!(format!(
            "apple speech: authorization status not Authorized; host must call \
             SFSpeechRecognizer.requestAuthorization at startup and the bundle \
             must declare NSSpeechRecognitionUsageDescription in Info.plist \
             (status={status:?})"
        ));
    }

    // Build AVAudioFormat for Float32 mono PCM at the caller's rate.
    let format_alloc = AVAudioFormat::alloc();
    let format: Retained<AVAudioFormat> = match unsafe {
        AVAudioFormat::initStandardFormatWithSampleRate_channels(
            format_alloc,
            sample_rate as f64,
            1,
        )
    } {
        Some(f) => f,
        None => abort!(format!(
            "apple speech: cannot construct AVAudioFormat for sample_rate={sample_rate} mono"
        )),
    };

    // Allocate the PCM buffer with capacity = sample count.
    let frame_capacity = samples.len() as u32;
    let buffer_alloc = AVAudioPCMBuffer::alloc();
    let buffer: Retained<AVAudioPCMBuffer> = match unsafe {
        AVAudioPCMBuffer::initWithPCMFormat_frameCapacity(buffer_alloc, &format, frame_capacity)
    } {
        Some(b) => b,
        None => abort!("apple speech: cannot allocate AVAudioPCMBuffer"),
    };

    // Fill the buffer's channel-0 with our samples.
    unsafe {
        buffer.setFrameLength(frame_capacity);
        let cd: *mut std::ptr::NonNull<f32> = buffer.floatChannelData();
        if cd.is_null() {
            abort!("apple speech: AVAudioPCMBuffer.floatChannelData returned null");
        }
        let ch0_ptr: *mut f32 = (*cd).as_ptr();
        std::ptr::copy_nonoverlapping(samples.as_ptr(), ch0_ptr, samples.len());
    }

    // Build the audio-buffer recognition request.
    let request: Retained<SFSpeechAudioBufferRecognitionRequest> = unsafe {
        let alloc = SFSpeechAudioBufferRecognitionRequest::alloc();
        SFSpeechAudioBufferRecognitionRequest::init(alloc)
    };

    unsafe {
        request.setShouldReportPartialResults(true);
        if on_device_only {
            request.setRequiresOnDeviceRecognition(true);
        }
        request.appendAudioPCMBuffer(&buffer);
        request.endAudio();
    }

    // Local "done" channel — signals this thread to stop holding the
    // task once the handler has fired its terminal event. Independent
    // of `event_tx` (which crosses into async-land).
    let (done_tx, done_rx) = std::sync::mpsc::channel::<()>();
    let done_tx_block = done_tx.clone();
    let event_tx_block = event_tx.clone();

    let handler: RcBlock<dyn Fn(*mut SFSpeechRecognitionResult, *mut NSError)> = RcBlock::new(
        move |result_ptr: *mut SFSpeechRecognitionResult, err_ptr: *mut NSError| {
            if !err_ptr.is_null() {
                let msg = unsafe { (*err_ptr).localizedDescription() };
                let _ = event_tx_block.send(StreamEvent::Error(format!(
                    "apple speech: recognition error: {}",
                    msg.to_string()
                )));
                let _ = done_tx_block.send(());
                return;
            }
            if result_ptr.is_null() {
                return;
            }
            let result: &SFSpeechRecognitionResult = unsafe { &*result_ptr };
            let transcription = unsafe { result.bestTranscription() };
            let formatted = unsafe { transcription.formattedString() };
            let text = formatted.to_string();
            let is_final = unsafe { result.isFinal() };
            if is_final {
                let _ = event_tx_block.send(StreamEvent::Final(text));
                let _ = done_tx_block.send(());
            } else {
                let _ = event_tx_block.send(StreamEvent::Partial(AppleSpeechPartial { text }));
            }
        },
    );
    let handler_ref: &DynBlock<dyn Fn(*mut SFSpeechRecognitionResult, *mut NSError)> = &*handler;
    let request_super: &SFSpeechRecognitionRequest = &**request;

    // We keep our own event_tx clone alive past the spawn so the timeout
    // branch below can surface a real error to the async caller. The
    // block holds its own clone (`event_tx_block`) — both are dropped
    // when this function returns.
    drop(done_tx);

    let task =
        unsafe { recognizer.recognitionTaskWithRequest_resultHandler(request_super, handler_ref) };

    // Wait for the handler's terminal signal, with a generous timeout
    // matching the batch path. If we time out, send an explicit error
    // event AND cancel the task — without the explicit send, the async
    // caller would block on `event_rx.recv().await` indefinitely (the
    // block's `event_tx_block` clone is still alive while Apple holds
    // its retained block, so channel-close-on-drop never fires).
    if done_rx
        .recv_timeout(std::time::Duration::from_secs(60))
        .is_err()
    {
        let _ = event_tx.send(StreamEvent::Error(
            "apple speech: streaming recognition timed out (60 s)".into(),
        ));
    }

    unsafe { task.cancel() };
    drop(task);
}