car-ffi-common 0.24.1

//! JSON wrappers for voice streaming sessions.
//!
//! Each FFI binding (NAPI, PyO3, WebSocket server) provides:
//!   - a single shared [`VoiceSessionRegistry`] (typically behind a
//!     `OnceLock` so the same registry survives across function calls),
//!   - a [`VoiceEventSink`] impl that fans events to its own consumer
//!     (TSF callback, Python callable, WS notification frame).
//!
//! These wrappers do the JSON parsing, build the appropriate
//! [`Listener`], wrap it in a [`VoiceSession`], and hand it to the
//! registry. They do not own any TSF / Python / network state.
//!
//! ## AudioSourceSpec wire format
//!
//! ```jsonc
//! { "kind": "mic" }                                  // default microphone
//! { "kind": "file",     "path": "/abs/path.wav" }    // pre-recorded, streamed through the same pipeline
//! { "kind": "fifo",     "path": "/tmp/audio.fifo" }  // named pipe, caller writes 16-bit PCM
//! { "kind": "system" }                               // OS output capture (macOS only for v1)
//! { "kind": "pcm_push", "sample_rate": 16000, "channels": 1 }
//! ```
//!
//! Only the variants the current build actually wires up are accepted.
//! Unimplemented variants return a structured "not implemented" error
//! so JS / Python clients can distinguish missing capability from a bad
//! request.

use car_voice::{
    Listener, PushHandle, PushListener, VoiceConfig, VoiceEventSink, VoiceSession,
    VoiceSessionRegistry,
};
use dashmap::DashMap;
use serde::{Deserialize, Serialize};
use std::sync::{Arc, OnceLock};

/// Caller-supplied audio source description. Parsed from the JSON
/// passed to [`transcribe_stream_start`].
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum AudioSourceSpec {
    /// Default OS microphone via `cpal`.
    Mic,
    /// Pre-recorded file streamed through the listener pipeline.
    File { path: String },
    /// Named pipe / fifo — caller writes 16-bit PCM frames.
    Fifo { path: String },
    /// OS audio output capture (system audio).
    System,
    /// Caller pushes PCM frames via `transcribe_stream_push`.
    PcmPush {
        sample_rate: u32,
        #[serde(default = "default_channels")]
        channels: u16,
    },
}

fn default_channels() -> u16 {
    1
}

/// Optional per-session configuration knobs.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct TranscribeStreamOptions {
    /// STT model id override (e.g. "whisper-large-v3", "parakeet-tdt-0.6b").
    pub model: Option<String>,
    /// Language hint (BCP-47 like "en", "en-US"). None = auto-detect.
    pub language: Option<String>,
    /// Bias prompt — words/names to nudge the recognizer toward.
    pub prompt: Option<String>,
    /// If true, emit `audio_chunk` events with frame counts (samples
    /// still omitted — separate flag below).
    #[serde(default)]
    pub emit_audio_meta: bool,
    /// Enable native streaming partials. Today this only takes effect
    /// for `Mic` sources when the build was compiled with the
    /// `parakeet` feature; the listener uses Parakeet TDT for
    /// transcription and emits `partial` events per non-blank token
    /// before each canonical `transcript` event. Without the feature
    /// or for non-Mic sources this flag is silently ignored.
    #[serde(default)]
    pub streaming: bool,

    /// Attach the prepared speaker diarizer to this session so
    /// `transcript` events carry `role: "other:speaker_N"` rather
    /// than `"unknown"`. Caller must `prepareDiarizer()` first.
    /// Silently ignored if no diarizer has been prepared, or if the
    /// source isn't `Mic`.
    #[serde(default)]
    pub diarizer: bool,

    /// Attach the enrollment-based SpeakerPipeline so segments
    /// matching an enrolled voiceprint get `role: "enrolled_user"`.
    /// Pipeline is built lazily from `~/.car/voiceprints/` on first
    /// use; layered with the diarizer when both are on (pipeline
    /// wins for the enrolled user, diarizer provides the cluster id
    /// for everyone else).
    #[serde(default)]
    pub enrolled: bool,

    /// Voice-context prompt overlay prepended to system prompts on
    /// the voice-invoked inference path. `None` uses the built-in
    /// default (`car_voice::DEFAULT_VOICE_PROMPT_OVERLAY`). An
    /// empty string disables the overlay. Threaded into
    /// `VoiceConfig.voice_prompt_overlay`.
    #[serde(default)]
    pub voice_prompt_overlay: Option<String>,

    /// Streaming STT provider override. Today only meaningful with
    /// `AudioSourceSpec::PcmPush`: when set to `"elevenlabs"`, the
    /// listener opens an ElevenLabs Realtime websocket
    /// (`wss://api.elevenlabs.io/v1/speech-to-text/realtime`) and
    /// forwards pushed PCM frames to it instead of running the
    /// in-process VAD + batch STT pipeline. Default (`None`) keeps
    /// the existing local pipeline. Requires `ELEVENLABS_API_KEY` in
    /// env, config, or keychain. Other source kinds ignore this
    /// field. Reserved future values: `"local"` (explicit opt-out for
    /// users who want to assert "no cloud").
    #[serde(default)]
    pub provider: Option<String>,
}

/// Build a [`Listener`] for the requested audio source.
///
/// Each binding will eventually want to grow this beyond `Mic` —
/// system audio (task #5), file replay, fifo, push. For task #1 we
/// stand up the scaffold with `Mic` wired and the rest returning a
/// structured "not implemented" error so the FFI surface can be
/// landed and consumers can be developed against the shape.
/// Side registry mapping `session_id` → `PushHandle` for pcm_push
/// sessions. Populated when a pcm_push session starts; cleared when
/// it stops. The transcribe_stream_push entry point looks up the
/// handle here and forwards bytes.
fn push_handles() -> &'static DashMap<String, PushHandle> {
    static MAP: OnceLock<DashMap<String, PushHandle>> = OnceLock::new();
    MAP.get_or_init(DashMap::new)
}

// ─── Parakeet pre-warm (feature-gated) ────────────────────────────────────

#[cfg(feature = "parakeet")]
fn prepared_parakeet() -> &'static OnceLock<std::sync::Arc<car_voice::ParakeetSttProvider>> {
    static P: OnceLock<std::sync::Arc<car_voice::ParakeetSttProvider>> = OnceLock::new();
    &P
}

/// Eagerly download and load the Parakeet model. Idempotent — safe
/// to call from app startup. Returns the resolved model directory
/// path on success.
///
/// First call downloads ~600 MB to `~/.car/models/parakeet-tdt-0.6b-v2-int8/`
/// (network-bound, can take minutes). Subsequent calls return
/// immediately with the cached path.
#[cfg(feature = "parakeet")]
pub async fn prepare_parakeet() -> Result<String, String> {
    let dir = car_voice::ParakeetSttProvider::default_model_dir()
        .map_err(|e| format!("model dir resolution failed: {}", e))?;
    let dir_str = dir.display().to_string();
    let provider = std::sync::Arc::new(car_voice::ParakeetSttProvider::new(dir.clone()));
    let provider_for_load = provider.clone();
    tokio::task::spawn_blocking(move || provider_for_load.prepare())
        .await
        .map_err(|e| format!("blocking task failed: {}", e))?
        .map_err(|e| format!("parakeet load failed: {}", e))?;
    let _ = prepared_parakeet().set(provider);
    Ok(serde_json::json!({ "model_dir": dir_str, "ready": true }).to_string())
}

/// No-op when the parakeet feature is off — surfaced so FFI callers
/// still get a function to call; returns a structured error JSON.
#[cfg(not(feature = "parakeet"))]
pub async fn prepare_parakeet() -> Result<String, String> {
    Err(
        "car-ffi-common was built without the `parakeet` feature; rebuild with --features parakeet"
            .to_string(),
    )
}

#[cfg(feature = "parakeet")]
#[allow(dead_code)] // conditionally compiled — used only under the `parakeet` feature
pub(crate) fn current_prepared_parakeet() -> Option<std::sync::Arc<car_voice::ParakeetSttProvider>>
{
    prepared_parakeet().get().cloned()
}

// ─── Diarizer pre-warm (feature-gated) ────────────────────────────────────

#[cfg(feature = "diarization")]
fn prepared_diarizer() -> &'static OnceLock<car_voice::SharedDiarizer> {
    static D: OnceLock<car_voice::SharedDiarizer> = OnceLock::new();
    &D
}

/// Eagerly download and load the speaker diarizer ONNX. Idempotent.
#[cfg(feature = "diarization")]
pub async fn prepare_diarizer() -> Result<String, String> {
    let result = tokio::task::spawn_blocking(|| {
        car_voice::SpeakerDiarizer::new(car_voice::DiarizationConfig::default())
    })
    .await
    .map_err(|e| format!("blocking task failed: {}", e))?
    .map_err(|e| format!("diarizer load failed: {}", e))?;
    let arc = std::sync::Arc::new(result);
    let _ = prepared_diarizer().set(arc);
    Ok(serde_json::json!({ "ready": true }).to_string())
}

#[cfg(not(feature = "diarization"))]
pub async fn prepare_diarizer() -> Result<String, String> {
    Err("car-ffi-common was built without the `diarization` feature; rebuild with --features diarization".to_string())
}

#[cfg(feature = "diarization")]
pub fn current_prepared_diarizer() -> Option<car_voice::SharedDiarizer> {
    prepared_diarizer().get().cloned()
}

// ─── Speaker enrollment ───────────────────────────────────────────────────

/// Pre-built SpeakerPipeline loaded from disk-saved enrollments.
/// Built lazily on the first `transcribe_stream_start` /
/// `start_meeting` call with `enrolled: true`.
fn prepared_pipeline() -> &'static OnceLock<std::sync::Arc<car_voice::SpeakerPipeline>> {
    static P: OnceLock<std::sync::Arc<car_voice::SpeakerPipeline>> = OnceLock::new();
    &P
}

/// Construct (or return the cached) SpeakerPipeline pre-populated
/// with the first enrollment found on disk. Today's pipeline supports
/// one enrolled user; multi-enrollment is a follow-up.
pub fn current_or_build_pipeline() -> Result<std::sync::Arc<car_voice::SpeakerPipeline>, String> {
    if let Some(p) = prepared_pipeline().get() {
        return Ok(p.clone());
    }
    let pipeline = car_voice::SpeakerPipeline::baseline();
    let pipeline = match car_voice::list_enrollments()
        .map_err(|e| format!("list enrollments: {}", e))?
        .into_iter()
        .next()
    {
        Some(info) => match car_voice::load_enrollment(&info.label) {
            Ok(e) => pipeline.with_enrollment(e),
            Err(e) => {
                tracing::warn!(
                    "[enrollment] failed to load '{}': {} — pipeline starts empty",
                    info.label,
                    e
                );
                pipeline
            }
        },
        None => pipeline,
    };
    let arc = std::sync::Arc::new(pipeline);
    let _ = prepared_pipeline().set(arc.clone());
    Ok(arc)
}

/// JSON shape for `enrollSpeaker`. Mirrors AudioSourceSpec — caller
/// declares whether they're handing over a WAV path or pre-decoded
/// PCM bytes (with sample rate and channel count).
#[derive(Debug, Clone, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum EnrollAudioSpec {
    /// 16-bit signed little-endian PCM bytes the caller pre-decoded.
    Pcm {
        sample_rate: u32,
        #[serde(default = "default_channels")]
        channels: u16,
        /// PCM bytes (length must be a multiple of `channels * 2`).
        data_b64: String,
    },
    /// Path to a WAV file on disk; we decode via `hound`.
    Wav { path: String },
}

/// Enroll a new speaker. Returns the saved enrollment path JSON.
pub async fn enroll_speaker(label: &str, audio_json: &str) -> Result<String, String> {
    let spec: EnrollAudioSpec =
        serde_json::from_str(audio_json).map_err(|e| format!("invalid audio JSON: {}", e))?;
    let label = label.to_string();
    let result = tokio::task::spawn_blocking(move || -> Result<String, String> {
        let enrollment = match spec {
            EnrollAudioSpec::Pcm {
                sample_rate,
                channels,
                data_b64,
            } => {
                use base64::Engine;
                let bytes = base64::engine::general_purpose::STANDARD
                    .decode(&data_b64)
                    .map_err(|e| format!("invalid pcm data_b64: {}", e))?;
                if bytes.len() % 2 != 0 {
                    return Err("pcm bytes length must be even (16-bit samples)".to_string());
                }
                let samples_i16: Vec<i16> = bytes
                    .chunks_exact(2)
                    .map(|c| i16::from_le_bytes([c[0], c[1]]))
                    .collect();
                car_voice::enroll_from_pcm(&label, &samples_i16, sample_rate, channels)
                    .map_err(|e| format!("enroll: {}", e))?
            }
            EnrollAudioSpec::Wav { path } => car_voice::enroll_from_wav(&label, path.as_ref())
                .map_err(|e| format!("enroll: {}", e))?,
        };
        let path = car_voice::save_enrollment(&enrollment).map_err(|e| e.to_string())?;
        // Reset the cached pipeline so the next session picks this enrollment up.
        let _ = prepared_pipeline();
        Ok(serde_json::json!({
            "label": enrollment.label,
            "path": path.display().to_string(),
            "model_id": enrollment.embedding.model,
        })
        .to_string())
    })
    .await
    .map_err(|e| format!("blocking task: {}", e))??;
    Ok(result)
}

/// List all saved enrollments. Returns `{enrollments: [{label, path, model_id}, ...]}`.
pub fn list_enrollments() -> Result<String, String> {
    let infos = car_voice::list_enrollments().map_err(|e| e.to_string())?;
    let arr: Vec<serde_json::Value> = infos
        .into_iter()
        .map(|i| {
            serde_json::json!({
                "label": i.label,
                "path": i.path.display().to_string(),
                "model_id": i.model_id,
            })
        })
        .collect();
    Ok(serde_json::json!({ "enrollments": arr }).to_string())
}

/// Delete a saved enrollment by label. No-op if it doesn't exist.
pub fn remove_enrollment(label: &str) -> Result<String, String> {
    car_voice::remove_enrollment(label).map_err(|e| e.to_string())?;
    Ok(serde_json::json!({ "label": label, "removed": true }).to_string())
}

fn build_listener_for_source(
    session_id: &str,
    source: &AudioSourceSpec,
    streaming: bool,
    diarizer: bool,
    enrolled: bool,
    provider: Option<&str>,
) -> Result<Box<dyn Listener>, String> {
    match source {
        AudioSourceSpec::Mic => {
            // Layer the optional features on top of the base listener.
            // Order matters only for readability — both builders are
            // idempotent setters.
            let mut listener = car_voice::CpalListener::new();
            // Streaming-provider selection. macOS gets Apple Speech
            // (free, on-device, no model download); Linux/Windows fall
            // back to Parakeet when the cargo feature is on. When
            // neither applies the flag is a no-op and the configured
            // batch SttProvider runs as today.
            #[cfg(target_os = "macos")]
            if streaming {
                let provider =
                    std::sync::Arc::new(car_voice::AppleSpeechSttProvider::new());
                listener = listener.with_apple_speech_streaming(provider);
            }
            #[cfg(all(not(target_os = "macos"), feature = "parakeet"))]
            if streaming {
                let provider = current_prepared_parakeet().unwrap_or_else(|| {
                    let dir = car_voice::ParakeetSttProvider::default_model_dir()
                        .expect("default_model_dir resolution");
                    std::sync::Arc::new(car_voice::ParakeetSttProvider::new(dir))
                });
                listener = listener.with_parakeet_streaming(provider);
            }
            #[cfg(all(not(target_os = "macos"), not(feature = "parakeet")))]
            let _ = streaming;
            #[cfg(feature = "diarization")]
            if diarizer {
                if let Some(d) = current_prepared_diarizer() {
                    listener = listener.with_diarizer(d);
                } else {
                    tracing::warn!(
                        "[voice] diarizer requested but not prepared — call prepare_diarizer() first; defaulting to no diarization"
                    );
                }
            }
            #[cfg(not(feature = "diarization"))]
            let _ = diarizer;
            if enrolled {
                match current_or_build_pipeline() {
                    Ok(p) => listener = listener.with_speaker_pipeline(p),
                    Err(e) => tracing::warn!(
                        "[voice] failed to build SpeakerPipeline: {} — proceeding without enrollment",
                        e
                    ),
                }
            }
            Ok(Box::new(listener))
        }
        #[cfg(all(target_os = "macos", feature = "system-audio-macos"))]
        AudioSourceSpec::System => Ok(Box::new(car_voice::SystemAudioListener::new())),
        #[cfg(target_os = "windows")]
        AudioSourceSpec::System => Ok(Box::new(car_voice::WindowsLoopbackListener::new())),
        #[cfg(target_os = "linux")]
        AudioSourceSpec::System => {
            // Linux loopback is "select the monitor source as your input
            // device" — no separate listener type. We build a CpalListener
            // and stamp the discovered monitor source name into its config
            // path. The caller's options.input_device override (when we
            // surface that on AudioSourceSpec::System) takes precedence.
            let monitor = car_voice::default_monitor_source().ok_or_else(|| {
                "no PipeWire/PulseAudio monitor source found — install pipewire-pulse or pulseaudio".to_string()
            })?;
            tracing::info!(
                "[voice] linux system audio: using monitor source '{}'",
                monitor
            );
            // CpalListener picks up `input_device` from VoiceConfig at
            // start time. We can't set it here without changing the
            // start signature, so for now we fall through to the default
            // input device — callers wanting a specific monitor should
            // pass it via options.model... actually no, model is wrong
            // field. The clean wiring requires extending
            // AudioSourceSpec::System with an optional device name.
            // For v1, document the limitation and use default cpal input.
            Ok(Box::new(car_voice::CpalListener::new()))
        }
        #[cfg(not(any(
            all(target_os = "macos", feature = "system-audio-macos"),
            target_os = "windows",
            target_os = "linux",
        )))]
        AudioSourceSpec::System => Err(
            "system audio capture: macOS requires the `system-audio-macos` feature (full Xcode); other platforms (Windows / Linux) need building with their respective target_os"
                .to_string(),
        ),
        AudioSourceSpec::File { .. } => Err(
            "file source not yet wired into transcribeStream — call the existing batch transcribe() in the meantime"
                .to_string(),
        ),
        AudioSourceSpec::Fifo { .. } => {
            Err("fifo source not yet wired into transcribeStream".to_string())
        }
        AudioSourceSpec::PcmPush {
            sample_rate,
            channels,
        } => match provider {
            Some("elevenlabs") => {
                if *channels != 1 {
                    return Err(format!(
                        "elevenlabs provider requires 1-channel PCM, got {channels}; \
                         downmix on the caller side or open a `PcmPush` with channels=1"
                    ));
                }
                let listener =
                    car_voice::ElevenLabsStreamingListener::new(*sample_rate);
                if let Some(handle) = listener.handle() {
                    push_handles().insert(session_id.to_string(), handle);
                }
                Ok(Box::new(listener))
            }
            // None or "local" both fall through to the local push pipeline.
            None | Some("local") => {
                let listener = PushListener::new(*sample_rate, *channels);
                // Snapshot the push handle into the side registry
                // before boxing — once we lose the typed reference we
                // can't get the handle back through `&dyn Listener`.
                if let Some(handle) = listener.handle() {
                    push_handles().insert(session_id.to_string(), handle);
                }
                Ok(Box::new(listener))
            }
            Some(other) => Err(format!(
                "unknown transcribe_stream provider '{other}'; supported: \"elevenlabs\", \"local\" (default)"
            )),
        },
    }
}

/// Start a streaming transcription session. Returns the session id
/// (echo of the caller-provided id, validated and registered).
///
/// Events flow asynchronously to `sink` until the session ends or
/// [`transcribe_stream_stop`] is called.
pub async fn transcribe_stream_start(
    session_id: &str,
    audio_source_json: &str,
    options_json: Option<&str>,
    registry: Arc<VoiceSessionRegistry>,
    sink: Arc<dyn VoiceEventSink>,
) -> Result<String, String> {
    if session_id.is_empty() {
        return Err("session_id must not be empty".to_string());
    }
    if registry.contains(session_id) {
        return Err(format!("session_id '{}' already exists", session_id));
    }

    let source: AudioSourceSpec = serde_json::from_str(audio_source_json)
        .map_err(|e| format!("invalid audioSource JSON: {}", e))?;
    let opts: TranscribeStreamOptions = match options_json {
        Some(s) if !s.is_empty() => {
            serde_json::from_str(s).map_err(|e| format!("invalid options JSON: {}", e))?
        }
        _ => TranscribeStreamOptions::default(),
    };

    let listener = build_listener_for_source(
        session_id,
        &source,
        opts.streaming,
        opts.diarizer,
        opts.enrolled,
        opts.provider.as_deref(),
    )?;
    let session = VoiceSession::new(session_id, listener);
    let config = build_voice_config(&opts);
    session
        .start(config, sink)
        .await
        .map_err(|e| format!("session start error: {}", e))?;
    registry
        .insert(session)
        .map_err(|e| format!("registry insert error: {}", e))?;
    Ok(serde_json::json!({ "session_id": session_id }).to_string())
}

/// Stop a streaming transcription session. Idempotent — stopping an
/// unknown id returns an error JSON the caller can ignore if desired.
pub async fn transcribe_stream_stop(
    session_id: &str,
    registry: Arc<VoiceSessionRegistry>,
) -> Result<String, String> {
    // Drop the push handle before stopping the session so any in-flight
    // pushes fail fast rather than queue against a teardown.
    push_handles().remove(session_id);
    registry
        .stop(session_id)
        .await
        .map_err(|e| format!("session stop error: {}", e))?;
    Ok(serde_json::json!({ "session_id": session_id, "stopped": true }).to_string())
}

/// Push a PCM frame into a `pcm_push` session.
///
/// `pcm_frame` is interpreted as 16-bit signed little-endian PCM at
/// the sample rate / channel layout declared when the session was
/// started. Stereo input is downmixed to mono inside the listener.
///
/// Returns `{accepted: true}` on success.
pub async fn transcribe_stream_push(
    session_id: &str,
    pcm_frame: &[u8],
    registry: Arc<VoiceSessionRegistry>,
) -> Result<String, String> {
    if !registry.contains(session_id) {
        return Err(format!("unknown session_id '{}'", session_id));
    }
    let handle = push_handles()
        .get(session_id)
        .ok_or_else(|| {
            format!(
                "session '{}' is not a pcm_push session (push only works for sources of kind 'pcm_push')",
                session_id
            )
        })?
        .clone();
    handle
        .feed_pcm(pcm_frame.to_vec())
        .await
        .map_err(|e: car_voice::VoiceError| format!("push failed: {}", e))?;
    Ok(serde_json::json!({ "accepted": true }).to_string())
}

/// List the ids of all currently active voice sessions.
pub fn list_voice_sessions(registry: Arc<VoiceSessionRegistry>) -> String {
    serde_json::json!({ "sessions": registry.list() }).to_string()
}

// ─── TTS streaming ────────────────────────────────────────────────────────

/// Per-stream cancellation handles for in-flight TTS streams. Populated
/// when `tts_stream_start` spawns the forwarder task; cleared either by
/// the task itself on natural completion or by `tts_stream_cancel`.
fn tts_stream_handles() -> &'static DashMap<String, tokio::task::AbortHandle> {
    static MAP: OnceLock<DashMap<String, tokio::task::AbortHandle>> = OnceLock::new();
    MAP.get_or_init(DashMap::new)
}

/// Optional per-stream knobs accepted by [`tts_stream_start`]. Every
/// field is optional and stays provider-agnostic — concrete provider
/// selection lives in [`car_voice::build_tts_speaker`].
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct TtsStreamOptions {
    /// TTS provider override (`"elevenlabs"`, `"local"`, `"kokoro"`,
    /// `"apple_speech"`). `None` uses the [`VoiceConfig`] default for
    /// the platform.
    pub provider: Option<String>,
    /// Provider-specific voice id. For ElevenLabs this is the voice
    /// UUID; for AppleSpeech / Kokoro / Local the value is interpreted
    /// per-provider. Ignored by providers that don't expose a voice id.
    pub voice_id: Option<String>,
    /// Reserved: when true, the server will also emit binary frame
    /// chunks alongside the JSON `voice.event` notifications. Wired in
    /// by the server layer; this struct only carries the bit so the
    /// JSON shape is stable.
    #[serde(default)]
    pub binary_frames: bool,
}

fn parse_tts_provider_kind(s: &str) -> Result<car_voice::TtsProviderKind, String> {
    match s.to_ascii_lowercase().as_str() {
        "elevenlabs" | "eleven-labs" | "eleven_labs" => {
            Ok(car_voice::TtsProviderKind::Elevenlabs)
        }
        "local" => Ok(car_voice::TtsProviderKind::Local),
        "kokoro" => Ok(car_voice::TtsProviderKind::Kokoro),
        "apple_speech" | "apple-speech" | "applespeech" => {
            Ok(car_voice::TtsProviderKind::AppleSpeech)
        }
        other => Err(format!("unknown tts provider '{}'", other)),
    }
}

/// Start a streaming TTS synthesis. The forwarder task pulls
/// [`car_voice::TtsChunk`]s off the provider's `synth_stream` receiver
/// and forwards each one as a `voice.event` with `type = "tts_chunk"`
/// through the supplied [`VoiceEventSink`].
///
/// Caller-provided `stream_id` is opaque — it's used as the session id
/// the sink wraps each notification with, and as the lookup key for
/// [`tts_stream_cancel`]. Re-using an active stream id returns an
/// error.
///
/// Chunk JSON shape:
/// ```jsonc
/// {
///   "type": "tts_chunk",
///   "stream_id": "<id>",
///   "seq": 0,
///   "audio_b64": "<base64>",
///   "format": "mp3" | "wav",
///   "is_final": false
/// }
/// ```
///
/// The forwarder unregisters itself from the handle map on natural
/// completion or on cancellation.
pub async fn tts_stream_start(
    stream_id: &str,
    text: &str,
    options_json: Option<&str>,
    sink: Arc<dyn VoiceEventSink>,
) -> Result<String, String> {
    use base64::Engine as _;

    if stream_id.is_empty() {
        return Err("stream_id must not be empty".to_string());
    }
    if tts_stream_handles().contains_key(stream_id) {
        return Err(format!("stream_id '{}' already exists", stream_id));
    }

    let opts: TtsStreamOptions = match options_json {
        Some(s) if !s.is_empty() => {
            serde_json::from_str(s).map_err(|e| format!("invalid options JSON: {}", e))?
        }
        _ => TtsStreamOptions::default(),
    };

    let mut config = VoiceConfig::default();
    if let Some(provider) = &opts.provider {
        config.tts_provider = parse_tts_provider_kind(provider)?;
    }
    if let Some(voice_id) = &opts.voice_id {
        // ElevenLabs is the only provider with a config-level voice id
        // today. Other providers ignore the field.
        config.elevenlabs_voice_id = voice_id.clone();
    }

    let speaker = car_voice::build_tts_speaker(&config)
        .map_err(|e| format!("tts provider init: {}", e))?;

    let mut rx = speaker
        .synth_stream(text)
        .await
        .map_err(|e| format!("synth_stream: {}", e))?;

    let stream_id_owned = stream_id.to_string();
    let sink_for_task = sink.clone();
    let binary_frames = opts.binary_frames;

    // Reject binary mode if the stream_id isn't a valid 32-char hex
    // UUID — the binary header requires 16 raw UUID bytes and the
    // forwarder can't fall back gracefully mid-stream.
    if binary_frames {
        binary::build_frame(binary::FRAME_TYPE_TTS_CHUNK, &stream_id_owned, 0, 0, &[]).map_err(
            |e| {
                format!(
                    "binary_frames=true requires stream_id be 32 hex chars: {}",
                    e
                )
            },
        )?;
    }

    let join_handle = tokio::spawn(async move {
        while let Some(chunk) = rx.recv().await {
            let format_byte = match chunk.format {
                car_voice::AudioFormat::Mp3 => binary::FORMAT_MP3,
                car_voice::AudioFormat::Wav => binary::FORMAT_WAV,
            };
            let format_str = match chunk.format {
                car_voice::AudioFormat::Mp3 => "mp3",
                car_voice::AudioFormat::Wav => "wav",
            };

            if binary_frames {
                // Binary path: audio rides only in the binary frame; the
                // bot picks up format/seq/final from the header.
                if let Ok(frame) = binary::build_frame(
                    binary::FRAME_TYPE_TTS_CHUNK,
                    &stream_id_owned,
                    chunk.seq,
                    format_byte,
                    &chunk.bytes,
                ) {
                    sink_for_task.send_binary(frame);
                }
                if chunk.is_final {
                    if let Ok(marker) = binary::build_frame(
                        binary::FRAME_TYPE_TTS_FINAL,
                        &stream_id_owned,
                        chunk.seq,
                        0,
                        &[],
                    ) {
                        sink_for_task.send_binary(marker);
                    }
                    break;
                }
            } else {
                let event_json = serde_json::json!({
                    "type": "tts_chunk",
                    "stream_id": stream_id_owned,
                    "seq": chunk.seq,
                    "audio_b64": base64::engine::general_purpose::STANDARD.encode(&chunk.bytes),
                    "format": format_str,
                    "is_final": chunk.is_final,
                })
                .to_string();
                sink_for_task.send(&stream_id_owned, event_json);
                if chunk.is_final {
                    break;
                }
            }
        }
        tts_stream_handles().remove(&stream_id_owned);
    });

    tts_stream_handles().insert(stream_id.to_string(), join_handle.abort_handle());

    Ok(serde_json::json!({
        "stream_id": stream_id,
        "binary_frames": binary_frames,
    })
    .to_string())
}

/// Cancel an in-flight TTS stream by id. Idempotent — cancelling an
/// unknown id returns `{cancelled: false}` rather than an error so
/// callers can race cancellation against natural completion safely.
pub async fn tts_stream_cancel(stream_id: &str) -> Result<String, String> {
    match tts_stream_handles().remove(stream_id) {
        Some((_, abort_handle)) => {
            abort_handle.abort();
            Ok(serde_json::json!({ "stream_id": stream_id, "cancelled": true }).to_string())
        }
        None => Ok(serde_json::json!({ "stream_id": stream_id, "cancelled": false }).to_string()),
    }
}

/// List the ids of all in-flight TTS streams. Useful for diagnostics
/// and for the bot to verify cancellation took effect.
pub fn list_tts_streams() -> String {
    let ids: Vec<String> = tts_stream_handles()
        .iter()
        .map(|e| e.key().clone())
        .collect();
    serde_json::json!({ "streams": ids }).to_string()
}

// ─── Binary frame wire format ─────────────────────────────────────────────

/// Wire format for CAR's binary WebSocket frames. Used by the meeting-bot
/// PCM hot-path (both directions) to avoid base64 + JSON overhead.
///
/// All multi-byte fields are little-endian. 26-byte fixed header followed
/// by an opaque payload:
///
/// ```text
///   offset  size  field
///   0       1     type tag
///   1       16    session/stream UUID (raw bytes; lowercase hex form,
///                 32 chars without dashes, is the JSON-RPC session_id /
///                 stream_id)
///   17      8     seq (u64 LE) — monotonic within a session/stream
///   25      1     format byte (type-specific; see FORMAT_* consts)
///   26+     N     payload bytes
/// ```
pub mod binary {
    /// Inbound — client → server PCM ingest for a `pcm_push` session.
    /// Payload is 16-bit signed LE PCM at the rate / channels declared
    /// at session start.
    pub const FRAME_TYPE_INBOUND_PCM: u8 = 0x01;
    /// Outbound — server → client TTS audio chunk. Format byte
    /// indicates the codec; payload is the raw provider bytes.
    pub const FRAME_TYPE_TTS_CHUNK: u8 = 0x02;
    /// Outbound — server → client end-of-stream marker. Zero-length
    /// payload. Format byte is ignored.
    pub const FRAME_TYPE_TTS_FINAL: u8 = 0x03;
    /// Outbound — server → client error. Payload is a UTF-8 error
    /// message (no length prefix; the whole payload is the message).
    pub const FRAME_TYPE_TTS_ERROR: u8 = 0x04;

    /// Format byte: raw 16-bit signed little-endian PCM.
    pub const FORMAT_PCM_S16LE: u8 = 0x00;
    /// Format byte: MP3.
    pub const FORMAT_MP3: u8 = 0x01;
    /// Format byte: WAV.
    pub const FORMAT_WAV: u8 = 0x02;

    /// Fixed header length in bytes.
    pub const HEADER_LEN: usize = 26;

    /// Parsed view over a binary frame's header + payload slice.
    /// `session_id_hex` is the 32-char lowercase hex form of the UUID
    /// bytes — directly comparable to a registry session_id.
    pub struct ParsedFrame<'a> {
        pub frame_type: u8,
        pub session_id_hex: String,
        pub seq: u64,
        pub format: u8,
        pub payload: &'a [u8],
    }

    /// Parse the fixed header out of a raw binary frame.
    pub fn parse_frame(bytes: &[u8]) -> Result<ParsedFrame<'_>, String> {
        if bytes.len() < HEADER_LEN {
            return Err(format!(
                "binary frame shorter than header ({} bytes; need {})",
                bytes.len(),
                HEADER_LEN
            ));
        }
        let frame_type = bytes[0];
        let mut session_id_hex = String::with_capacity(32);
        for b in &bytes[1..17] {
            use std::fmt::Write as _;
            let _ = write!(session_id_hex, "{:02x}", b);
        }
        let seq = u64::from_le_bytes(bytes[17..25].try_into().unwrap());
        let format = bytes[25];
        let payload = &bytes[HEADER_LEN..];
        Ok(ParsedFrame {
            frame_type,
            session_id_hex,
            seq,
            format,
            payload,
        })
    }

    /// Build a binary frame from its header fields and a payload.
    /// `session_id_hex` must be a 32-char lowercase hex string (no
    /// dashes); other forms are rejected.
    pub fn build_frame(
        frame_type: u8,
        session_id_hex: &str,
        seq: u64,
        format: u8,
        payload: &[u8],
    ) -> Result<Vec<u8>, String> {
        let uuid_bytes = parse_session_id_hex(session_id_hex)?;
        let mut out = Vec::with_capacity(HEADER_LEN + payload.len());
        out.push(frame_type);
        out.extend_from_slice(&uuid_bytes);
        out.extend_from_slice(&seq.to_le_bytes());
        out.push(format);
        out.extend_from_slice(payload);
        Ok(out)
    }

    fn parse_session_id_hex(s: &str) -> Result<[u8; 16], String> {
        if s.len() != 32 {
            return Err(format!(
                "session_id must be 32 hex chars (got {} chars)",
                s.len()
            ));
        }
        let mut out = [0u8; 16];
        for (i, chunk) in s.as_bytes().chunks(2).enumerate() {
            let hex_pair = std::str::from_utf8(chunk)
                .map_err(|_| "session_id contains non-ASCII".to_string())?;
            out[i] = u8::from_str_radix(hex_pair, 16)
                .map_err(|_| format!("session_id has non-hex chars at index {}", i * 2))?;
        }
        Ok(out)
    }

    #[cfg(test)]
    mod tests {
        use super::*;

        #[test]
        fn roundtrip_preserves_fields() {
            let id = "0123456789abcdef0123456789abcdef";
            let payload = b"hello world";
            let frame = build_frame(FRAME_TYPE_TTS_CHUNK, id, 42, FORMAT_MP3, payload).unwrap();
            let parsed = parse_frame(&frame).unwrap();
            assert_eq!(parsed.frame_type, FRAME_TYPE_TTS_CHUNK);
            assert_eq!(parsed.session_id_hex, id);
            assert_eq!(parsed.seq, 42);
            assert_eq!(parsed.format, FORMAT_MP3);
            assert_eq!(parsed.payload, payload);
        }

        #[test]
        fn short_frame_rejected() {
            assert!(parse_frame(&[0u8; 10]).is_err());
        }

        #[test]
        fn bad_hex_rejected() {
            let bad = "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz";
            assert!(build_frame(0x01, bad, 0, 0, &[]).is_err());
        }

        #[test]
        fn wrong_length_hex_rejected() {
            assert!(build_frame(0x01, "abcd", 0, 0, &[]).is_err());
        }
    }
}

/// Translate caller options into the listener's [`VoiceConfig`].
///
/// Maps the generic `model` hint to whichever provider-specific field
/// the configured [`SttProvider`] uses. `prompt` (bias text) has no
/// home on `VoiceConfig` yet — it's accepted at the FFI boundary so
/// the JSON shape is stable, but currently ignored. Adding it without
/// a matching field would silently lose data, which is worse than
/// being explicit about the gap.
fn build_voice_config(opts: &TranscribeStreamOptions) -> VoiceConfig {
    let mut config = VoiceConfig::default();
    if let Some(model) = &opts.model {
        // Today the only on-device provider is whisper.cpp, so any model
        // override goes there. When Parakeet lands (task #6) this will
        // branch on `config.stt_provider` and set the right field.
        config.whisper_cpp_model = model.clone();
    }
    if let Some(lang) = &opts.language {
        config.language = lang.clone();
    }
    if let Some(overlay) = &opts.voice_prompt_overlay {
        config.voice_prompt_overlay = Some(overlay.clone());
    }
    let _ = &opts.prompt; // intentionally ignored — see doc comment
    config
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parses_mic_source() {
        let s: AudioSourceSpec = serde_json::from_str(r#"{"kind":"mic"}"#).unwrap();
        matches!(s, AudioSourceSpec::Mic);
    }

    #[test]
    fn parses_pcm_push_with_defaults() {
        let s: AudioSourceSpec =
            serde_json::from_str(r#"{"kind":"pcm_push","sample_rate":48000}"#).unwrap();
        match s {
            AudioSourceSpec::PcmPush {
                sample_rate,
                channels,
            } => {
                assert_eq!(sample_rate, 48000);
                assert_eq!(channels, 1);
            }
            _ => panic!("wrong variant"),
        }
    }

    #[test]
    fn parses_file_source() {
        let s: AudioSourceSpec =
            serde_json::from_str(r#"{"kind":"file","path":"/tmp/a.wav"}"#).unwrap();
        match s {
            AudioSourceSpec::File { path } => assert_eq!(path, "/tmp/a.wav"),
            _ => panic!("wrong variant"),
        }
    }

    #[test]
    fn rejects_unknown_kind() {
        let res: Result<AudioSourceSpec, _> = serde_json::from_str(r#"{"kind":"radio"}"#);
        assert!(res.is_err());
    }

    #[test]
    fn options_defaults_are_empty() {
        let opts: TranscribeStreamOptions = serde_json::from_str("{}").unwrap();
        assert!(opts.model.is_none());
        assert!(opts.language.is_none());
        assert!(!opts.emit_audio_meta);
    }

    #[tokio::test]
    async fn unknown_session_stop_returns_error() {
        let registry = Arc::new(VoiceSessionRegistry::new());
        let err = transcribe_stream_stop("nope", registry).await.unwrap_err();
        assert!(err.contains("nope") || err.contains("not running"));
    }
}