car-ffi-common 0.6.0

//! JSON wrappers for voice streaming sessions.
//!
//! Each FFI binding (NAPI, PyO3, WebSocket server) provides:
//!   - a single shared [`VoiceSessionRegistry`] (typically behind a
//!     `OnceLock` so the same registry survives across function calls),
//!   - a [`VoiceEventSink`] impl that fans events to its own consumer
//!     (TSF callback, Python callable, WS notification frame).
//!
//! These wrappers do the JSON parsing, build the appropriate
//! [`Listener`], wrap it in a [`VoiceSession`], and hand it to the
//! registry. They do not own any TSF / Python / network state.
//!
//! ## AudioSourceSpec wire format
//!
//! ```jsonc
//! { "kind": "mic" }                                  // default microphone
//! { "kind": "file",     "path": "/abs/path.wav" }    // pre-recorded, streamed through the same pipeline
//! { "kind": "fifo",     "path": "/tmp/audio.fifo" }  // named pipe, caller writes 16-bit PCM
//! { "kind": "system" }                               // OS output capture (macOS only for v1)
//! { "kind": "pcm_push", "sample_rate": 16000, "channels": 1 }
//! ```
//!
//! Only the variants the current build actually wires up are accepted.
//! Unimplemented variants return a structured "not implemented" error
//! so JS / Python clients can distinguish missing capability from a bad
//! request.

use car_voice::{
    Listener, PushHandle, PushListener, VoiceConfig, VoiceEventSink, VoiceSession,
    VoiceSessionRegistry,
};
use dashmap::DashMap;
use serde::{Deserialize, Serialize};
use std::sync::{Arc, OnceLock};

/// Caller-supplied audio source description. Parsed from the JSON
/// passed to [`transcribe_stream_start`].
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum AudioSourceSpec {
    /// Default OS microphone via `cpal`.
    Mic,
    /// Pre-recorded file streamed through the listener pipeline.
    File { path: String },
    /// Named pipe / fifo — caller writes 16-bit PCM frames.
    Fifo { path: String },
    /// OS audio output capture (system audio).
    System,
    /// Caller pushes PCM frames via `transcribe_stream_push`.
    PcmPush {
        sample_rate: u32,
        #[serde(default = "default_channels")]
        channels: u16,
    },
}

fn default_channels() -> u16 {
    1
}

/// Optional per-session configuration knobs.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct TranscribeStreamOptions {
    /// STT model id override (e.g. "whisper-large-v3", "parakeet-tdt-0.6b").
    pub model: Option<String>,
    /// Language hint (BCP-47 like "en", "en-US"). None = auto-detect.
    pub language: Option<String>,
    /// Bias prompt — words/names to nudge the recognizer toward.
    pub prompt: Option<String>,
    /// If true, emit `audio_chunk` events with frame counts (samples
    /// still omitted — separate flag below).
    #[serde(default)]
    pub emit_audio_meta: bool,
    /// Enable native streaming partials. Today this only takes effect
    /// for `Mic` sources when the build was compiled with the
    /// `parakeet` feature; the listener uses Parakeet TDT for
    /// transcription and emits `partial` events per non-blank token
    /// before each canonical `transcript` event. Without the feature
    /// or for non-Mic sources this flag is silently ignored.
    #[serde(default)]
    pub streaming: bool,

    /// Attach the prepared speaker diarizer to this session so
    /// `transcript` events carry `role: "other:speaker_N"` rather
    /// than `"unknown"`. Caller must `prepareDiarizer()` first.
    /// Silently ignored if no diarizer has been prepared, or if the
    /// source isn't `Mic`.
    #[serde(default)]
    pub diarizer: bool,

    /// Attach the enrollment-based SpeakerPipeline so segments
    /// matching an enrolled voiceprint get `role: "enrolled_user"`.
    /// Pipeline is built lazily from `~/.car/voiceprints/` on first
    /// use; layered with the diarizer when both are on (pipeline
    /// wins for the enrolled user, diarizer provides the cluster id
    /// for everyone else).
    #[serde(default)]
    pub enrolled: bool,
}

/// Build a [`Listener`] for the requested audio source.
///
/// Each binding will eventually want to grow this beyond `Mic` —
/// system audio (task #5), file replay, fifo, push. For task #1 we
/// stand up the scaffold with `Mic` wired and the rest returning a
/// structured "not implemented" error so the FFI surface can be
/// landed and consumers can be developed against the shape.
/// Side registry mapping `session_id` → `PushHandle` for pcm_push
/// sessions. Populated when a pcm_push session starts; cleared when
/// it stops. The transcribe_stream_push entry point looks up the
/// handle here and forwards bytes.
fn push_handles() -> &'static DashMap<String, PushHandle> {
    static MAP: OnceLock<DashMap<String, PushHandle>> = OnceLock::new();
    MAP.get_or_init(DashMap::new)
}

// ─── Parakeet pre-warm (feature-gated) ────────────────────────────────────

#[cfg(feature = "parakeet")]
fn prepared_parakeet(
) -> &'static OnceLock<std::sync::Arc<car_voice::ParakeetSttProvider>> {
    static P: OnceLock<std::sync::Arc<car_voice::ParakeetSttProvider>> = OnceLock::new();
    &P
}

/// Eagerly download and load the Parakeet model. Idempotent — safe
/// to call from app startup. Returns the resolved model directory
/// path on success.
///
/// First call downloads ~600 MB to `~/.car/models/parakeet-tdt-0.6b-v2-int8/`
/// (network-bound, can take minutes). Subsequent calls return
/// immediately with the cached path.
#[cfg(feature = "parakeet")]
pub async fn prepare_parakeet() -> Result<String, String> {
    let dir = car_voice::ParakeetSttProvider::default_model_dir()
        .map_err(|e| format!("model dir resolution failed: {}", e))?;
    let dir_str = dir.display().to_string();
    let provider = std::sync::Arc::new(car_voice::ParakeetSttProvider::new(dir.clone()));
    let provider_for_load = provider.clone();
    tokio::task::spawn_blocking(move || provider_for_load.prepare())
        .await
        .map_err(|e| format!("blocking task failed: {}", e))?
        .map_err(|e| format!("parakeet load failed: {}", e))?;
    let _ = prepared_parakeet().set(provider);
    Ok(serde_json::json!({ "model_dir": dir_str, "ready": true }).to_string())
}

/// No-op when the parakeet feature is off — surfaced so FFI callers
/// still get a function to call; returns a structured error JSON.
#[cfg(not(feature = "parakeet"))]
pub async fn prepare_parakeet() -> Result<String, String> {
    Err("car-ffi-common was built without the `parakeet` feature; rebuild with --features parakeet".to_string())
}

#[cfg(feature = "parakeet")]
pub(crate) fn current_prepared_parakeet(
) -> Option<std::sync::Arc<car_voice::ParakeetSttProvider>> {
    prepared_parakeet().get().cloned()
}

// ─── Diarizer pre-warm (feature-gated) ────────────────────────────────────

#[cfg(feature = "diarization")]
fn prepared_diarizer() -> &'static OnceLock<car_voice::SharedDiarizer> {
    static D: OnceLock<car_voice::SharedDiarizer> = OnceLock::new();
    &D
}

/// Eagerly download and load the speaker diarizer ONNX. Idempotent.
#[cfg(feature = "diarization")]
pub async fn prepare_diarizer() -> Result<String, String> {
    let result = tokio::task::spawn_blocking(|| {
        car_voice::SpeakerDiarizer::new(car_voice::DiarizationConfig::default())
    })
    .await
    .map_err(|e| format!("blocking task failed: {}", e))?
    .map_err(|e| format!("diarizer load failed: {}", e))?;
    let arc = std::sync::Arc::new(result);
    let _ = prepared_diarizer().set(arc);
    Ok(serde_json::json!({ "ready": true }).to_string())
}

#[cfg(not(feature = "diarization"))]
pub async fn prepare_diarizer() -> Result<String, String> {
    Err("car-ffi-common was built without the `diarization` feature; rebuild with --features diarization".to_string())
}

#[cfg(feature = "diarization")]
pub(crate) fn current_prepared_diarizer() -> Option<car_voice::SharedDiarizer> {
    prepared_diarizer().get().cloned()
}

// ─── Speaker enrollment ───────────────────────────────────────────────────

/// Pre-built SpeakerPipeline loaded from disk-saved enrollments.
/// Built lazily on the first `transcribe_stream_start` /
/// `start_meeting` call with `enrolled: true`.
fn prepared_pipeline() -> &'static OnceLock<std::sync::Arc<car_voice::SpeakerPipeline>> {
    static P: OnceLock<std::sync::Arc<car_voice::SpeakerPipeline>> = OnceLock::new();
    &P
}

/// Construct (or return the cached) SpeakerPipeline pre-populated
/// with the first enrollment found on disk. Today's pipeline supports
/// one enrolled user; multi-enrollment is a follow-up.
pub(crate) fn current_or_build_pipeline(
) -> Result<std::sync::Arc<car_voice::SpeakerPipeline>, String> {
    if let Some(p) = prepared_pipeline().get() {
        return Ok(p.clone());
    }
    let pipeline = car_voice::SpeakerPipeline::baseline();
    let pipeline = match car_voice::list_enrollments()
        .map_err(|e| format!("list enrollments: {}", e))?
        .into_iter()
        .next()
    {
        Some(info) => match car_voice::load_enrollment(&info.label) {
            Ok(e) => pipeline.with_enrollment(e),
            Err(e) => {
                tracing::warn!(
                    "[enrollment] failed to load '{}': {} — pipeline starts empty",
                    info.label,
                    e
                );
                pipeline
            }
        },
        None => pipeline,
    };
    let arc = std::sync::Arc::new(pipeline);
    let _ = prepared_pipeline().set(arc.clone());
    Ok(arc)
}

/// JSON shape for `enrollSpeaker`. Mirrors AudioSourceSpec — caller
/// declares whether they're handing over a WAV path or pre-decoded
/// PCM bytes (with sample rate and channel count).
#[derive(Debug, Clone, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum EnrollAudioSpec {
    /// 16-bit signed little-endian PCM bytes the caller pre-decoded.
    Pcm {
        sample_rate: u32,
        #[serde(default = "default_channels")]
        channels: u16,
        /// PCM bytes (length must be a multiple of `channels * 2`).
        data_b64: String,
    },
    /// Path to a WAV file on disk; we decode via `hound`.
    Wav { path: String },
}

/// Enroll a new speaker. Returns the saved enrollment path JSON.
pub async fn enroll_speaker(label: &str, audio_json: &str) -> Result<String, String> {
    let spec: EnrollAudioSpec =
        serde_json::from_str(audio_json).map_err(|e| format!("invalid audio JSON: {}", e))?;
    let label = label.to_string();
    let result = tokio::task::spawn_blocking(move || -> Result<String, String> {
        let enrollment = match spec {
            EnrollAudioSpec::Pcm {
                sample_rate,
                channels,
                data_b64,
            } => {
                use base64::Engine;
                let bytes = base64::engine::general_purpose::STANDARD
                    .decode(&data_b64)
                    .map_err(|e| format!("invalid pcm data_b64: {}", e))?;
                if bytes.len() % 2 != 0 {
                    return Err("pcm bytes length must be even (16-bit samples)".to_string());
                }
                let samples_i16: Vec<i16> = bytes
                    .chunks_exact(2)
                    .map(|c| i16::from_le_bytes([c[0], c[1]]))
                    .collect();
                car_voice::enroll_from_pcm(&label, &samples_i16, sample_rate, channels)
                    .map_err(|e| format!("enroll: {}", e))?
            }
            EnrollAudioSpec::Wav { path } => car_voice::enroll_from_wav(&label, path.as_ref())
                .map_err(|e| format!("enroll: {}", e))?,
        };
        let path = car_voice::save_enrollment(&enrollment).map_err(|e| e.to_string())?;
        // Reset the cached pipeline so the next session picks this enrollment up.
        let _ = prepared_pipeline();
        Ok(serde_json::json!({
            "label": enrollment.label,
            "path": path.display().to_string(),
            "model_id": enrollment.embedding.model,
        })
        .to_string())
    })
    .await
    .map_err(|e| format!("blocking task: {}", e))??;
    Ok(result)
}

/// List all saved enrollments. Returns `{enrollments: [{label, path, model_id}, ...]}`.
pub fn list_enrollments() -> Result<String, String> {
    let infos = car_voice::list_enrollments().map_err(|e| e.to_string())?;
    let arr: Vec<serde_json::Value> = infos
        .into_iter()
        .map(|i| {
            serde_json::json!({
                "label": i.label,
                "path": i.path.display().to_string(),
                "model_id": i.model_id,
            })
        })
        .collect();
    Ok(serde_json::json!({ "enrollments": arr }).to_string())
}

/// Delete a saved enrollment by label. No-op if it doesn't exist.
pub fn remove_enrollment(label: &str) -> Result<String, String> {
    car_voice::remove_enrollment(label).map_err(|e| e.to_string())?;
    Ok(serde_json::json!({ "label": label, "removed": true }).to_string())
}

fn build_listener_for_source(
    session_id: &str,
    source: &AudioSourceSpec,
    streaming: bool,
    diarizer: bool,
    enrolled: bool,
) -> Result<Box<dyn Listener>, String> {
    match source {
        AudioSourceSpec::Mic => {
            // Layer the optional features on top of the base listener.
            // Order matters only for readability — both builders are
            // idempotent setters.
            let mut listener = car_voice::CpalListener::new();
            // Streaming-provider selection. macOS gets Apple Speech
            // (free, on-device, no model download); Linux/Windows fall
            // back to Parakeet when the cargo feature is on. When
            // neither applies the flag is a no-op and the configured
            // batch SttProvider runs as today.
            #[cfg(target_os = "macos")]
            if streaming {
                let provider =
                    std::sync::Arc::new(car_voice::AppleSpeechSttProvider::new());
                listener = listener.with_apple_speech_streaming(provider);
            }
            #[cfg(all(not(target_os = "macos"), feature = "parakeet"))]
            if streaming {
                let provider = current_prepared_parakeet().unwrap_or_else(|| {
                    let dir = car_voice::ParakeetSttProvider::default_model_dir()
                        .expect("default_model_dir resolution");
                    std::sync::Arc::new(car_voice::ParakeetSttProvider::new(dir))
                });
                listener = listener.with_parakeet_streaming(provider);
            }
            #[cfg(all(not(target_os = "macos"), not(feature = "parakeet")))]
            let _ = streaming;
            #[cfg(feature = "diarization")]
            if diarizer {
                if let Some(d) = current_prepared_diarizer() {
                    listener = listener.with_diarizer(d);
                } else {
                    tracing::warn!(
                        "[voice] diarizer requested but not prepared — call prepare_diarizer() first; defaulting to no diarization"
                    );
                }
            }
            #[cfg(not(feature = "diarization"))]
            let _ = diarizer;
            if enrolled {
                match current_or_build_pipeline() {
                    Ok(p) => listener = listener.with_speaker_pipeline(p),
                    Err(e) => tracing::warn!(
                        "[voice] failed to build SpeakerPipeline: {} — proceeding without enrollment",
                        e
                    ),
                }
            }
            Ok(Box::new(listener))
        }
        #[cfg(all(target_os = "macos", feature = "system-audio-macos"))]
        AudioSourceSpec::System => Ok(Box::new(car_voice::SystemAudioListener::new())),
        #[cfg(target_os = "windows")]
        AudioSourceSpec::System => Ok(Box::new(car_voice::WindowsLoopbackListener::new())),
        #[cfg(target_os = "linux")]
        AudioSourceSpec::System => {
            // Linux loopback is "select the monitor source as your input
            // device" — no separate listener type. We build a CpalListener
            // and stamp the discovered monitor source name into its config
            // path. The caller's options.input_device override (when we
            // surface that on AudioSourceSpec::System) takes precedence.
            let monitor = car_voice::default_monitor_source().ok_or_else(|| {
                "no PipeWire/PulseAudio monitor source found — install pipewire-pulse or pulseaudio".to_string()
            })?;
            tracing::info!(
                "[voice] linux system audio: using monitor source '{}'",
                monitor
            );
            // CpalListener picks up `input_device` from VoiceConfig at
            // start time. We can't set it here without changing the
            // start signature, so for now we fall through to the default
            // input device — callers wanting a specific monitor should
            // pass it via options.model... actually no, model is wrong
            // field. The clean wiring requires extending
            // AudioSourceSpec::System with an optional device name.
            // For v1, document the limitation and use default cpal input.
            Ok(Box::new(car_voice::CpalListener::new()))
        }
        #[cfg(not(any(
            all(target_os = "macos", feature = "system-audio-macos"),
            target_os = "windows",
            target_os = "linux",
        )))]
        AudioSourceSpec::System => Err(
            "system audio capture: macOS requires the `system-audio-macos` feature (full Xcode); other platforms (Windows / Linux) need building with their respective target_os"
                .to_string(),
        ),
        AudioSourceSpec::File { .. } => Err(
            "file source not yet wired into transcribeStream — call the existing batch transcribe() in the meantime"
                .to_string(),
        ),
        AudioSourceSpec::Fifo { .. } => {
            Err("fifo source not yet wired into transcribeStream".to_string())
        }
        AudioSourceSpec::PcmPush {
            sample_rate,
            channels,
        } => {
            let listener = PushListener::new(*sample_rate, *channels);
            // Snapshot the push handle into the side registry before
            // boxing — once we lose the typed reference we can't get
            // the handle back through `&dyn Listener`.
            if let Some(handle) = listener.handle() {
                push_handles().insert(session_id.to_string(), handle);
            }
            Ok(Box::new(listener))
        }
    }
}

/// Start a streaming transcription session. Returns the session id
/// (echo of the caller-provided id, validated and registered).
///
/// Events flow asynchronously to `sink` until the session ends or
/// [`transcribe_stream_stop`] is called.
pub async fn transcribe_stream_start(
    session_id: &str,
    audio_source_json: &str,
    options_json: Option<&str>,
    registry: Arc<VoiceSessionRegistry>,
    sink: Arc<dyn VoiceEventSink>,
) -> Result<String, String> {
    if session_id.is_empty() {
        return Err("session_id must not be empty".to_string());
    }
    if registry.contains(session_id) {
        return Err(format!("session_id '{}' already exists", session_id));
    }

    let source: AudioSourceSpec = serde_json::from_str(audio_source_json)
        .map_err(|e| format!("invalid audioSource JSON: {}", e))?;
    let opts: TranscribeStreamOptions = match options_json {
        Some(s) if !s.is_empty() => serde_json::from_str(s)
            .map_err(|e| format!("invalid options JSON: {}", e))?,
        _ => TranscribeStreamOptions::default(),
    };

    let listener = build_listener_for_source(
        session_id,
        &source,
        opts.streaming,
        opts.diarizer,
        opts.enrolled,
    )?;
    let session = VoiceSession::new(session_id, listener);
    let config = build_voice_config(&opts);
    session
        .start(config, sink)
        .await
        .map_err(|e| format!("session start error: {}", e))?;
    registry
        .insert(session)
        .map_err(|e| format!("registry insert error: {}", e))?;
    Ok(serde_json::json!({ "session_id": session_id }).to_string())
}

/// Stop a streaming transcription session. Idempotent — stopping an
/// unknown id returns an error JSON the caller can ignore if desired.
pub async fn transcribe_stream_stop(
    session_id: &str,
    registry: Arc<VoiceSessionRegistry>,
) -> Result<String, String> {
    // Drop the push handle before stopping the session so any in-flight
    // pushes fail fast rather than queue against a teardown.
    push_handles().remove(session_id);
    registry
        .stop(session_id)
        .await
        .map_err(|e| format!("session stop error: {}", e))?;
    Ok(serde_json::json!({ "session_id": session_id, "stopped": true }).to_string())
}

/// Push a PCM frame into a `pcm_push` session.
///
/// `pcm_frame` is interpreted as 16-bit signed little-endian PCM at
/// the sample rate / channel layout declared when the session was
/// started. Stereo input is downmixed to mono inside the listener.
///
/// Returns `{accepted: true}` on success.
pub async fn transcribe_stream_push(
    session_id: &str,
    pcm_frame: &[u8],
    registry: Arc<VoiceSessionRegistry>,
) -> Result<String, String> {
    if !registry.contains(session_id) {
        return Err(format!("unknown session_id '{}'", session_id));
    }
    let handle = push_handles()
        .get(session_id)
        .ok_or_else(|| {
            format!(
                "session '{}' is not a pcm_push session (push only works for sources of kind 'pcm_push')",
                session_id
            )
        })?
        .clone();
    handle
        .feed_pcm(pcm_frame.to_vec())
        .await
        .map_err(|e: car_voice::VoiceError| format!("push failed: {}", e))?;
    Ok(serde_json::json!({ "accepted": true }).to_string())
}

/// List the ids of all currently active voice sessions.
pub fn list_voice_sessions(registry: Arc<VoiceSessionRegistry>) -> String {
    serde_json::json!({ "sessions": registry.list() }).to_string()
}

/// Translate caller options into the listener's [`VoiceConfig`].
///
/// Maps the generic `model` hint to whichever provider-specific field
/// the configured [`SttProvider`] uses. `prompt` (bias text) has no
/// home on `VoiceConfig` yet — it's accepted at the FFI boundary so
/// the JSON shape is stable, but currently ignored. Adding it without
/// a matching field would silently lose data, which is worse than
/// being explicit about the gap.
fn build_voice_config(opts: &TranscribeStreamOptions) -> VoiceConfig {
    let mut config = VoiceConfig::default();
    if let Some(model) = &opts.model {
        // Today the only on-device provider is whisper.cpp, so any model
        // override goes there. When Parakeet lands (task #6) this will
        // branch on `config.stt_provider` and set the right field.
        config.whisper_cpp_model = model.clone();
    }
    if let Some(lang) = &opts.language {
        config.language = lang.clone();
    }
    let _ = &opts.prompt; // intentionally ignored — see doc comment
    config
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parses_mic_source() {
        let s: AudioSourceSpec = serde_json::from_str(r#"{"kind":"mic"}"#).unwrap();
        matches!(s, AudioSourceSpec::Mic);
    }

    #[test]
    fn parses_pcm_push_with_defaults() {
        let s: AudioSourceSpec =
            serde_json::from_str(r#"{"kind":"pcm_push","sample_rate":48000}"#).unwrap();
        match s {
            AudioSourceSpec::PcmPush { sample_rate, channels } => {
                assert_eq!(sample_rate, 48000);
                assert_eq!(channels, 1);
            }
            _ => panic!("wrong variant"),
        }
    }

    #[test]
    fn parses_file_source() {
        let s: AudioSourceSpec =
            serde_json::from_str(r#"{"kind":"file","path":"/tmp/a.wav"}"#).unwrap();
        match s {
            AudioSourceSpec::File { path } => assert_eq!(path, "/tmp/a.wav"),
            _ => panic!("wrong variant"),
        }
    }

    #[test]
    fn rejects_unknown_kind() {
        let res: Result<AudioSourceSpec, _> = serde_json::from_str(r#"{"kind":"radio"}"#);
        assert!(res.is_err());
    }

    #[test]
    fn options_defaults_are_empty() {
        let opts: TranscribeStreamOptions = serde_json::from_str("{}").unwrap();
        assert!(opts.model.is_none());
        assert!(opts.language.is_none());
        assert!(!opts.emit_audio_meta);
    }

    #[tokio::test]
    async fn unknown_session_stop_returns_error() {
        let registry = Arc::new(VoiceSessionRegistry::new());
        let err = transcribe_stream_stop("nope", registry).await.unwrap_err();
        assert!(err.contains("nope") || err.contains("not running"));
    }
}