car-voice 0.17.0

//! ElevenLabs streaming text-to-speech provider.
//!
//! Ported from `app/src/lib/voiceEngine.ts:streamAndPlay`. Uses the
//! `/v1/text-to-speech/{voice_id}/stream` endpoint. `synth` buffers the
//! whole response (for callers that want one blob); `synth_stream`
//! forwards each MP3 chunk as it arrives over an mpsc channel — that's
//! the low-latency path used by `voice.tts_stream.*` JSON-RPC.

use crate::tts::{AudioFormat, Speaker, SynthesizedAudio, TtsChunk, TTS_STREAM_QUEUE_CAPACITY};
use crate::{Result, VoiceConfig, VoiceError};
use async_trait::async_trait;
use futures::StreamExt;
use serde::Serialize;
use tokio::sync::mpsc;

/// ElevenLabs streaming TTS speaker.
pub struct ElevenLabsSpeaker {
    client: reqwest::Client,
    api_key: String,
    voice_id: String,
    model: String,
    voice_settings: VoiceSettings,
}

// Manual Debug impl that redacts the API key. Do NOT derive Debug on this
// struct — it would print the key in any panic / error / log line.
impl std::fmt::Debug for ElevenLabsSpeaker {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("ElevenLabsSpeaker")
            .field("client", &"reqwest::Client")
            .field("api_key", &"<redacted>")
            .field("voice_id", &self.voice_id)
            .field("model", &self.model)
            .field("voice_settings", &self.voice_settings)
            .finish()
    }
}

/// Voice tuning knobs sent to ElevenLabs. Stability/similarity/style each
/// range over `[0.0, 1.0]`. Defaults match the prod TS values from
/// `voiceEngine.ts`.
#[derive(Debug, Clone, Serialize)]
pub struct VoiceSettings {
    pub stability: f32,
    pub similarity_boost: f32,
    pub style: f32,
}

impl Default for VoiceSettings {
    fn default() -> Self {
        Self {
            stability: 0.45,
            similarity_boost: 0.85,
            style: 0.20,
        }
    }
}

impl ElevenLabsSpeaker {
    /// Construct from explicit credentials. Prefer
    /// [`ElevenLabsSpeaker::from_config`] in production code.
    pub fn new(
        api_key: impl Into<String>,
        voice_id: impl Into<String>,
        model: impl Into<String>,
    ) -> Self {
        Self {
            client: reqwest::Client::new(),
            api_key: api_key.into(),
            voice_id: voice_id.into(),
            model: model.into(),
            voice_settings: VoiceSettings::default(),
        }
    }

    /// Construct from a [`VoiceConfig`], pulling the API key from
    /// (in order) the `elevenlabs_api_key` config field, the
    /// `ELEVENLABS_API_KEY` process env var, or the OS keychain
    /// under CAR's default service. Mirrors `car-inference`'s
    /// resolution order — see #140.
    pub fn from_config(config: &VoiceConfig) -> Result<Self> {
        let api_key = config
            .elevenlabs_api_key
            .clone()
            .filter(|k| !k.is_empty())
            .or_else(|| car_secrets::resolve_env_or_keychain("ELEVENLABS_API_KEY"))
            .ok_or_else(|| {
                VoiceError::Config(
                    "ELEVENLABS_API_KEY not set; set the env var or store it via \
                     `car secrets put ELEVENLABS_API_KEY`"
                        .into(),
                )
            })?;

        Ok(Self {
            client: reqwest::Client::new(),
            api_key,
            voice_id: config.elevenlabs_voice_id.clone(),
            model: config.elevenlabs_tts_model.clone(),
            voice_settings: VoiceSettings::default(),
        })
    }

    /// Override the voice tuning settings.
    pub fn with_voice_settings(mut self, settings: VoiceSettings) -> Self {
        self.voice_settings = settings;
        self
    }

    /// Build the JSON request body sent to ElevenLabs. Exposed for tests.
    pub fn request_body(&self, text: &str) -> serde_json::Value {
        serde_json::json!({
            "text": text,
            "model_id": self.model,
            "voice_settings": {
                "stability": self.voice_settings.stability,
                "similarity_boost": self.voice_settings.similarity_boost,
                "style": self.voice_settings.style,
            },
        })
    }

    /// Build the streaming endpoint URL for the configured voice.
    pub fn stream_url(&self) -> String {
        format!(
            "https://api.elevenlabs.io/v1/text-to-speech/{}/stream?output_format=mp3_44100_128",
            self.voice_id
        )
    }
}

impl ElevenLabsSpeaker {
    /// POST to the streaming endpoint and return the live response.
    /// Callers consume `.bytes_stream()` from the result.
    async fn open_stream(&self, text: &str) -> Result<reqwest::Response> {
        let resp = self
            .client
            .post(self.stream_url())
            .header("xi-api-key", &self.api_key)
            .header("accept", "audio/mpeg")
            .json(&self.request_body(text))
            .send()
            .await
            .map_err(|e| VoiceError::Tts(format!("http: {e}")))?;

        if !resp.status().is_success() {
            let status = resp.status();
            let body = resp.text().await.unwrap_or_default();
            return Err(VoiceError::Tts(format!("API {status}: {body}")));
        }

        Ok(resp)
    }
}

#[async_trait]
impl Speaker for ElevenLabsSpeaker {
    async fn synth(&self, text: &str) -> Result<SynthesizedAudio> {
        if text.trim().is_empty() {
            return Err(VoiceError::Tts("empty text".into()));
        }
        let resp = self.open_stream(text).await?;
        let mut stream = resp.bytes_stream();
        let mut bytes = Vec::with_capacity(64 * 1024);
        while let Some(chunk) = stream.next().await {
            let chunk = chunk.map_err(|e| VoiceError::Tts(format!("stream: {e}")))?;
            bytes.extend_from_slice(&chunk);
        }
        if bytes.is_empty() {
            return Err(VoiceError::Tts("empty audio response".into()));
        }
        Ok(SynthesizedAudio {
            bytes,
            format: AudioFormat::Mp3,
        })
    }

    async fn synth_stream(&self, text: &str) -> Result<mpsc::Receiver<TtsChunk>> {
        if text.trim().is_empty() {
            return Err(VoiceError::Tts("empty text".into()));
        }
        let resp = self.open_stream(text).await?;
        let (tx, rx) = mpsc::channel(TTS_STREAM_QUEUE_CAPACITY);

        crate::voice_runtime_handle().spawn(async move {
            let stream = resp.bytes_stream();
            tokio::pin!(stream);
            let mut seq: u64 = 0;
            let mut last_bytes: Option<Vec<u8>> = None;

            // We need to peek one chunk ahead so we can mark the last one
            // `is_final = true` without buffering the whole stream.
            while let Some(chunk) = stream.next().await {
                let chunk = match chunk {
                    Ok(b) => b.to_vec(),
                    Err(e) => {
                        tracing::error!("[elevenlabs] stream error: {e}");
                        break;
                    }
                };
                if let Some(prev) = last_bytes.take() {
                    if tx
                        .send(TtsChunk {
                            seq,
                            bytes: prev,
                            format: AudioFormat::Mp3,
                            is_final: false,
                        })
                        .await
                        .is_err()
                    {
                        // Receiver dropped — caller cancelled (barge-in).
                        return;
                    }
                    seq += 1;
                }
                last_bytes = Some(chunk);
            }

            // Emit the last buffered chunk as final. If the stream was
            // empty, emit a zero-byte final chunk so the consumer always
            // sees a terminator.
            let final_bytes = last_bytes.unwrap_or_default();
            let _ = tx
                .send(TtsChunk {
                    seq,
                    bytes: final_bytes,
                    format: AudioFormat::Mp3,
                    is_final: true,
                })
                .await;
        });

        Ok(rx)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn request_body_includes_text_model_and_voice_settings() {
        let speaker = ElevenLabsSpeaker::new("k", "v", "eleven_turbo_v2_5");
        let body = speaker.request_body("hello tokhn");
        assert_eq!(body["text"], "hello tokhn");
        assert_eq!(body["model_id"], "eleven_turbo_v2_5");
        // f32 -> JSON loses precision; compare with epsilon
        let stability = body["voice_settings"]["stability"].as_f64().unwrap();
        let similarity = body["voice_settings"]["similarity_boost"].as_f64().unwrap();
        let style = body["voice_settings"]["style"].as_f64().unwrap();
        assert!((stability - 0.45).abs() < 1e-4);
        assert!((similarity - 0.85).abs() < 1e-4);
        assert!((style - 0.20).abs() < 1e-4);
    }

    #[test]
    fn stream_url_uses_voice_id_and_format_query() {
        let speaker = ElevenLabsSpeaker::new("k", "voice_xyz", "m");
        let url = speaker.stream_url();
        assert!(url.contains("/text-to-speech/voice_xyz/stream"));
        assert!(url.contains("output_format=mp3_44100_128"));
    }

    /// Combined env-var test — env mutation is process-global so we can't
    /// run two tests touching `ELEVENLABS_API_KEY` in parallel. Asserting
    /// both behaviors in one test keeps them serial without pulling in
    /// `serial_test`.
    #[test]
    fn from_config_env_var_fallback_behavior() {
        let saved = std::env::var("ELEVENLABS_API_KEY").ok();

        // 1) Explicit api_key in config wins.
        let cfg = VoiceConfig {
            elevenlabs_api_key: Some("explicit-key".into()),
            ..VoiceConfig::default()
        };
        let speaker = ElevenLabsSpeaker::from_config(&cfg).unwrap();
        assert_eq!(speaker.api_key, "explicit-key");

        // 2) Falls back to env var when config field is None.
        std::env::set_var("ELEVENLABS_API_KEY", "env-key");
        let cfg = VoiceConfig {
            elevenlabs_api_key: None,
            ..VoiceConfig::default()
        };
        let speaker = ElevenLabsSpeaker::from_config(&cfg).unwrap();
        assert_eq!(speaker.api_key, "env-key");

        // 3) Errors when neither is set.
        std::env::remove_var("ELEVENLABS_API_KEY");
        let cfg = VoiceConfig {
            elevenlabs_api_key: None,
            ..VoiceConfig::default()
        };
        let err = ElevenLabsSpeaker::from_config(&cfg).unwrap_err();
        assert!(matches!(err, VoiceError::Config(_)));

        // Restore env so other tests aren't affected.
        match saved {
            Some(k) => std::env::set_var("ELEVENLABS_API_KEY", k),
            None => std::env::remove_var("ELEVENLABS_API_KEY"),
        }
    }

    #[tokio::test]
    async fn synth_rejects_empty_text() {
        let speaker = ElevenLabsSpeaker::new("k", "v", "m");
        let err = speaker.synth("   ").await.unwrap_err();
        assert!(matches!(err, VoiceError::Tts(_)));
    }
}