use crate::tts::{AudioFormat, Speaker, SynthesizedAudio, TtsChunk, TTS_STREAM_QUEUE_CAPACITY};
use crate::{Result, VoiceConfig, VoiceError};
use async_trait::async_trait;
use futures::StreamExt;
use serde::Serialize;
use tokio::sync::mpsc;
pub struct ElevenLabsSpeaker {
client: reqwest::Client,
api_key: String,
voice_id: String,
model: String,
voice_settings: VoiceSettings,
}
impl std::fmt::Debug for ElevenLabsSpeaker {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ElevenLabsSpeaker")
.field("client", &"reqwest::Client")
.field("api_key", &"<redacted>")
.field("voice_id", &self.voice_id)
.field("model", &self.model)
.field("voice_settings", &self.voice_settings)
.finish()
}
}
#[derive(Debug, Clone, Serialize)]
pub struct VoiceSettings {
pub stability: f32,
pub similarity_boost: f32,
pub style: f32,
}
impl Default for VoiceSettings {
fn default() -> Self {
Self {
stability: 0.45,
similarity_boost: 0.85,
style: 0.20,
}
}
}
impl ElevenLabsSpeaker {
pub fn new(
api_key: impl Into<String>,
voice_id: impl Into<String>,
model: impl Into<String>,
) -> Self {
Self {
client: reqwest::Client::new(),
api_key: api_key.into(),
voice_id: voice_id.into(),
model: model.into(),
voice_settings: VoiceSettings::default(),
}
}
pub fn from_config(config: &VoiceConfig) -> Result<Self> {
let api_key = config
.elevenlabs_api_key
.clone()
.filter(|k| !k.is_empty())
.or_else(|| car_secrets::resolve_env_or_keychain("ELEVENLABS_API_KEY"))
.ok_or_else(|| {
VoiceError::Config(
"ELEVENLABS_API_KEY not set; set the env var or store it via \
`car secrets put ELEVENLABS_API_KEY`"
.into(),
)
})?;
Ok(Self {
client: reqwest::Client::new(),
api_key,
voice_id: config.elevenlabs_voice_id.clone(),
model: config.elevenlabs_tts_model.clone(),
voice_settings: VoiceSettings::default(),
})
}
pub fn with_voice_settings(mut self, settings: VoiceSettings) -> Self {
self.voice_settings = settings;
self
}
pub fn request_body(&self, text: &str) -> serde_json::Value {
serde_json::json!({
"text": text,
"model_id": self.model,
"voice_settings": {
"stability": self.voice_settings.stability,
"similarity_boost": self.voice_settings.similarity_boost,
"style": self.voice_settings.style,
},
})
}
pub fn stream_url(&self) -> String {
format!(
"https://api.elevenlabs.io/v1/text-to-speech/{}/stream?output_format=mp3_44100_128",
self.voice_id
)
}
}
impl ElevenLabsSpeaker {
async fn open_stream(&self, text: &str) -> Result<reqwest::Response> {
let resp = self
.client
.post(self.stream_url())
.header("xi-api-key", &self.api_key)
.header("accept", "audio/mpeg")
.json(&self.request_body(text))
.send()
.await
.map_err(|e| VoiceError::Tts(format!("http: {e}")))?;
if !resp.status().is_success() {
let status = resp.status();
let body = resp.text().await.unwrap_or_default();
return Err(VoiceError::Tts(format!("API {status}: {body}")));
}
Ok(resp)
}
}
#[async_trait]
impl Speaker for ElevenLabsSpeaker {
async fn synth(&self, text: &str) -> Result<SynthesizedAudio> {
if text.trim().is_empty() {
return Err(VoiceError::Tts("empty text".into()));
}
let resp = self.open_stream(text).await?;
let mut stream = resp.bytes_stream();
let mut bytes = Vec::with_capacity(64 * 1024);
while let Some(chunk) = stream.next().await {
let chunk = chunk.map_err(|e| VoiceError::Tts(format!("stream: {e}")))?;
bytes.extend_from_slice(&chunk);
}
if bytes.is_empty() {
return Err(VoiceError::Tts("empty audio response".into()));
}
Ok(SynthesizedAudio {
bytes,
format: AudioFormat::Mp3,
})
}
async fn synth_stream(&self, text: &str) -> Result<mpsc::Receiver<TtsChunk>> {
if text.trim().is_empty() {
return Err(VoiceError::Tts("empty text".into()));
}
let resp = self.open_stream(text).await?;
let (tx, rx) = mpsc::channel(TTS_STREAM_QUEUE_CAPACITY);
crate::voice_runtime_handle().spawn(async move {
let stream = resp.bytes_stream();
tokio::pin!(stream);
let mut seq: u64 = 0;
let mut last_bytes: Option<Vec<u8>> = None;
while let Some(chunk) = stream.next().await {
let chunk = match chunk {
Ok(b) => b.to_vec(),
Err(e) => {
tracing::error!("[elevenlabs] stream error: {e}");
break;
}
};
if let Some(prev) = last_bytes.take() {
if tx
.send(TtsChunk {
seq,
bytes: prev,
format: AudioFormat::Mp3,
is_final: false,
})
.await
.is_err()
{
return;
}
seq += 1;
}
last_bytes = Some(chunk);
}
let final_bytes = last_bytes.unwrap_or_default();
let _ = tx
.send(TtsChunk {
seq,
bytes: final_bytes,
format: AudioFormat::Mp3,
is_final: true,
})
.await;
});
Ok(rx)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn request_body_includes_text_model_and_voice_settings() {
let speaker = ElevenLabsSpeaker::new("k", "v", "eleven_turbo_v2_5");
let body = speaker.request_body("hello tokhn");
assert_eq!(body["text"], "hello tokhn");
assert_eq!(body["model_id"], "eleven_turbo_v2_5");
let stability = body["voice_settings"]["stability"].as_f64().unwrap();
let similarity = body["voice_settings"]["similarity_boost"].as_f64().unwrap();
let style = body["voice_settings"]["style"].as_f64().unwrap();
assert!((stability - 0.45).abs() < 1e-4);
assert!((similarity - 0.85).abs() < 1e-4);
assert!((style - 0.20).abs() < 1e-4);
}
#[test]
fn stream_url_uses_voice_id_and_format_query() {
let speaker = ElevenLabsSpeaker::new("k", "voice_xyz", "m");
let url = speaker.stream_url();
assert!(url.contains("/text-to-speech/voice_xyz/stream"));
assert!(url.contains("output_format=mp3_44100_128"));
}
#[test]
fn from_config_env_var_fallback_behavior() {
let saved = std::env::var("ELEVENLABS_API_KEY").ok();
let cfg = VoiceConfig {
elevenlabs_api_key: Some("explicit-key".into()),
..VoiceConfig::default()
};
let speaker = ElevenLabsSpeaker::from_config(&cfg).unwrap();
assert_eq!(speaker.api_key, "explicit-key");
std::env::set_var("ELEVENLABS_API_KEY", "env-key");
let cfg = VoiceConfig {
elevenlabs_api_key: None,
..VoiceConfig::default()
};
let speaker = ElevenLabsSpeaker::from_config(&cfg).unwrap();
assert_eq!(speaker.api_key, "env-key");
std::env::remove_var("ELEVENLABS_API_KEY");
let cfg = VoiceConfig {
elevenlabs_api_key: None,
..VoiceConfig::default()
};
let err = ElevenLabsSpeaker::from_config(&cfg).unwrap_err();
assert!(matches!(err, VoiceError::Config(_)));
match saved {
Some(k) => std::env::set_var("ELEVENLABS_API_KEY", k),
None => std::env::remove_var("ELEVENLABS_API_KEY"),
}
}
#[tokio::test]
async fn synth_rejects_empty_text() {
let speaker = ElevenLabsSpeaker::new("k", "v", "m");
let err = speaker.synth(" ").await.unwrap_err();
assert!(matches!(err, VoiceError::Tts(_)));
}
}