llm/backends/
elevenlabs.rs

1use crate::chat::{ChatMessage, ChatProvider, ChatResponse, Tool};
2use crate::completion::{CompletionProvider, CompletionRequest, CompletionResponse};
3use crate::embedding::EmbeddingProvider;
4#[cfg(feature = "elevenlabs")]
5use crate::error::LLMError;
6use crate::models::ModelsProvider;
7use crate::stt::SpeechToTextProvider;
8use crate::tts::TextToSpeechProvider;
9use crate::LLMProvider;
10use async_trait::async_trait;
11use reqwest::Client;
12use serde::{Deserialize, Serialize};
13use std::time::Duration;
14
15/// ElevenLabs speech to text backend implementation
16///
17/// This struct provides functionality for speech-to-text transcription using the ElevenLabs API.
18/// It implements various LLM provider traits but only supports speech-to-text functionality.
19pub struct ElevenLabs {
20    /// API key for ElevenLabs authentication
21    api_key: String,
22    /// Model identifier for speech-to-text
23    model_id: String,
24    /// Base URL for API requests
25    base_url: String,
26    /// Optional timeout duration in seconds
27    timeout_seconds: Option<u64>,
28    /// HTTP client for making requests
29    client: Client,
30    /// Voice ID to use for speech synthesis
31    voice: Option<String>,
32}
33
34/// Internal representation of a word from ElevenLabs API response
35#[derive(Debug, Deserialize)]
36struct ElevenLabsWord {
37    /// The transcribed word text
38    text: String,
39    /// Start time of the word in seconds
40    #[serde(default)]
41    start: f32,
42    /// End time of the word in seconds
43    #[serde(default)]
44    end: f32,
45}
46
47/// Public representation of a transcribed word with timing information
48#[derive(Debug, Deserialize, Serialize, Clone)]
49pub struct Word {
50    /// The transcribed word text
51    pub text: String,
52    /// Start time of the word in seconds
53    pub start: f32,
54    /// End time of the word in seconds
55    pub end: f32,
56}
57
58/// Response structure from ElevenLabs speech-to-text API
59#[allow(dead_code)]
60#[derive(Debug, Deserialize)]
61struct ElevenLabsResponse {
62    /// Detected language code if available
63    #[serde(skip_serializing_if = "Option::is_none")]
64    language_code: Option<String>,
65    /// Probability of the detected language if available
66    #[serde(skip_serializing_if = "Option::is_none")]
67    language_probability: Option<f32>,
68    /// Full transcribed text
69    #[serde(skip_serializing_if = "Option::is_none")]
70    text: String,
71    /// Optional list of words with timing information
72    words: Option<Vec<ElevenLabsWord>>,
73}
74
75impl ElevenLabs {
76    /// Creates a new ElevenLabs instance
77    ///
78    /// # Arguments
79    ///
80    /// * `api_key` - API key for ElevenLabs authentication
81    /// * `model_id` - Model identifier for speech-to-text
82    /// * `base_url` - Base URL for API requests
83    /// * `timeout_seconds` - Optional timeout duration in seconds
84    ///
85    /// # Returns
86    ///
87    /// A new ElevenLabs instance
88    pub fn new(
89        api_key: String,
90        model_id: String,
91        base_url: String,
92        timeout_seconds: Option<u64>,
93        voice: Option<String>,
94    ) -> Self {
95        Self {
96            api_key,
97            model_id,
98            base_url,
99            timeout_seconds,
100            client: Client::new(),
101            voice,
102        }
103    }
104}
105
106#[async_trait]
107impl SpeechToTextProvider for ElevenLabs {
108    /// Transcribes audio data to text using ElevenLabs API
109    ///
110    /// # Arguments
111    ///
112    /// * `audio` - Raw audio data as bytes
113    ///
114    /// # Returns
115    ///
116    /// * `Ok(String)` - Transcribed text
117    /// * `Err(LLMError)` - Error if transcription fails
118    async fn transcribe(&self, audio: Vec<u8>) -> Result<String, LLMError> {
119        let url = format!("{}/speech-to-text", self.base_url);
120        let part = reqwest::multipart::Part::bytes(audio).file_name("audio.wav");
121        let form = reqwest::multipart::Form::new()
122            .text("model_id", self.model_id.clone())
123            .part("file", part);
124
125        let mut req = self
126            .client
127            .post(url)
128            .header("xi-api-key", &self.api_key)
129            .multipart(form);
130
131        if let Some(t) = self.timeout_seconds {
132            req = req.timeout(Duration::from_secs(t));
133        }
134
135        let resp = req.send().await?.error_for_status()?;
136        let text = resp.text().await?;
137        let raw = text.clone();
138        let parsed: ElevenLabsResponse =
139            serde_json::from_str(&text).map_err(|e| LLMError::ResponseFormatError {
140                message: e.to_string(),
141                raw_response: raw,
142            })?;
143
144        let words: Option<Vec<Word>> = parsed.words.map(|ws| {
145            ws.into_iter()
146                .map(|w| Word {
147                    text: w.text,
148                    start: w.start,
149                    end: w.end,
150                })
151                .collect()
152        });
153
154        Ok(words
155            .unwrap_or_default()
156            .into_iter()
157            .map(|w| w.text)
158            .collect())
159    }
160
161    /// Transcribes audio file to text using ElevenLabs API
162    ///
163    /// # Arguments
164    ///
165    /// * `file_path` - Path to the audio file
166    ///
167    /// # Returns
168    ///
169    /// * `Ok(String)` - Transcribed text
170    /// * `Err(LLMError)` - Error if transcription fails
171    async fn transcribe_file(&self, file_path: &str) -> Result<String, LLMError> {
172        let url = format!("{}/speech-to-text", self.base_url);
173        let form = reqwest::multipart::Form::new()
174            .text("model_id", self.model_id.clone())
175            .file("file", file_path)
176            .await
177            .map_err(|e| LLMError::HttpError(e.to_string()))?;
178
179        let mut req = self
180            .client
181            .post(url)
182            .header("xi-api-key", &self.api_key)
183            .multipart(form);
184
185        if let Some(t) = self.timeout_seconds {
186            req = req.timeout(Duration::from_secs(t));
187        }
188
189        let resp = req.send().await?.error_for_status()?;
190        let text = resp.text().await?;
191        let raw = text.clone();
192        let parsed: ElevenLabsResponse =
193            serde_json::from_str(&text).map_err(|e| LLMError::ResponseFormatError {
194                message: e.to_string(),
195                raw_response: raw,
196            })?;
197
198        let words: Option<Vec<Word>> = parsed.words.map(|ws| {
199            ws.into_iter()
200                .map(|w| Word {
201                    text: w.text,
202                    start: w.start,
203                    end: w.end,
204                })
205                .collect()
206        });
207
208        Ok(words
209            .unwrap_or_default()
210            .into_iter()
211            .map(|w| w.text)
212            .collect())
213    }
214}
215
216#[async_trait]
217impl CompletionProvider for ElevenLabs {
218    /// Returns a not implemented message for completion requests
219    async fn complete(&self, _req: &CompletionRequest) -> Result<CompletionResponse, LLMError> {
220        Ok(CompletionResponse {
221            text: "ElevenLabs completion not implemented.".into(),
222        })
223    }
224}
225
226#[async_trait]
227impl EmbeddingProvider for ElevenLabs {
228    /// Returns an error indicating embedding is not supported
229    async fn embed(&self, _text: Vec<String>) -> Result<Vec<Vec<f32>>, LLMError> {
230        Err(LLMError::ProviderError(
231            "Embedding not supported".to_string(),
232        ))
233    }
234}
235
236#[async_trait]
237impl ChatProvider for ElevenLabs {
238    /// Returns an error indicating chat is not supported
239    async fn chat(&self, _messages: &[ChatMessage]) -> Result<Box<dyn ChatResponse>, LLMError> {
240        Err(LLMError::ProviderError("Chat not supported".to_string()))
241    }
242
243    /// Returns an error indicating chat with tools is not supported
244    async fn chat_with_tools(
245        &self,
246        _messages: &[ChatMessage],
247        _tools: Option<&[Tool]>,
248    ) -> Result<Box<dyn ChatResponse>, LLMError> {
249        Err(LLMError::ProviderError(
250            "Chat with tools not supported".to_string(),
251        ))
252    }
253}
254
255#[async_trait]
256impl ModelsProvider for ElevenLabs {}
257
258impl LLMProvider for ElevenLabs {
259    /// Returns None as no tools are supported
260    fn tools(&self) -> Option<&[Tool]> {
261        None
262    }
263}
264
265#[async_trait]
266impl TextToSpeechProvider for ElevenLabs {
267    /// Converts text to speech using ElevenLabs API
268    ///
269    /// # Arguments
270    ///
271    /// * `text` - Text to convert to speech
272    /// * `voice_id` - Voice ID to use for speech synthesis
273    ///
274    /// # Returns
275    ///
276    /// * `Ok(Vec<u8>)` - Audio data as bytes
277    /// * `Err(LLMError)` - Error if conversion fails
278    async fn speech(&self, text: &str) -> Result<Vec<u8>, LLMError> {
279        let url = format!(
280            "{}/text-to-speech/{}?output_format=mp3_44100_128",
281            self.base_url,
282            self.voice
283                .clone()
284                .unwrap_or("JBFqnCBsd6RMkjVDRZzb".to_string())
285        );
286
287        let body = serde_json::json!({
288            "text": text,
289            "model_id": self.model_id
290        });
291
292        let mut req = self
293            .client
294            .post(url)
295            .header("xi-api-key", &self.api_key)
296            .header("Content-Type", "application/json")
297            .json(&body);
298
299        if let Some(t) = self.timeout_seconds {
300            req = req.timeout(Duration::from_secs(t));
301        }
302
303        let resp = req.send().await?.error_for_status()?;
304        let audio_data = resp.bytes().await?;
305
306        Ok(audio_data.to_vec())
307    }
308}