golem_ai_tts_deepgram/
lib.rs

1use crate::client::{get_available_models, DeepgramTtsApi, Model, RateLimitConfig};
2use crate::conversions::{
3    audio_data_to_synthesis_result, deepgram_model_to_voice_info, estimate_audio_duration,
4    get_max_chars_for_model, models_to_language_info, split_text_intelligently,
5    synthesis_options_to_tts_request, validate_synthesis_request, validate_text_input,
6};
7use golem_ai_tts::durability::{DurableTts, ExtendedTtsProvider};
8use golem_ai_tts::model::advanced::{
9    AudioSample, LongFormOperation, LongFormResult, OperationStatus, PronunciationEntry,
10    PronunciationLexicon, VoiceDesignParams,
11};
12use golem_ai_tts::model::streaming::{StreamStatus, SynthesisStream, VoiceConversionStream};
13use golem_ai_tts::model::synthesis::{SynthesisOptions, ValidationResult};
14use golem_ai_tts::model::types::{
15    AudioChunk, AudioFormat, LanguageCode, SynthesisMetadata, SynthesisResult, TextInput,
16    TimingInfo, TtsError, VoiceGender, VoiceQuality, VoiceSettings,
17};
18use golem_ai_tts::model::voices::{LanguageInfo, Voice, VoiceFilter, VoiceInfo, VoiceResults};
19use golem_ai_tts::wasi_compat::{subscribe_zero, Pollable};
20use golem_ai_tts::{
21    AdvancedTtsProvider, LongFormOperationInterface, PronunciationLexiconInterface,
22    StreamingVoiceProvider, SynthesisStreamInterface, SynthesizeProvider,
23    VoiceConversionStreamInterface, VoiceInterface, VoiceProvider, VoiceResultsInterface,
24};
25use log::{info, warn};
26use std::cell::{Cell, RefCell};
27
28pub mod config;
29pub use config::DeepgramConfig;
30#[cfg(feature = "golem")]
31pub use config::DeepgramHostConfig;
32
33mod client;
34mod conversions;
35
36pub struct DeepgramVoiceImpl {
37    model_data: Model,
38    client: DeepgramTtsApi,
39}
40
41impl DeepgramVoiceImpl {
42    fn new(model_data: Model, client: DeepgramTtsApi) -> Self {
43        Self { model_data, client }
44    }
45}
46
47impl VoiceInterface for DeepgramVoiceImpl {
48    fn as_any(&self) -> &dyn std::any::Any {
49        self
50    }
51    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
52        self
53    }
54
55    fn get_id(&self) -> String {
56        self.model_data.voice_id.clone()
57    }
58
59    fn get_name(&self) -> String {
60        self.model_data.name.clone()
61    }
62
63    fn get_provider_id(&self) -> Option<String> {
64        Some("Deepgram".to_string())
65    }
66
67    fn get_language(&self) -> LanguageCode {
68        conversions::normalize_language_code(&self.model_data.language)
69    }
70
71    fn get_additional_languages(&self) -> Vec<LanguageCode> {
72        vec![]
73    }
74
75    fn get_gender(&self) -> VoiceGender {
76        conversions::parse_gender(&self.model_data.gender)
77    }
78
79    fn get_quality(&self) -> VoiceQuality {
80        conversions::infer_quality_from_model(&self.model_data.voice_id)
81    }
82
83    fn get_description(&self) -> Option<String> {
84        Some(format!(
85            "{} voice with {} accent, {}. Characteristics: {}. Suitable for: {}",
86            self.model_data.gender,
87            self.model_data.accent,
88            self.model_data.age,
89            self.model_data.characteristics.join(", "),
90            self.model_data.use_cases.join(", ")
91        ))
92    }
93
94    fn supports_ssml(&self) -> bool {
95        false
96    }
97
98    fn get_sample_rates(&self) -> Vec<u32> {
99        vec![8000, 16000, 22050, 24000, 32000, 48000]
100    }
101
102    fn get_supported_formats(&self) -> Vec<AudioFormat> {
103        vec![
104            AudioFormat::Mp3,
105            AudioFormat::Wav,
106            AudioFormat::Pcm,
107            AudioFormat::OggOpus,
108            AudioFormat::Aac,
109            AudioFormat::Flac,
110            AudioFormat::Mulaw,
111            AudioFormat::Alaw,
112        ]
113    }
114
115    fn update_settings(&self, _settings: VoiceSettings) -> Result<(), TtsError> {
116        Err(TtsError::UnsupportedOperation(
117            "Deepgram does not support voice settings updates".to_string(),
118        ))
119    }
120
121    fn delete(&self) -> Result<(), TtsError> {
122        Err(TtsError::UnsupportedOperation(
123            "Deepgram voices cannot be deleted".to_string(),
124        ))
125    }
126
127    fn clone(&self) -> Result<Voice, TtsError> {
128        Err(TtsError::UnsupportedOperation(
129            "Deepgram does not support voice cloning".to_string(),
130        ))
131    }
132
133    fn preview(&self, text: String) -> Result<Vec<u8>, TtsError> {
134        let (request, params) = synthesis_options_to_tts_request(text, None)?;
135        let mut params = params.unwrap();
136        params.model = Some(self.model_data.voice_id.clone());
137
138        self.client.text_to_speech(&request, Some(&params))
139    }
140}
141
142pub struct DeepgramVoiceResults {
143    voices: RefCell<Vec<VoiceInfo>>,
144    current_index: Cell<usize>,
145}
146
147impl DeepgramVoiceResults {
148    fn new(voices: Vec<VoiceInfo>) -> Self {
149        Self {
150            voices: RefCell::new(voices),
151            current_index: Cell::new(0),
152        }
153    }
154}
155
156impl VoiceResultsInterface for DeepgramVoiceResults {
157    fn as_any(&self) -> &dyn std::any::Any {
158        self
159    }
160    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
161        self
162    }
163
164    fn has_more(&self) -> bool {
165        self.current_index.get() < self.voices.borrow().len()
166    }
167
168    fn get_next(&self) -> Result<Vec<VoiceInfo>, TtsError> {
169        let voices = self.voices.borrow();
170        let current = self.current_index.get();
171
172        if current >= voices.len() {
173            return Ok(vec![]);
174        }
175
176        let remaining: Vec<VoiceInfo> = voices[current..].to_vec();
177        self.current_index.set(voices.len());
178
179        Ok(remaining)
180    }
181
182    fn get_total_count(&self) -> Option<u32> {
183        Some(self.voices.borrow().len() as u32)
184    }
185}
186
187#[warn(dead_code)]
188pub struct DeepgramSynthesisStream {
189    client: DeepgramTtsApi,
190    current_request: RefCell<Option<client::TextToSpeechRequest>>,
191    params: RefCell<Option<client::TextToSpeechParams>>,
192    response_stream: RefCell<Option<golem_wasi_http::Response>>,
193    chunk_buffer: RefCell<Vec<u8>>,
194    bytes_streamed: Cell<usize>,
195    total_chunks_received: Cell<u32>,
196    finished: Cell<bool>,
197    sequence_number: Cell<u32>,
198    stream_started: Cell<bool>,
199}
200
201impl DeepgramSynthesisStream {
202    fn new(voice_id: String, client: DeepgramTtsApi, options: Option<SynthesisOptions>) -> Self {
203        let (request, params) = synthesis_options_to_tts_request(String::new(), options)
204            .unwrap_or_else(|_| {
205                let request = client::TextToSpeechRequest {
206                    text: String::new(),
207                };
208                let params = Some(client::TextToSpeechParams {
209                    model: Some(voice_id.clone()),
210                    encoding: Some("linear16".to_string()),
211                    container: Some("wav".to_string()),
212                    sample_rate: Some(24000),
213                    bit_rate: None,
214                });
215                (request, params)
216            });
217        let mut params = params.unwrap();
218        params.model = Some(voice_id.clone());
219
220        Self {
221            client,
222            current_request: RefCell::new(Some(request)),
223            params: RefCell::new(Some(params)),
224            response_stream: RefCell::new(None),
225            chunk_buffer: RefCell::new(Vec::new()),
226            bytes_streamed: Cell::new(0),
227            total_chunks_received: Cell::new(0),
228            finished: Cell::new(false),
229            sequence_number: Cell::new(0),
230            stream_started: Cell::new(false),
231        }
232    }
233
234    #[allow(dead_code)]
235    fn get_progress(&self) -> (usize, u32) {
236        (self.bytes_streamed.get(), self.total_chunks_received.get())
237    }
238}
239
240impl SynthesisStreamInterface for DeepgramSynthesisStream {
241    fn as_any(&self) -> &dyn std::any::Any {
242        self
243    }
244    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
245        self
246    }
247
248    fn send_text(&self, input: TextInput) -> Result<(), TtsError> {
249        info!("[DEEPGRAM] send_text called with: '{}'", input.content);
250
251        if self.finished.get() {
252            warn!("[DEEPGRAM] Stream already finished, returning error");
253            return Err(TtsError::InvalidConfiguration(
254                "Stream already finished".to_string(),
255            ));
256        }
257
258        {
259            let mut request_ref = self.current_request.borrow_mut();
260            if let Some(mut request) = request_ref.take() {
261                info!(
262                    "[DEEPGRAM] Updating request text from '{}' to '{}'",
263                    request.text, input.content
264                );
265                request.text = input.content;
266                *request_ref = Some(request);
267            } else {
268                warn!("[DEEPGRAM] Warning: No current request to update");
269            }
270        }
271
272        Ok(())
273    }
274
275    fn finish(&self) -> Result<(), TtsError> {
276        info!("[DEEPGRAM] finish() called");
277
278        if self.stream_started.get() {
279            info!("[DEEPGRAM] Stream already started, returning OK");
280            return Ok(());
281        }
282
283        warn!("[DEEPGRAM] Checking current request state...");
284        let request_debug = self.current_request.borrow();
285        match request_debug.as_ref() {
286            Some(req) => {
287                info!(
288                    "[DEEPGRAM] Request exists with text: '{}' (length: {})",
289                    req.text,
290                    req.text.len()
291                );
292            }
293            None => {
294                warn!("[DEEPGRAM] No request found!");
295                return Err(TtsError::InternalError("No request available".to_string()));
296            }
297        }
298        drop(request_debug);
299
300        if let Some(request) = self.current_request.borrow().as_ref() {
301            warn!("[DEEPGRAM] Current request text: '{}'", request.text);
302            if !request.text.is_empty() {
303                warn!("[DEEPGRAM] Making API call to Deepgram...");
304
305                if let Some(params) = self.params.borrow().as_ref() {
306                    warn!(
307                        "[DEEPGRAM] Params - model: {:?}, encoding: {:?}, sample_rate: {:?}",
308                        params.model, params.encoding, params.sample_rate
309                    );
310                } else {
311                    warn!("[DEEPGRAM] Warning: No params available");
312                }
313
314                match self
315                    .client
316                    .text_to_speech_stream(request, self.params.borrow().as_ref())
317                {
318                    Ok(response) => {
319                        *self.response_stream.borrow_mut() = Some(response);
320                        self.stream_started.set(true);
321                    }
322                    Err(e) => {
323                        self.finished.set(true);
324                        return Err(e);
325                    }
326                }
327            } else {
328                return Err(TtsError::InvalidText(
329                    "No text provided for synthesis".to_string(),
330                ));
331            }
332        } else {
333            return Err(TtsError::InternalError("No request available".to_string()));
334        }
335
336        Ok(())
337    }
338
339    fn receive_chunk(&self) -> Result<Option<AudioChunk>, TtsError> {
340        if self.finished.get() {
341            return Ok(None);
342        }
343
344        if !self.stream_started.get() && self.has_pending_audio() {
345            self.finish()?;
346        }
347
348        if let Some(response) = self.response_stream.borrow_mut().take() {
349            const CHUNK_SIZE: usize = 8192;
350
351            match response.bytes() {
352                Ok(bytes) => {
353                    if bytes.is_empty() {
354                        self.finished.set(true);
355                        return Ok(None);
356                    }
357
358                    let mut current_buffer = self.chunk_buffer.borrow_mut();
359                    current_buffer.extend_from_slice(&bytes);
360
361                    if current_buffer.len() >= CHUNK_SIZE || bytes.len() < CHUNK_SIZE {
362                        let chunk_data: Vec<u8> = if current_buffer.len() <= CHUNK_SIZE {
363                            current_buffer.drain(..).collect()
364                        } else {
365                            current_buffer.drain(..CHUNK_SIZE).collect()
366                        };
367
368                        let sequence = self.sequence_number.get();
369                        self.sequence_number.set(sequence + 1);
370                        self.bytes_streamed
371                            .set(self.bytes_streamed.get() + chunk_data.len());
372                        self.total_chunks_received
373                            .set(self.total_chunks_received.get() + 1);
374
375                        let is_final = bytes.len() < CHUNK_SIZE && current_buffer.is_empty();
376                        if is_final {
377                            self.finished.set(true);
378                        }
379
380                        let chunk = AudioChunk {
381                            data: chunk_data,
382                            sequence_number: sequence,
383                            is_final,
384                            timing_info: None,
385                        };
386
387                        return Ok(Some(chunk));
388                    }
389
390                    Ok(None)
391                }
392                Err(e) => {
393                    self.finished.set(true);
394                    Err(TtsError::NetworkError(format!(
395                        "Failed to read response: {}",
396                        e
397                    )))
398                }
399            }
400        } else {
401            if self.stream_started.get() && self.chunk_buffer.borrow().is_empty() {
402                self.finished.set(true);
403            }
404            Ok(None)
405        }
406    }
407
408    fn has_pending_audio(&self) -> bool {
409        !self.finished.get()
410            && (self.response_stream.borrow().is_some()
411                || !self.chunk_buffer.borrow().is_empty()
412                || (!self.stream_started.get()
413                    && self
414                        .current_request
415                        .borrow()
416                        .as_ref()
417                        .is_some_and(|r| !r.text.is_empty())))
418    }
419
420    fn get_status(&self) -> StreamStatus {
421        if self.finished.get() {
422            StreamStatus::Finished
423        } else if self.stream_started.get() || self.response_stream.borrow().is_some() {
424            StreamStatus::Processing
425        } else {
426            StreamStatus::Ready
427        }
428    }
429
430    fn close(&self) {
431        self.finished.set(true);
432        self.stream_started.set(false);
433        *self.response_stream.borrow_mut() = None;
434        self.chunk_buffer.borrow_mut().clear();
435    }
436}
437
438pub struct DeepgramVoiceConversionStream {
439    _voice_id: String,
440}
441
442impl DeepgramVoiceConversionStream {
443    fn new(voice_id: String, _client: DeepgramTtsApi) -> Self {
444        Self {
445            _voice_id: voice_id,
446        }
447    }
448}
449
450impl VoiceConversionStreamInterface for DeepgramVoiceConversionStream {
451    fn as_any(&self) -> &dyn std::any::Any {
452        self
453    }
454    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
455        self
456    }
457
458    fn send_audio(&self, _audio_data: Vec<u8>) -> Result<(), TtsError> {
459        Err(TtsError::UnsupportedOperation(
460            "Deepgram does not support voice conversion".to_string(),
461        ))
462    }
463
464    fn receive_converted(&self) -> Result<Option<AudioChunk>, TtsError> {
465        Err(TtsError::UnsupportedOperation(
466            "Deepgram does not support voice conversion".to_string(),
467        ))
468    }
469
470    fn finish(&self) -> Result<(), TtsError> {
471        Err(TtsError::UnsupportedOperation(
472            "Deepgram does not support voice conversion".to_string(),
473        ))
474    }
475
476    fn close(&self) {}
477}
478
479pub struct DeepgramPronunciationLexicon {
480    _name: String,
481}
482
483impl DeepgramPronunciationLexicon {
484    fn new(
485        name: String,
486        _language: LanguageCode,
487        _entries: Option<Vec<PronunciationEntry>>,
488    ) -> Self {
489        Self { _name: name }
490    }
491}
492
493impl PronunciationLexiconInterface for DeepgramPronunciationLexicon {
494    fn as_any(&self) -> &dyn std::any::Any {
495        self
496    }
497    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
498        self
499    }
500
501    fn get_name(&self) -> String {
502        self._name.clone()
503    }
504
505    fn get_language(&self) -> LanguageCode {
506        "en".to_string()
507    }
508
509    fn get_entry_count(&self) -> u32 {
510        0
511    }
512
513    fn add_entry(&self, _word: String, _pronunciation: String) -> Result<(), TtsError> {
514        Err(TtsError::UnsupportedOperation(
515            "Deepgram does not support pronunciation lexicons".to_string(),
516        ))
517    }
518
519    fn remove_entry(&self, _word: String) -> Result<(), TtsError> {
520        Err(TtsError::UnsupportedOperation(
521            "Deepgram does not support pronunciation lexicons".to_string(),
522        ))
523    }
524
525    fn export_content(&self) -> Result<String, TtsError> {
526        Err(TtsError::UnsupportedOperation(
527            "Deepgram does not support pronunciation lexicons".to_string(),
528        ))
529    }
530}
531
532pub struct DeepgramLongFormOperation {
533    content: String,
534    voice_id: String,
535    client: DeepgramTtsApi,
536    status: Cell<OperationStatus>,
537    progress: Cell<f32>,
538    audio_chunks: RefCell<Option<Vec<Vec<u8>>>>,
539}
540
541impl DeepgramLongFormOperation {
542    fn new(
543        content: String,
544        _output_location: String,
545        voice_id: String,
546        client: DeepgramTtsApi,
547        _chapter_breaks: Option<Vec<u32>>,
548    ) -> Self {
549        Self {
550            content,
551            voice_id,
552            client,
553            status: Cell::new(OperationStatus::Pending),
554            progress: Cell::new(0.0),
555            audio_chunks: RefCell::new(None),
556        }
557    }
558
559    fn process_long_form(&self) -> Result<(), TtsError> {
560        self.status.set(OperationStatus::Processing);
561
562        let chunks = split_text_intelligently(&self.content, 1000);
563        let mut audio_chunks = Vec::new();
564
565        for (i, chunk) in chunks.iter().enumerate() {
566            let (request, mut params) = synthesis_options_to_tts_request(chunk.clone(), None)?;
567            if let Some(ref mut p) = params {
568                p.model = Some(self.voice_id.clone());
569            }
570
571            match self
572                .client
573                .text_to_speech_with_metadata(&request, params.as_ref())
574            {
575                Ok(tts_response) => {
576                    audio_chunks.push(tts_response.audio_data);
577                    self.progress.set((i + 1) as f32 / chunks.len() as f32);
578                }
579                Err(e) => {
580                    self.status.set(OperationStatus::Failed);
581                    return Err(e);
582                }
583            }
584        }
585
586        *self.audio_chunks.borrow_mut() = Some(audio_chunks);
587        self.status.set(OperationStatus::Completed);
588        self.progress.set(1.0);
589
590        Ok(())
591    }
592}
593
594impl LongFormOperationInterface for DeepgramLongFormOperation {
595    fn as_any(&self) -> &dyn std::any::Any {
596        self
597    }
598    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
599        self
600    }
601
602    fn get_status(&self) -> OperationStatus {
603        self.status.get()
604    }
605
606    fn get_progress(&self) -> f32 {
607        self.progress.get()
608    }
609
610    fn cancel(&self) -> Result<(), TtsError> {
611        self.status.set(OperationStatus::Cancelled);
612        Ok(())
613    }
614
615    fn get_result(&self) -> Result<LongFormResult, TtsError> {
616        if self.status.get() != OperationStatus::Completed {
617            return Err(TtsError::InvalidConfiguration(
618                "Operation not completed".to_string(),
619            ));
620        }
621
622        if let Some(chunks) = self.audio_chunks.borrow().as_ref() {
623            let mut combined_audio = Vec::new();
624            for chunk in chunks {
625                combined_audio.extend_from_slice(chunk);
626            }
627
628            let result = LongFormResult {
629                output_location: "deepgram-synthesis".to_string(),
630                total_duration: estimate_audio_duration(&combined_audio, 24000),
631                chapter_durations: None,
632                metadata: audio_data_to_synthesis_result(
633                    combined_audio.clone(),
634                    &self.content,
635                    "linear16",
636                    24000,
637                )
638                .metadata,
639            };
640
641            Ok(result)
642        } else {
643            Err(TtsError::InternalError(
644                "No audio data available".to_string(),
645            ))
646        }
647    }
648}
649
650pub struct DeepgramTts;
651
652impl DeepgramTts {
653    fn create_client(provider_config: &DeepgramConfig) -> DeepgramTtsApi {
654        DeepgramTtsApi::new(provider_config)
655    }
656
657    fn create_batch_client(provider_config: &DeepgramConfig) -> DeepgramTtsApi {
658        let batch_config = RateLimitConfig {
659            max_retries: 5,
660            initial_delay: std::time::Duration::from_millis(500),
661            max_delay: std::time::Duration::from_secs(60),
662            backoff_multiplier: 1.5,
663        };
664        DeepgramTtsApi::new(provider_config).with_rate_limit_config(batch_config)
665    }
666
667    fn create_streaming_client(provider_config: &DeepgramConfig) -> DeepgramTtsApi {
668        let streaming_config = RateLimitConfig {
669            max_retries: 3,
670            initial_delay: std::time::Duration::from_millis(200),
671            max_delay: std::time::Duration::from_secs(5),
672            backoff_multiplier: 2.0,
673        };
674        DeepgramTtsApi::new(provider_config).with_rate_limit_config(streaming_config)
675    }
676}
677
678impl VoiceProvider for DeepgramTts {
679    type Voice = DeepgramVoiceImpl;
680    type VoiceResults = DeepgramVoiceResults;
681    type ProviderConfig = DeepgramConfig;
682
683    fn list_voices(
684        provider_config: Self::ProviderConfig,
685        filter: Option<VoiceFilter>,
686    ) -> Result<VoiceResults, TtsError> {
687        let client = Self::create_client(&provider_config);
688        let models = get_available_models();
689
690        if let Some(f) = filter.as_ref() {
691            let mut voice_filters = crate::client::VoiceFilters::new();
692
693            if let Some(lang) = &f.language {
694                voice_filters = voice_filters.with_language(lang.clone());
695            }
696
697            if let Some(gender) = f.gender {
698                let gender_str = match gender {
699                    VoiceGender::Male => "masculine",
700                    VoiceGender::Female => "feminine",
701                    VoiceGender::Neutral => "neutral",
702                };
703                voice_filters = voice_filters.with_gender(gender_str.to_string());
704            }
705
706            if let Some(quality) = f.quality {
707                let _quality_filter = match quality {
708                    VoiceQuality::Standard => crate::client::VoiceQuality::Standard,
709                    VoiceQuality::Premium => crate::client::VoiceQuality::Premium,
710                    VoiceQuality::Neural => crate::client::VoiceQuality::Professional,
711                    VoiceQuality::Studio => crate::client::VoiceQuality::Professional,
712                };
713                voice_filters = voice_filters.with_version(crate::client::ModelVersion::Aura2);
714            }
715
716            if let Some(query) = &f.search_query {
717                voice_filters = voice_filters.with_search(query.clone());
718            }
719
720            let filtered_response = client.get_models_filtered(&voice_filters)?;
721            let voice_infos: Vec<VoiceInfo> = filtered_response
722                .models
723                .into_iter()
724                .map(deepgram_model_to_voice_info)
725                .collect();
726
727            return Ok(VoiceResults::new(DeepgramVoiceResults::new(voice_infos)));
728        }
729
730        let mut voice_infos: Vec<VoiceInfo> = models
731            .into_iter()
732            .map(deepgram_model_to_voice_info)
733            .collect();
734
735        if let Some(f) = filter {
736            voice_infos.retain(|voice| {
737                let mut matches = true;
738
739                if let Some(gender) = f.gender {
740                    matches = matches && voice.gender == gender;
741                }
742
743                if let Some(quality) = f.quality {
744                    matches = matches && voice.quality == quality;
745                }
746
747                if let Some(lang) = &f.language {
748                    matches = matches && voice.language == *lang;
749                }
750
751                if let Some(provider) = &f.provider {
752                    matches = matches && voice.provider.contains(provider);
753                }
754
755                if let Some(query) = &f.search_query {
756                    let query_lower = query.to_lowercase();
757                    matches = matches
758                        && (voice.name.to_lowercase().contains(&query_lower)
759                            || voice
760                                .description
761                                .as_ref()
762                                .is_some_and(|d| d.to_lowercase().contains(&query_lower))
763                            || voice
764                                .use_cases
765                                .iter()
766                                .any(|uc| uc.to_lowercase().contains(&query_lower)));
767                }
768
769                matches
770            });
771        }
772
773        Ok(VoiceResults::new(DeepgramVoiceResults::new(voice_infos)))
774    }
775
776    fn get_voice(
777        provider_config: Self::ProviderConfig,
778        voice_id: String,
779    ) -> Result<Voice, TtsError> {
780        let client = Self::create_client(&provider_config);
781        let models = get_available_models();
782
783        if let Some(model) = models.into_iter().find(|m| m.voice_id == voice_id) {
784            Ok(Voice::new(DeepgramVoiceImpl::new(model, client)))
785        } else {
786            Err(TtsError::VoiceNotFound(format!(
787                "Voice '{}' not found",
788                voice_id
789            )))
790        }
791    }
792
793    fn search_voices(
794        provider_config: Self::ProviderConfig,
795        filter: Option<VoiceFilter>,
796    ) -> Result<Vec<VoiceInfo>, TtsError> {
797        let client = Self::create_client(&provider_config);
798
799        // Extract search query from filter
800        // deepgram does not have the native api for querying we are trying a simple search over all models
801        let search_query = filter
802            .as_ref()
803            .and_then(|f| f.search_query.as_ref())
804            .cloned()
805            .unwrap_or_else(String::new);
806
807        let mut voice_infos: Vec<VoiceInfo> = if !search_query.is_empty() {
808            let search_results = client.search_models(&search_query)?;
809            search_results
810                .into_iter()
811                .map(deepgram_model_to_voice_info)
812                .collect()
813        } else {
814            let all_models = get_available_models();
815            all_models
816                .into_iter()
817                .map(deepgram_model_to_voice_info)
818                .collect()
819        };
820
821        if let Some(f) = filter {
822            voice_infos.retain(|voice| {
823                let mut matches = true;
824
825                if let Some(gender) = f.gender {
826                    matches = matches && voice.gender == gender;
827                }
828
829                if let Some(quality) = f.quality {
830                    matches = matches && voice.quality == quality;
831                }
832
833                if let Some(lang) = &f.language {
834                    matches = matches && voice.language == *lang;
835                }
836
837                if let Some(provider) = &f.provider {
838                    matches = matches && voice.provider.contains(provider);
839                }
840
841                matches
842            });
843        }
844
845        Ok(voice_infos)
846    }
847
848    fn list_languages(
849        _provider_config: Self::ProviderConfig,
850    ) -> Result<Vec<LanguageInfo>, TtsError> {
851        let models = get_available_models();
852        Ok(models_to_language_info(models))
853    }
854}
855
856impl SynthesizeProvider for DeepgramTts {
857    type ProviderConfig = DeepgramConfig;
858
859    fn synthesize(
860        provider_config: Self::ProviderConfig,
861        input: TextInput,
862        voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
863        options: Option<SynthesisOptions>,
864    ) -> Result<SynthesisResult, TtsError> {
865        validate_synthesis_request(
866            &input.content,
867            input.text_type,
868            input.language.as_deref(),
869            options.as_ref(),
870        )?;
871
872        let client = Self::create_client(&provider_config);
873        let voice_id = voice.get::<DeepgramVoiceImpl>().get_id();
874
875        let max_chars = get_max_chars_for_model(Some(&voice_id));
876
877        if input.content.len() <= max_chars {
878            let (request, mut params) =
879                synthesis_options_to_tts_request(input.content.clone(), options)?;
880            if let Some(ref mut p) = params {
881                p.model = Some(voice_id);
882            }
883
884            let tts_response = client.text_to_speech_with_metadata(&request, params.as_ref())?;
885            let encoding = params
886                .as_ref()
887                .and_then(|p| p.encoding.as_ref())
888                .unwrap_or(&"linear16".to_string())
889                .clone();
890            let sample_rate = params.as_ref().and_then(|p| p.sample_rate).unwrap_or(24000);
891
892            let mut synthesis_result = audio_data_to_synthesis_result(
893                tts_response.audio_data,
894                &input.content,
895                &encoding,
896                sample_rate,
897            );
898
899            synthesis_result.metadata.provider_info = Some(format!(
900                "Deepgram TTS - Model: {}, Characters: {}",
901                tts_response.metadata.dg_model_name, tts_response.metadata.dg_char_count
902            ));
903
904            Ok(synthesis_result)
905        } else {
906            let chunks = split_text_intelligently(&input.content, max_chars);
907            let mut combined_audio = Vec::new();
908            let mut total_characters = 0u32;
909            let mut total_words = 0u32;
910            let mut total_duration = 0.0f32;
911
912            for (chunk_index, chunk) in chunks.iter().enumerate() {
913                let (request, mut params) =
914                    synthesis_options_to_tts_request(chunk.clone(), options.clone())?;
915                if let Some(ref mut p) = params {
916                    p.model = Some(voice_id.clone());
917                }
918
919                let tts_response =
920                    client.text_to_speech_with_metadata(&request, params.as_ref())?;
921
922                combined_audio.extend_from_slice(&tts_response.audio_data);
923
924                total_characters += chunk.chars().count() as u32;
925                total_words += chunk.split_whitespace().count() as u32;
926
927                let encoding = params
928                    .as_ref()
929                    .and_then(|p| p.encoding.as_ref())
930                    .cloned()
931                    .unwrap_or_else(|| "linear16".to_string());
932                let sample_rate = params.as_ref().and_then(|p| p.sample_rate).unwrap_or(24000);
933                total_duration += estimate_audio_duration(&tts_response.audio_data, sample_rate);
934
935                if chunk_index < chunks.len() - 1 {
936                    let silence_samples = (sample_rate as f32 * 0.1) as usize;
937                    let silence_bytes = match encoding.as_str() {
938                        "linear16" => silence_samples * 2,
939                        "mulaw" | "alaw" => silence_samples,
940                        _ => silence_samples * 2,
941                    };
942                    combined_audio.extend(vec![0u8; silence_bytes]);
943                    total_duration += 0.1;
944                }
945            }
946
947            let audio_size_bytes = combined_audio.len() as u32;
948            let synthesis_result = SynthesisResult {
949                audio_data: combined_audio,
950                metadata: SynthesisMetadata {
951                    duration_seconds: total_duration,
952                    character_count: total_characters,
953                    word_count: total_words,
954                    audio_size_bytes,
955                    request_id: format!("deepgram-chunked-{}", chrono::Utc::now().timestamp()),
956                    provider_info: Some(format!(
957                        "Deepgram TTS - Model: {}, Chunks: {}, Total Characters: {}",
958                        voice_id,
959                        chunks.len(),
960                        total_characters
961                    )),
962                },
963            };
964
965            Ok(synthesis_result)
966        }
967    }
968
969    fn synthesize_batch(
970        provider_config: Self::ProviderConfig,
971        inputs: Vec<TextInput>,
972        voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
973        options: Option<SynthesisOptions>,
974    ) -> Result<Vec<SynthesisResult>, TtsError> {
975        let mut results = Vec::new();
976        let client = Self::create_batch_client(&provider_config);
977        let voice_id = voice.get::<DeepgramVoiceImpl>().get_id();
978
979        for input in inputs {
980            validate_synthesis_request(
981                &input.content,
982                input.text_type,
983                input.language.as_deref(),
984                options.as_ref(),
985            )?;
986
987            let (request, mut params) =
988                synthesis_options_to_tts_request(input.content.clone(), options.clone())?;
989            if let Some(ref mut p) = params {
990                p.model = Some(voice_id.clone());
991            }
992
993            match client.text_to_speech_with_metadata(&request, params.as_ref()) {
994                Ok(tts_response) => {
995                    let encoding = params
996                        .as_ref()
997                        .and_then(|p| p.encoding.as_ref())
998                        .unwrap_or(&"linear16".to_string())
999                        .clone();
1000                    let sample_rate = params.as_ref().and_then(|p| p.sample_rate).unwrap_or(24000);
1001
1002                    let mut synthesis_result = audio_data_to_synthesis_result(
1003                        tts_response.audio_data,
1004                        &input.content,
1005                        &encoding,
1006                        sample_rate,
1007                    );
1008
1009                    synthesis_result.metadata.provider_info = Some(format!(
1010                        "Deepgram TTS - Model: {}, Characters: {}",
1011                        tts_response.metadata.dg_model_name, tts_response.metadata.dg_char_count
1012                    ));
1013
1014                    results.push(synthesis_result);
1015                }
1016                Err(e) => {
1017                    return Err(e);
1018                }
1019            }
1020        }
1021
1022        Ok(results)
1023    }
1024
1025    fn get_timing_marks(
1026        _provider_config: Self::ProviderConfig,
1027        _input: TextInput,
1028        _voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1029    ) -> Result<Vec<TimingInfo>, TtsError> {
1030        Err(TtsError::UnsupportedOperation(
1031            "Timing marks not supported by Deepgram".to_string(),
1032        ))
1033    }
1034
1035    fn validate_input(
1036        _provider_config: Self::ProviderConfig,
1037        input: TextInput,
1038        voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1039    ) -> Result<ValidationResult, TtsError> {
1040        let voice_id = voice.get::<DeepgramVoiceImpl>().get_id();
1041
1042        let mut _is_valid = true;
1043        let mut messages = Vec::new();
1044
1045        if input.content.is_empty() {
1046            _is_valid = false;
1047            messages.push("Text input cannot be empty".to_string());
1048        }
1049
1050        let char_limit = if voice_id.starts_with("aura-2") {
1051            2000
1052        } else {
1053            1000
1054        };
1055        if input.content.len() > char_limit {
1056            _is_valid = false;
1057            messages.push(format!(
1058                "Text exceeds {} character limit for {}",
1059                char_limit, voice_id
1060            ));
1061        }
1062
1063        if input
1064            .content
1065            .chars()
1066            .any(|c| c.is_control() && c != '\n' && c != '\r' && c != '\t')
1067        {
1068            messages.push(
1069                "Warning: Text contains control characters that may not be processed correctly"
1070                    .to_string(),
1071            );
1072        }
1073
1074        let _message = if messages.is_empty() {
1075            None
1076        } else {
1077            Some(messages.join("; "))
1078        };
1079
1080        Ok(validate_text_input(&input.content, Some(&voice_id)))
1081    }
1082}
1083
1084impl StreamingVoiceProvider for DeepgramTts {
1085    type SynthesisStream = DeepgramSynthesisStream;
1086    type VoiceConversionStream = DeepgramVoiceConversionStream;
1087    type ProviderConfig = DeepgramConfig;
1088
1089    fn create_stream(
1090        provider_config: Self::ProviderConfig,
1091        voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1092        options: Option<SynthesisOptions>,
1093    ) -> Result<SynthesisStream, TtsError> {
1094        let client = Self::create_streaming_client(&provider_config);
1095        let voice_id = voice.get::<DeepgramVoiceImpl>().get_id();
1096
1097        let stream = DeepgramSynthesisStream::new(voice_id, client, options);
1098        Ok(SynthesisStream::new(stream))
1099    }
1100
1101    fn create_voice_conversion_stream(
1102        provider_config: Self::ProviderConfig,
1103        target_voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1104        _options: Option<SynthesisOptions>,
1105    ) -> Result<VoiceConversionStream, TtsError> {
1106        let client = Self::create_client(&provider_config);
1107        let voice_id = target_voice.get::<DeepgramVoiceImpl>().get_id();
1108
1109        let stream = DeepgramVoiceConversionStream::new(voice_id, client);
1110        Ok(VoiceConversionStream::new(stream))
1111    }
1112}
1113
1114impl AdvancedTtsProvider for DeepgramTts {
1115    type PronunciationLexicon = DeepgramPronunciationLexicon;
1116    type LongFormOperation = DeepgramLongFormOperation;
1117    type ProviderConfig = DeepgramConfig;
1118
1119    fn create_voice_clone(
1120        _provider_config: Self::ProviderConfig,
1121        _name: String,
1122        _audio_samples: Vec<AudioSample>,
1123        _description: Option<String>,
1124    ) -> Result<Voice, TtsError> {
1125        Err(TtsError::UnsupportedOperation(
1126            "Deepgram does not support voice cloning".to_string(),
1127        ))
1128    }
1129
1130    fn design_voice(
1131        _provider_config: Self::ProviderConfig,
1132        _name: String,
1133        _characteristics: VoiceDesignParams,
1134    ) -> Result<Voice, TtsError> {
1135        Err(TtsError::UnsupportedOperation(
1136            "Deepgram does not support voice design".to_string(),
1137        ))
1138    }
1139
1140    fn convert_voice(
1141        _provider_config: Self::ProviderConfig,
1142        _input_audio: Vec<u8>,
1143        _target_voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1144        _preserve_timing: Option<bool>,
1145    ) -> Result<Vec<u8>, TtsError> {
1146        Err(TtsError::UnsupportedOperation(
1147            "Deepgram does not support voice conversion".to_string(),
1148        ))
1149    }
1150
1151    fn generate_sound_effect(
1152        _provider_config: Self::ProviderConfig,
1153        _description: String,
1154        _duration_seconds: Option<f32>,
1155        _style_influence: Option<f32>,
1156    ) -> Result<Vec<u8>, TtsError> {
1157        Err(TtsError::UnsupportedOperation(
1158            "Deepgram does not support sound effect generation".to_string(),
1159        ))
1160    }
1161
1162    fn create_lexicon(
1163        _provider_config: Self::ProviderConfig,
1164        name: String,
1165        language: LanguageCode,
1166        entries: Option<Vec<PronunciationEntry>>,
1167    ) -> Result<PronunciationLexicon, TtsError> {
1168        let lexicon = DeepgramPronunciationLexicon::new(name, language, entries);
1169        Ok(PronunciationLexicon::new(lexicon))
1170    }
1171
1172    fn synthesize_long_form(
1173        provider_config: Self::ProviderConfig,
1174        content: String,
1175        voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1176        output_location: String,
1177        chapter_breaks: Option<Vec<u32>>,
1178    ) -> Result<LongFormOperation, TtsError> {
1179        let client = Self::create_batch_client(&provider_config);
1180        let voice_id = voice.get::<DeepgramVoiceImpl>().get_id();
1181
1182        let operation = DeepgramLongFormOperation::new(
1183            content,
1184            output_location,
1185            voice_id,
1186            client,
1187            chapter_breaks,
1188        );
1189
1190        operation.process_long_form()?;
1191
1192        Ok(LongFormOperation::new(operation))
1193    }
1194}
1195
1196impl ExtendedTtsProvider for DeepgramTts {
1197    fn unwrapped_synthesis_stream(
1198        provider_config: <Self as VoiceProvider>::ProviderConfig,
1199        voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1200        options: Option<SynthesisOptions>,
1201    ) -> Self::SynthesisStream {
1202        let client = Self::create_streaming_client(&provider_config);
1203        let voice_id = voice.get::<DeepgramVoiceImpl>().get_id();
1204
1205        DeepgramSynthesisStream::new(voice_id, client, options)
1206    }
1207
1208    fn unwrapped_voice_conversion_stream(
1209        provider_config: <Self as VoiceProvider>::ProviderConfig,
1210        target_voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1211        _options: Option<SynthesisOptions>,
1212    ) -> Self::VoiceConversionStream {
1213        let client = Self::create_client(&provider_config);
1214        let voice_id = target_voice.get::<DeepgramVoiceImpl>().get_id();
1215
1216        DeepgramVoiceConversionStream::new(voice_id, client)
1217    }
1218
1219    fn subscribe_synthesis_stream(_stream: &Self::SynthesisStream) -> Pollable {
1220        subscribe_zero()
1221    }
1222
1223    fn subscribe_voice_conversion_stream(_stream: &Self::VoiceConversionStream) -> Pollable {
1224        subscribe_zero()
1225    }
1226}
1227
1228pub type DurableDeepgramTts = DurableTts<DeepgramTts>;
golem_ai_tts_deepgram/lib.rs

golem_ai_tts_deepgram/
lib.rs