1use crate::client::{get_available_models, DeepgramTtsApi, Model, RateLimitConfig};
2use crate::conversions::{
3 audio_data_to_synthesis_result, deepgram_model_to_voice_info, estimate_audio_duration,
4 get_max_chars_for_model, models_to_language_info, split_text_intelligently,
5 synthesis_options_to_tts_request, validate_synthesis_request, validate_text_input,
6};
7use golem_ai_tts::durability::{DurableTts, ExtendedTtsProvider};
8use golem_ai_tts::model::advanced::{
9 AudioSample, LongFormOperation, LongFormResult, OperationStatus, PronunciationEntry,
10 PronunciationLexicon, VoiceDesignParams,
11};
12use golem_ai_tts::model::streaming::{StreamStatus, SynthesisStream, VoiceConversionStream};
13use golem_ai_tts::model::synthesis::{SynthesisOptions, ValidationResult};
14use golem_ai_tts::model::types::{
15 AudioChunk, AudioFormat, LanguageCode, SynthesisMetadata, SynthesisResult, TextInput,
16 TimingInfo, TtsError, VoiceGender, VoiceQuality, VoiceSettings,
17};
18use golem_ai_tts::model::voices::{LanguageInfo, Voice, VoiceFilter, VoiceInfo, VoiceResults};
19use golem_ai_tts::wasi_compat::{subscribe_zero, Pollable};
20use golem_ai_tts::{
21 AdvancedTtsProvider, LongFormOperationInterface, PronunciationLexiconInterface,
22 StreamingVoiceProvider, SynthesisStreamInterface, SynthesizeProvider,
23 VoiceConversionStreamInterface, VoiceInterface, VoiceProvider, VoiceResultsInterface,
24};
25use log::{info, warn};
26use std::cell::{Cell, RefCell};
27
28pub mod config;
29pub use config::DeepgramConfig;
30#[cfg(feature = "golem")]
31pub use config::DeepgramHostConfig;
32
33mod client;
34mod conversions;
35
36pub struct DeepgramVoiceImpl {
37 model_data: Model,
38 client: DeepgramTtsApi,
39}
40
41impl DeepgramVoiceImpl {
42 fn new(model_data: Model, client: DeepgramTtsApi) -> Self {
43 Self { model_data, client }
44 }
45}
46
47impl VoiceInterface for DeepgramVoiceImpl {
48 fn as_any(&self) -> &dyn std::any::Any {
49 self
50 }
51 fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
52 self
53 }
54
55 fn get_id(&self) -> String {
56 self.model_data.voice_id.clone()
57 }
58
59 fn get_name(&self) -> String {
60 self.model_data.name.clone()
61 }
62
63 fn get_provider_id(&self) -> Option<String> {
64 Some("Deepgram".to_string())
65 }
66
67 fn get_language(&self) -> LanguageCode {
68 conversions::normalize_language_code(&self.model_data.language)
69 }
70
71 fn get_additional_languages(&self) -> Vec<LanguageCode> {
72 vec![]
73 }
74
75 fn get_gender(&self) -> VoiceGender {
76 conversions::parse_gender(&self.model_data.gender)
77 }
78
79 fn get_quality(&self) -> VoiceQuality {
80 conversions::infer_quality_from_model(&self.model_data.voice_id)
81 }
82
83 fn get_description(&self) -> Option<String> {
84 Some(format!(
85 "{} voice with {} accent, {}. Characteristics: {}. Suitable for: {}",
86 self.model_data.gender,
87 self.model_data.accent,
88 self.model_data.age,
89 self.model_data.characteristics.join(", "),
90 self.model_data.use_cases.join(", ")
91 ))
92 }
93
94 fn supports_ssml(&self) -> bool {
95 false
96 }
97
98 fn get_sample_rates(&self) -> Vec<u32> {
99 vec![8000, 16000, 22050, 24000, 32000, 48000]
100 }
101
102 fn get_supported_formats(&self) -> Vec<AudioFormat> {
103 vec![
104 AudioFormat::Mp3,
105 AudioFormat::Wav,
106 AudioFormat::Pcm,
107 AudioFormat::OggOpus,
108 AudioFormat::Aac,
109 AudioFormat::Flac,
110 AudioFormat::Mulaw,
111 AudioFormat::Alaw,
112 ]
113 }
114
115 fn update_settings(&self, _settings: VoiceSettings) -> Result<(), TtsError> {
116 Err(TtsError::UnsupportedOperation(
117 "Deepgram does not support voice settings updates".to_string(),
118 ))
119 }
120
121 fn delete(&self) -> Result<(), TtsError> {
122 Err(TtsError::UnsupportedOperation(
123 "Deepgram voices cannot be deleted".to_string(),
124 ))
125 }
126
127 fn clone(&self) -> Result<Voice, TtsError> {
128 Err(TtsError::UnsupportedOperation(
129 "Deepgram does not support voice cloning".to_string(),
130 ))
131 }
132
133 fn preview(&self, text: String) -> Result<Vec<u8>, TtsError> {
134 let (request, params) = synthesis_options_to_tts_request(text, None)?;
135 let mut params = params.unwrap();
136 params.model = Some(self.model_data.voice_id.clone());
137
138 self.client.text_to_speech(&request, Some(¶ms))
139 }
140}
141
142pub struct DeepgramVoiceResults {
143 voices: RefCell<Vec<VoiceInfo>>,
144 current_index: Cell<usize>,
145}
146
147impl DeepgramVoiceResults {
148 fn new(voices: Vec<VoiceInfo>) -> Self {
149 Self {
150 voices: RefCell::new(voices),
151 current_index: Cell::new(0),
152 }
153 }
154}
155
156impl VoiceResultsInterface for DeepgramVoiceResults {
157 fn as_any(&self) -> &dyn std::any::Any {
158 self
159 }
160 fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
161 self
162 }
163
164 fn has_more(&self) -> bool {
165 self.current_index.get() < self.voices.borrow().len()
166 }
167
168 fn get_next(&self) -> Result<Vec<VoiceInfo>, TtsError> {
169 let voices = self.voices.borrow();
170 let current = self.current_index.get();
171
172 if current >= voices.len() {
173 return Ok(vec![]);
174 }
175
176 let remaining: Vec<VoiceInfo> = voices[current..].to_vec();
177 self.current_index.set(voices.len());
178
179 Ok(remaining)
180 }
181
182 fn get_total_count(&self) -> Option<u32> {
183 Some(self.voices.borrow().len() as u32)
184 }
185}
186
187#[warn(dead_code)]
188pub struct DeepgramSynthesisStream {
189 client: DeepgramTtsApi,
190 current_request: RefCell<Option<client::TextToSpeechRequest>>,
191 params: RefCell<Option<client::TextToSpeechParams>>,
192 response_stream: RefCell<Option<golem_wasi_http::Response>>,
193 chunk_buffer: RefCell<Vec<u8>>,
194 bytes_streamed: Cell<usize>,
195 total_chunks_received: Cell<u32>,
196 finished: Cell<bool>,
197 sequence_number: Cell<u32>,
198 stream_started: Cell<bool>,
199}
200
201impl DeepgramSynthesisStream {
202 fn new(voice_id: String, client: DeepgramTtsApi, options: Option<SynthesisOptions>) -> Self {
203 let (request, params) = synthesis_options_to_tts_request(String::new(), options)
204 .unwrap_or_else(|_| {
205 let request = client::TextToSpeechRequest {
206 text: String::new(),
207 };
208 let params = Some(client::TextToSpeechParams {
209 model: Some(voice_id.clone()),
210 encoding: Some("linear16".to_string()),
211 container: Some("wav".to_string()),
212 sample_rate: Some(24000),
213 bit_rate: None,
214 });
215 (request, params)
216 });
217 let mut params = params.unwrap();
218 params.model = Some(voice_id.clone());
219
220 Self {
221 client,
222 current_request: RefCell::new(Some(request)),
223 params: RefCell::new(Some(params)),
224 response_stream: RefCell::new(None),
225 chunk_buffer: RefCell::new(Vec::new()),
226 bytes_streamed: Cell::new(0),
227 total_chunks_received: Cell::new(0),
228 finished: Cell::new(false),
229 sequence_number: Cell::new(0),
230 stream_started: Cell::new(false),
231 }
232 }
233
234 #[allow(dead_code)]
235 fn get_progress(&self) -> (usize, u32) {
236 (self.bytes_streamed.get(), self.total_chunks_received.get())
237 }
238}
239
240impl SynthesisStreamInterface for DeepgramSynthesisStream {
241 fn as_any(&self) -> &dyn std::any::Any {
242 self
243 }
244 fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
245 self
246 }
247
248 fn send_text(&self, input: TextInput) -> Result<(), TtsError> {
249 info!("[DEEPGRAM] send_text called with: '{}'", input.content);
250
251 if self.finished.get() {
252 warn!("[DEEPGRAM] Stream already finished, returning error");
253 return Err(TtsError::InvalidConfiguration(
254 "Stream already finished".to_string(),
255 ));
256 }
257
258 {
259 let mut request_ref = self.current_request.borrow_mut();
260 if let Some(mut request) = request_ref.take() {
261 info!(
262 "[DEEPGRAM] Updating request text from '{}' to '{}'",
263 request.text, input.content
264 );
265 request.text = input.content;
266 *request_ref = Some(request);
267 } else {
268 warn!("[DEEPGRAM] Warning: No current request to update");
269 }
270 }
271
272 Ok(())
273 }
274
275 fn finish(&self) -> Result<(), TtsError> {
276 info!("[DEEPGRAM] finish() called");
277
278 if self.stream_started.get() {
279 info!("[DEEPGRAM] Stream already started, returning OK");
280 return Ok(());
281 }
282
283 warn!("[DEEPGRAM] Checking current request state...");
284 let request_debug = self.current_request.borrow();
285 match request_debug.as_ref() {
286 Some(req) => {
287 info!(
288 "[DEEPGRAM] Request exists with text: '{}' (length: {})",
289 req.text,
290 req.text.len()
291 );
292 }
293 None => {
294 warn!("[DEEPGRAM] No request found!");
295 return Err(TtsError::InternalError("No request available".to_string()));
296 }
297 }
298 drop(request_debug);
299
300 if let Some(request) = self.current_request.borrow().as_ref() {
301 warn!("[DEEPGRAM] Current request text: '{}'", request.text);
302 if !request.text.is_empty() {
303 warn!("[DEEPGRAM] Making API call to Deepgram...");
304
305 if let Some(params) = self.params.borrow().as_ref() {
306 warn!(
307 "[DEEPGRAM] Params - model: {:?}, encoding: {:?}, sample_rate: {:?}",
308 params.model, params.encoding, params.sample_rate
309 );
310 } else {
311 warn!("[DEEPGRAM] Warning: No params available");
312 }
313
314 match self
315 .client
316 .text_to_speech_stream(request, self.params.borrow().as_ref())
317 {
318 Ok(response) => {
319 *self.response_stream.borrow_mut() = Some(response);
320 self.stream_started.set(true);
321 }
322 Err(e) => {
323 self.finished.set(true);
324 return Err(e);
325 }
326 }
327 } else {
328 return Err(TtsError::InvalidText(
329 "No text provided for synthesis".to_string(),
330 ));
331 }
332 } else {
333 return Err(TtsError::InternalError("No request available".to_string()));
334 }
335
336 Ok(())
337 }
338
339 fn receive_chunk(&self) -> Result<Option<AudioChunk>, TtsError> {
340 if self.finished.get() {
341 return Ok(None);
342 }
343
344 if !self.stream_started.get() && self.has_pending_audio() {
345 self.finish()?;
346 }
347
348 if let Some(response) = self.response_stream.borrow_mut().take() {
349 const CHUNK_SIZE: usize = 8192;
350
351 match response.bytes() {
352 Ok(bytes) => {
353 if bytes.is_empty() {
354 self.finished.set(true);
355 return Ok(None);
356 }
357
358 let mut current_buffer = self.chunk_buffer.borrow_mut();
359 current_buffer.extend_from_slice(&bytes);
360
361 if current_buffer.len() >= CHUNK_SIZE || bytes.len() < CHUNK_SIZE {
362 let chunk_data: Vec<u8> = if current_buffer.len() <= CHUNK_SIZE {
363 current_buffer.drain(..).collect()
364 } else {
365 current_buffer.drain(..CHUNK_SIZE).collect()
366 };
367
368 let sequence = self.sequence_number.get();
369 self.sequence_number.set(sequence + 1);
370 self.bytes_streamed
371 .set(self.bytes_streamed.get() + chunk_data.len());
372 self.total_chunks_received
373 .set(self.total_chunks_received.get() + 1);
374
375 let is_final = bytes.len() < CHUNK_SIZE && current_buffer.is_empty();
376 if is_final {
377 self.finished.set(true);
378 }
379
380 let chunk = AudioChunk {
381 data: chunk_data,
382 sequence_number: sequence,
383 is_final,
384 timing_info: None,
385 };
386
387 return Ok(Some(chunk));
388 }
389
390 Ok(None)
391 }
392 Err(e) => {
393 self.finished.set(true);
394 Err(TtsError::NetworkError(format!(
395 "Failed to read response: {}",
396 e
397 )))
398 }
399 }
400 } else {
401 if self.stream_started.get() && self.chunk_buffer.borrow().is_empty() {
402 self.finished.set(true);
403 }
404 Ok(None)
405 }
406 }
407
408 fn has_pending_audio(&self) -> bool {
409 !self.finished.get()
410 && (self.response_stream.borrow().is_some()
411 || !self.chunk_buffer.borrow().is_empty()
412 || (!self.stream_started.get()
413 && self
414 .current_request
415 .borrow()
416 .as_ref()
417 .is_some_and(|r| !r.text.is_empty())))
418 }
419
420 fn get_status(&self) -> StreamStatus {
421 if self.finished.get() {
422 StreamStatus::Finished
423 } else if self.stream_started.get() || self.response_stream.borrow().is_some() {
424 StreamStatus::Processing
425 } else {
426 StreamStatus::Ready
427 }
428 }
429
430 fn close(&self) {
431 self.finished.set(true);
432 self.stream_started.set(false);
433 *self.response_stream.borrow_mut() = None;
434 self.chunk_buffer.borrow_mut().clear();
435 }
436}
437
438pub struct DeepgramVoiceConversionStream {
439 _voice_id: String,
440}
441
442impl DeepgramVoiceConversionStream {
443 fn new(voice_id: String, _client: DeepgramTtsApi) -> Self {
444 Self {
445 _voice_id: voice_id,
446 }
447 }
448}
449
450impl VoiceConversionStreamInterface for DeepgramVoiceConversionStream {
451 fn as_any(&self) -> &dyn std::any::Any {
452 self
453 }
454 fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
455 self
456 }
457
458 fn send_audio(&self, _audio_data: Vec<u8>) -> Result<(), TtsError> {
459 Err(TtsError::UnsupportedOperation(
460 "Deepgram does not support voice conversion".to_string(),
461 ))
462 }
463
464 fn receive_converted(&self) -> Result<Option<AudioChunk>, TtsError> {
465 Err(TtsError::UnsupportedOperation(
466 "Deepgram does not support voice conversion".to_string(),
467 ))
468 }
469
470 fn finish(&self) -> Result<(), TtsError> {
471 Err(TtsError::UnsupportedOperation(
472 "Deepgram does not support voice conversion".to_string(),
473 ))
474 }
475
476 fn close(&self) {}
477}
478
479pub struct DeepgramPronunciationLexicon {
480 _name: String,
481}
482
483impl DeepgramPronunciationLexicon {
484 fn new(
485 name: String,
486 _language: LanguageCode,
487 _entries: Option<Vec<PronunciationEntry>>,
488 ) -> Self {
489 Self { _name: name }
490 }
491}
492
493impl PronunciationLexiconInterface for DeepgramPronunciationLexicon {
494 fn as_any(&self) -> &dyn std::any::Any {
495 self
496 }
497 fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
498 self
499 }
500
501 fn get_name(&self) -> String {
502 self._name.clone()
503 }
504
505 fn get_language(&self) -> LanguageCode {
506 "en".to_string()
507 }
508
509 fn get_entry_count(&self) -> u32 {
510 0
511 }
512
513 fn add_entry(&self, _word: String, _pronunciation: String) -> Result<(), TtsError> {
514 Err(TtsError::UnsupportedOperation(
515 "Deepgram does not support pronunciation lexicons".to_string(),
516 ))
517 }
518
519 fn remove_entry(&self, _word: String) -> Result<(), TtsError> {
520 Err(TtsError::UnsupportedOperation(
521 "Deepgram does not support pronunciation lexicons".to_string(),
522 ))
523 }
524
525 fn export_content(&self) -> Result<String, TtsError> {
526 Err(TtsError::UnsupportedOperation(
527 "Deepgram does not support pronunciation lexicons".to_string(),
528 ))
529 }
530}
531
532pub struct DeepgramLongFormOperation {
533 content: String,
534 voice_id: String,
535 client: DeepgramTtsApi,
536 status: Cell<OperationStatus>,
537 progress: Cell<f32>,
538 audio_chunks: RefCell<Option<Vec<Vec<u8>>>>,
539}
540
541impl DeepgramLongFormOperation {
542 fn new(
543 content: String,
544 _output_location: String,
545 voice_id: String,
546 client: DeepgramTtsApi,
547 _chapter_breaks: Option<Vec<u32>>,
548 ) -> Self {
549 Self {
550 content,
551 voice_id,
552 client,
553 status: Cell::new(OperationStatus::Pending),
554 progress: Cell::new(0.0),
555 audio_chunks: RefCell::new(None),
556 }
557 }
558
559 fn process_long_form(&self) -> Result<(), TtsError> {
560 self.status.set(OperationStatus::Processing);
561
562 let chunks = split_text_intelligently(&self.content, 1000);
563 let mut audio_chunks = Vec::new();
564
565 for (i, chunk) in chunks.iter().enumerate() {
566 let (request, mut params) = synthesis_options_to_tts_request(chunk.clone(), None)?;
567 if let Some(ref mut p) = params {
568 p.model = Some(self.voice_id.clone());
569 }
570
571 match self
572 .client
573 .text_to_speech_with_metadata(&request, params.as_ref())
574 {
575 Ok(tts_response) => {
576 audio_chunks.push(tts_response.audio_data);
577 self.progress.set((i + 1) as f32 / chunks.len() as f32);
578 }
579 Err(e) => {
580 self.status.set(OperationStatus::Failed);
581 return Err(e);
582 }
583 }
584 }
585
586 *self.audio_chunks.borrow_mut() = Some(audio_chunks);
587 self.status.set(OperationStatus::Completed);
588 self.progress.set(1.0);
589
590 Ok(())
591 }
592}
593
594impl LongFormOperationInterface for DeepgramLongFormOperation {
595 fn as_any(&self) -> &dyn std::any::Any {
596 self
597 }
598 fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
599 self
600 }
601
602 fn get_status(&self) -> OperationStatus {
603 self.status.get()
604 }
605
606 fn get_progress(&self) -> f32 {
607 self.progress.get()
608 }
609
610 fn cancel(&self) -> Result<(), TtsError> {
611 self.status.set(OperationStatus::Cancelled);
612 Ok(())
613 }
614
615 fn get_result(&self) -> Result<LongFormResult, TtsError> {
616 if self.status.get() != OperationStatus::Completed {
617 return Err(TtsError::InvalidConfiguration(
618 "Operation not completed".to_string(),
619 ));
620 }
621
622 if let Some(chunks) = self.audio_chunks.borrow().as_ref() {
623 let mut combined_audio = Vec::new();
624 for chunk in chunks {
625 combined_audio.extend_from_slice(chunk);
626 }
627
628 let result = LongFormResult {
629 output_location: "deepgram-synthesis".to_string(),
630 total_duration: estimate_audio_duration(&combined_audio, 24000),
631 chapter_durations: None,
632 metadata: audio_data_to_synthesis_result(
633 combined_audio.clone(),
634 &self.content,
635 "linear16",
636 24000,
637 )
638 .metadata,
639 };
640
641 Ok(result)
642 } else {
643 Err(TtsError::InternalError(
644 "No audio data available".to_string(),
645 ))
646 }
647 }
648}
649
650pub struct DeepgramTts;
651
652impl DeepgramTts {
653 fn create_client(provider_config: &DeepgramConfig) -> DeepgramTtsApi {
654 DeepgramTtsApi::new(provider_config)
655 }
656
657 fn create_batch_client(provider_config: &DeepgramConfig) -> DeepgramTtsApi {
658 let batch_config = RateLimitConfig {
659 max_retries: 5,
660 initial_delay: std::time::Duration::from_millis(500),
661 max_delay: std::time::Duration::from_secs(60),
662 backoff_multiplier: 1.5,
663 };
664 DeepgramTtsApi::new(provider_config).with_rate_limit_config(batch_config)
665 }
666
667 fn create_streaming_client(provider_config: &DeepgramConfig) -> DeepgramTtsApi {
668 let streaming_config = RateLimitConfig {
669 max_retries: 3,
670 initial_delay: std::time::Duration::from_millis(200),
671 max_delay: std::time::Duration::from_secs(5),
672 backoff_multiplier: 2.0,
673 };
674 DeepgramTtsApi::new(provider_config).with_rate_limit_config(streaming_config)
675 }
676}
677
678impl VoiceProvider for DeepgramTts {
679 type Voice = DeepgramVoiceImpl;
680 type VoiceResults = DeepgramVoiceResults;
681 type ProviderConfig = DeepgramConfig;
682
683 fn list_voices(
684 provider_config: Self::ProviderConfig,
685 filter: Option<VoiceFilter>,
686 ) -> Result<VoiceResults, TtsError> {
687 let client = Self::create_client(&provider_config);
688 let models = get_available_models();
689
690 if let Some(f) = filter.as_ref() {
691 let mut voice_filters = crate::client::VoiceFilters::new();
692
693 if let Some(lang) = &f.language {
694 voice_filters = voice_filters.with_language(lang.clone());
695 }
696
697 if let Some(gender) = f.gender {
698 let gender_str = match gender {
699 VoiceGender::Male => "masculine",
700 VoiceGender::Female => "feminine",
701 VoiceGender::Neutral => "neutral",
702 };
703 voice_filters = voice_filters.with_gender(gender_str.to_string());
704 }
705
706 if let Some(quality) = f.quality {
707 let _quality_filter = match quality {
708 VoiceQuality::Standard => crate::client::VoiceQuality::Standard,
709 VoiceQuality::Premium => crate::client::VoiceQuality::Premium,
710 VoiceQuality::Neural => crate::client::VoiceQuality::Professional,
711 VoiceQuality::Studio => crate::client::VoiceQuality::Professional,
712 };
713 voice_filters = voice_filters.with_version(crate::client::ModelVersion::Aura2);
714 }
715
716 if let Some(query) = &f.search_query {
717 voice_filters = voice_filters.with_search(query.clone());
718 }
719
720 let filtered_response = client.get_models_filtered(&voice_filters)?;
721 let voice_infos: Vec<VoiceInfo> = filtered_response
722 .models
723 .into_iter()
724 .map(deepgram_model_to_voice_info)
725 .collect();
726
727 return Ok(VoiceResults::new(DeepgramVoiceResults::new(voice_infos)));
728 }
729
730 let mut voice_infos: Vec<VoiceInfo> = models
731 .into_iter()
732 .map(deepgram_model_to_voice_info)
733 .collect();
734
735 if let Some(f) = filter {
736 voice_infos.retain(|voice| {
737 let mut matches = true;
738
739 if let Some(gender) = f.gender {
740 matches = matches && voice.gender == gender;
741 }
742
743 if let Some(quality) = f.quality {
744 matches = matches && voice.quality == quality;
745 }
746
747 if let Some(lang) = &f.language {
748 matches = matches && voice.language == *lang;
749 }
750
751 if let Some(provider) = &f.provider {
752 matches = matches && voice.provider.contains(provider);
753 }
754
755 if let Some(query) = &f.search_query {
756 let query_lower = query.to_lowercase();
757 matches = matches
758 && (voice.name.to_lowercase().contains(&query_lower)
759 || voice
760 .description
761 .as_ref()
762 .is_some_and(|d| d.to_lowercase().contains(&query_lower))
763 || voice
764 .use_cases
765 .iter()
766 .any(|uc| uc.to_lowercase().contains(&query_lower)));
767 }
768
769 matches
770 });
771 }
772
773 Ok(VoiceResults::new(DeepgramVoiceResults::new(voice_infos)))
774 }
775
776 fn get_voice(
777 provider_config: Self::ProviderConfig,
778 voice_id: String,
779 ) -> Result<Voice, TtsError> {
780 let client = Self::create_client(&provider_config);
781 let models = get_available_models();
782
783 if let Some(model) = models.into_iter().find(|m| m.voice_id == voice_id) {
784 Ok(Voice::new(DeepgramVoiceImpl::new(model, client)))
785 } else {
786 Err(TtsError::VoiceNotFound(format!(
787 "Voice '{}' not found",
788 voice_id
789 )))
790 }
791 }
792
793 fn search_voices(
794 provider_config: Self::ProviderConfig,
795 filter: Option<VoiceFilter>,
796 ) -> Result<Vec<VoiceInfo>, TtsError> {
797 let client = Self::create_client(&provider_config);
798
799 let search_query = filter
802 .as_ref()
803 .and_then(|f| f.search_query.as_ref())
804 .cloned()
805 .unwrap_or_else(String::new);
806
807 let mut voice_infos: Vec<VoiceInfo> = if !search_query.is_empty() {
808 let search_results = client.search_models(&search_query)?;
809 search_results
810 .into_iter()
811 .map(deepgram_model_to_voice_info)
812 .collect()
813 } else {
814 let all_models = get_available_models();
815 all_models
816 .into_iter()
817 .map(deepgram_model_to_voice_info)
818 .collect()
819 };
820
821 if let Some(f) = filter {
822 voice_infos.retain(|voice| {
823 let mut matches = true;
824
825 if let Some(gender) = f.gender {
826 matches = matches && voice.gender == gender;
827 }
828
829 if let Some(quality) = f.quality {
830 matches = matches && voice.quality == quality;
831 }
832
833 if let Some(lang) = &f.language {
834 matches = matches && voice.language == *lang;
835 }
836
837 if let Some(provider) = &f.provider {
838 matches = matches && voice.provider.contains(provider);
839 }
840
841 matches
842 });
843 }
844
845 Ok(voice_infos)
846 }
847
848 fn list_languages(
849 _provider_config: Self::ProviderConfig,
850 ) -> Result<Vec<LanguageInfo>, TtsError> {
851 let models = get_available_models();
852 Ok(models_to_language_info(models))
853 }
854}
855
856impl SynthesizeProvider for DeepgramTts {
857 type ProviderConfig = DeepgramConfig;
858
859 fn synthesize(
860 provider_config: Self::ProviderConfig,
861 input: TextInput,
862 voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
863 options: Option<SynthesisOptions>,
864 ) -> Result<SynthesisResult, TtsError> {
865 validate_synthesis_request(
866 &input.content,
867 input.text_type,
868 input.language.as_deref(),
869 options.as_ref(),
870 )?;
871
872 let client = Self::create_client(&provider_config);
873 let voice_id = voice.get::<DeepgramVoiceImpl>().get_id();
874
875 let max_chars = get_max_chars_for_model(Some(&voice_id));
876
877 if input.content.len() <= max_chars {
878 let (request, mut params) =
879 synthesis_options_to_tts_request(input.content.clone(), options)?;
880 if let Some(ref mut p) = params {
881 p.model = Some(voice_id);
882 }
883
884 let tts_response = client.text_to_speech_with_metadata(&request, params.as_ref())?;
885 let encoding = params
886 .as_ref()
887 .and_then(|p| p.encoding.as_ref())
888 .unwrap_or(&"linear16".to_string())
889 .clone();
890 let sample_rate = params.as_ref().and_then(|p| p.sample_rate).unwrap_or(24000);
891
892 let mut synthesis_result = audio_data_to_synthesis_result(
893 tts_response.audio_data,
894 &input.content,
895 &encoding,
896 sample_rate,
897 );
898
899 synthesis_result.metadata.provider_info = Some(format!(
900 "Deepgram TTS - Model: {}, Characters: {}",
901 tts_response.metadata.dg_model_name, tts_response.metadata.dg_char_count
902 ));
903
904 Ok(synthesis_result)
905 } else {
906 let chunks = split_text_intelligently(&input.content, max_chars);
907 let mut combined_audio = Vec::new();
908 let mut total_characters = 0u32;
909 let mut total_words = 0u32;
910 let mut total_duration = 0.0f32;
911
912 for (chunk_index, chunk) in chunks.iter().enumerate() {
913 let (request, mut params) =
914 synthesis_options_to_tts_request(chunk.clone(), options.clone())?;
915 if let Some(ref mut p) = params {
916 p.model = Some(voice_id.clone());
917 }
918
919 let tts_response =
920 client.text_to_speech_with_metadata(&request, params.as_ref())?;
921
922 combined_audio.extend_from_slice(&tts_response.audio_data);
923
924 total_characters += chunk.chars().count() as u32;
925 total_words += chunk.split_whitespace().count() as u32;
926
927 let encoding = params
928 .as_ref()
929 .and_then(|p| p.encoding.as_ref())
930 .cloned()
931 .unwrap_or_else(|| "linear16".to_string());
932 let sample_rate = params.as_ref().and_then(|p| p.sample_rate).unwrap_or(24000);
933 total_duration += estimate_audio_duration(&tts_response.audio_data, sample_rate);
934
935 if chunk_index < chunks.len() - 1 {
936 let silence_samples = (sample_rate as f32 * 0.1) as usize;
937 let silence_bytes = match encoding.as_str() {
938 "linear16" => silence_samples * 2,
939 "mulaw" | "alaw" => silence_samples,
940 _ => silence_samples * 2,
941 };
942 combined_audio.extend(vec![0u8; silence_bytes]);
943 total_duration += 0.1;
944 }
945 }
946
947 let audio_size_bytes = combined_audio.len() as u32;
948 let synthesis_result = SynthesisResult {
949 audio_data: combined_audio,
950 metadata: SynthesisMetadata {
951 duration_seconds: total_duration,
952 character_count: total_characters,
953 word_count: total_words,
954 audio_size_bytes,
955 request_id: format!("deepgram-chunked-{}", chrono::Utc::now().timestamp()),
956 provider_info: Some(format!(
957 "Deepgram TTS - Model: {}, Chunks: {}, Total Characters: {}",
958 voice_id,
959 chunks.len(),
960 total_characters
961 )),
962 },
963 };
964
965 Ok(synthesis_result)
966 }
967 }
968
969 fn synthesize_batch(
970 provider_config: Self::ProviderConfig,
971 inputs: Vec<TextInput>,
972 voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
973 options: Option<SynthesisOptions>,
974 ) -> Result<Vec<SynthesisResult>, TtsError> {
975 let mut results = Vec::new();
976 let client = Self::create_batch_client(&provider_config);
977 let voice_id = voice.get::<DeepgramVoiceImpl>().get_id();
978
979 for input in inputs {
980 validate_synthesis_request(
981 &input.content,
982 input.text_type,
983 input.language.as_deref(),
984 options.as_ref(),
985 )?;
986
987 let (request, mut params) =
988 synthesis_options_to_tts_request(input.content.clone(), options.clone())?;
989 if let Some(ref mut p) = params {
990 p.model = Some(voice_id.clone());
991 }
992
993 match client.text_to_speech_with_metadata(&request, params.as_ref()) {
994 Ok(tts_response) => {
995 let encoding = params
996 .as_ref()
997 .and_then(|p| p.encoding.as_ref())
998 .unwrap_or(&"linear16".to_string())
999 .clone();
1000 let sample_rate = params.as_ref().and_then(|p| p.sample_rate).unwrap_or(24000);
1001
1002 let mut synthesis_result = audio_data_to_synthesis_result(
1003 tts_response.audio_data,
1004 &input.content,
1005 &encoding,
1006 sample_rate,
1007 );
1008
1009 synthesis_result.metadata.provider_info = Some(format!(
1010 "Deepgram TTS - Model: {}, Characters: {}",
1011 tts_response.metadata.dg_model_name, tts_response.metadata.dg_char_count
1012 ));
1013
1014 results.push(synthesis_result);
1015 }
1016 Err(e) => {
1017 return Err(e);
1018 }
1019 }
1020 }
1021
1022 Ok(results)
1023 }
1024
1025 fn get_timing_marks(
1026 _provider_config: Self::ProviderConfig,
1027 _input: TextInput,
1028 _voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1029 ) -> Result<Vec<TimingInfo>, TtsError> {
1030 Err(TtsError::UnsupportedOperation(
1031 "Timing marks not supported by Deepgram".to_string(),
1032 ))
1033 }
1034
1035 fn validate_input(
1036 _provider_config: Self::ProviderConfig,
1037 input: TextInput,
1038 voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1039 ) -> Result<ValidationResult, TtsError> {
1040 let voice_id = voice.get::<DeepgramVoiceImpl>().get_id();
1041
1042 let mut _is_valid = true;
1043 let mut messages = Vec::new();
1044
1045 if input.content.is_empty() {
1046 _is_valid = false;
1047 messages.push("Text input cannot be empty".to_string());
1048 }
1049
1050 let char_limit = if voice_id.starts_with("aura-2") {
1051 2000
1052 } else {
1053 1000
1054 };
1055 if input.content.len() > char_limit {
1056 _is_valid = false;
1057 messages.push(format!(
1058 "Text exceeds {} character limit for {}",
1059 char_limit, voice_id
1060 ));
1061 }
1062
1063 if input
1064 .content
1065 .chars()
1066 .any(|c| c.is_control() && c != '\n' && c != '\r' && c != '\t')
1067 {
1068 messages.push(
1069 "Warning: Text contains control characters that may not be processed correctly"
1070 .to_string(),
1071 );
1072 }
1073
1074 let _message = if messages.is_empty() {
1075 None
1076 } else {
1077 Some(messages.join("; "))
1078 };
1079
1080 Ok(validate_text_input(&input.content, Some(&voice_id)))
1081 }
1082}
1083
1084impl StreamingVoiceProvider for DeepgramTts {
1085 type SynthesisStream = DeepgramSynthesisStream;
1086 type VoiceConversionStream = DeepgramVoiceConversionStream;
1087 type ProviderConfig = DeepgramConfig;
1088
1089 fn create_stream(
1090 provider_config: Self::ProviderConfig,
1091 voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1092 options: Option<SynthesisOptions>,
1093 ) -> Result<SynthesisStream, TtsError> {
1094 let client = Self::create_streaming_client(&provider_config);
1095 let voice_id = voice.get::<DeepgramVoiceImpl>().get_id();
1096
1097 let stream = DeepgramSynthesisStream::new(voice_id, client, options);
1098 Ok(SynthesisStream::new(stream))
1099 }
1100
1101 fn create_voice_conversion_stream(
1102 provider_config: Self::ProviderConfig,
1103 target_voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1104 _options: Option<SynthesisOptions>,
1105 ) -> Result<VoiceConversionStream, TtsError> {
1106 let client = Self::create_client(&provider_config);
1107 let voice_id = target_voice.get::<DeepgramVoiceImpl>().get_id();
1108
1109 let stream = DeepgramVoiceConversionStream::new(voice_id, client);
1110 Ok(VoiceConversionStream::new(stream))
1111 }
1112}
1113
1114impl AdvancedTtsProvider for DeepgramTts {
1115 type PronunciationLexicon = DeepgramPronunciationLexicon;
1116 type LongFormOperation = DeepgramLongFormOperation;
1117 type ProviderConfig = DeepgramConfig;
1118
1119 fn create_voice_clone(
1120 _provider_config: Self::ProviderConfig,
1121 _name: String,
1122 _audio_samples: Vec<AudioSample>,
1123 _description: Option<String>,
1124 ) -> Result<Voice, TtsError> {
1125 Err(TtsError::UnsupportedOperation(
1126 "Deepgram does not support voice cloning".to_string(),
1127 ))
1128 }
1129
1130 fn design_voice(
1131 _provider_config: Self::ProviderConfig,
1132 _name: String,
1133 _characteristics: VoiceDesignParams,
1134 ) -> Result<Voice, TtsError> {
1135 Err(TtsError::UnsupportedOperation(
1136 "Deepgram does not support voice design".to_string(),
1137 ))
1138 }
1139
1140 fn convert_voice(
1141 _provider_config: Self::ProviderConfig,
1142 _input_audio: Vec<u8>,
1143 _target_voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1144 _preserve_timing: Option<bool>,
1145 ) -> Result<Vec<u8>, TtsError> {
1146 Err(TtsError::UnsupportedOperation(
1147 "Deepgram does not support voice conversion".to_string(),
1148 ))
1149 }
1150
1151 fn generate_sound_effect(
1152 _provider_config: Self::ProviderConfig,
1153 _description: String,
1154 _duration_seconds: Option<f32>,
1155 _style_influence: Option<f32>,
1156 ) -> Result<Vec<u8>, TtsError> {
1157 Err(TtsError::UnsupportedOperation(
1158 "Deepgram does not support sound effect generation".to_string(),
1159 ))
1160 }
1161
1162 fn create_lexicon(
1163 _provider_config: Self::ProviderConfig,
1164 name: String,
1165 language: LanguageCode,
1166 entries: Option<Vec<PronunciationEntry>>,
1167 ) -> Result<PronunciationLexicon, TtsError> {
1168 let lexicon = DeepgramPronunciationLexicon::new(name, language, entries);
1169 Ok(PronunciationLexicon::new(lexicon))
1170 }
1171
1172 fn synthesize_long_form(
1173 provider_config: Self::ProviderConfig,
1174 content: String,
1175 voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1176 output_location: String,
1177 chapter_breaks: Option<Vec<u32>>,
1178 ) -> Result<LongFormOperation, TtsError> {
1179 let client = Self::create_batch_client(&provider_config);
1180 let voice_id = voice.get::<DeepgramVoiceImpl>().get_id();
1181
1182 let operation = DeepgramLongFormOperation::new(
1183 content,
1184 output_location,
1185 voice_id,
1186 client,
1187 chapter_breaks,
1188 );
1189
1190 operation.process_long_form()?;
1191
1192 Ok(LongFormOperation::new(operation))
1193 }
1194}
1195
1196impl ExtendedTtsProvider for DeepgramTts {
1197 fn unwrapped_synthesis_stream(
1198 provider_config: <Self as VoiceProvider>::ProviderConfig,
1199 voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1200 options: Option<SynthesisOptions>,
1201 ) -> Self::SynthesisStream {
1202 let client = Self::create_streaming_client(&provider_config);
1203 let voice_id = voice.get::<DeepgramVoiceImpl>().get_id();
1204
1205 DeepgramSynthesisStream::new(voice_id, client, options)
1206 }
1207
1208 fn unwrapped_voice_conversion_stream(
1209 provider_config: <Self as VoiceProvider>::ProviderConfig,
1210 target_voice: golem_ai_tts::model::voices::VoiceBorrow<'_>,
1211 _options: Option<SynthesisOptions>,
1212 ) -> Self::VoiceConversionStream {
1213 let client = Self::create_client(&provider_config);
1214 let voice_id = target_voice.get::<DeepgramVoiceImpl>().get_id();
1215
1216 DeepgramVoiceConversionStream::new(voice_id, client)
1217 }
1218
1219 fn subscribe_synthesis_stream(_stream: &Self::SynthesisStream) -> Pollable {
1220 subscribe_zero()
1221 }
1222
1223 fn subscribe_voice_conversion_stream(_stream: &Self::VoiceConversionStream) -> Pollable {
1224 subscribe_zero()
1225 }
1226}
1227
1228pub type DurableDeepgramTts = DurableTts<DeepgramTts>;