quantum_sdk/
audio.rs

1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4
5use crate::client::Client;
6use crate::error::Result;
7
8/// Request body for text-to-speech.
9#[derive(Debug, Clone, Serialize, Default)]
10pub struct TextToSpeechRequest {
11    /// TTS model (e.g. "tts-1", "eleven_multilingual_v2", "grok-3-tts").
12    pub model: String,
13
14    /// Text to synthesise into speech.
15    pub text: String,
16
17    /// Voice to use (e.g. "alloy", "echo", "nova", "Rachel").
18    #[serde(skip_serializing_if = "Option::is_none")]
19    pub voice: Option<String>,
20
21    /// Audio format (e.g. "mp3", "wav", "opus"). Default: "mp3".
22    #[serde(rename = "format", skip_serializing_if = "Option::is_none")]
23    pub output_format: Option<String>,
24
25    /// Speech rate (provider-dependent).
26    #[serde(skip_serializing_if = "Option::is_none")]
27    pub speed: Option<f64>,
28}
29
30/// Backwards-compatible alias.
31pub type TtsRequest = TextToSpeechRequest;
32
33/// Response from text-to-speech.
34#[derive(Debug, Clone, Deserialize)]
35pub struct TextToSpeechResponse {
36    /// Base64-encoded audio data.
37    pub audio_base64: String,
38
39    /// Audio format (e.g. "mp3").
40    pub format: String,
41
42    /// Audio file size.
43    pub size_bytes: i64,
44
45    /// Model that generated the audio.
46    pub model: String,
47
48    /// Total cost in ticks.
49    #[serde(default)]
50    pub cost_ticks: i64,
51
52    /// Unique request identifier.
53    #[serde(default)]
54    pub request_id: String,
55}
56
57/// Backwards-compatible alias.
58pub type TtsResponse = TextToSpeechResponse;
59
60/// Request body for speech-to-text.
61#[derive(Debug, Clone, Serialize, Default)]
62pub struct SpeechToTextRequest {
63    /// STT model (e.g. "whisper-1", "scribe_v2").
64    pub model: String,
65
66    /// Base64-encoded audio data.
67    pub audio_base64: String,
68
69    /// Original filename (helps with format detection). Default: "audio.mp3".
70    #[serde(skip_serializing_if = "Option::is_none")]
71    pub filename: Option<String>,
72
73    /// BCP-47 language code hint (e.g. "en", "de").
74    #[serde(skip_serializing_if = "Option::is_none")]
75    pub language: Option<String>,
76}
77
78/// Backwards-compatible alias.
79pub type SttRequest = SpeechToTextRequest;
80
81/// Response from speech-to-text.
82#[derive(Debug, Clone, Deserialize)]
83pub struct SpeechToTextResponse {
84    /// Transcribed text.
85    pub text: String,
86
87    /// Model that performed transcription.
88    pub model: String,
89
90    /// Total cost in ticks.
91    #[serde(default)]
92    pub cost_ticks: i64,
93
94    /// Unique request identifier.
95    #[serde(default)]
96    pub request_id: String,
97}
98
99/// Backwards-compatible alias.
100pub type SttResponse = SpeechToTextResponse;
101
102/// Request body for music generation.
103#[derive(Debug, Clone, Serialize, Default)]
104pub struct MusicRequest {
105    /// Music generation model (e.g. "lyria").
106    pub model: String,
107
108    /// Describes the music to generate.
109    pub prompt: String,
110
111    /// Target duration in seconds (default 30).
112    #[serde(skip_serializing_if = "Option::is_none")]
113    pub duration_seconds: Option<i32>,
114}
115
116/// Response from music generation.
117#[derive(Debug, Clone, Deserialize)]
118pub struct MusicResponse {
119    /// Generated music clips.
120    #[serde(default)]
121    pub audio_clips: Vec<MusicClip>,
122
123    /// Model that generated the music.
124    #[serde(default)]
125    pub model: String,
126
127    /// Total cost in ticks.
128    #[serde(default)]
129    pub cost_ticks: i64,
130
131    /// Unique request identifier.
132    #[serde(default)]
133    pub request_id: String,
134}
135
136/// A single generated music clip.
137#[derive(Debug, Clone, Deserialize)]
138pub struct MusicClip {
139    /// Base64-encoded audio data.
140    pub base64: String,
141
142    /// Audio format (e.g. "mp3", "wav").
143    #[serde(default)]
144    pub format: String,
145
146    /// Audio file size.
147    #[serde(default)]
148    pub size_bytes: i64,
149
150    /// Clip index within the batch.
151    #[serde(default)]
152    pub index: i32,
153}
154
155/// Request body for sound effects generation.
156#[derive(Debug, Clone, Serialize, Default)]
157pub struct SoundEffectRequest {
158    /// Text prompt describing the sound effect.
159    pub prompt: String,
160
161    /// Optional duration in seconds.
162    #[serde(skip_serializing_if = "Option::is_none")]
163    pub duration_seconds: Option<f64>,
164}
165
166/// Response from sound effects generation.
167#[derive(Debug, Clone, Deserialize)]
168pub struct SoundEffectResponse {
169    /// Base64-encoded audio data.
170    pub audio_base64: String,
171
172    /// Audio format (e.g. "mp3").
173    pub format: String,
174
175    /// File size in bytes.
176    #[serde(default)]
177    pub size_bytes: i64,
178
179    /// Model used.
180    #[serde(default)]
181    pub model: String,
182
183    /// Total cost in ticks.
184    #[serde(default)]
185    pub cost_ticks: i64,
186
187    /// Unique request identifier.
188    #[serde(default)]
189    pub request_id: String,
190}
191
192// ---------------------------------------------------------------------------
193// Advanced Audio Types
194// ---------------------------------------------------------------------------
195
196/// Generic audio response used by multiple advanced audio endpoints.
197#[derive(Debug, Clone, Deserialize)]
198pub struct AudioResponse {
199    /// Base64-encoded audio data.
200    #[serde(default)]
201    pub audio_base64: Option<String>,
202
203    /// Audio format (e.g. "mp3", "wav").
204    #[serde(default)]
205    pub format: Option<String>,
206
207    /// File size in bytes.
208    #[serde(default)]
209    pub size_bytes: Option<i64>,
210
211    /// Model used.
212    #[serde(default)]
213    pub model: Option<String>,
214
215    /// Total cost in ticks.
216    #[serde(default)]
217    pub cost_ticks: i64,
218
219    /// Unique request identifier.
220    #[serde(default)]
221    pub request_id: String,
222
223    /// Additional response fields.
224    #[serde(flatten)]
225    pub extra: HashMap<String, serde_json::Value>,
226}
227
228/// A single dialogue turn (used for building the request — converted to text + voices).
229#[derive(Debug, Clone, Serialize, Deserialize, Default)]
230pub struct DialogueTurn {
231    /// Speaker name or identifier.
232    pub speaker: String,
233
234    /// Text for this speaker to say.
235    pub text: String,
236
237    /// Voice ID to use for this speaker.
238    #[serde(skip_serializing_if = "Option::is_none")]
239    pub voice: Option<String>,
240}
241
242/// Voice mapping for ElevenLabs dialogue.
243#[derive(Debug, Clone, Serialize)]
244pub struct DialogueVoice {
245    pub voice_id: String,
246    pub name: String,
247}
248
249/// Request body sent to the QAI proxy for dialogue generation.
250/// The proxy expects `text` (full script) + `voices` (speaker-to-voice mapping).
251#[derive(Debug, Clone, Serialize, Default)]
252pub struct DialogueRequest {
253    /// Full dialogue script (e.g. "Speaker1: Hello!\nSpeaker2: Hi there!").
254    pub text: String,
255
256    /// Voice mappings — each speaker name mapped to a voice_id.
257    pub voices: Vec<DialogueVoice>,
258
259    /// Dialogue model.
260    #[serde(skip_serializing_if = "Option::is_none")]
261    pub model: Option<String>,
262
263    /// Output audio format.
264    #[serde(rename = "output_format", skip_serializing_if = "Option::is_none")]
265    pub output_format: Option<String>,
266
267    /// Seed for reproducible generation.
268    #[serde(skip_serializing_if = "Option::is_none")]
269    pub seed: Option<i32>,
270}
271
272impl DialogueRequest {
273    /// Build a DialogueRequest from individual turns.
274    /// Converts turns into the text + voices format the API expects.
275    pub fn from_turns(turns: Vec<DialogueTurn>, model: Option<String>) -> Self {
276        // Build the script text: "Speaker: text\n..."
277        let text = turns.iter()
278            .map(|t| format!("{}: {}", t.speaker, t.text))
279            .collect::<Vec<_>>()
280            .join("\n");
281
282        // Deduplicate voices — one entry per unique speaker
283        let mut seen = std::collections::HashSet::new();
284        let voices: Vec<DialogueVoice> = turns.iter()
285            .filter(|t| t.voice.is_some() && seen.insert(t.speaker.clone()))
286            .map(|t| DialogueVoice {
287                voice_id: t.voice.clone().unwrap_or_default(),
288                name: t.speaker.clone(),
289            })
290            .collect();
291
292        Self {
293            text,
294            voices,
295            model,
296            ..Default::default()
297        }
298    }
299}
300
301/// Request body for speech-to-speech conversion.
302#[derive(Debug, Clone, Serialize, Default)]
303pub struct SpeechToSpeechRequest {
304    /// Model for conversion.
305    #[serde(skip_serializing_if = "Option::is_none")]
306    pub model: Option<String>,
307
308    /// Base64-encoded source audio.
309    pub audio_base64: String,
310
311    /// Target voice identifier.
312    #[serde(skip_serializing_if = "Option::is_none")]
313    pub voice_id: Option<String>,
314
315    /// Target voice name (alternative to voice_id).
316    #[serde(skip_serializing_if = "Option::is_none")]
317    pub voice: Option<String>,
318
319    /// Output audio format.
320    #[serde(rename = "format", skip_serializing_if = "Option::is_none")]
321    pub output_format: Option<String>,
322}
323
324/// Request body for voice isolation.
325#[derive(Debug, Clone, Serialize, Default)]
326pub struct IsolateVoiceRequest {
327    /// Base64-encoded audio to isolate voice from.
328    pub audio_base64: String,
329
330    /// Output audio format.
331    #[serde(rename = "format", skip_serializing_if = "Option::is_none")]
332    pub output_format: Option<String>,
333}
334
335/// Backwards-compatible alias.
336pub type IsolateRequest = IsolateVoiceRequest;
337
338/// Request body for voice remixing.
339#[derive(Debug, Clone, Serialize, Default)]
340pub struct RemixVoiceRequest {
341    /// Base64-encoded source audio.
342    pub audio_base64: String,
343
344    /// Target voice for the remix.
345    #[serde(skip_serializing_if = "Option::is_none")]
346    pub voice: Option<String>,
347
348    /// Model for remixing.
349    #[serde(skip_serializing_if = "Option::is_none")]
350    pub model: Option<String>,
351
352    /// Output audio format.
353    #[serde(rename = "format", skip_serializing_if = "Option::is_none")]
354    pub output_format: Option<String>,
355}
356
357/// Backwards-compatible alias.
358pub type RemixRequest = RemixVoiceRequest;
359
360/// Request body for audio dubbing.
361#[derive(Debug, Clone, Serialize, Default)]
362pub struct DubRequest {
363    /// Base64-encoded source audio or video.
364    #[serde(skip_serializing_if = "Option::is_none")]
365    pub audio_base64: Option<String>,
366
367    /// Original filename (helps detect format).
368    #[serde(skip_serializing_if = "Option::is_none")]
369    pub filename: Option<String>,
370
371    /// URL to source media (alternative to audio_base64).
372    #[serde(skip_serializing_if = "Option::is_none")]
373    pub source_url: Option<String>,
374
375    /// Target language (BCP-47 code, e.g. "es", "de").
376    pub target_lang: String,
377
378    /// Source language (auto-detected if omitted).
379    #[serde(skip_serializing_if = "Option::is_none")]
380    pub source_lang: Option<String>,
381
382    /// Number of speakers (optional).
383    #[serde(skip_serializing_if = "Option::is_none")]
384    pub num_speakers: Option<i32>,
385
386    /// Enable highest quality processing.
387    #[serde(skip_serializing_if = "Option::is_none")]
388    pub highest_resolution: Option<bool>,
389}
390
391/// Request body for audio alignment / forced alignment.
392#[derive(Debug, Clone, Serialize, Default)]
393pub struct AlignRequest {
394    /// Base64-encoded audio data.
395    pub audio_base64: String,
396
397    /// Transcript text to align against the audio.
398    pub text: String,
399
400    /// Language code.
401    #[serde(skip_serializing_if = "Option::is_none")]
402    pub language: Option<String>,
403}
404
405/// A single alignment segment.
406#[derive(Debug, Clone, Deserialize)]
407pub struct AlignmentSegment {
408    /// Aligned text.
409    pub text: String,
410
411    /// Start time in seconds.
412    pub start: f64,
413
414    /// End time in seconds.
415    pub end: f64,
416}
417
418/// A single word with timing information from forced alignment.
419#[derive(Debug, Clone, Deserialize)]
420pub struct AlignedWord {
421    /// Word text.
422    pub text: String,
423
424    /// Start time in seconds.
425    pub start_time: f64,
426
427    /// End time in seconds.
428    pub end_time: f64,
429
430    /// Alignment confidence score.
431    #[serde(default)]
432    pub confidence: f64,
433}
434
435/// Response from audio alignment.
436#[derive(Debug, Clone, Deserialize)]
437pub struct AlignResponse {
438    /// Aligned segments.
439    #[serde(default)]
440    pub segments: Vec<AlignmentSegment>,
441
442    /// Word-level alignment.
443    #[serde(default)]
444    pub alignment: Vec<AlignedWord>,
445
446    /// Model used.
447    #[serde(default)]
448    pub model: String,
449
450    /// Total cost in ticks.
451    #[serde(default)]
452    pub cost_ticks: i64,
453
454    /// Unique request identifier.
455    #[serde(default)]
456    pub request_id: String,
457}
458
459// ---------------------------------------------------------------------------
460// Typed response structs (parity with Go SDK)
461// ---------------------------------------------------------------------------
462
463/// Response from dialogue generation.
464#[derive(Debug, Clone, Deserialize)]
465pub struct DialogueResponse {
466    pub audio_base64: String,
467    pub format: String,
468    #[serde(default)]
469    pub size_bytes: i64,
470    #[serde(default)]
471    pub model: String,
472    #[serde(default)]
473    pub cost_ticks: i64,
474    #[serde(default)]
475    pub request_id: String,
476}
477
478/// Response from speech-to-speech conversion.
479#[derive(Debug, Clone, Deserialize)]
480pub struct SpeechToSpeechResponse {
481    pub audio_base64: String,
482    pub format: String,
483    #[serde(default)]
484    pub size_bytes: i64,
485    #[serde(default)]
486    pub model: String,
487    #[serde(default)]
488    pub cost_ticks: i64,
489    #[serde(default)]
490    pub request_id: String,
491}
492
493/// Response from voice isolation.
494#[derive(Debug, Clone, Deserialize)]
495pub struct IsolateVoiceResponse {
496    pub audio_base64: String,
497    pub format: String,
498    #[serde(default)]
499    pub size_bytes: i64,
500    #[serde(default)]
501    pub cost_ticks: i64,
502    #[serde(default)]
503    pub request_id: String,
504}
505
506/// Response from voice remixing.
507#[derive(Debug, Clone, Deserialize)]
508pub struct RemixVoiceResponse {
509    #[serde(default)]
510    pub audio_base64: Option<String>,
511    pub format: String,
512    #[serde(default)]
513    pub size_bytes: i64,
514    #[serde(default)]
515    pub voice_id: Option<String>,
516    #[serde(default)]
517    pub cost_ticks: i64,
518    #[serde(default)]
519    pub request_id: String,
520}
521
522/// Response from dubbing.
523#[derive(Debug, Clone, Deserialize)]
524pub struct DubResponse {
525    pub dubbing_id: String,
526    pub audio_base64: String,
527    pub format: String,
528    #[serde(default)]
529    pub target_lang: String,
530    #[serde(default)]
531    pub status: String,
532    #[serde(default)]
533    pub processing_time_seconds: f64,
534    #[serde(default)]
535    pub cost_ticks: i64,
536    #[serde(default)]
537    pub request_id: String,
538}
539
540/// Response from voice design.
541#[derive(Debug, Clone, Deserialize)]
542pub struct VoiceDesignResponse {
543    pub previews: Vec<VoicePreview>,
544    #[serde(default)]
545    pub cost_ticks: i64,
546    #[serde(default)]
547    pub request_id: String,
548}
549
550/// A single voice preview from voice design.
551#[derive(Debug, Clone, Deserialize)]
552pub struct VoicePreview {
553    pub generated_voice_id: String,
554    pub audio_base64: String,
555    pub format: String,
556}
557
558/// Response from Starfish TTS.
559#[derive(Debug, Clone, Deserialize)]
560pub struct StarfishTTSResponse {
561    #[serde(default)]
562    pub audio_base64: Option<String>,
563    #[serde(default)]
564    pub url: Option<String>,
565    pub format: String,
566    #[serde(default)]
567    pub size_bytes: i64,
568    #[serde(default)]
569    pub duration: f64,
570    #[serde(default)]
571    pub model: String,
572    #[serde(default)]
573    pub cost_ticks: i64,
574    #[serde(default)]
575    pub request_id: String,
576}
577
578/// Advanced music generation request.
579#[derive(Debug, Clone, Serialize, Default)]
580pub struct MusicAdvancedRequest {
581    pub prompt: String,
582    #[serde(skip_serializing_if = "Option::is_none")]
583    pub duration_seconds: Option<i32>,
584    #[serde(skip_serializing_if = "Option::is_none")]
585    pub model: Option<String>,
586    #[serde(skip_serializing_if = "Option::is_none")]
587    pub finetune_id: Option<String>,
588}
589
590/// A single clip from advanced music generation.
591#[derive(Debug, Clone, Deserialize)]
592pub struct MusicAdvancedClip {
593    #[serde(default)]
594    pub base64: String,
595    #[serde(default)]
596    pub format: String,
597    #[serde(default)]
598    pub size: i64,
599}
600
601/// Response from advanced music generation.
602#[derive(Debug, Clone, Deserialize)]
603pub struct MusicAdvancedResponse {
604    #[serde(default)]
605    pub clips: Vec<MusicAdvancedClip>,
606    #[serde(default)]
607    pub model: String,
608    #[serde(default)]
609    pub cost_ticks: i64,
610    #[serde(default)]
611    pub request_id: String,
612}
613
614/// Music finetune info.
615#[derive(Debug, Clone, Serialize, Deserialize)]
616pub struct MusicFinetuneInfo {
617    pub finetune_id: String,
618    pub name: String,
619    #[serde(default)]
620    pub description: Option<String>,
621    #[serde(default)]
622    pub status: String,
623    #[serde(default)]
624    pub model_id: Option<String>,
625    #[serde(default)]
626    pub created_at: Option<String>,
627}
628
629/// Response from listing music finetunes.
630#[derive(Debug, Clone, Deserialize)]
631pub struct MusicFinetuneListResponse {
632    pub finetunes: Vec<MusicFinetuneInfo>,
633}
634
635/// Request to create a music finetune.
636#[derive(Debug, Clone, Serialize)]
637pub struct MusicFinetuneCreateRequest {
638    pub name: String,
639    #[serde(skip_serializing_if = "Option::is_none")]
640    pub description: Option<String>,
641    pub samples: Vec<String>,
642}
643
644/// Request body for voice design (generating a voice from a description).
645#[derive(Debug, Clone, Serialize, Default)]
646pub struct VoiceDesignRequest {
647    /// Text description of the desired voice.
648    #[serde(rename = "voice_description")]
649    pub description: String,
650
651    /// Sample text to speak with the designed voice.
652    #[serde(rename = "sample_text")]
653    pub text: String,
654
655    /// Output audio format.
656    #[serde(rename = "format", skip_serializing_if = "Option::is_none")]
657    pub output_format: Option<String>,
658}
659
660/// Request body for Starfish TTS.
661#[derive(Debug, Clone, Serialize, Default)]
662pub struct StarfishTTSRequest {
663    /// Text to synthesise.
664    pub text: String,
665
666    /// HeyGen voice identifier.
667    #[serde(skip_serializing_if = "Option::is_none")]
668    pub voice_id: Option<String>,
669
670    /// Voice name (alternative to voice_id).
671    #[serde(skip_serializing_if = "Option::is_none")]
672    pub voice: Option<String>,
673
674    /// Output audio format.
675    #[serde(rename = "format", skip_serializing_if = "Option::is_none")]
676    pub output_format: Option<String>,
677
678    /// Input type (e.g. "text", "ssml").
679    #[serde(skip_serializing_if = "Option::is_none")]
680    pub input_type: Option<String>,
681
682    /// Speech speed multiplier.
683    #[serde(skip_serializing_if = "Option::is_none")]
684    pub speed: Option<f64>,
685
686    /// BCP-47 language code.
687    #[serde(skip_serializing_if = "Option::is_none")]
688    pub language: Option<String>,
689
690    /// Locale code.
691    #[serde(skip_serializing_if = "Option::is_none")]
692    pub locale: Option<String>,
693}
694
695// ---------------------------------------------------------------------------
696// Eleven Music (advanced music generation with sections, finetunes, etc.)
697// ---------------------------------------------------------------------------
698
699/// A section within an Eleven Music generation request.
700#[derive(Debug, Clone, Serialize, Deserialize, Default)]
701pub struct MusicSection {
702    pub section_type: String,
703    #[serde(skip_serializing_if = "Option::is_none")]
704    pub lyrics: Option<String>,
705    #[serde(skip_serializing_if = "Option::is_none")]
706    pub style: Option<String>,
707    #[serde(skip_serializing_if = "Option::is_none")]
708    pub style_exclude: Option<String>,
709}
710
711/// Request body for advanced music generation (ElevenLabs Eleven Music).
712#[derive(Debug, Clone, Serialize, Default)]
713pub struct ElevenMusicRequest {
714    pub model: String,
715    pub prompt: String,
716    #[serde(skip_serializing_if = "Option::is_none")]
717    pub sections: Option<Vec<MusicSection>>,
718    #[serde(skip_serializing_if = "Option::is_none")]
719    pub duration_seconds: Option<i32>,
720    #[serde(skip_serializing_if = "Option::is_none")]
721    pub language: Option<String>,
722    #[serde(skip_serializing_if = "Option::is_none")]
723    pub vocals: Option<bool>,
724    #[serde(skip_serializing_if = "Option::is_none")]
725    pub style: Option<String>,
726    #[serde(skip_serializing_if = "Option::is_none")]
727    pub style_exclude: Option<String>,
728    #[serde(skip_serializing_if = "Option::is_none")]
729    pub finetune_id: Option<String>,
730    #[serde(skip_serializing_if = "Option::is_none")]
731    pub edit_reference_id: Option<String>,
732    #[serde(skip_serializing_if = "Option::is_none")]
733    pub edit_instruction: Option<String>,
734}
735
736/// A single music clip from advanced generation.
737#[derive(Debug, Clone, Deserialize)]
738pub struct ElevenMusicClip {
739    /// Base64-encoded audio data.
740    #[serde(default)]
741    pub base64: String,
742    /// Audio format (e.g. "mp3").
743    #[serde(default)]
744    pub format: String,
745    /// File size in bytes.
746    #[serde(default)]
747    pub size: i64,
748}
749
750/// Response from advanced music generation.
751/// Backend returns: { clips: [...], model, cost_ticks, request_id }
752#[derive(Debug, Clone, Deserialize)]
753pub struct ElevenMusicResponse {
754    /// Generated music clips.
755    #[serde(default)]
756    pub clips: Vec<ElevenMusicClip>,
757    /// Model used.
758    #[serde(default)]
759    pub model: String,
760    /// Total cost in ticks.
761    #[serde(default)]
762    pub cost_ticks: i64,
763    /// Unique request identifier.
764    #[serde(default)]
765    pub request_id: String,
766}
767
768/// Info about a music finetune.
769#[derive(Debug, Clone, Serialize, Deserialize)]
770pub struct FinetuneInfo {
771    pub finetune_id: String,
772    pub name: String,
773    #[serde(default)]
774    pub status: String,
775    #[serde(default)]
776    pub created_at: Option<String>,
777}
778
779/// Response from listing finetunes.
780#[derive(Debug, Clone, Deserialize)]
781pub struct ListFinetunesResponse {
782    pub finetunes: Vec<FinetuneInfo>,
783}
784
785// ---------------------------------------------------------------------------
786// Client impl
787// ---------------------------------------------------------------------------
788
789impl Client {
790    /// Generates speech from text.
791    pub async fn speak(&self, req: &TextToSpeechRequest) -> Result<TextToSpeechResponse> {
792        let (mut resp, meta) = self
793            .post_json::<TextToSpeechRequest, TextToSpeechResponse>("/qai/v1/audio/tts", req)
794            .await?;
795        if resp.cost_ticks == 0 {
796            resp.cost_ticks = meta.cost_ticks;
797        }
798        if resp.request_id.is_empty() {
799            resp.request_id = meta.request_id;
800        }
801        Ok(resp)
802    }
803
804    /// Converts speech to text.
805    pub async fn transcribe(&self, req: &SpeechToTextRequest) -> Result<SpeechToTextResponse> {
806        let (mut resp, meta) = self
807            .post_json::<SpeechToTextRequest, SpeechToTextResponse>("/qai/v1/audio/stt", req)
808            .await?;
809        if resp.cost_ticks == 0 {
810            resp.cost_ticks = meta.cost_ticks;
811        }
812        if resp.request_id.is_empty() {
813            resp.request_id = meta.request_id;
814        }
815        Ok(resp)
816    }
817
818    /// Generates sound effects from a text prompt (ElevenLabs).
819    pub async fn sound_effects(&self, req: &SoundEffectRequest) -> Result<SoundEffectResponse> {
820        let (mut resp, meta) = self
821            .post_json::<SoundEffectRequest, SoundEffectResponse>(
822                "/qai/v1/audio/sound-effects",
823                req,
824            )
825            .await?;
826        if resp.cost_ticks == 0 {
827            resp.cost_ticks = meta.cost_ticks;
828        }
829        if resp.request_id.is_empty() {
830            resp.request_id = meta.request_id;
831        }
832        Ok(resp)
833    }
834
835    /// Generates music from a text prompt.
836    pub async fn generate_music(&self, req: &MusicRequest) -> Result<MusicResponse> {
837        let (mut resp, meta) = self
838            .post_json::<MusicRequest, MusicResponse>("/qai/v1/audio/music", req)
839            .await?;
840        if resp.cost_ticks == 0 {
841            resp.cost_ticks = meta.cost_ticks;
842        }
843        if resp.request_id.is_empty() {
844            resp.request_id = meta.request_id;
845        }
846        Ok(resp)
847    }
848
849    /// Generates multi-speaker dialogue audio.
850    pub async fn dialogue(&self, req: &DialogueRequest) -> Result<AudioResponse> {
851        let (mut resp, meta) = self
852            .post_json::<DialogueRequest, AudioResponse>("/qai/v1/audio/dialogue", req)
853            .await?;
854        if resp.cost_ticks == 0 {
855            resp.cost_ticks = meta.cost_ticks;
856        }
857        if resp.request_id.is_empty() {
858            resp.request_id = meta.request_id;
859        }
860        Ok(resp)
861    }
862
863    /// Converts speech to a different voice.
864    pub async fn speech_to_speech(
865        &self,
866        req: &SpeechToSpeechRequest,
867    ) -> Result<AudioResponse> {
868        let (mut resp, meta) = self
869            .post_json::<SpeechToSpeechRequest, AudioResponse>(
870                "/qai/v1/audio/speech-to-speech",
871                req,
872            )
873            .await?;
874        if resp.cost_ticks == 0 {
875            resp.cost_ticks = meta.cost_ticks;
876        }
877        if resp.request_id.is_empty() {
878            resp.request_id = meta.request_id;
879        }
880        Ok(resp)
881    }
882
883    /// Isolates voice from background noise and music.
884    pub async fn isolate_voice(&self, req: &IsolateVoiceRequest) -> Result<AudioResponse> {
885        let (mut resp, meta) = self
886            .post_json::<IsolateVoiceRequest, AudioResponse>("/qai/v1/audio/isolate", req)
887            .await?;
888        if resp.cost_ticks == 0 {
889            resp.cost_ticks = meta.cost_ticks;
890        }
891        if resp.request_id.is_empty() {
892            resp.request_id = meta.request_id;
893        }
894        Ok(resp)
895    }
896
897    /// Remixes audio with a different voice.
898    pub async fn remix_voice(&self, req: &RemixVoiceRequest) -> Result<AudioResponse> {
899        let (mut resp, meta) = self
900            .post_json::<RemixVoiceRequest, AudioResponse>("/qai/v1/audio/remix", req)
901            .await?;
902        if resp.cost_ticks == 0 {
903            resp.cost_ticks = meta.cost_ticks;
904        }
905        if resp.request_id.is_empty() {
906            resp.request_id = meta.request_id;
907        }
908        Ok(resp)
909    }
910
911    /// Dubs audio or video into a target language.
912    pub async fn dub(&self, req: &DubRequest) -> Result<AudioResponse> {
913        let (mut resp, meta) = self
914            .post_json::<DubRequest, AudioResponse>("/qai/v1/audio/dub", req)
915            .await?;
916        if resp.cost_ticks == 0 {
917            resp.cost_ticks = meta.cost_ticks;
918        }
919        if resp.request_id.is_empty() {
920            resp.request_id = meta.request_id;
921        }
922        Ok(resp)
923    }
924
925    /// Performs forced alignment of text against audio.
926    pub async fn align(&self, req: &AlignRequest) -> Result<AlignResponse> {
927        let (mut resp, meta) = self
928            .post_json::<AlignRequest, AlignResponse>("/qai/v1/audio/align", req)
929            .await?;
930        if resp.cost_ticks == 0 {
931            resp.cost_ticks = meta.cost_ticks;
932        }
933        if resp.request_id.is_empty() {
934            resp.request_id = meta.request_id;
935        }
936        Ok(resp)
937    }
938
939    /// Designs a new voice from a text description and generates sample audio.
940    pub async fn voice_design(&self, req: &VoiceDesignRequest) -> Result<AudioResponse> {
941        let (mut resp, meta) = self
942            .post_json::<VoiceDesignRequest, AudioResponse>("/qai/v1/audio/voice-design", req)
943            .await?;
944        if resp.cost_ticks == 0 {
945            resp.cost_ticks = meta.cost_ticks;
946        }
947        if resp.request_id.is_empty() {
948            resp.request_id = meta.request_id;
949        }
950        Ok(resp)
951    }
952
953    /// Generates speech using Starfish TTS (HeyGen).
954    pub async fn starfish_tts(&self, req: &StarfishTTSRequest) -> Result<AudioResponse> {
955        let (mut resp, meta) = self
956            .post_json::<StarfishTTSRequest, AudioResponse>("/qai/v1/audio/starfish-tts", req)
957            .await?;
958        if resp.cost_ticks == 0 {
959            resp.cost_ticks = meta.cost_ticks;
960        }
961        if resp.request_id.is_empty() {
962            resp.request_id = meta.request_id;
963        }
964        Ok(resp)
965    }
966
967    /// Generates music via ElevenLabs Eleven Music (advanced: sections, finetunes, edits).
968    pub async fn generate_music_advanced(
969        &self,
970        req: &ElevenMusicRequest,
971    ) -> Result<ElevenMusicResponse> {
972        let (mut resp, meta) = self
973            .post_json::<ElevenMusicRequest, ElevenMusicResponse>(
974                "/qai/v1/audio/music/advanced",
975                req,
976            )
977            .await?;
978        if resp.cost_ticks == 0 {
979            resp.cost_ticks = meta.cost_ticks;
980        }
981        if resp.request_id.is_empty() {
982            resp.request_id = meta.request_id;
983        }
984        Ok(resp)
985    }
986
987    /// Lists all music finetunes for the authenticated user.
988    pub async fn list_finetunes(&self) -> Result<ListFinetunesResponse> {
989        let (resp, _) = self
990            .get_json::<ListFinetunesResponse>("/qai/v1/audio/finetunes")
991            .await?;
992        Ok(resp)
993    }
994
995    /// Creates a new music finetune from audio sample files.
996    pub async fn create_finetune(
997        &self,
998        name: &str,
999        files: Vec<crate::voices::CloneVoiceFile>,
1000    ) -> Result<FinetuneInfo> {
1001        let mut form = reqwest::multipart::Form::new().text("name", name.to_string());
1002
1003        for file in files {
1004            let part = reqwest::multipart::Part::bytes(file.data)
1005                .file_name(file.filename)
1006                .mime_str(&file.mime_type)
1007                .map_err(|e| crate::error::Error::Http(e.into()))?;
1008            form = form.part("files", part);
1009        }
1010
1011        let (resp, _) = self
1012            .post_multipart::<FinetuneInfo>("/qai/v1/audio/finetunes", form)
1013            .await?;
1014        Ok(resp)
1015    }
1016
1017    /// Deletes a music finetune by ID.
1018    pub async fn delete_finetune(&self, id: &str) -> Result<serde_json::Value> {
1019        let path = format!("/qai/v1/audio/finetunes/{id}");
1020        let (resp, _) = self.delete_json::<serde_json::Value>(&path).await?;
1021        Ok(resp)
1022    }
1023}
quantum_sdk/audio.rs

quantum_sdk/
audio.rs