1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4
5use crate::client::Client;
6use crate::error::Result;
7
8#[derive(Debug, Clone, Serialize, Default)]
10pub struct TtsRequest {
11 pub model: String,
13
14 pub text: String,
16
17 #[serde(skip_serializing_if = "Option::is_none")]
19 pub voice: Option<String>,
20
21 #[serde(rename = "format", skip_serializing_if = "Option::is_none")]
23 pub output_format: Option<String>,
24
25 #[serde(skip_serializing_if = "Option::is_none")]
27 pub speed: Option<f64>,
28}
29
30#[derive(Debug, Clone, Deserialize)]
32pub struct TtsResponse {
33 pub audio_base64: String,
35
36 pub format: String,
38
39 pub size_bytes: i64,
41
42 pub model: String,
44
45 #[serde(default)]
47 pub cost_ticks: i64,
48
49 #[serde(default)]
51 pub request_id: String,
52}
53
54#[derive(Debug, Clone, Serialize, Default)]
56pub struct SttRequest {
57 pub model: String,
59
60 pub audio_base64: String,
62
63 #[serde(skip_serializing_if = "Option::is_none")]
65 pub filename: Option<String>,
66
67 #[serde(skip_serializing_if = "Option::is_none")]
69 pub language: Option<String>,
70}
71
72#[derive(Debug, Clone, Deserialize)]
74pub struct SttResponse {
75 pub text: String,
77
78 pub model: String,
80
81 #[serde(default)]
83 pub cost_ticks: i64,
84
85 #[serde(default)]
87 pub request_id: String,
88}
89
90#[derive(Debug, Clone, Serialize, Default)]
92pub struct MusicRequest {
93 pub model: String,
95
96 pub prompt: String,
98
99 #[serde(skip_serializing_if = "Option::is_none")]
101 pub duration_seconds: Option<i32>,
102}
103
104#[derive(Debug, Clone, Deserialize)]
106pub struct MusicResponse {
107 #[serde(default)]
109 pub audio_clips: Vec<MusicClip>,
110
111 #[serde(default)]
113 pub model: String,
114
115 #[serde(default)]
117 pub cost_ticks: i64,
118
119 #[serde(default)]
121 pub request_id: String,
122}
123
124#[derive(Debug, Clone, Deserialize)]
126pub struct MusicClip {
127 pub base64: String,
129
130 #[serde(default)]
132 pub format: String,
133
134 #[serde(default)]
136 pub size_bytes: i64,
137
138 #[serde(default)]
140 pub index: i32,
141}
142
143#[derive(Debug, Clone, Serialize, Default)]
145pub struct SoundEffectRequest {
146 pub prompt: String,
148
149 #[serde(skip_serializing_if = "Option::is_none")]
151 pub duration_seconds: Option<f64>,
152}
153
154#[derive(Debug, Clone, Deserialize)]
156pub struct SoundEffectResponse {
157 pub audio_base64: String,
159
160 pub format: String,
162
163 #[serde(default)]
165 pub size_bytes: i64,
166
167 #[serde(default)]
169 pub model: String,
170
171 #[serde(default)]
173 pub cost_ticks: i64,
174
175 #[serde(default)]
177 pub request_id: String,
178}
179
180#[derive(Debug, Clone, Deserialize)]
186pub struct AudioResponse {
187 #[serde(default)]
189 pub audio_base64: Option<String>,
190
191 #[serde(default)]
193 pub format: Option<String>,
194
195 #[serde(default)]
197 pub size_bytes: Option<i64>,
198
199 #[serde(default)]
201 pub model: Option<String>,
202
203 #[serde(default)]
205 pub cost_ticks: i64,
206
207 #[serde(default)]
209 pub request_id: String,
210
211 #[serde(flatten)]
213 pub extra: HashMap<String, serde_json::Value>,
214}
215
216#[derive(Debug, Clone, Serialize, Deserialize, Default)]
218pub struct DialogueTurn {
219 pub speaker: String,
221
222 pub text: String,
224
225 #[serde(skip_serializing_if = "Option::is_none")]
227 pub voice: Option<String>,
228}
229
230#[derive(Debug, Clone, Serialize)]
232pub struct DialogueVoice {
233 pub voice_id: String,
234 pub name: String,
235}
236
237#[derive(Debug, Clone, Serialize, Default)]
240pub struct DialogueRequest {
241 pub text: String,
243
244 pub voices: Vec<DialogueVoice>,
246
247 #[serde(skip_serializing_if = "Option::is_none")]
249 pub model: Option<String>,
250
251 #[serde(rename = "output_format", skip_serializing_if = "Option::is_none")]
253 pub output_format: Option<String>,
254
255 #[serde(skip_serializing_if = "Option::is_none")]
257 pub seed: Option<i32>,
258}
259
260impl DialogueRequest {
261 pub fn from_turns(turns: Vec<DialogueTurn>, model: Option<String>) -> Self {
264 let text = turns.iter()
266 .map(|t| format!("{}: {}", t.speaker, t.text))
267 .collect::<Vec<_>>()
268 .join("\n");
269
270 let mut seen = std::collections::HashSet::new();
272 let voices: Vec<DialogueVoice> = turns.iter()
273 .filter(|t| t.voice.is_some() && seen.insert(t.speaker.clone()))
274 .map(|t| DialogueVoice {
275 voice_id: t.voice.clone().unwrap_or_default(),
276 name: t.speaker.clone(),
277 })
278 .collect();
279
280 Self {
281 text,
282 voices,
283 model,
284 ..Default::default()
285 }
286 }
287}
288
289#[derive(Debug, Clone, Serialize, Default)]
291pub struct SpeechToSpeechRequest {
292 #[serde(skip_serializing_if = "Option::is_none")]
294 pub model: Option<String>,
295
296 pub audio_base64: String,
298
299 #[serde(skip_serializing_if = "Option::is_none")]
301 pub voice: Option<String>,
302
303 #[serde(rename = "format", skip_serializing_if = "Option::is_none")]
305 pub output_format: Option<String>,
306}
307
308#[derive(Debug, Clone, Serialize, Default)]
310pub struct IsolateRequest {
311 pub audio_base64: String,
313
314 #[serde(rename = "format", skip_serializing_if = "Option::is_none")]
316 pub output_format: Option<String>,
317}
318
319#[derive(Debug, Clone, Serialize, Default)]
321pub struct RemixRequest {
322 pub audio_base64: String,
324
325 #[serde(skip_serializing_if = "Option::is_none")]
327 pub voice: Option<String>,
328
329 #[serde(skip_serializing_if = "Option::is_none")]
331 pub model: Option<String>,
332
333 #[serde(rename = "format", skip_serializing_if = "Option::is_none")]
335 pub output_format: Option<String>,
336}
337
338#[derive(Debug, Clone, Serialize, Default)]
340pub struct DubRequest {
341 pub audio_base64: String,
343
344 #[serde(skip_serializing_if = "Option::is_none")]
346 pub filename: Option<String>,
347
348 pub target_language: String,
350
351 #[serde(skip_serializing_if = "Option::is_none")]
353 pub source_language: Option<String>,
354}
355
356#[derive(Debug, Clone, Serialize, Default)]
358pub struct AlignRequest {
359 pub audio_base64: String,
361
362 pub text: String,
364
365 #[serde(skip_serializing_if = "Option::is_none")]
367 pub language: Option<String>,
368}
369
370#[derive(Debug, Clone, Deserialize)]
372pub struct AlignmentSegment {
373 pub text: String,
375
376 pub start: f64,
378
379 pub end: f64,
381}
382
383#[derive(Debug, Clone, Deserialize)]
385pub struct AlignResponse {
386 pub segments: Vec<AlignmentSegment>,
388
389 #[serde(default)]
391 pub cost_ticks: i64,
392
393 #[serde(default)]
395 pub request_id: String,
396}
397
398#[derive(Debug, Clone, Serialize, Default)]
400pub struct VoiceDesignRequest {
401 #[serde(rename = "voice_description")]
403 pub description: String,
404
405 #[serde(rename = "sample_text")]
407 pub text: String,
408
409 #[serde(rename = "format", skip_serializing_if = "Option::is_none")]
411 pub output_format: Option<String>,
412}
413
414#[derive(Debug, Clone, Serialize, Default)]
416pub struct StarfishTTSRequest {
417 pub text: String,
419
420 #[serde(skip_serializing_if = "Option::is_none")]
422 pub voice: Option<String>,
423
424 #[serde(rename = "format", skip_serializing_if = "Option::is_none")]
426 pub output_format: Option<String>,
427
428 #[serde(skip_serializing_if = "Option::is_none")]
430 pub speed: Option<f64>,
431}
432
433#[derive(Debug, Clone, Serialize, Deserialize, Default)]
439pub struct MusicSection {
440 pub section_type: String,
441 #[serde(skip_serializing_if = "Option::is_none")]
442 pub lyrics: Option<String>,
443 #[serde(skip_serializing_if = "Option::is_none")]
444 pub style: Option<String>,
445 #[serde(skip_serializing_if = "Option::is_none")]
446 pub style_exclude: Option<String>,
447}
448
449#[derive(Debug, Clone, Serialize, Default)]
451pub struct ElevenMusicRequest {
452 pub model: String,
453 pub prompt: String,
454 #[serde(skip_serializing_if = "Option::is_none")]
455 pub sections: Option<Vec<MusicSection>>,
456 #[serde(skip_serializing_if = "Option::is_none")]
457 pub duration_seconds: Option<i32>,
458 #[serde(skip_serializing_if = "Option::is_none")]
459 pub language: Option<String>,
460 #[serde(skip_serializing_if = "Option::is_none")]
461 pub vocals: Option<bool>,
462 #[serde(skip_serializing_if = "Option::is_none")]
463 pub style: Option<String>,
464 #[serde(skip_serializing_if = "Option::is_none")]
465 pub style_exclude: Option<String>,
466 #[serde(skip_serializing_if = "Option::is_none")]
467 pub finetune_id: Option<String>,
468 #[serde(skip_serializing_if = "Option::is_none")]
469 pub edit_reference_id: Option<String>,
470 #[serde(skip_serializing_if = "Option::is_none")]
471 pub edit_instruction: Option<String>,
472}
473
474#[derive(Debug, Clone, Deserialize)]
476pub struct ElevenMusicClip {
477 #[serde(default)]
479 pub base64: String,
480 #[serde(default)]
482 pub format: String,
483 #[serde(default)]
485 pub size: i64,
486}
487
488#[derive(Debug, Clone, Deserialize)]
491pub struct ElevenMusicResponse {
492 #[serde(default)]
494 pub clips: Vec<ElevenMusicClip>,
495 #[serde(default)]
497 pub model: String,
498 #[serde(default)]
500 pub cost_ticks: i64,
501 #[serde(default)]
503 pub request_id: String,
504}
505
506#[derive(Debug, Clone, Serialize, Deserialize)]
508pub struct FinetuneInfo {
509 pub finetune_id: String,
510 pub name: String,
511 #[serde(default)]
512 pub status: String,
513 #[serde(default)]
514 pub created_at: Option<String>,
515}
516
517#[derive(Debug, Clone, Deserialize)]
519pub struct ListFinetunesResponse {
520 pub finetunes: Vec<FinetuneInfo>,
521}
522
523impl Client {
528 pub async fn speak(&self, req: &TtsRequest) -> Result<TtsResponse> {
530 let (mut resp, meta) = self
531 .post_json::<TtsRequest, TtsResponse>("/qai/v1/audio/tts", req)
532 .await?;
533 if resp.cost_ticks == 0 {
534 resp.cost_ticks = meta.cost_ticks;
535 }
536 if resp.request_id.is_empty() {
537 resp.request_id = meta.request_id;
538 }
539 Ok(resp)
540 }
541
542 pub async fn transcribe(&self, req: &SttRequest) -> Result<SttResponse> {
544 let (mut resp, meta) = self
545 .post_json::<SttRequest, SttResponse>("/qai/v1/audio/stt", req)
546 .await?;
547 if resp.cost_ticks == 0 {
548 resp.cost_ticks = meta.cost_ticks;
549 }
550 if resp.request_id.is_empty() {
551 resp.request_id = meta.request_id;
552 }
553 Ok(resp)
554 }
555
556 pub async fn sound_effects(&self, req: &SoundEffectRequest) -> Result<SoundEffectResponse> {
558 let (mut resp, meta) = self
559 .post_json::<SoundEffectRequest, SoundEffectResponse>(
560 "/qai/v1/audio/sound-effects",
561 req,
562 )
563 .await?;
564 if resp.cost_ticks == 0 {
565 resp.cost_ticks = meta.cost_ticks;
566 }
567 if resp.request_id.is_empty() {
568 resp.request_id = meta.request_id;
569 }
570 Ok(resp)
571 }
572
573 pub async fn generate_music(&self, req: &MusicRequest) -> Result<MusicResponse> {
575 let (mut resp, meta) = self
576 .post_json::<MusicRequest, MusicResponse>("/qai/v1/audio/music", req)
577 .await?;
578 if resp.cost_ticks == 0 {
579 resp.cost_ticks = meta.cost_ticks;
580 }
581 if resp.request_id.is_empty() {
582 resp.request_id = meta.request_id;
583 }
584 Ok(resp)
585 }
586
587 pub async fn dialogue(&self, req: &DialogueRequest) -> Result<AudioResponse> {
589 let (mut resp, meta) = self
590 .post_json::<DialogueRequest, AudioResponse>("/qai/v1/audio/dialogue", req)
591 .await?;
592 if resp.cost_ticks == 0 {
593 resp.cost_ticks = meta.cost_ticks;
594 }
595 if resp.request_id.is_empty() {
596 resp.request_id = meta.request_id;
597 }
598 Ok(resp)
599 }
600
601 pub async fn speech_to_speech(
603 &self,
604 req: &SpeechToSpeechRequest,
605 ) -> Result<AudioResponse> {
606 let (mut resp, meta) = self
607 .post_json::<SpeechToSpeechRequest, AudioResponse>(
608 "/qai/v1/audio/speech-to-speech",
609 req,
610 )
611 .await?;
612 if resp.cost_ticks == 0 {
613 resp.cost_ticks = meta.cost_ticks;
614 }
615 if resp.request_id.is_empty() {
616 resp.request_id = meta.request_id;
617 }
618 Ok(resp)
619 }
620
621 pub async fn isolate_voice(&self, req: &IsolateRequest) -> Result<AudioResponse> {
623 let (mut resp, meta) = self
624 .post_json::<IsolateRequest, AudioResponse>("/qai/v1/audio/isolate", req)
625 .await?;
626 if resp.cost_ticks == 0 {
627 resp.cost_ticks = meta.cost_ticks;
628 }
629 if resp.request_id.is_empty() {
630 resp.request_id = meta.request_id;
631 }
632 Ok(resp)
633 }
634
635 pub async fn remix_voice(&self, req: &RemixRequest) -> Result<AudioResponse> {
637 let (mut resp, meta) = self
638 .post_json::<RemixRequest, AudioResponse>("/qai/v1/audio/remix", req)
639 .await?;
640 if resp.cost_ticks == 0 {
641 resp.cost_ticks = meta.cost_ticks;
642 }
643 if resp.request_id.is_empty() {
644 resp.request_id = meta.request_id;
645 }
646 Ok(resp)
647 }
648
649 pub async fn dub(&self, req: &DubRequest) -> Result<AudioResponse> {
651 let (mut resp, meta) = self
652 .post_json::<DubRequest, AudioResponse>("/qai/v1/audio/dub", req)
653 .await?;
654 if resp.cost_ticks == 0 {
655 resp.cost_ticks = meta.cost_ticks;
656 }
657 if resp.request_id.is_empty() {
658 resp.request_id = meta.request_id;
659 }
660 Ok(resp)
661 }
662
663 pub async fn align(&self, req: &AlignRequest) -> Result<AlignResponse> {
665 let (mut resp, meta) = self
666 .post_json::<AlignRequest, AlignResponse>("/qai/v1/audio/align", req)
667 .await?;
668 if resp.cost_ticks == 0 {
669 resp.cost_ticks = meta.cost_ticks;
670 }
671 if resp.request_id.is_empty() {
672 resp.request_id = meta.request_id;
673 }
674 Ok(resp)
675 }
676
677 pub async fn voice_design(&self, req: &VoiceDesignRequest) -> Result<AudioResponse> {
679 let (mut resp, meta) = self
680 .post_json::<VoiceDesignRequest, AudioResponse>("/qai/v1/audio/voice-design", req)
681 .await?;
682 if resp.cost_ticks == 0 {
683 resp.cost_ticks = meta.cost_ticks;
684 }
685 if resp.request_id.is_empty() {
686 resp.request_id = meta.request_id;
687 }
688 Ok(resp)
689 }
690
691 pub async fn starfish_tts(&self, req: &StarfishTTSRequest) -> Result<AudioResponse> {
693 let (mut resp, meta) = self
694 .post_json::<StarfishTTSRequest, AudioResponse>("/qai/v1/audio/starfish-tts", req)
695 .await?;
696 if resp.cost_ticks == 0 {
697 resp.cost_ticks = meta.cost_ticks;
698 }
699 if resp.request_id.is_empty() {
700 resp.request_id = meta.request_id;
701 }
702 Ok(resp)
703 }
704
705 pub async fn generate_music_advanced(
707 &self,
708 req: &ElevenMusicRequest,
709 ) -> Result<ElevenMusicResponse> {
710 let (mut resp, meta) = self
711 .post_json::<ElevenMusicRequest, ElevenMusicResponse>(
712 "/qai/v1/audio/music/advanced",
713 req,
714 )
715 .await?;
716 if resp.cost_ticks == 0 {
717 resp.cost_ticks = meta.cost_ticks;
718 }
719 if resp.request_id.is_empty() {
720 resp.request_id = meta.request_id;
721 }
722 Ok(resp)
723 }
724
725 pub async fn list_finetunes(&self) -> Result<ListFinetunesResponse> {
727 let (resp, _) = self
728 .get_json::<ListFinetunesResponse>("/qai/v1/audio/finetunes")
729 .await?;
730 Ok(resp)
731 }
732
733 pub async fn create_finetune(
735 &self,
736 name: &str,
737 files: Vec<crate::voices::CloneVoiceFile>,
738 ) -> Result<FinetuneInfo> {
739 let mut form = reqwest::multipart::Form::new().text("name", name.to_string());
740
741 for file in files {
742 let part = reqwest::multipart::Part::bytes(file.data)
743 .file_name(file.filename)
744 .mime_str(&file.mime_type)
745 .map_err(|e| crate::error::Error::Http(e.into()))?;
746 form = form.part("files", part);
747 }
748
749 let (resp, _) = self
750 .post_multipart::<FinetuneInfo>("/qai/v1/audio/finetunes", form)
751 .await?;
752 Ok(resp)
753 }
754
755 pub async fn delete_finetune(&self, id: &str) -> Result<serde_json::Value> {
757 let path = format!("/qai/v1/audio/finetunes/{id}");
758 let (resp, _) = self.delete_json::<serde_json::Value>(&path).await?;
759 Ok(resp)
760 }
761}