use std::collections::HashMap;
use serde::{Deserialize, Serialize};
use crate::client::Client;
use crate::error::Result;
#[derive(Debug, Clone, Serialize, Default)]
pub struct TextToSpeechRequest {
pub model: String,
pub text: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub voice: Option<String>,
#[serde(rename = "format", skip_serializing_if = "Option::is_none")]
pub output_format: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub speed: Option<f64>,
}
pub type TtsRequest = TextToSpeechRequest;
#[derive(Debug, Clone, Deserialize)]
pub struct TextToSpeechResponse {
pub audio_base64: String,
pub format: String,
pub size_bytes: i64,
pub model: String,
#[serde(default)]
pub cost_ticks: i64,
#[serde(default)]
pub balance_after: i64,
#[serde(default)]
pub request_id: String,
}
pub type TtsResponse = TextToSpeechResponse;
#[derive(Debug, Clone, Serialize, Default)]
pub struct SpeechToTextRequest {
pub model: String,
pub audio_base64: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub filename: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub language: Option<String>,
}
pub type SttRequest = SpeechToTextRequest;
#[derive(Debug, Clone, Deserialize)]
pub struct SpeechToTextResponse {
pub text: String,
pub model: String,
#[serde(default)]
pub cost_ticks: i64,
#[serde(default)]
pub balance_after: i64,
#[serde(default)]
pub request_id: String,
}
pub type SttResponse = SpeechToTextResponse;
#[derive(Debug, Clone, Serialize, Default)]
pub struct MusicRequest {
pub model: String,
pub prompt: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub duration_seconds: Option<i32>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct MusicResponse {
#[serde(default)]
pub audio_clips: Vec<MusicClip>,
#[serde(default)]
pub model: String,
#[serde(default)]
pub cost_ticks: i64,
#[serde(default)]
pub balance_after: i64,
#[serde(default)]
pub request_id: String,
}
#[derive(Debug, Clone, Deserialize)]
pub struct MusicClip {
pub base64: String,
#[serde(default)]
pub format: String,
#[serde(default)]
pub size_bytes: i64,
#[serde(default)]
pub index: i32,
}
#[derive(Debug, Clone, Serialize, Default)]
pub struct SoundEffectRequest {
pub prompt: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub duration_seconds: Option<f64>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct SoundEffectResponse {
pub audio_base64: String,
pub format: String,
#[serde(default)]
pub size_bytes: i64,
#[serde(default)]
pub model: String,
#[serde(default)]
pub cost_ticks: i64,
#[serde(default)]
pub request_id: String,
}
#[derive(Debug, Clone, Deserialize)]
pub struct AudioResponse {
#[serde(default)]
pub audio_base64: Option<String>,
#[serde(default)]
pub format: Option<String>,
#[serde(default)]
pub size_bytes: Option<i64>,
#[serde(default)]
pub model: Option<String>,
#[serde(default)]
pub cost_ticks: i64,
#[serde(default)]
pub request_id: String,
#[serde(flatten)]
pub extra: HashMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct DialogueTurn {
pub speaker: String,
pub text: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub voice: Option<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct DialogueVoice {
pub voice_id: String,
pub name: String,
}
#[derive(Debug, Clone, Serialize, Default)]
pub struct DialogueRequest {
pub text: String,
pub voices: Vec<DialogueVoice>,
#[serde(skip_serializing_if = "Option::is_none")]
pub model: Option<String>,
#[serde(rename = "output_format", skip_serializing_if = "Option::is_none")]
pub output_format: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub seed: Option<i32>,
}
impl DialogueRequest {
pub fn from_turns(turns: Vec<DialogueTurn>, model: Option<String>) -> Self {
let text = turns.iter()
.map(|t| format!("{}: {}", t.speaker, t.text))
.collect::<Vec<_>>()
.join("\n");
let mut seen = std::collections::HashSet::new();
let voices: Vec<DialogueVoice> = turns.iter()
.filter(|t| t.voice.is_some() && seen.insert(t.speaker.clone()))
.map(|t| DialogueVoice {
voice_id: t.voice.clone().unwrap_or_default(),
name: t.speaker.clone(),
})
.collect();
Self {
text,
voices,
model,
..Default::default()
}
}
}
#[derive(Debug, Clone, Serialize, Default)]
pub struct SpeechToSpeechRequest {
#[serde(skip_serializing_if = "Option::is_none")]
pub model: Option<String>,
pub audio_base64: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub voice_id: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub voice: Option<String>,
#[serde(rename = "format", skip_serializing_if = "Option::is_none")]
pub output_format: Option<String>,
}
#[derive(Debug, Clone, Serialize, Default)]
pub struct IsolateVoiceRequest {
pub audio_base64: String,
#[serde(rename = "format", skip_serializing_if = "Option::is_none")]
pub output_format: Option<String>,
}
pub type IsolateRequest = IsolateVoiceRequest;
#[derive(Debug, Clone, Serialize, Default)]
pub struct RemixVoiceRequest {
pub audio_base64: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub voice: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub model: Option<String>,
#[serde(rename = "format", skip_serializing_if = "Option::is_none")]
pub output_format: Option<String>,
}
pub type RemixRequest = RemixVoiceRequest;
#[derive(Debug, Clone, Serialize, Default)]
pub struct DubRequest {
#[serde(skip_serializing_if = "Option::is_none")]
pub audio_base64: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub filename: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub source_url: Option<String>,
pub target_lang: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub source_lang: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub num_speakers: Option<i32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub highest_resolution: Option<bool>,
}
#[derive(Debug, Clone, Serialize, Default)]
pub struct AlignRequest {
pub audio_base64: String,
pub text: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub language: Option<String>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct AlignmentSegment {
pub text: String,
pub start: f64,
pub end: f64,
}
#[derive(Debug, Clone, Deserialize)]
pub struct AlignedWord {
pub text: String,
pub start_time: f64,
pub end_time: f64,
#[serde(default)]
pub confidence: f64,
}
#[derive(Debug, Clone, Deserialize)]
pub struct AlignResponse {
#[serde(default)]
pub segments: Vec<AlignmentSegment>,
#[serde(default)]
pub alignment: Vec<AlignedWord>,
#[serde(default)]
pub model: String,
#[serde(default)]
pub cost_ticks: i64,
#[serde(default)]
pub request_id: String,
}
#[derive(Debug, Clone, Deserialize)]
pub struct DialogueResponse {
pub audio_base64: String,
pub format: String,
#[serde(default)]
pub size_bytes: i64,
#[serde(default)]
pub model: String,
#[serde(default)]
pub cost_ticks: i64,
#[serde(default)]
pub request_id: String,
}
#[derive(Debug, Clone, Deserialize)]
pub struct SpeechToSpeechResponse {
pub audio_base64: String,
pub format: String,
#[serde(default)]
pub size_bytes: i64,
#[serde(default)]
pub model: String,
#[serde(default)]
pub cost_ticks: i64,
#[serde(default)]
pub request_id: String,
}
#[derive(Debug, Clone, Deserialize)]
pub struct IsolateVoiceResponse {
pub audio_base64: String,
pub format: String,
#[serde(default)]
pub size_bytes: i64,
#[serde(default)]
pub cost_ticks: i64,
#[serde(default)]
pub request_id: String,
}
#[derive(Debug, Clone, Deserialize)]
pub struct RemixVoiceResponse {
#[serde(default)]
pub audio_base64: Option<String>,
pub format: String,
#[serde(default)]
pub size_bytes: i64,
#[serde(default)]
pub voice_id: Option<String>,
#[serde(default)]
pub cost_ticks: i64,
#[serde(default)]
pub request_id: String,
}
#[derive(Debug, Clone, Deserialize)]
pub struct DubResponse {
pub dubbing_id: String,
pub audio_base64: String,
pub format: String,
#[serde(default)]
pub target_lang: String,
#[serde(default)]
pub status: String,
#[serde(default)]
pub processing_time_seconds: f64,
#[serde(default)]
pub cost_ticks: i64,
#[serde(default)]
pub request_id: String,
}
#[derive(Debug, Clone, Deserialize)]
pub struct VoiceDesignResponse {
pub previews: Vec<VoicePreview>,
#[serde(default)]
pub cost_ticks: i64,
#[serde(default)]
pub request_id: String,
}
#[derive(Debug, Clone, Deserialize)]
pub struct VoicePreview {
pub generated_voice_id: String,
pub audio_base64: String,
pub format: String,
}
#[derive(Debug, Clone, Deserialize)]
pub struct StarfishTTSResponse {
#[serde(default)]
pub audio_base64: Option<String>,
#[serde(default)]
pub url: Option<String>,
pub format: String,
#[serde(default)]
pub size_bytes: i64,
#[serde(default)]
pub duration: f64,
#[serde(default)]
pub model: String,
#[serde(default)]
pub cost_ticks: i64,
#[serde(default)]
pub request_id: String,
}
#[derive(Debug, Clone, Serialize, Default)]
pub struct MusicAdvancedRequest {
pub prompt: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub duration_seconds: Option<i32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub model: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub finetune_id: Option<String>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct MusicAdvancedClip {
#[serde(default)]
pub base64: String,
#[serde(default)]
pub format: String,
#[serde(default)]
pub size: i64,
}
#[derive(Debug, Clone, Deserialize)]
pub struct MusicAdvancedResponse {
#[serde(default)]
pub clips: Vec<MusicAdvancedClip>,
#[serde(default)]
pub model: String,
#[serde(default)]
pub cost_ticks: i64,
#[serde(default)]
pub request_id: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MusicFinetuneInfo {
pub finetune_id: String,
pub name: String,
#[serde(default)]
pub description: Option<String>,
#[serde(default)]
pub status: String,
#[serde(default)]
pub model_id: Option<String>,
#[serde(default)]
pub created_at: Option<String>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct MusicFinetuneListResponse {
pub finetunes: Vec<MusicFinetuneInfo>,
}
#[derive(Debug, Clone, Serialize)]
pub struct MusicFinetuneCreateRequest {
pub name: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
pub samples: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Default)]
pub struct VoiceDesignRequest {
#[serde(rename = "voice_description")]
pub description: String,
#[serde(rename = "sample_text")]
pub text: String,
#[serde(rename = "format", skip_serializing_if = "Option::is_none")]
pub output_format: Option<String>,
}
#[derive(Debug, Clone, Serialize, Default)]
pub struct StarfishTTSRequest {
pub text: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub voice_id: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub voice: Option<String>,
#[serde(rename = "format", skip_serializing_if = "Option::is_none")]
pub output_format: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub input_type: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub speed: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub language: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub locale: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct MusicSection {
pub section_type: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub lyrics: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub style: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub style_exclude: Option<String>,
}
#[derive(Debug, Clone, Serialize, Default)]
pub struct ElevenMusicRequest {
pub model: String,
pub prompt: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub sections: Option<Vec<MusicSection>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub duration_seconds: Option<i32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub language: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub vocals: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub style: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub style_exclude: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub finetune_id: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub edit_reference_id: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub edit_instruction: Option<String>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct ElevenMusicClip {
#[serde(default)]
pub base64: String,
#[serde(default)]
pub format: String,
#[serde(default)]
pub size: i64,
}
#[derive(Debug, Clone, Deserialize)]
pub struct ElevenMusicResponse {
#[serde(default)]
pub clips: Vec<ElevenMusicClip>,
#[serde(default)]
pub model: String,
#[serde(default)]
pub cost_ticks: i64,
#[serde(default)]
pub request_id: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FinetuneInfo {
pub finetune_id: String,
pub name: String,
#[serde(default)]
pub status: String,
#[serde(default)]
pub created_at: Option<String>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct ListFinetunesResponse {
pub finetunes: Vec<FinetuneInfo>,
}
impl Client {
pub async fn speak(&self, req: &TextToSpeechRequest) -> Result<TextToSpeechResponse> {
let (mut resp, meta) = self
.post_json::<TextToSpeechRequest, TextToSpeechResponse>("/qai/v1/audio/tts", req)
.await?;
if resp.cost_ticks == 0 {
resp.cost_ticks = meta.cost_ticks;
}
if resp.request_id.is_empty() {
resp.request_id = meta.request_id;
}
Ok(resp)
}
pub async fn transcribe(&self, req: &SpeechToTextRequest) -> Result<SpeechToTextResponse> {
let (mut resp, meta) = self
.post_json::<SpeechToTextRequest, SpeechToTextResponse>("/qai/v1/audio/stt", req)
.await?;
if resp.cost_ticks == 0 {
resp.cost_ticks = meta.cost_ticks;
}
if resp.request_id.is_empty() {
resp.request_id = meta.request_id;
}
Ok(resp)
}
pub async fn sound_effects(&self, req: &SoundEffectRequest) -> Result<SoundEffectResponse> {
let (mut resp, meta) = self
.post_json::<SoundEffectRequest, SoundEffectResponse>(
"/qai/v1/audio/sound-effects",
req,
)
.await?;
if resp.cost_ticks == 0 {
resp.cost_ticks = meta.cost_ticks;
}
if resp.request_id.is_empty() {
resp.request_id = meta.request_id;
}
Ok(resp)
}
pub async fn generate_music(&self, req: &MusicRequest) -> Result<MusicResponse> {
let (mut resp, meta) = self
.post_json::<MusicRequest, MusicResponse>("/qai/v1/audio/music", req)
.await?;
if resp.cost_ticks == 0 {
resp.cost_ticks = meta.cost_ticks;
}
if resp.request_id.is_empty() {
resp.request_id = meta.request_id;
}
Ok(resp)
}
pub async fn dialogue(&self, req: &DialogueRequest) -> Result<AudioResponse> {
let (mut resp, meta) = self
.post_json::<DialogueRequest, AudioResponse>("/qai/v1/audio/dialogue", req)
.await?;
if resp.cost_ticks == 0 {
resp.cost_ticks = meta.cost_ticks;
}
if resp.request_id.is_empty() {
resp.request_id = meta.request_id;
}
Ok(resp)
}
pub async fn speech_to_speech(
&self,
req: &SpeechToSpeechRequest,
) -> Result<AudioResponse> {
let (mut resp, meta) = self
.post_json::<SpeechToSpeechRequest, AudioResponse>(
"/qai/v1/audio/speech-to-speech",
req,
)
.await?;
if resp.cost_ticks == 0 {
resp.cost_ticks = meta.cost_ticks;
}
if resp.request_id.is_empty() {
resp.request_id = meta.request_id;
}
Ok(resp)
}
pub async fn isolate_voice(&self, req: &IsolateVoiceRequest) -> Result<AudioResponse> {
let (mut resp, meta) = self
.post_json::<IsolateVoiceRequest, AudioResponse>("/qai/v1/audio/isolate", req)
.await?;
if resp.cost_ticks == 0 {
resp.cost_ticks = meta.cost_ticks;
}
if resp.request_id.is_empty() {
resp.request_id = meta.request_id;
}
Ok(resp)
}
pub async fn remix_voice(&self, req: &RemixVoiceRequest) -> Result<AudioResponse> {
let (mut resp, meta) = self
.post_json::<RemixVoiceRequest, AudioResponse>("/qai/v1/audio/remix", req)
.await?;
if resp.cost_ticks == 0 {
resp.cost_ticks = meta.cost_ticks;
}
if resp.request_id.is_empty() {
resp.request_id = meta.request_id;
}
Ok(resp)
}
pub async fn dub(&self, req: &DubRequest) -> Result<AudioResponse> {
let (mut resp, meta) = self
.post_json::<DubRequest, AudioResponse>("/qai/v1/audio/dub", req)
.await?;
if resp.cost_ticks == 0 {
resp.cost_ticks = meta.cost_ticks;
}
if resp.request_id.is_empty() {
resp.request_id = meta.request_id;
}
Ok(resp)
}
pub async fn align(&self, req: &AlignRequest) -> Result<AlignResponse> {
let (mut resp, meta) = self
.post_json::<AlignRequest, AlignResponse>("/qai/v1/audio/align", req)
.await?;
if resp.cost_ticks == 0 {
resp.cost_ticks = meta.cost_ticks;
}
if resp.request_id.is_empty() {
resp.request_id = meta.request_id;
}
Ok(resp)
}
pub async fn voice_design(&self, req: &VoiceDesignRequest) -> Result<AudioResponse> {
let (mut resp, meta) = self
.post_json::<VoiceDesignRequest, AudioResponse>("/qai/v1/audio/voice-design", req)
.await?;
if resp.cost_ticks == 0 {
resp.cost_ticks = meta.cost_ticks;
}
if resp.request_id.is_empty() {
resp.request_id = meta.request_id;
}
Ok(resp)
}
pub async fn starfish_tts(&self, req: &StarfishTTSRequest) -> Result<AudioResponse> {
let (mut resp, meta) = self
.post_json::<StarfishTTSRequest, AudioResponse>("/qai/v1/audio/starfish-tts", req)
.await?;
if resp.cost_ticks == 0 {
resp.cost_ticks = meta.cost_ticks;
}
if resp.request_id.is_empty() {
resp.request_id = meta.request_id;
}
Ok(resp)
}
pub async fn generate_music_advanced(
&self,
req: &ElevenMusicRequest,
) -> Result<ElevenMusicResponse> {
let (mut resp, meta) = self
.post_json::<ElevenMusicRequest, ElevenMusicResponse>(
"/qai/v1/audio/music/advanced",
req,
)
.await?;
if resp.cost_ticks == 0 {
resp.cost_ticks = meta.cost_ticks;
}
if resp.request_id.is_empty() {
resp.request_id = meta.request_id;
}
Ok(resp)
}
pub async fn list_finetunes(&self) -> Result<ListFinetunesResponse> {
let (resp, _) = self
.get_json::<ListFinetunesResponse>("/qai/v1/audio/finetunes")
.await?;
Ok(resp)
}
pub async fn create_finetune(
&self,
name: &str,
files: Vec<crate::voices::CloneVoiceFile>,
) -> Result<FinetuneInfo> {
let mut form = reqwest::multipart::Form::new().text("name", name.to_string());
for file in files {
let part = reqwest::multipart::Part::bytes(file.data)
.file_name(file.filename)
.mime_str(&file.mime_type)
.map_err(|e| crate::error::Error::Http(e.into()))?;
form = form.part("files", part);
}
let (resp, _) = self
.post_multipart::<FinetuneInfo>("/qai/v1/audio/finetunes", form)
.await?;
Ok(resp)
}
pub async fn delete_finetune(&self, id: &str) -> Result<serde_json::Value> {
let path = format!("/qai/v1/audio/finetunes/{id}");
let (resp, _) = self.delete_json::<serde_json::Value>(&path).await?;
Ok(resp)
}
}