1use crate::shared::FileUpload;
2use bytes::Bytes;
3use derive_builder::Builder;
4use serde::{Deserialize, Serialize};
5use serde_json::Value;
6use std::fmt::Display;
7
8#[derive(Serialize, Deserialize, Debug, Default, Builder, Clone, PartialEq)]
9#[builder(name = "AudioSpeechParametersBuilder")]
10#[builder(setter(into, strip_option), default)]
11pub struct AudioSpeechParameters {
12 pub model: String,
14 pub input: String,
16 #[serde(skip_serializing_if = "Option::is_none")]
18 pub voice: Option<String>,
19 #[serde(skip_serializing_if = "Option::is_none")]
21 pub voice_text: Option<String>,
22 #[serde(skip_serializing_if = "Option::is_none")]
24 pub instructions: Option<String>,
25 #[serde(skip_serializing_if = "Option::is_none")]
27 pub response_format: Option<AudioSpeechResponseFormat>,
28 #[serde(skip_serializing_if = "Option::is_none")]
30 pub speed: Option<f32>,
31}
32
33#[derive(Serialize, Deserialize, Debug, Default, Builder, Clone, PartialEq)]
34#[builder(name = "AudioTranscriptionParametersBuilder")]
35#[builder(setter(into, strip_option), default)]
36pub struct AudioTranscriptionParameters {
37 pub file: FileUpload,
39 pub model: String,
41 #[serde(skip_serializing_if = "Option::is_none")]
43 pub language: Option<String>,
44 #[serde(skip_serializing_if = "Option::is_none")]
46 pub chunking_strategy: Option<TranscriptionChunkingStrategy>,
47 #[serde(skip_serializing_if = "Option::is_none")]
49 pub prompt: Option<String>,
50 #[serde(skip_serializing_if = "Option::is_none")]
52 pub response_format: Option<AudioOutputFormat>,
53 #[serde(skip_serializing_if = "Option::is_none")]
55 pub stream: Option<bool>,
56 #[serde(skip_serializing_if = "Option::is_none")]
60 pub temperature: Option<f32>,
61 #[serde(skip_serializing_if = "Option::is_none")]
64 pub timestamp_granularities: Option<Vec<TimestampGranularity>>,
65 #[serde(flatten)]
67 #[serde(skip_serializing_if = "Option::is_none")]
68 pub extra_body: Option<Value>,
69}
70
71#[derive(Serialize, Deserialize, Debug, Default, Builder, Clone, PartialEq)]
72#[builder(name = "AudioTranslationParametersBuilder")]
73#[builder(setter(into, strip_option), default)]
74pub struct AudioTranslationParameters {
75 pub file: FileUpload,
77 pub model: String,
79 #[serde(skip_serializing_if = "Option::is_none")]
81 pub prompt: Option<String>,
82 #[serde(skip_serializing_if = "Option::is_none")]
84 pub response_format: Option<AudioOutputFormat>,
85 #[serde(skip_serializing_if = "Option::is_none")]
89 pub temperature: Option<f32>,
90}
91
92#[derive(Debug, Clone)]
93pub struct AudioSpeechResponse {
94 pub bytes: Bytes,
95}
96
97#[cfg(feature = "stream")]
98#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
99pub struct StreamAudioSpeechParameters {
100 pub model: String,
102 pub input: String,
104 #[serde(skip_serializing_if = "Option::is_none")]
106 pub voice: Option<String>,
107 #[serde(skip_serializing_if = "Option::is_none")]
109 pub voice_text: Option<String>,
110 #[serde(skip_serializing_if = "Option::is_none")]
112 pub response_format: Option<AudioSpeechResponseFormat>,
113 #[serde(skip_serializing_if = "Option::is_none")]
115 pub speed: Option<f32>,
116 pub stream: bool,
117}
118
119#[cfg(feature = "stream")]
120#[derive(Debug, Clone, PartialEq)]
121pub struct AudioSpeechResponseChunkResponse {
122 pub bytes: Bytes,
123}
124
125#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
126#[serde(rename_all = "snake_case")]
127pub enum AudioOutputFormat {
128 Json,
129 Text,
130 Srt,
131 VerboseJson,
132 Vtt,
133}
134
135#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
136#[serde(rename_all = "snake_case")]
137pub enum AudioSpeechResponseFormat {
138 Mp3,
139 Opus,
140 Aac,
141 Flac,
142 Wav,
143 Pcm,
144}
145
146#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq)]
147#[serde(rename_all = "snake_case")]
148pub enum AudioVoice {
149 #[default]
150 Alloy,
151 Ash,
152 Coral,
153 Echo,
154 Fable,
155 Onyx,
156 Nova,
157 Sage,
158 Shimmer,
159}
160
161#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
162#[serde(rename_all = "snake_case")]
163pub enum TimestampGranularity {
164 Word,
165 Segment,
166}
167
168#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq)]
169#[serde(rename_all = "snake_case")]
170pub enum TranscriptionChunkingStrategy {
171 Auto,
172 #[serde(untagged)]
173 VadConfig(VadConfig),
174}
175
176#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq)]
177pub struct VadConfig {
178 pub r#type: VadConfigType,
180 pub prefix_padding_ms: Option<usize>,
182 pub silence_duration_ms: Option<usize>,
184 pub threshold: Option<f32>,
186}
187
188#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq)]
189#[serde(rename_all = "snake_case")]
190pub enum VadConfigType {
191 ServerVad,
192}
193
194impl Display for AudioOutputFormat {
195 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
196 write!(
197 f,
198 "{}",
199 match self {
200 AudioOutputFormat::Json => "json",
201 AudioOutputFormat::Text => "text",
202 AudioOutputFormat::Srt => "srt",
203 AudioOutputFormat::VerboseJson => "verbose_json",
204 AudioOutputFormat::Vtt => "vtt",
205 }
206 )
207 }
208}
209
210impl Display for TimestampGranularity {
211 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
212 write!(
213 f,
214 "{}",
215 match self {
216 TimestampGranularity::Word => "word",
217 TimestampGranularity::Segment => "segment",
218 }
219 )
220 }
221}
222
223impl Display for TranscriptionChunkingStrategy {
224 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
225 match self {
226 TranscriptionChunkingStrategy::Auto => "auto".fmt(f),
227 TranscriptionChunkingStrategy::VadConfig(vad_config) => vad_config.fmt(f),
228 }
229 }
230}
231
232impl Display for VadConfig {
233 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
234 let s = serde_json::to_string(self).map_err(|_| std::fmt::Error)?;
235 write!(f, "{s}")
236 }
237}
238
239#[cfg(test)]
240mod tests {
241 use crate::audio::{
242 AudioTranscriptionParameters, AudioTranscriptionParametersBuilder,
243 TranscriptionChunkingStrategy, VadConfig, VadConfigType,
244 };
245 use crate::shared::FileUpload;
246
247 #[test]
248 fn test_audio_transcription_chunking_strategy_auto_serialization_deserialization() {
249 let chunking_strategy = TranscriptionChunkingStrategy::Auto;
250
251 let serialized = serde_json::to_string(&chunking_strategy).unwrap();
252 assert_eq!(serialized, "\"auto\"");
253
254 let deserialized: TranscriptionChunkingStrategy =
255 serde_json::from_str(serialized.as_str()).unwrap();
256 assert_eq!(deserialized, chunking_strategy)
257 }
258
259 #[test]
260 fn test_audio_transcription_chunking_strategy_vad_config_serialization_deserialization() {
261 let chunking_strategy = TranscriptionChunkingStrategy::VadConfig(VadConfig {
262 r#type: VadConfigType::ServerVad,
263 prefix_padding_ms: Some(10),
264 silence_duration_ms: Some(20),
265 threshold: Some(0.5),
266 });
267
268 let serialized = serde_json::to_string(&chunking_strategy).unwrap();
269 assert_eq!(serialized, "{\"type\":\"server_vad\",\"prefix_padding_ms\":10,\"silence_duration_ms\":20,\"threshold\":0.5}");
270
271 let deserialized: TranscriptionChunkingStrategy =
272 serde_json::from_str(serialized.as_str()).unwrap();
273 assert_eq!(deserialized, chunking_strategy)
274 }
275
276 #[test]
277 fn test_audio_transcription_extra_body_serialization_deserialization() {
278 let mut builder = &mut AudioTranscriptionParametersBuilder::default();
279 builder = builder.file(FileUpload::File("test.wav".to_string()));
280 builder = builder.model("test");
281 let extra = serde_json::json!({
282 "enable_my_feature": true,
283 "my_param": 10
284 });
285 builder = builder.extra_body(extra);
286
287 let params: AudioTranscriptionParameters = builder.build().unwrap();
288
289 let serialized = serde_json::to_string(¶ms).unwrap();
290 assert_eq!(serialized, "{\"file\":{\"File\":\"test.wav\"},\"model\":\"test\",\"enable_my_feature\":true,\"my_param\":10}");
291
292 let deserialized: AudioTranscriptionParameters =
293 serde_json::from_str(serialized.as_str()).unwrap();
294 assert_eq!(deserialized, params)
295 }
296}