1use serde::{Deserialize, Serialize};
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
9#[serde(rename_all = "lowercase")]
10#[non_exhaustive]
11pub enum Voice {
12 #[default]
14 Alloy,
15 Ash,
17 Ballad,
19 Coral,
21 Echo,
23 Fable,
25 Nova,
27 Onyx,
29 Sage,
31 Shimmer,
33 Verse,
35}
36
37impl Voice {
38 pub fn as_str(&self) -> &'static str {
40 match self {
41 Voice::Alloy => "alloy",
42 Voice::Ash => "ash",
43 Voice::Ballad => "ballad",
44 Voice::Coral => "coral",
45 Voice::Echo => "echo",
46 Voice::Fable => "fable",
47 Voice::Nova => "nova",
48 Voice::Onyx => "onyx",
49 Voice::Sage => "sage",
50 Voice::Shimmer => "shimmer",
51 Voice::Verse => "verse",
52 }
53 }
54}
55
56impl std::fmt::Display for Voice {
57 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
58 write!(f, "{}", self.as_str())
59 }
60}
61
62#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
64#[non_exhaustive]
65pub enum TtsModel {
66 #[default]
68 #[serde(rename = "gpt-4o-mini-tts")]
69 Gpt4oMiniTts,
70 #[serde(rename = "tts-1")]
72 Tts1,
73 #[serde(rename = "tts-1-hd")]
75 Tts1Hd,
76}
77
78impl TtsModel {
79 pub fn as_str(&self) -> &'static str {
81 match self {
82 TtsModel::Gpt4oMiniTts => "gpt-4o-mini-tts",
83 TtsModel::Tts1 => "tts-1",
84 TtsModel::Tts1Hd => "tts-1-hd",
85 }
86 }
87
88 pub fn supports_instructions(&self) -> bool {
90 matches!(self, TtsModel::Gpt4oMiniTts)
91 }
92}
93
94impl std::fmt::Display for TtsModel {
95 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
96 write!(f, "{}", self.as_str())
97 }
98}
99
100#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
102#[serde(rename_all = "lowercase")]
103#[non_exhaustive]
104pub enum AudioFormat {
105 #[default]
107 Mp3,
108 Opus,
110 Aac,
112 Flac,
114 Wav,
116 Pcm,
118}
119
120impl AudioFormat {
121 pub fn as_str(&self) -> &'static str {
123 match self {
124 AudioFormat::Mp3 => "mp3",
125 AudioFormat::Opus => "opus",
126 AudioFormat::Aac => "aac",
127 AudioFormat::Flac => "flac",
128 AudioFormat::Wav => "wav",
129 AudioFormat::Pcm => "pcm",
130 }
131 }
132
133 pub fn mime_type(&self) -> &'static str {
135 match self {
136 AudioFormat::Mp3 => "audio/mpeg",
137 AudioFormat::Opus => "audio/opus",
138 AudioFormat::Aac => "audio/aac",
139 AudioFormat::Flac => "audio/flac",
140 AudioFormat::Wav => "audio/wav",
141 AudioFormat::Pcm => "audio/pcm",
142 }
143 }
144
145 pub fn extension(&self) -> &'static str {
147 match self {
148 AudioFormat::Mp3 => "mp3",
149 AudioFormat::Opus => "opus",
150 AudioFormat::Aac => "aac",
151 AudioFormat::Flac => "flac",
152 AudioFormat::Wav => "wav",
153 AudioFormat::Pcm => "pcm",
154 }
155 }
156}
157
158impl std::fmt::Display for AudioFormat {
159 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
160 write!(f, "{}", self.as_str())
161 }
162}
163
164#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
166#[non_exhaustive]
167pub enum TranscriptionModel {
168 #[default]
170 #[serde(rename = "gpt-4o-transcribe")]
171 Gpt4oTranscribe,
172 #[serde(rename = "gpt-4o-mini-transcribe")]
174 Gpt4oMiniTranscribe,
175 #[serde(rename = "whisper-1")]
177 Whisper1,
178 #[serde(rename = "gpt-4o-transcribe-diarize")]
180 Gpt4oTranscribeDiarize,
181}
182
183impl TranscriptionModel {
184 pub fn as_str(&self) -> &'static str {
186 match self {
187 TranscriptionModel::Gpt4oTranscribe => "gpt-4o-transcribe",
188 TranscriptionModel::Gpt4oMiniTranscribe => "gpt-4o-mini-transcribe",
189 TranscriptionModel::Whisper1 => "whisper-1",
190 TranscriptionModel::Gpt4oTranscribeDiarize => "gpt-4o-transcribe-diarize",
191 }
192 }
193
194 pub fn supports_streaming(&self) -> bool {
196 !matches!(self, TranscriptionModel::Whisper1)
197 }
198
199 pub fn supports_prompt(&self) -> bool {
201 !matches!(self, TranscriptionModel::Gpt4oTranscribeDiarize)
202 }
203
204 pub fn supports_diarization(&self) -> bool {
206 matches!(self, TranscriptionModel::Gpt4oTranscribeDiarize)
207 }
208}
209
210impl std::fmt::Display for TranscriptionModel {
211 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
212 write!(f, "{}", self.as_str())
213 }
214}
215
216#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
218#[serde(rename_all = "snake_case")]
219#[non_exhaustive]
220pub enum TranscriptionResponseFormat {
221 #[default]
223 Json,
224 Text,
226 Srt,
228 VerboseJson,
230 Vtt,
232 DiarizedJson,
234}
235
236impl TranscriptionResponseFormat {
237 pub fn as_str(&self) -> &'static str {
239 match self {
240 TranscriptionResponseFormat::Json => "json",
241 TranscriptionResponseFormat::Text => "text",
242 TranscriptionResponseFormat::Srt => "srt",
243 TranscriptionResponseFormat::VerboseJson => "verbose_json",
244 TranscriptionResponseFormat::Vtt => "vtt",
245 TranscriptionResponseFormat::DiarizedJson => "diarized_json",
246 }
247 }
248}
249
250impl std::fmt::Display for TranscriptionResponseFormat {
251 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
252 write!(f, "{}", self.as_str())
253 }
254}
255
256#[derive(Debug, Clone, Deserialize)]
258pub struct Usage {
259 #[serde(rename = "type")]
261 pub usage_type: Option<String>,
262 pub input_tokens: Option<u32>,
264 pub output_tokens: Option<u32>,
266 pub total_tokens: Option<u32>,
268 pub seconds: Option<u32>,
270 pub input_token_details: Option<InputTokenDetails>,
272}
273
274#[derive(Debug, Clone, Deserialize)]
276pub struct InputTokenDetails {
277 pub text_tokens: Option<u32>,
279 pub audio_tokens: Option<u32>,
281}
282
283#[derive(Debug, Clone, Deserialize)]
285pub struct TranscriptionResponse {
286 pub text: String,
288 #[serde(default)]
290 pub usage: Option<Usage>,
291 #[serde(default)]
293 pub language: Option<String>,
294 #[serde(default)]
296 pub duration: Option<f64>,
297 #[serde(default)]
299 pub segments: Option<Vec<TranscriptionSegment>>,
300 #[serde(default)]
302 pub words: Option<Vec<TranscriptionWord>>,
303 #[serde(default)]
305 pub task: Option<String>,
306}
307
308#[derive(Debug, Clone, Deserialize)]
310pub struct TranscriptionSegment {
311 pub id: Option<serde_json::Value>, pub start: Option<f64>,
315 pub end: Option<f64>,
317 pub text: String,
319 #[serde(default)]
321 pub speaker: Option<String>,
322 #[serde(default)]
324 pub seek: Option<u32>,
325 #[serde(default)]
327 pub tokens: Option<Vec<u32>>,
328 #[serde(default)]
330 pub temperature: Option<f64>,
331 #[serde(default)]
333 pub avg_logprob: Option<f64>,
334 #[serde(default)]
336 pub compression_ratio: Option<f64>,
337 #[serde(default)]
339 pub no_speech_prob: Option<f64>,
340}
341
342#[derive(Debug, Clone, Deserialize)]
344pub struct TranscriptionWord {
345 pub word: String,
347 pub start: f64,
349 pub end: f64,
351}
352
353#[allow(dead_code)]
355#[derive(Debug, Clone, Deserialize)]
356pub struct LogProb {
357 pub token: String,
359 pub logprob: f64,
361 #[serde(default)]
363 pub bytes: Option<Vec<u8>>,
364}
365
366#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
368#[serde(rename_all = "lowercase")]
369pub enum TimestampGranularity {
370 Word,
372 Segment,
374}
375
376impl TimestampGranularity {
377 pub fn as_str(&self) -> &'static str {
379 match self {
380 TimestampGranularity::Word => "word",
381 TimestampGranularity::Segment => "segment",
382 }
383 }
384}
385
386#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
388#[non_exhaustive]
389pub enum InputAudioFormat {
390 Flac,
392 Mp3,
394 Mp4,
396 Mpeg,
398 Mpga,
400 M4a,
402 Ogg,
404 Wav,
406 Webm,
408}
409
410impl InputAudioFormat {
411 pub fn mime_type(&self) -> &'static str {
413 match self {
414 InputAudioFormat::Flac => "audio/flac",
415 InputAudioFormat::Mp3 => "audio/mpeg",
416 InputAudioFormat::Mp4 => "audio/mp4",
417 InputAudioFormat::Mpeg => "audio/mpeg",
418 InputAudioFormat::Mpga => "audio/mpeg",
419 InputAudioFormat::M4a => "audio/mp4",
420 InputAudioFormat::Ogg => "audio/ogg",
421 InputAudioFormat::Wav => "audio/wav",
422 InputAudioFormat::Webm => "audio/webm",
423 }
424 }
425
426 pub fn from_extension(ext: &str) -> Option<Self> {
428 match ext.to_lowercase().as_str() {
429 "flac" => Some(InputAudioFormat::Flac),
430 "mp3" => Some(InputAudioFormat::Mp3),
431 "mp4" => Some(InputAudioFormat::Mp4),
432 "mpeg" => Some(InputAudioFormat::Mpeg),
433 "mpga" => Some(InputAudioFormat::Mpga),
434 "m4a" => Some(InputAudioFormat::M4a),
435 "ogg" => Some(InputAudioFormat::Ogg),
436 "wav" => Some(InputAudioFormat::Wav),
437 "webm" => Some(InputAudioFormat::Webm),
438 _ => None,
439 }
440 }
441}
442
443#[cfg(test)]
444mod tests {
445 use super::*;
446
447 #[test]
448 fn test_voice_serialization() {
449 let voice = Voice::Alloy;
450 let serialized = serde_json::to_string(&voice).unwrap();
451 assert_eq!(serialized, "\"alloy\"");
452 }
453
454 #[test]
455 fn test_tts_model_serialization() {
456 let model = TtsModel::Gpt4oMiniTts;
457 let serialized = serde_json::to_string(&model).unwrap();
458 assert_eq!(serialized, "\"gpt-4o-mini-tts\"");
459 }
460
461 #[test]
462 fn test_audio_format_mime_types() {
463 assert_eq!(AudioFormat::Mp3.mime_type(), "audio/mpeg");
464 assert_eq!(AudioFormat::Wav.mime_type(), "audio/wav");
465 }
466
467 #[test]
468 fn test_transcription_model_features() {
469 assert!(TranscriptionModel::Gpt4oTranscribe.supports_streaming());
470 assert!(!TranscriptionModel::Whisper1.supports_streaming());
471 assert!(TranscriptionModel::Gpt4oTranscribeDiarize.supports_diarization());
472 }
473}