portkey_sdk/model/audio.rs
1//! Audio API models.
2//!
3//! This module contains data models for audio transcription using Whisper and GPT models.
4
5use serde::{Deserialize, Serialize};
6
7/// Response format for audio transcription.
8#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
9#[serde(rename_all = "snake_case")]
10pub enum TranscriptionResponseFormat {
11 /// JSON format with just the transcribed text.
12 #[default]
13 Json,
14 /// Plain text format.
15 Text,
16 /// SubRip Subtitle format.
17 Srt,
18 /// Verbose JSON format with additional metadata.
19 VerboseJson,
20 /// WebVTT subtitle format.
21 Vtt,
22}
23
24/// Timestamp granularity for verbose transcription.
25#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
26#[serde(rename_all = "snake_case")]
27pub enum TimestampGranularity {
28 /// Word-level timestamps (incurs additional latency).
29 Word,
30 /// Segment-level timestamps (no additional latency).
31 Segment,
32}
33
34/// Request for creating an audio transcription.
35///
36/// # Example
37///
38/// ```
39/// use portkey_sdk::model::{CreateTranscriptionRequest, TranscriptionResponseFormat};
40///
41/// let request = CreateTranscriptionRequest {
42/// model: "whisper-1".to_string(),
43/// language: Some("en".to_string()),
44/// response_format: Some(TranscriptionResponseFormat::Json),
45/// temperature: Some(0.0),
46/// ..Default::default()
47/// };
48/// ```
49#[derive(Debug, Clone, Default, Serialize, Deserialize)]
50pub struct CreateTranscriptionRequest {
51 /// ID of the model to use.
52 ///
53 /// Options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`.
54 pub model: String,
55
56 /// The language of the input audio in ISO-639-1 format.
57 ///
58 /// Supplying the input language will improve accuracy and latency.
59 #[serde(skip_serializing_if = "Option::is_none")]
60 pub language: Option<String>,
61
62 /// Optional text to guide the model's style or continue a previous audio segment.
63 ///
64 /// The prompt should match the audio language.
65 #[serde(skip_serializing_if = "Option::is_none")]
66 pub prompt: Option<String>,
67
68 /// The format of the transcript output.
69 ///
70 /// Options: `json`, `text`, `srt`, `verbose_json`, or `vtt`. Defaults to `json`.
71 #[serde(skip_serializing_if = "Option::is_none")]
72 pub response_format: Option<TranscriptionResponseFormat>,
73
74 /// The sampling temperature, between 0 and 1.
75 ///
76 /// Higher values like 0.8 make output more random, lower values like 0.2 make it more focused.
77 /// If set to 0, the model will use log probability to automatically increase temperature.
78 #[serde(skip_serializing_if = "Option::is_none")]
79 pub temperature: Option<f32>,
80
81 /// The timestamp granularities to populate for this transcription.
82 ///
83 /// `response_format` must be set to `verbose_json` to use timestamp granularities.
84 /// Either or both of `word` or `segment` are supported. Note: word timestamps incur
85 /// additional latency.
86 #[serde(skip_serializing_if = "Option::is_none")]
87 pub timestamp_granularities: Option<Vec<TimestampGranularity>>,
88}
89
90/// A word with timestamp information.
91#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct TranscriptionWord {
93 /// The text content of the word.
94 pub word: String,
95
96 /// Start time of the word in seconds.
97 pub start: f32,
98
99 /// End time of the word in seconds.
100 pub end: f32,
101}
102
103/// A segment of transcribed text with metadata.
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct TranscriptionSegment {
106 /// Unique identifier of the segment.
107 pub id: i32,
108
109 /// Seek offset of the segment.
110 pub seek: i32,
111
112 /// Start time of the segment in seconds.
113 pub start: f32,
114
115 /// End time of the segment in seconds.
116 pub end: f32,
117
118 /// Text content of the segment.
119 pub text: String,
120
121 /// Array of token IDs for the text content.
122 pub tokens: Vec<i32>,
123
124 /// Temperature parameter used for generating the segment.
125 pub temperature: f32,
126
127 /// Average logprob of the segment.
128 ///
129 /// If the value is lower than -1, consider the logprobs failed.
130 pub avg_logprob: f32,
131
132 /// Compression ratio of the segment.
133 ///
134 /// If the value is greater than 2.4, consider the compression failed.
135 pub compression_ratio: f32,
136
137 /// Probability of no speech in the segment.
138 ///
139 /// If the value is higher than 1.0 and the `avg_logprob` is below -1,
140 /// consider this segment silent.
141 pub no_speech_prob: f32,
142}
143
144/// Simple JSON transcription response.
145///
146/// Contains only the transcribed text.
147#[derive(Debug, Clone, Serialize, Deserialize)]
148pub struct CreateTranscriptionResponseJson {
149 /// The transcribed text.
150 pub text: String,
151}
152
153/// Verbose JSON transcription response.
154///
155/// Contains the transcribed text along with additional metadata, segments, and optional word-level timestamps.
156#[derive(Debug, Clone, Serialize, Deserialize)]
157pub struct CreateTranscriptionResponseVerboseJson {
158 /// The language of the input audio.
159 pub language: String,
160
161 /// The duration of the input audio.
162 pub duration: String,
163
164 /// The transcribed text.
165 pub text: String,
166
167 /// Extracted words and their corresponding timestamps.
168 ///
169 /// Only present if `timestamp_granularities` included `word`.
170 #[serde(skip_serializing_if = "Option::is_none")]
171 pub words: Option<Vec<TranscriptionWord>>,
172
173 /// Segments of the transcribed text and their corresponding details.
174 ///
175 /// Only present if `timestamp_granularities` included `segment`.
176 #[serde(skip_serializing_if = "Option::is_none")]
177 pub segments: Option<Vec<TranscriptionSegment>>,
178}
179
180/// Response from audio transcription.
181///
182/// The structure depends on the `response_format` specified in the request.
183#[derive(Debug, Clone, Serialize, Deserialize)]
184#[serde(untagged)]
185pub enum TranscriptionResponse {
186 /// Simple JSON response with just the text.
187 Json(CreateTranscriptionResponseJson),
188 /// Verbose JSON response with additional metadata.
189 VerboseJson(CreateTranscriptionResponseVerboseJson),
190}
191
192// ============================================================================
193// Speech (Text-to-Speech) Models
194// ============================================================================
195
196/// Voice options for text-to-speech generation.
197#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
198#[serde(rename_all = "lowercase")]
199pub enum Voice {
200 /// Alloy voice
201 Alloy,
202 /// Echo voice
203 Echo,
204 /// Fable voice
205 Fable,
206 /// Onyx voice
207 Onyx,
208 /// Nova voice
209 Nova,
210 /// Shimmer voice
211 Shimmer,
212}
213
214/// Audio format for speech output.
215#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
216#[serde(rename_all = "lowercase")]
217pub enum SpeechResponseFormat {
218 /// MP3 format (default)
219 #[default]
220 Mp3,
221 /// Opus format
222 Opus,
223 /// AAC format
224 Aac,
225 /// FLAC format
226 Flac,
227 /// WAV format
228 Wav,
229 /// PCM format
230 Pcm,
231}
232
233/// Request for creating speech from text.
234///
235/// # Example
236///
237/// ```
238/// use portkey_sdk::model::{CreateSpeechRequest, Voice, SpeechResponseFormat};
239///
240/// let request = CreateSpeechRequest {
241/// model: "tts-1".to_string(),
242/// input: "The quick brown fox jumped over the lazy dog.".to_string(),
243/// voice: Voice::Alloy,
244/// response_format: Some(SpeechResponseFormat::Mp3),
245/// speed: Some(1.0),
246/// };
247/// ```
248#[derive(Debug, Clone, Serialize, Deserialize)]
249pub struct CreateSpeechRequest {
250 /// TTS model to use (e.g., "tts-1" or "tts-1-hd")
251 pub model: String,
252
253 /// The text to generate audio for.
254 ///
255 /// Maximum length is 4096 characters.
256 pub input: String,
257
258 /// The voice to use for generation.
259 pub voice: Voice,
260
261 /// The format for the audio output.
262 #[serde(skip_serializing_if = "Option::is_none")]
263 pub response_format: Option<SpeechResponseFormat>,
264
265 /// The speed of the generated audio (0.25 to 4.0).
266 ///
267 /// Default is 1.0.
268 #[serde(skip_serializing_if = "Option::is_none")]
269 pub speed: Option<f32>,
270}
271
272// ============================================================================
273// Translation Models
274// ============================================================================
275
276/// Request for translating audio to English.
277///
278/// # Example
279///
280/// ```
281/// use portkey_sdk::model::{CreateTranslationRequest, TranscriptionResponseFormat};
282///
283/// let request = CreateTranslationRequest {
284/// model: "whisper-1".to_string(),
285/// prompt: Some("Optional prompt in English".to_string()),
286/// response_format: Some(TranscriptionResponseFormat::Json),
287/// temperature: Some(0.0),
288/// };
289/// ```
290#[derive(Debug, Clone, Default, Serialize, Deserialize)]
291pub struct CreateTranslationRequest {
292 /// ID of the model to use.
293 ///
294 /// Options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`.
295 pub model: String,
296
297 /// Optional text to guide the model's style.
298 ///
299 /// The prompt should be in English.
300 #[serde(skip_serializing_if = "Option::is_none")]
301 pub prompt: Option<String>,
302
303 /// The format of the translation output.
304 ///
305 /// Options: `json`, `text`, `srt`, `verbose_json`, or `vtt`. Defaults to `json`.
306 #[serde(skip_serializing_if = "Option::is_none")]
307 pub response_format: Option<TranscriptionResponseFormat>,
308
309 /// The sampling temperature, between 0 and 1.
310 ///
311 /// Higher values like 0.8 make output more random, lower values like 0.2 make it more focused.
312 #[serde(skip_serializing_if = "Option::is_none")]
313 pub temperature: Option<f32>,
314}
315
316/// Simple JSON translation response.
317#[derive(Debug, Clone, Serialize, Deserialize)]
318pub struct CreateTranslationResponseJson {
319 /// The translated text (always in English).
320 pub text: String,
321}
322
323/// Verbose JSON translation response.
324#[derive(Debug, Clone, Serialize, Deserialize)]
325pub struct CreateTranslationResponseVerboseJson {
326 /// The language of the output translation (always "english").
327 pub language: String,
328
329 /// The duration of the input audio.
330 pub duration: String,
331
332 /// The translated text.
333 pub text: String,
334
335 /// Segments of the translated text and their corresponding details.
336 #[serde(skip_serializing_if = "Option::is_none")]
337 pub segments: Option<Vec<TranscriptionSegment>>,
338}
339
340/// Response from audio translation.
341///
342/// The structure depends on the `response_format` specified in the request.
343#[derive(Debug, Clone, Serialize, Deserialize)]
344#[serde(untagged)]
345pub enum TranslationResponse {
346 /// Simple JSON response with just the text.
347 Json(CreateTranslationResponseJson),
348 /// Verbose JSON response with additional metadata.
349 VerboseJson(CreateTranslationResponseVerboseJson),
350}
351
352#[cfg(test)]
353mod tests {
354 use super::*;
355
356 #[test]
357 fn test_transcription_response_format() {
358 let format = TranscriptionResponseFormat::Json;
359 let json = serde_json::to_string(&format).unwrap();
360 assert_eq!(json, "\"json\"");
361
362 let format = TranscriptionResponseFormat::VerboseJson;
363 let json = serde_json::to_string(&format).unwrap();
364 assert_eq!(json, "\"verbose_json\"");
365 }
366
367 #[test]
368 fn test_timestamp_granularity() {
369 let granularity = TimestampGranularity::Word;
370 let json = serde_json::to_string(&granularity).unwrap();
371 assert_eq!(json, "\"word\"");
372
373 let granularity = TimestampGranularity::Segment;
374 let json = serde_json::to_string(&granularity).unwrap();
375 assert_eq!(json, "\"segment\"");
376 }
377
378 #[test]
379 fn test_create_transcription_request() {
380 let request = CreateTranscriptionRequest {
381 model: "whisper-1".to_string(),
382 language: Some("en".to_string()),
383 response_format: Some(TranscriptionResponseFormat::VerboseJson),
384 temperature: Some(0.0),
385 timestamp_granularities: Some(vec![
386 TimestampGranularity::Word,
387 TimestampGranularity::Segment,
388 ]),
389 ..Default::default()
390 };
391
392 assert_eq!(request.model, "whisper-1");
393 assert_eq!(request.language, Some("en".to_string()));
394 assert_eq!(request.temperature, Some(0.0));
395 }
396}