portkey_sdk/model/
audio.rs

1//! Audio API models.
2//!
3//! This module contains data models for audio transcription using Whisper and GPT models.
4
5use serde::{Deserialize, Serialize};
6
7/// Response format for audio transcription.
8#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
9#[serde(rename_all = "snake_case")]
10pub enum TranscriptionResponseFormat {
11    /// JSON format with just the transcribed text.
12    #[default]
13    Json,
14    /// Plain text format.
15    Text,
16    /// SubRip Subtitle format.
17    Srt,
18    /// Verbose JSON format with additional metadata.
19    VerboseJson,
20    /// WebVTT subtitle format.
21    Vtt,
22}
23
24/// Timestamp granularity for verbose transcription.
25#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
26#[serde(rename_all = "snake_case")]
27pub enum TimestampGranularity {
28    /// Word-level timestamps (incurs additional latency).
29    Word,
30    /// Segment-level timestamps (no additional latency).
31    Segment,
32}
33
34/// Request for creating an audio transcription.
35///
36/// # Example
37///
38/// ```
39/// use portkey_sdk::model::{CreateTranscriptionRequest, TranscriptionResponseFormat};
40///
41/// let request = CreateTranscriptionRequest {
42///     model: "whisper-1".to_string(),
43///     language: Some("en".to_string()),
44///     response_format: Some(TranscriptionResponseFormat::Json),
45///     temperature: Some(0.0),
46///     ..Default::default()
47/// };
48/// ```
49#[derive(Debug, Clone, Default, Serialize, Deserialize)]
50pub struct CreateTranscriptionRequest {
51    /// ID of the model to use.
52    ///
53    /// Options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`.
54    pub model: String,
55
56    /// The language of the input audio in ISO-639-1 format.
57    ///
58    /// Supplying the input language will improve accuracy and latency.
59    #[serde(skip_serializing_if = "Option::is_none")]
60    pub language: Option<String>,
61
62    /// Optional text to guide the model's style or continue a previous audio segment.
63    ///
64    /// The prompt should match the audio language.
65    #[serde(skip_serializing_if = "Option::is_none")]
66    pub prompt: Option<String>,
67
68    /// The format of the transcript output.
69    ///
70    /// Options: `json`, `text`, `srt`, `verbose_json`, or `vtt`. Defaults to `json`.
71    #[serde(skip_serializing_if = "Option::is_none")]
72    pub response_format: Option<TranscriptionResponseFormat>,
73
74    /// The sampling temperature, between 0 and 1.
75    ///
76    /// Higher values like 0.8 make output more random, lower values like 0.2 make it more focused.
77    /// If set to 0, the model will use log probability to automatically increase temperature.
78    #[serde(skip_serializing_if = "Option::is_none")]
79    pub temperature: Option<f32>,
80
81    /// The timestamp granularities to populate for this transcription.
82    ///
83    /// `response_format` must be set to `verbose_json` to use timestamp granularities.
84    /// Either or both of `word` or `segment` are supported. Note: word timestamps incur
85    /// additional latency.
86    #[serde(skip_serializing_if = "Option::is_none")]
87    pub timestamp_granularities: Option<Vec<TimestampGranularity>>,
88}
89
90/// A word with timestamp information.
91#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct TranscriptionWord {
93    /// The text content of the word.
94    pub word: String,
95
96    /// Start time of the word in seconds.
97    pub start: f32,
98
99    /// End time of the word in seconds.
100    pub end: f32,
101}
102
103/// A segment of transcribed text with metadata.
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct TranscriptionSegment {
106    /// Unique identifier of the segment.
107    pub id: i32,
108
109    /// Seek offset of the segment.
110    pub seek: i32,
111
112    /// Start time of the segment in seconds.
113    pub start: f32,
114
115    /// End time of the segment in seconds.
116    pub end: f32,
117
118    /// Text content of the segment.
119    pub text: String,
120
121    /// Array of token IDs for the text content.
122    pub tokens: Vec<i32>,
123
124    /// Temperature parameter used for generating the segment.
125    pub temperature: f32,
126
127    /// Average logprob of the segment.
128    ///
129    /// If the value is lower than -1, consider the logprobs failed.
130    pub avg_logprob: f32,
131
132    /// Compression ratio of the segment.
133    ///
134    /// If the value is greater than 2.4, consider the compression failed.
135    pub compression_ratio: f32,
136
137    /// Probability of no speech in the segment.
138    ///
139    /// If the value is higher than 1.0 and the `avg_logprob` is below -1,
140    /// consider this segment silent.
141    pub no_speech_prob: f32,
142}
143
144/// Simple JSON transcription response.
145///
146/// Contains only the transcribed text.
147#[derive(Debug, Clone, Serialize, Deserialize)]
148pub struct CreateTranscriptionResponseJson {
149    /// The transcribed text.
150    pub text: String,
151}
152
153/// Verbose JSON transcription response.
154///
155/// Contains the transcribed text along with additional metadata, segments, and optional word-level timestamps.
156#[derive(Debug, Clone, Serialize, Deserialize)]
157pub struct CreateTranscriptionResponseVerboseJson {
158    /// The language of the input audio.
159    pub language: String,
160
161    /// The duration of the input audio.
162    pub duration: String,
163
164    /// The transcribed text.
165    pub text: String,
166
167    /// Extracted words and their corresponding timestamps.
168    ///
169    /// Only present if `timestamp_granularities` included `word`.
170    #[serde(skip_serializing_if = "Option::is_none")]
171    pub words: Option<Vec<TranscriptionWord>>,
172
173    /// Segments of the transcribed text and their corresponding details.
174    ///
175    /// Only present if `timestamp_granularities` included `segment`.
176    #[serde(skip_serializing_if = "Option::is_none")]
177    pub segments: Option<Vec<TranscriptionSegment>>,
178}
179
180/// Response from audio transcription.
181///
182/// The structure depends on the `response_format` specified in the request.
183#[derive(Debug, Clone, Serialize, Deserialize)]
184#[serde(untagged)]
185pub enum TranscriptionResponse {
186    /// Simple JSON response with just the text.
187    Json(CreateTranscriptionResponseJson),
188    /// Verbose JSON response with additional metadata.
189    VerboseJson(CreateTranscriptionResponseVerboseJson),
190}
191
192// ============================================================================
193// Speech (Text-to-Speech) Models
194// ============================================================================
195
196/// Voice options for text-to-speech generation.
197#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
198#[serde(rename_all = "lowercase")]
199pub enum Voice {
200    /// Alloy voice
201    Alloy,
202    /// Echo voice
203    Echo,
204    /// Fable voice
205    Fable,
206    /// Onyx voice
207    Onyx,
208    /// Nova voice
209    Nova,
210    /// Shimmer voice
211    Shimmer,
212}
213
214/// Audio format for speech output.
215#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
216#[serde(rename_all = "lowercase")]
217pub enum SpeechResponseFormat {
218    /// MP3 format (default)
219    #[default]
220    Mp3,
221    /// Opus format
222    Opus,
223    /// AAC format
224    Aac,
225    /// FLAC format
226    Flac,
227    /// WAV format
228    Wav,
229    /// PCM format
230    Pcm,
231}
232
233/// Request for creating speech from text.
234///
235/// # Example
236///
237/// ```
238/// use portkey_sdk::model::{CreateSpeechRequest, Voice, SpeechResponseFormat};
239///
240/// let request = CreateSpeechRequest {
241///     model: "tts-1".to_string(),
242///     input: "The quick brown fox jumped over the lazy dog.".to_string(),
243///     voice: Voice::Alloy,
244///     response_format: Some(SpeechResponseFormat::Mp3),
245///     speed: Some(1.0),
246/// };
247/// ```
248#[derive(Debug, Clone, Serialize, Deserialize)]
249pub struct CreateSpeechRequest {
250    /// TTS model to use (e.g., "tts-1" or "tts-1-hd")
251    pub model: String,
252
253    /// The text to generate audio for.
254    ///
255    /// Maximum length is 4096 characters.
256    pub input: String,
257
258    /// The voice to use for generation.
259    pub voice: Voice,
260
261    /// The format for the audio output.
262    #[serde(skip_serializing_if = "Option::is_none")]
263    pub response_format: Option<SpeechResponseFormat>,
264
265    /// The speed of the generated audio (0.25 to 4.0).
266    ///
267    /// Default is 1.0.
268    #[serde(skip_serializing_if = "Option::is_none")]
269    pub speed: Option<f32>,
270}
271
272// ============================================================================
273// Translation Models
274// ============================================================================
275
276/// Request for translating audio to English.
277///
278/// # Example
279///
280/// ```
281/// use portkey_sdk::model::{CreateTranslationRequest, TranscriptionResponseFormat};
282///
283/// let request = CreateTranslationRequest {
284///     model: "whisper-1".to_string(),
285///     prompt: Some("Optional prompt in English".to_string()),
286///     response_format: Some(TranscriptionResponseFormat::Json),
287///     temperature: Some(0.0),
288/// };
289/// ```
290#[derive(Debug, Clone, Default, Serialize, Deserialize)]
291pub struct CreateTranslationRequest {
292    /// ID of the model to use.
293    ///
294    /// Options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`.
295    pub model: String,
296
297    /// Optional text to guide the model's style.
298    ///
299    /// The prompt should be in English.
300    #[serde(skip_serializing_if = "Option::is_none")]
301    pub prompt: Option<String>,
302
303    /// The format of the translation output.
304    ///
305    /// Options: `json`, `text`, `srt`, `verbose_json`, or `vtt`. Defaults to `json`.
306    #[serde(skip_serializing_if = "Option::is_none")]
307    pub response_format: Option<TranscriptionResponseFormat>,
308
309    /// The sampling temperature, between 0 and 1.
310    ///
311    /// Higher values like 0.8 make output more random, lower values like 0.2 make it more focused.
312    #[serde(skip_serializing_if = "Option::is_none")]
313    pub temperature: Option<f32>,
314}
315
316/// Simple JSON translation response.
317#[derive(Debug, Clone, Serialize, Deserialize)]
318pub struct CreateTranslationResponseJson {
319    /// The translated text (always in English).
320    pub text: String,
321}
322
323/// Verbose JSON translation response.
324#[derive(Debug, Clone, Serialize, Deserialize)]
325pub struct CreateTranslationResponseVerboseJson {
326    /// The language of the output translation (always "english").
327    pub language: String,
328
329    /// The duration of the input audio.
330    pub duration: String,
331
332    /// The translated text.
333    pub text: String,
334
335    /// Segments of the translated text and their corresponding details.
336    #[serde(skip_serializing_if = "Option::is_none")]
337    pub segments: Option<Vec<TranscriptionSegment>>,
338}
339
340/// Response from audio translation.
341///
342/// The structure depends on the `response_format` specified in the request.
343#[derive(Debug, Clone, Serialize, Deserialize)]
344#[serde(untagged)]
345pub enum TranslationResponse {
346    /// Simple JSON response with just the text.
347    Json(CreateTranslationResponseJson),
348    /// Verbose JSON response with additional metadata.
349    VerboseJson(CreateTranslationResponseVerboseJson),
350}
351
352#[cfg(test)]
353mod tests {
354    use super::*;
355
356    #[test]
357    fn test_transcription_response_format() {
358        let format = TranscriptionResponseFormat::Json;
359        let json = serde_json::to_string(&format).unwrap();
360        assert_eq!(json, "\"json\"");
361
362        let format = TranscriptionResponseFormat::VerboseJson;
363        let json = serde_json::to_string(&format).unwrap();
364        assert_eq!(json, "\"verbose_json\"");
365    }
366
367    #[test]
368    fn test_timestamp_granularity() {
369        let granularity = TimestampGranularity::Word;
370        let json = serde_json::to_string(&granularity).unwrap();
371        assert_eq!(json, "\"word\"");
372
373        let granularity = TimestampGranularity::Segment;
374        let json = serde_json::to_string(&granularity).unwrap();
375        assert_eq!(json, "\"segment\"");
376    }
377
378    #[test]
379    fn test_create_transcription_request() {
380        let request = CreateTranscriptionRequest {
381            model: "whisper-1".to_string(),
382            language: Some("en".to_string()),
383            response_format: Some(TranscriptionResponseFormat::VerboseJson),
384            temperature: Some(0.0),
385            timestamp_granularities: Some(vec![
386                TimestampGranularity::Word,
387                TimestampGranularity::Segment,
388            ]),
389            ..Default::default()
390        };
391
392        assert_eq!(request.model, "whisper-1");
393        assert_eq!(request.language, Some("en".to_string()));
394        assert_eq!(request.temperature, Some(0.0));
395    }
396}