dynamo_async_openai/types/audio.rs
1// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0
3//
4// Based on https://github.com/64bit/async-openai/ by Himanshu Neema
5// Original Copyright (c) 2022 Himanshu Neema
6// Licensed under MIT License (see ATTRIBUTIONS-Rust.md)
7//
8// Modifications Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
9// Licensed under Apache 2.0
10
11use bytes::Bytes;
12use derive_builder::Builder;
13use serde::{Deserialize, Serialize};
14
15use super::InputSource;
16use crate::error::OpenAIError;
17
18#[derive(Debug, Default, Clone, PartialEq)]
19pub struct AudioInput {
20 pub source: InputSource,
21}
22
23#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
24#[serde(rename_all = "snake_case")]
25pub enum AudioResponseFormat {
26 #[default]
27 Json,
28 Text,
29 Srt,
30 VerboseJson,
31 Vtt,
32}
33
34#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
35#[serde(rename_all = "lowercase")]
36pub enum SpeechResponseFormat {
37 #[default]
38 Mp3,
39 Opus,
40 Aac,
41 Flac,
42 Pcm,
43 Wav,
44}
45
46#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
47#[serde(rename_all = "lowercase")]
48#[non_exhaustive]
49pub enum Voice {
50 #[default]
51 Alloy,
52 Ash,
53 Ballad,
54 Coral,
55 Echo,
56 Fable,
57 Onyx,
58 Nova,
59 Sage,
60 Shimmer,
61}
62
63#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
64pub enum SpeechModel {
65 #[default]
66 #[serde(rename = "tts-1")]
67 Tts1,
68 #[serde(rename = "tts-1-hd")]
69 Tts1Hd,
70 #[serde(untagged)]
71 Other(String),
72}
73
74#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
75#[serde(rename_all = "lowercase")]
76pub enum TimestampGranularity {
77 Word,
78 #[default]
79 Segment,
80}
81
82#[derive(Clone, Default, Debug, Builder, PartialEq)]
83#[builder(name = "CreateTranscriptionRequestArgs")]
84#[builder(pattern = "mutable")]
85#[builder(setter(into, strip_option), default)]
86#[builder(derive(Debug))]
87#[builder(build_fn(error = "OpenAIError"))]
88pub struct CreateTranscriptionRequest {
89 /// The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
90 pub file: AudioInput,
91
92 /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
93 pub model: String,
94
95 /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should match the audio language.
96 pub prompt: Option<String>,
97
98 /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
99 pub response_format: Option<AudioResponseFormat>,
100
101 /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
102 pub temperature: Option<f32>, // default: 0
103
104 /// The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will improve accuracy and latency.
105 pub language: Option<String>,
106
107 /// The timestamp granularities to populate for this transcription. `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
108 pub timestamp_granularities: Option<Vec<TimestampGranularity>>,
109}
110
111/// Represents a transcription response returned by model, based on the provided
112/// input.
113#[derive(Debug, Deserialize, Clone, Serialize)]
114pub struct CreateTranscriptionResponseJson {
115 /// The transcribed text.
116 pub text: String,
117}
118
119/// Represents a verbose json transcription response returned by model, based on
120/// the provided input.
121#[derive(Debug, Deserialize, Clone, Serialize)]
122pub struct CreateTranscriptionResponseVerboseJson {
123 /// The language of the input audio.
124 pub language: String,
125
126 /// The duration of the input audio.
127 pub duration: f32,
128
129 /// The transcribed text.
130 pub text: String,
131
132 /// Extracted words and their corresponding timestamps.
133 #[serde(skip_serializing_if = "Option::is_none")]
134 pub words: Option<Vec<TranscriptionWord>>,
135
136 /// Segments of the transcribed text and their corresponding details.
137 #[serde(skip_serializing_if = "Option::is_none")]
138 pub segments: Option<Vec<TranscriptionSegment>>,
139}
140
141#[derive(Debug, Deserialize, Clone, Serialize)]
142pub struct TranscriptionWord {
143 /// The text content of the word.
144 pub word: String,
145
146 /// Start time of the word in seconds.
147 pub start: f32,
148
149 /// End time of the word in seconds.
150 pub end: f32,
151}
152
153#[derive(Debug, Deserialize, Clone, Serialize)]
154pub struct TranscriptionSegment {
155 /// Unique identifier of the segment.
156 pub id: i32,
157
158 // Seek offset of the segment.
159 pub seek: i32,
160
161 /// Start time of the segment in seconds.
162 pub start: f32,
163
164 /// End time of the segment in seconds.
165 pub end: f32,
166
167 /// Text content of the segment.
168 pub text: String,
169
170 /// Array of token IDs for the text content.
171 pub tokens: Vec<i32>,
172
173 /// Temperature parameter used for generating the segment.
174 pub temperature: f32,
175
176 /// Average logprob of the segment. If the value is lower than -1, consider
177 /// the logprobs failed.
178 pub avg_logprob: f32,
179
180 /// Compression ratio of the segment. If the value is greater than 2.4,
181 /// consider the compression failed.
182 pub compression_ratio: f32,
183
184 /// Probability of no speech in the segment. If the value is higher than 1.0
185 /// and the `avg_logprob` is below -1, consider this segment silent.
186 pub no_speech_prob: f32,
187}
188
189#[derive(Clone, Default, Debug, Builder, PartialEq, Serialize, Deserialize)]
190#[builder(name = "CreateSpeechRequestArgs")]
191#[builder(pattern = "mutable")]
192#[builder(setter(into, strip_option), default)]
193#[builder(derive(Debug))]
194#[builder(build_fn(error = "OpenAIError"))]
195pub struct CreateSpeechRequest {
196 /// The text to generate audio for. The maximum length is 4096 characters.
197 pub input: String,
198
199 /// One of the available [TTS models](https://platform.openai.com/docs/models/tts): `tts-1` or `tts-1-hd`
200 pub model: SpeechModel,
201
202 /// The voice to use when generating the audio. Supported voices are `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer` and `verse`.
203
204 /// Previews of the voices are available in the [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
205 pub voice: Voice,
206
207 /// Control the voice of your generated audio with additional instructions.
208 /// Does not work with `tts-1` or `tts-1-hd`.
209 #[serde(skip_serializing_if = "Option::is_none")]
210 pub instructions: Option<String>,
211
212 /// The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`.
213 #[serde(skip_serializing_if = "Option::is_none")]
214 pub response_format: Option<SpeechResponseFormat>,
215
216 /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
217 #[serde(skip_serializing_if = "Option::is_none")]
218 pub speed: Option<f32>, // default: 1.0
219}
220
221#[derive(Clone, Default, Debug, Builder, PartialEq)]
222#[builder(name = "CreateTranslationRequestArgs")]
223#[builder(pattern = "mutable")]
224#[builder(setter(into, strip_option), default)]
225#[builder(derive(Debug))]
226#[builder(build_fn(error = "OpenAIError"))]
227pub struct CreateTranslationRequest {
228 /// The audio file object (not file name) translate, in one of these
229 ///formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
230 pub file: AudioInput,
231
232 /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
233 pub model: String,
234
235 /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should be in English.
236 pub prompt: Option<String>,
237
238 /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
239 pub response_format: Option<AudioResponseFormat>,
240
241 /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
242 pub temperature: Option<f32>, // default: 0
243}
244
245#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
246pub struct CreateTranslationResponseJson {
247 pub text: String,
248}
249
250#[derive(Debug, Deserialize, Clone, Serialize)]
251pub struct CreateTranslationResponseVerboseJson {
252 /// The language of the output translation (always `english`).
253 pub language: String,
254 /// The duration of the input audio.
255 pub duration: String,
256 /// The translated text.
257 pub text: String,
258 /// Segments of the translated text and their corresponding details.
259 #[serde(skip_serializing_if = "Option::is_none")]
260 pub segments: Option<Vec<TranscriptionSegment>>,
261}
262
263#[derive(Debug, Clone)]
264pub struct CreateSpeechResponse {
265 pub bytes: Bytes,
266}