dynamo_async_openai/types/
audio.rs

1// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0
3//
4// Based on https://github.com/64bit/async-openai/ by Himanshu Neema
5// Original Copyright (c) 2022 Himanshu Neema
6// Licensed under MIT License (see ATTRIBUTIONS-Rust.md)
7//
8// Modifications Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
9// Licensed under Apache 2.0
10
11use bytes::Bytes;
12use derive_builder::Builder;
13use serde::{Deserialize, Serialize};
14
15use super::InputSource;
16use crate::error::OpenAIError;
17
18#[derive(Debug, Default, Clone, PartialEq)]
19pub struct AudioInput {
20    pub source: InputSource,
21}
22
23#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
24#[serde(rename_all = "snake_case")]
25pub enum AudioResponseFormat {
26    #[default]
27    Json,
28    Text,
29    Srt,
30    VerboseJson,
31    Vtt,
32}
33
34#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
35#[serde(rename_all = "lowercase")]
36pub enum SpeechResponseFormat {
37    #[default]
38    Mp3,
39    Opus,
40    Aac,
41    Flac,
42    Pcm,
43    Wav,
44}
45
46#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
47#[serde(rename_all = "lowercase")]
48#[non_exhaustive]
49pub enum Voice {
50    #[default]
51    Alloy,
52    Ash,
53    Ballad,
54    Coral,
55    Echo,
56    Fable,
57    Onyx,
58    Nova,
59    Sage,
60    Shimmer,
61}
62
63#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
64pub enum SpeechModel {
65    #[default]
66    #[serde(rename = "tts-1")]
67    Tts1,
68    #[serde(rename = "tts-1-hd")]
69    Tts1Hd,
70    #[serde(untagged)]
71    Other(String),
72}
73
74#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
75#[serde(rename_all = "lowercase")]
76pub enum TimestampGranularity {
77    Word,
78    #[default]
79    Segment,
80}
81
82#[derive(Clone, Default, Debug, Builder, PartialEq)]
83#[builder(name = "CreateTranscriptionRequestArgs")]
84#[builder(pattern = "mutable")]
85#[builder(setter(into, strip_option), default)]
86#[builder(derive(Debug))]
87#[builder(build_fn(error = "OpenAIError"))]
88pub struct CreateTranscriptionRequest {
89    /// The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
90    pub file: AudioInput,
91
92    /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
93    pub model: String,
94
95    /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should match the audio language.
96    pub prompt: Option<String>,
97
98    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
99    pub response_format: Option<AudioResponseFormat>,
100
101    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
102    pub temperature: Option<f32>, // default: 0
103
104    /// The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will improve accuracy and latency.
105    pub language: Option<String>,
106
107    /// The timestamp granularities to populate for this transcription. `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
108    pub timestamp_granularities: Option<Vec<TimestampGranularity>>,
109}
110
111/// Represents a transcription response returned by model, based on the provided
112/// input.
113#[derive(Debug, Deserialize, Clone, Serialize)]
114pub struct CreateTranscriptionResponseJson {
115    /// The transcribed text.
116    pub text: String,
117}
118
119/// Represents a verbose json transcription response returned by model, based on
120/// the provided input.
121#[derive(Debug, Deserialize, Clone, Serialize)]
122pub struct CreateTranscriptionResponseVerboseJson {
123    /// The language of the input audio.
124    pub language: String,
125
126    /// The duration of the input audio.
127    pub duration: f32,
128
129    /// The transcribed text.
130    pub text: String,
131
132    /// Extracted words and their corresponding timestamps.
133    #[serde(skip_serializing_if = "Option::is_none")]
134    pub words: Option<Vec<TranscriptionWord>>,
135
136    /// Segments of the transcribed text and their corresponding details.
137    #[serde(skip_serializing_if = "Option::is_none")]
138    pub segments: Option<Vec<TranscriptionSegment>>,
139}
140
141#[derive(Debug, Deserialize, Clone, Serialize)]
142pub struct TranscriptionWord {
143    /// The text content of the word.
144    pub word: String,
145
146    /// Start time of the word in seconds.
147    pub start: f32,
148
149    /// End time of the word in seconds.
150    pub end: f32,
151}
152
153#[derive(Debug, Deserialize, Clone, Serialize)]
154pub struct TranscriptionSegment {
155    /// Unique identifier of the segment.
156    pub id: i32,
157
158    // Seek offset of the segment.
159    pub seek: i32,
160
161    /// Start time of the segment in seconds.
162    pub start: f32,
163
164    /// End time of the segment in seconds.
165    pub end: f32,
166
167    /// Text content of the segment.
168    pub text: String,
169
170    /// Array of token IDs for the text content.
171    pub tokens: Vec<i32>,
172
173    /// Temperature parameter used for generating the segment.
174    pub temperature: f32,
175
176    /// Average logprob of the segment. If the value is lower than -1, consider
177    /// the logprobs failed.
178    pub avg_logprob: f32,
179
180    /// Compression ratio of the segment. If the value is greater than 2.4,
181    /// consider the compression failed.
182    pub compression_ratio: f32,
183
184    /// Probability of no speech in the segment. If the value is higher than 1.0
185    /// and the `avg_logprob` is below -1, consider this segment silent.
186    pub no_speech_prob: f32,
187}
188
189#[derive(Clone, Default, Debug, Builder, PartialEq, Serialize, Deserialize)]
190#[builder(name = "CreateSpeechRequestArgs")]
191#[builder(pattern = "mutable")]
192#[builder(setter(into, strip_option), default)]
193#[builder(derive(Debug))]
194#[builder(build_fn(error = "OpenAIError"))]
195pub struct CreateSpeechRequest {
196    /// The text to generate audio for. The maximum length is 4096 characters.
197    pub input: String,
198
199    /// One of the available [TTS models](https://platform.openai.com/docs/models/tts): `tts-1` or `tts-1-hd`
200    pub model: SpeechModel,
201
202    /// The voice to use when generating the audio. Supported voices are `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer` and `verse`.
203
204    /// Previews of the voices are available in the [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
205    pub voice: Voice,
206
207    /// Control the voice of your generated audio with additional instructions.
208    /// Does not work with `tts-1` or `tts-1-hd`.
209    #[serde(skip_serializing_if = "Option::is_none")]
210    pub instructions: Option<String>,
211
212    /// The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`.
213    #[serde(skip_serializing_if = "Option::is_none")]
214    pub response_format: Option<SpeechResponseFormat>,
215
216    /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
217    #[serde(skip_serializing_if = "Option::is_none")]
218    pub speed: Option<f32>, // default: 1.0
219}
220
221#[derive(Clone, Default, Debug, Builder, PartialEq)]
222#[builder(name = "CreateTranslationRequestArgs")]
223#[builder(pattern = "mutable")]
224#[builder(setter(into, strip_option), default)]
225#[builder(derive(Debug))]
226#[builder(build_fn(error = "OpenAIError"))]
227pub struct CreateTranslationRequest {
228    /// The audio file object (not file name) translate, in one of these
229    ///formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
230    pub file: AudioInput,
231
232    /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
233    pub model: String,
234
235    /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should be in English.
236    pub prompt: Option<String>,
237
238    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
239    pub response_format: Option<AudioResponseFormat>,
240
241    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
242    pub temperature: Option<f32>, // default: 0
243}
244
245#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
246pub struct CreateTranslationResponseJson {
247    pub text: String,
248}
249
250#[derive(Debug, Deserialize, Clone, Serialize)]
251pub struct CreateTranslationResponseVerboseJson {
252    /// The language of the output translation (always `english`).
253    pub language: String,
254    /// The duration of the input audio.
255    pub duration: String,
256    /// The translated text.
257    pub text: String,
258    /// Segments of the translated text and their corresponding details.
259    #[serde(skip_serializing_if = "Option::is_none")]
260    pub segments: Option<Vec<TranscriptionSegment>>,
261}
262
263#[derive(Debug, Clone)]
264pub struct CreateSpeechResponse {
265    pub bytes: Bytes,
266}