async_openai/types/
audio.rs

1use bytes::Bytes;
2use derive_builder::Builder;
3use serde::{Deserialize, Serialize};
4
5use super::InputSource;
6use crate::error::OpenAIError;
7
8#[derive(Debug, Default, Clone, PartialEq)]
9pub struct AudioInput {
10    pub source: InputSource,
11}
12
13#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
14#[serde(rename_all = "snake_case")]
15pub enum AudioResponseFormat {
16    #[default]
17    Json,
18    Text,
19    Srt,
20    VerboseJson,
21    Vtt,
22}
23
24#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
25#[serde(rename_all = "lowercase")]
26pub enum SpeechResponseFormat {
27    #[default]
28    Mp3,
29    Opus,
30    Aac,
31    Flac,
32    Pcm,
33    Wav,
34}
35
36#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
37#[serde(rename_all = "lowercase")]
38#[non_exhaustive]
39pub enum Voice {
40    #[default]
41    Alloy,
42    Ash,
43    Coral,
44    Echo,
45    Fable,
46    Onyx,
47    Nova,
48    Sage,
49    Shimmer,
50}
51
52#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
53pub enum SpeechModel {
54    #[default]
55    #[serde(rename = "tts-1")]
56    Tts1,
57    #[serde(rename = "tts-1-hd")]
58    Tts1Hd,
59    #[serde(untagged)]
60    Other(String),
61}
62
63#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
64#[serde(rename_all = "lowercase")]
65pub enum TimestampGranularity {
66    Word,
67    #[default]
68    Segment,
69}
70
71#[derive(Clone, Default, Debug, Builder, PartialEq)]
72#[builder(name = "CreateTranscriptionRequestArgs")]
73#[builder(pattern = "mutable")]
74#[builder(setter(into, strip_option), default)]
75#[builder(derive(Debug))]
76#[builder(build_fn(error = "OpenAIError"))]
77pub struct CreateTranscriptionRequest {
78    /// The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
79    pub file: AudioInput,
80
81    /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
82    pub model: String,
83
84    /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should match the audio language.
85    pub prompt: Option<String>,
86
87    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
88    pub response_format: Option<AudioResponseFormat>,
89
90    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
91    pub temperature: Option<f32>, // default: 0
92
93    /// The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will improve accuracy and latency.
94    pub language: Option<String>,
95
96    /// The timestamp granularities to populate for this transcription. `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
97    pub timestamp_granularities: Option<Vec<TimestampGranularity>>,
98}
99
100/// Represents a transcription response returned by model, based on the provided
101/// input.
102#[derive(Debug, Deserialize, Clone, Serialize)]
103pub struct CreateTranscriptionResponseJson {
104    /// The transcribed text.
105    pub text: String,
106}
107
108/// Represents a verbose json transcription response returned by model, based on
109/// the provided input.
110#[derive(Debug, Deserialize, Clone, Serialize)]
111pub struct CreateTranscriptionResponseVerboseJson {
112    /// The language of the input audio.
113    pub language: String,
114
115    /// The duration of the input audio.
116    pub duration: f32,
117
118    /// The transcribed text.
119    pub text: String,
120
121    /// Extracted words and their corresponding timestamps.
122    #[serde(skip_serializing_if = "Option::is_none")]
123    pub words: Option<Vec<TranscriptionWord>>,
124
125    /// Segments of the transcribed text and their corresponding details.
126    #[serde(skip_serializing_if = "Option::is_none")]
127    pub segments: Option<Vec<TranscriptionSegment>>,
128}
129
130#[derive(Debug, Deserialize, Clone, Serialize)]
131pub struct TranscriptionWord {
132    /// The text content of the word.
133    pub word: String,
134
135    /// Start time of the word in seconds.
136    pub start: f32,
137
138    /// End time of the word in seconds.
139    pub end: f32,
140}
141
142#[derive(Debug, Deserialize, Clone, Serialize)]
143pub struct TranscriptionSegment {
144    /// Unique identifier of the segment.
145    pub id: i32,
146
147    // Seek offset of the segment.
148    pub seek: i32,
149
150    /// Start time of the segment in seconds.
151    pub start: f32,
152
153    /// End time of the segment in seconds.
154    pub end: f32,
155
156    /// Text content of the segment.
157    pub text: String,
158
159    /// Array of token IDs for the text content.
160    pub tokens: Vec<i32>,
161
162    /// Temperature parameter used for generating the segment.
163    pub temperature: f32,
164
165    /// Average logprob of the segment. If the value is lower than -1, consider
166    /// the logprobs failed.
167    pub avg_logprob: f32,
168
169    /// Compression ratio of the segment. If the value is greater than 2.4,
170    /// consider the compression failed.
171    pub compression_ratio: f32,
172
173    /// Probability of no speech in the segment. If the value is higher than 1.0
174    /// and the `avg_logprob` is below -1, consider this segment silent.
175    pub no_speech_prob: f32,
176}
177
178#[derive(Clone, Default, Debug, Builder, PartialEq, Serialize, Deserialize)]
179#[builder(name = "CreateSpeechRequestArgs")]
180#[builder(pattern = "mutable")]
181#[builder(setter(into, strip_option), default)]
182#[builder(derive(Debug))]
183#[builder(build_fn(error = "OpenAIError"))]
184pub struct CreateSpeechRequest {
185    /// The text to generate audio for. The maximum length is 4096 characters.
186    pub input: String,
187
188    /// One of the available [TTS models](https://platform.openai.com/docs/models/tts): `tts-1` or `tts-1-hd`
189    pub model: SpeechModel,
190
191    /// The voice to use when generating the audio. Supported voices are `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer` and `verse`.
192
193    /// Previews of the voices are available in the [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
194    pub voice: Voice,
195
196    /// Control the voice of your generated audio with additional instructions.
197    /// Does not work with `tts-1` or `tts-1-hd`.
198    #[serde(skip_serializing_if = "Option::is_none")]
199    pub instructions: Option<String>,
200
201    /// The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`.
202    #[serde(skip_serializing_if = "Option::is_none")]
203    pub response_format: Option<SpeechResponseFormat>,
204
205    /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
206    #[serde(skip_serializing_if = "Option::is_none")]
207    pub speed: Option<f32>, // default: 1.0
208}
209
210#[derive(Clone, Default, Debug, Builder, PartialEq)]
211#[builder(name = "CreateTranslationRequestArgs")]
212#[builder(pattern = "mutable")]
213#[builder(setter(into, strip_option), default)]
214#[builder(derive(Debug))]
215#[builder(build_fn(error = "OpenAIError"))]
216pub struct CreateTranslationRequest {
217    /// The audio file object (not file name) translate, in one of these
218    ///formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
219    pub file: AudioInput,
220
221    /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
222    pub model: String,
223
224    /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should be in English.
225    pub prompt: Option<String>,
226
227    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
228    pub response_format: Option<AudioResponseFormat>,
229
230    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
231    pub temperature: Option<f32>, // default: 0
232}
233
234#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
235pub struct CreateTranslationResponseJson {
236    pub text: String,
237}
238
239#[derive(Debug, Deserialize, Clone, Serialize)]
240pub struct CreateTranslationResponseVerboseJson {
241    /// The language of the output translation (always `english`).
242    pub language: String,
243    /// The duration of the input audio.
244    pub duration: String,
245    /// The translated text.
246    pub text: String,
247    /// Segments of the translated text and their corresponding details.
248    #[serde(skip_serializing_if = "Option::is_none")]
249    pub segments: Option<Vec<TranscriptionSegment>>,
250}
251
252#[derive(Debug, Clone)]
253pub struct CreateSpeechResponse {
254    pub bytes: Bytes,
255}