async_openai/types/audio.rs
1use bytes::Bytes;
2use derive_builder::Builder;
3use serde::{Deserialize, Serialize};
4
5use super::InputSource;
6use crate::error::OpenAIError;
7
8#[derive(Debug, Default, Clone, PartialEq)]
9pub struct AudioInput {
10 pub source: InputSource,
11}
12
13#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
14#[serde(rename_all = "snake_case")]
15pub enum AudioResponseFormat {
16 #[default]
17 Json,
18 Text,
19 Srt,
20 VerboseJson,
21 Vtt,
22}
23
24#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
25#[serde(rename_all = "lowercase")]
26pub enum SpeechResponseFormat {
27 #[default]
28 Mp3,
29 Opus,
30 Aac,
31 Flac,
32 Pcm,
33 Wav,
34}
35
36#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
37#[serde(rename_all = "lowercase")]
38#[non_exhaustive]
39pub enum Voice {
40 #[default]
41 Alloy,
42 Ash,
43 Coral,
44 Echo,
45 Fable,
46 Onyx,
47 Nova,
48 Sage,
49 Shimmer,
50}
51
52#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
53pub enum SpeechModel {
54 #[default]
55 #[serde(rename = "tts-1")]
56 Tts1,
57 #[serde(rename = "tts-1-hd")]
58 Tts1Hd,
59 #[serde(untagged)]
60 Other(String),
61}
62
63#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
64#[serde(rename_all = "lowercase")]
65pub enum TimestampGranularity {
66 Word,
67 #[default]
68 Segment,
69}
70
71#[derive(Clone, Default, Debug, Builder, PartialEq)]
72#[builder(name = "CreateTranscriptionRequestArgs")]
73#[builder(pattern = "mutable")]
74#[builder(setter(into, strip_option), default)]
75#[builder(derive(Debug))]
76#[builder(build_fn(error = "OpenAIError"))]
77pub struct CreateTranscriptionRequest {
78 /// The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
79 pub file: AudioInput,
80
81 /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
82 pub model: String,
83
84 /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should match the audio language.
85 pub prompt: Option<String>,
86
87 /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
88 pub response_format: Option<AudioResponseFormat>,
89
90 /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
91 pub temperature: Option<f32>, // default: 0
92
93 /// The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will improve accuracy and latency.
94 pub language: Option<String>,
95
96 /// The timestamp granularities to populate for this transcription. `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
97 pub timestamp_granularities: Option<Vec<TimestampGranularity>>,
98}
99
100/// Represents a transcription response returned by model, based on the provided
101/// input.
102#[derive(Debug, Deserialize, Clone, Serialize)]
103pub struct CreateTranscriptionResponseJson {
104 /// The transcribed text.
105 pub text: String,
106}
107
108/// Represents a verbose json transcription response returned by model, based on
109/// the provided input.
110#[derive(Debug, Deserialize, Clone, Serialize)]
111pub struct CreateTranscriptionResponseVerboseJson {
112 /// The language of the input audio.
113 pub language: String,
114
115 /// The duration of the input audio.
116 pub duration: f32,
117
118 /// The transcribed text.
119 pub text: String,
120
121 /// Extracted words and their corresponding timestamps.
122 #[serde(skip_serializing_if = "Option::is_none")]
123 pub words: Option<Vec<TranscriptionWord>>,
124
125 /// Segments of the transcribed text and their corresponding details.
126 #[serde(skip_serializing_if = "Option::is_none")]
127 pub segments: Option<Vec<TranscriptionSegment>>,
128}
129
130#[derive(Debug, Deserialize, Clone, Serialize)]
131pub struct TranscriptionWord {
132 /// The text content of the word.
133 pub word: String,
134
135 /// Start time of the word in seconds.
136 pub start: f32,
137
138 /// End time of the word in seconds.
139 pub end: f32,
140}
141
142#[derive(Debug, Deserialize, Clone, Serialize)]
143pub struct TranscriptionSegment {
144 /// Unique identifier of the segment.
145 pub id: i32,
146
147 // Seek offset of the segment.
148 pub seek: i32,
149
150 /// Start time of the segment in seconds.
151 pub start: f32,
152
153 /// End time of the segment in seconds.
154 pub end: f32,
155
156 /// Text content of the segment.
157 pub text: String,
158
159 /// Array of token IDs for the text content.
160 pub tokens: Vec<i32>,
161
162 /// Temperature parameter used for generating the segment.
163 pub temperature: f32,
164
165 /// Average logprob of the segment. If the value is lower than -1, consider
166 /// the logprobs failed.
167 pub avg_logprob: f32,
168
169 /// Compression ratio of the segment. If the value is greater than 2.4,
170 /// consider the compression failed.
171 pub compression_ratio: f32,
172
173 /// Probability of no speech in the segment. If the value is higher than 1.0
174 /// and the `avg_logprob` is below -1, consider this segment silent.
175 pub no_speech_prob: f32,
176}
177
178#[derive(Clone, Default, Debug, Builder, PartialEq, Serialize, Deserialize)]
179#[builder(name = "CreateSpeechRequestArgs")]
180#[builder(pattern = "mutable")]
181#[builder(setter(into, strip_option), default)]
182#[builder(derive(Debug))]
183#[builder(build_fn(error = "OpenAIError"))]
184pub struct CreateSpeechRequest {
185 /// The text to generate audio for. The maximum length is 4096 characters.
186 pub input: String,
187
188 /// One of the available [TTS models](https://platform.openai.com/docs/models/tts): `tts-1` or `tts-1-hd`
189 pub model: SpeechModel,
190
191 /// The voice to use when generating the audio. Supported voices are `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer` and `verse`.
192
193 /// Previews of the voices are available in the [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
194 pub voice: Voice,
195
196 /// Control the voice of your generated audio with additional instructions.
197 /// Does not work with `tts-1` or `tts-1-hd`.
198 #[serde(skip_serializing_if = "Option::is_none")]
199 pub instructions: Option<String>,
200
201 /// The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`.
202 #[serde(skip_serializing_if = "Option::is_none")]
203 pub response_format: Option<SpeechResponseFormat>,
204
205 /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
206 #[serde(skip_serializing_if = "Option::is_none")]
207 pub speed: Option<f32>, // default: 1.0
208}
209
210#[derive(Clone, Default, Debug, Builder, PartialEq)]
211#[builder(name = "CreateTranslationRequestArgs")]
212#[builder(pattern = "mutable")]
213#[builder(setter(into, strip_option), default)]
214#[builder(derive(Debug))]
215#[builder(build_fn(error = "OpenAIError"))]
216pub struct CreateTranslationRequest {
217 /// The audio file object (not file name) translate, in one of these
218 ///formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
219 pub file: AudioInput,
220
221 /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
222 pub model: String,
223
224 /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should be in English.
225 pub prompt: Option<String>,
226
227 /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
228 pub response_format: Option<AudioResponseFormat>,
229
230 /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
231 pub temperature: Option<f32>, // default: 0
232}
233
234#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
235pub struct CreateTranslationResponseJson {
236 pub text: String,
237}
238
239#[derive(Debug, Deserialize, Clone, Serialize)]
240pub struct CreateTranslationResponseVerboseJson {
241 /// The language of the output translation (always `english`).
242 pub language: String,
243 /// The duration of the input audio.
244 pub duration: String,
245 /// The translated text.
246 pub text: String,
247 /// Segments of the translated text and their corresponding details.
248 #[serde(skip_serializing_if = "Option::is_none")]
249 pub segments: Option<Vec<TranscriptionSegment>>,
250}
251
252#[derive(Debug, Clone)]
253pub struct CreateSpeechResponse {
254 pub bytes: Bytes,
255}