async_openai/types/audio.rs
1use bytes::Bytes;
2use derive_builder::Builder;
3use serde::{Deserialize, Serialize};
4
5use super::InputSource;
6use crate::error::OpenAIError;
7
8#[derive(Debug, Default, Clone, PartialEq)]
9pub struct AudioInput {
10 pub source: InputSource,
11}
12
13#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
14#[serde(rename_all = "snake_case")]
15pub enum AudioResponseFormat {
16 #[default]
17 Json,
18 Text,
19 Srt,
20 VerboseJson,
21 Vtt,
22}
23
24#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
25#[serde(rename_all = "lowercase")]
26pub enum SpeechResponseFormat {
27 #[default]
28 Mp3,
29 Opus,
30 Aac,
31 Flac,
32 Pcm,
33 Wav,
34}
35
36#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
37#[serde(rename_all = "lowercase")]
38#[non_exhaustive]
39pub enum Voice {
40 #[default]
41 Alloy,
42 Ash,
43 Ballad,
44 Coral,
45 Echo,
46 Fable,
47 Onyx,
48 Nova,
49 Sage,
50 Shimmer,
51}
52
53#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
54pub enum SpeechModel {
55 #[default]
56 #[serde(rename = "tts-1")]
57 Tts1,
58 #[serde(rename = "tts-1-hd")]
59 Tts1Hd,
60 #[serde(untagged)]
61 Other(String),
62}
63
64#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
65#[serde(rename_all = "lowercase")]
66pub enum TimestampGranularity {
67 Word,
68 #[default]
69 Segment,
70}
71
72#[derive(Clone, Default, Debug, Builder, PartialEq)]
73#[builder(name = "CreateTranscriptionRequestArgs")]
74#[builder(pattern = "mutable")]
75#[builder(setter(into, strip_option), default)]
76#[builder(derive(Debug))]
77#[builder(build_fn(error = "OpenAIError"))]
78pub struct CreateTranscriptionRequest {
79 /// The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
80 pub file: AudioInput,
81
82 /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
83 pub model: String,
84
85 /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should match the audio language.
86 pub prompt: Option<String>,
87
88 /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
89 pub response_format: Option<AudioResponseFormat>,
90
91 /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
92 pub temperature: Option<f32>, // default: 0
93
94 /// The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will improve accuracy and latency.
95 pub language: Option<String>,
96
97 /// The timestamp granularities to populate for this transcription. `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
98 pub timestamp_granularities: Option<Vec<TimestampGranularity>>,
99}
100
101/// Represents a transcription response returned by model, based on the provided
102/// input.
103#[derive(Debug, Deserialize, Clone, Serialize)]
104pub struct CreateTranscriptionResponseJson {
105 /// The transcribed text.
106 pub text: String,
107}
108
109/// Represents a verbose json transcription response returned by model, based on
110/// the provided input.
111#[derive(Debug, Deserialize, Clone, Serialize)]
112pub struct CreateTranscriptionResponseVerboseJson {
113 /// The language of the input audio.
114 pub language: String,
115
116 /// The duration of the input audio.
117 pub duration: f32,
118
119 /// The transcribed text.
120 pub text: String,
121
122 /// Extracted words and their corresponding timestamps.
123 #[serde(skip_serializing_if = "Option::is_none")]
124 pub words: Option<Vec<TranscriptionWord>>,
125
126 /// Segments of the transcribed text and their corresponding details.
127 #[serde(skip_serializing_if = "Option::is_none")]
128 pub segments: Option<Vec<TranscriptionSegment>>,
129}
130
131#[derive(Debug, Deserialize, Clone, Serialize)]
132pub struct TranscriptionWord {
133 /// The text content of the word.
134 pub word: String,
135
136 /// Start time of the word in seconds.
137 pub start: f32,
138
139 /// End time of the word in seconds.
140 pub end: f32,
141}
142
143#[derive(Debug, Deserialize, Clone, Serialize)]
144pub struct TranscriptionSegment {
145 /// Unique identifier of the segment.
146 pub id: i32,
147
148 // Seek offset of the segment.
149 pub seek: i32,
150
151 /// Start time of the segment in seconds.
152 pub start: f32,
153
154 /// End time of the segment in seconds.
155 pub end: f32,
156
157 /// Text content of the segment.
158 pub text: String,
159
160 /// Array of token IDs for the text content.
161 pub tokens: Vec<i32>,
162
163 /// Temperature parameter used for generating the segment.
164 pub temperature: f32,
165
166 /// Average logprob of the segment. If the value is lower than -1, consider
167 /// the logprobs failed.
168 pub avg_logprob: f32,
169
170 /// Compression ratio of the segment. If the value is greater than 2.4,
171 /// consider the compression failed.
172 pub compression_ratio: f32,
173
174 /// Probability of no speech in the segment. If the value is higher than 1.0
175 /// and the `avg_logprob` is below -1, consider this segment silent.
176 pub no_speech_prob: f32,
177}
178
179#[derive(Clone, Default, Debug, Builder, PartialEq, Serialize, Deserialize)]
180#[builder(name = "CreateSpeechRequestArgs")]
181#[builder(pattern = "mutable")]
182#[builder(setter(into, strip_option), default)]
183#[builder(derive(Debug))]
184#[builder(build_fn(error = "OpenAIError"))]
185pub struct CreateSpeechRequest {
186 /// The text to generate audio for. The maximum length is 4096 characters.
187 pub input: String,
188
189 /// One of the available [TTS models](https://platform.openai.com/docs/models/tts): `tts-1` or `tts-1-hd`
190 pub model: SpeechModel,
191
192 /// The voice to use when generating the audio. Supported voices are `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer` and `verse`.
193
194 /// Previews of the voices are available in the [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
195 pub voice: Voice,
196
197 /// Control the voice of your generated audio with additional instructions.
198 /// Does not work with `tts-1` or `tts-1-hd`.
199 #[serde(skip_serializing_if = "Option::is_none")]
200 pub instructions: Option<String>,
201
202 /// The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`.
203 #[serde(skip_serializing_if = "Option::is_none")]
204 pub response_format: Option<SpeechResponseFormat>,
205
206 /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
207 #[serde(skip_serializing_if = "Option::is_none")]
208 pub speed: Option<f32>, // default: 1.0
209}
210
211#[derive(Clone, Default, Debug, Builder, PartialEq)]
212#[builder(name = "CreateTranslationRequestArgs")]
213#[builder(pattern = "mutable")]
214#[builder(setter(into, strip_option), default)]
215#[builder(derive(Debug))]
216#[builder(build_fn(error = "OpenAIError"))]
217pub struct CreateTranslationRequest {
218 /// The audio file object (not file name) translate, in one of these
219 ///formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
220 pub file: AudioInput,
221
222 /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
223 pub model: String,
224
225 /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should be in English.
226 pub prompt: Option<String>,
227
228 /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
229 pub response_format: Option<AudioResponseFormat>,
230
231 /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
232 pub temperature: Option<f32>, // default: 0
233}
234
235#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
236pub struct CreateTranslationResponseJson {
237 pub text: String,
238}
239
240#[derive(Debug, Deserialize, Clone, Serialize)]
241pub struct CreateTranslationResponseVerboseJson {
242 /// The language of the output translation (always `english`).
243 pub language: String,
244 /// The duration of the input audio.
245 pub duration: String,
246 /// The translated text.
247 pub text: String,
248 /// Segments of the translated text and their corresponding details.
249 #[serde(skip_serializing_if = "Option::is_none")]
250 pub segments: Option<Vec<TranscriptionSegment>>,
251}
252
253#[derive(Debug, Clone)]
254pub struct CreateSpeechResponse {
255 pub bytes: Bytes,
256}