async_openai/types/audio.rs
1use bytes::Bytes;
2use derive_builder::Builder;
3use serde::{Deserialize, Serialize};
4
5use super::InputSource;
6use crate::error::OpenAIError;
7
8#[derive(Debug, Default, Clone, PartialEq)]
9pub struct AudioInput {
10 pub source: InputSource,
11}
12
13#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
14#[serde(rename_all = "snake_case")]
15pub enum AudioResponseFormat {
16 #[default]
17 Json,
18 Text,
19 Srt,
20 VerboseJson,
21 Vtt,
22}
23
24#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
25#[serde(rename_all = "lowercase")]
26pub enum SpeechResponseFormat {
27 #[default]
28 Mp3,
29 Opus,
30 Aac,
31 Flac,
32 Pcm,
33 Wav,
34}
35
36#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
37#[serde(rename_all = "lowercase")]
38#[non_exhaustive]
39pub enum Voice {
40 #[default]
41 Alloy,
42 Ash,
43 Coral,
44 Echo,
45 Fable,
46 Onyx,
47 Nova,
48 Sage,
49 Shimmer,
50}
51
52#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
53pub enum SpeechModel {
54 #[default]
55 #[serde(rename = "tts-1")]
56 Tts1,
57 #[serde(rename = "tts-1-hd")]
58 Tts1Hd,
59 #[serde(untagged)]
60 Other(String),
61}
62
63#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
64#[serde(rename_all = "lowercase")]
65pub enum TimestampGranularity {
66 Word,
67 #[default]
68 Segment,
69}
70
71#[derive(Clone, Default, Debug, Builder, PartialEq)]
72#[builder(name = "CreateTranscriptionRequestArgs")]
73#[builder(pattern = "mutable")]
74#[builder(setter(into, strip_option), default)]
75#[builder(derive(Debug))]
76#[builder(build_fn(error = "OpenAIError"))]
77pub struct CreateTranscriptionRequest {
78 /// The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
79 pub file: AudioInput,
80
81 /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
82 pub model: String,
83
84 /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should match the audio language.
85 pub prompt: Option<String>,
86
87 /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
88 pub response_format: Option<AudioResponseFormat>,
89
90 /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
91 pub temperature: Option<f32>, // default: 0
92
93 /// The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will improve accuracy and latency.
94 pub language: Option<String>,
95
96 /// The timestamp granularities to populate for this transcription. `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
97 pub timestamp_granularities: Option<Vec<TimestampGranularity>>,
98}
99
100/// Represents a transcription response returned by model, based on the provided
101/// input.
102#[derive(Debug, Deserialize, Clone, Serialize)]
103pub struct CreateTranscriptionResponseJson {
104 /// The transcribed text.
105 pub text: String,
106}
107
108/// Represents a verbose json transcription response returned by model, based on
109/// the provided input.
110#[derive(Debug, Deserialize, Clone, Serialize)]
111pub struct CreateTranscriptionResponseVerboseJson {
112 /// The language of the input audio.
113 pub language: String,
114
115 /// The duration of the input audio.
116 pub duration: f32,
117
118 /// The transcribed text.
119 pub text: String,
120
121 /// Extracted words and their corresponding timestamps.
122 #[serde(skip_serializing_if = "Option::is_none")]
123 pub words: Option<Vec<TranscriptionWord>>,
124
125 /// Segments of the transcribed text and their corresponding details.
126 #[serde(skip_serializing_if = "Option::is_none")]
127 pub segments: Option<Vec<TranscriptionSegment>>,
128}
129
130#[derive(Debug, Deserialize, Clone, Serialize)]
131pub struct TranscriptionWord {
132 /// The text content of the word.
133 pub word: String,
134
135 /// Start time of the word in seconds.
136 pub start: f32,
137
138 /// End time of the word in seconds.
139 pub end: f32,
140}
141
142#[derive(Debug, Deserialize, Clone, Serialize)]
143pub struct TranscriptionSegment {
144 /// Unique identifier of the segment.
145 pub id: i32,
146
147 // Seek offset of the segment.
148 pub seek: i32,
149
150 /// Start time of the segment in seconds.
151 pub start: f32,
152
153 /// End time of the segment in seconds.
154 pub end: f32,
155
156 /// Text content of the segment.
157 pub text: String,
158
159 /// Array of token IDs for the text content.
160 pub tokens: Vec<i32>,
161
162 /// Temperature parameter used for generating the segment.
163 pub temperature: f32,
164
165 /// Average logprob of the segment. If the value is lower than -1, consider
166 /// the logprobs failed.
167 pub avg_logprob: f32,
168
169 /// Compression ratio of the segment. If the value is greater than 2.4,
170 /// consider the compression failed.
171 pub compression_ratio: f32,
172
173 /// Probability of no speech in the segment. If the value is higher than 1.0
174 /// and the `avg_logprob` is below -1, consider this segment silent.
175 pub no_speech_prob: f32,
176}
177
178#[derive(Clone, Default, Debug, Builder, PartialEq, Serialize, Deserialize)]
179#[builder(name = "CreateSpeechRequestArgs")]
180#[builder(pattern = "mutable")]
181#[builder(setter(into, strip_option), default)]
182#[builder(derive(Debug))]
183#[builder(build_fn(error = "OpenAIError"))]
184pub struct CreateSpeechRequest {
185 /// The text to generate audio for. The maximum length is 4096 characters.
186 pub input: String,
187
188 /// One of the available [TTS models](https://platform.openai.com/docs/models/tts): `tts-1` or `tts-1-hd`
189 pub model: SpeechModel,
190
191 /// The voice to use when generating the audio. Supported voices are `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage` and `shimmer`.
192 /// Previews of the voices are available in the [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
193 pub voice: Voice,
194
195 /// The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`.
196 #[serde(skip_serializing_if = "Option::is_none")]
197 pub response_format: Option<SpeechResponseFormat>,
198
199 /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
200 #[serde(skip_serializing_if = "Option::is_none")]
201 pub speed: Option<f32>, // default: 1.0
202}
203
204#[derive(Clone, Default, Debug, Builder, PartialEq)]
205#[builder(name = "CreateTranslationRequestArgs")]
206#[builder(pattern = "mutable")]
207#[builder(setter(into, strip_option), default)]
208#[builder(derive(Debug))]
209#[builder(build_fn(error = "OpenAIError"))]
210pub struct CreateTranslationRequest {
211 /// The audio file object (not file name) translate, in one of these
212 ///formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
213 pub file: AudioInput,
214
215 /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
216 pub model: String,
217
218 /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should be in English.
219 pub prompt: Option<String>,
220
221 /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
222 pub response_format: Option<AudioResponseFormat>,
223
224 /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
225 pub temperature: Option<f32>, // default: 0
226}
227
228#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
229pub struct CreateTranslationResponseJson {
230 pub text: String,
231}
232
233#[derive(Debug, Deserialize, Clone, Serialize)]
234pub struct CreateTranslationResponseVerboseJson {
235 /// The language of the output translation (always `english`).
236 pub language: String,
237 /// The duration of the input audio.
238 pub duration: String,
239 /// The translated text.
240 pub text: String,
241 /// Segments of the translated text and their corresponding details.
242 #[serde(skip_serializing_if = "Option::is_none")]
243 pub segments: Option<Vec<TranscriptionSegment>>,
244}
245
246#[derive(Debug, Clone)]
247pub struct CreateSpeechResponse {
248 pub bytes: Bytes,
249}