async_openai/types/
audio.rs

1use bytes::Bytes;
2use derive_builder::Builder;
3use serde::{Deserialize, Serialize};
4
5use super::InputSource;
6use crate::error::OpenAIError;
7
8#[derive(Debug, Default, Clone, PartialEq)]
9pub struct AudioInput {
10    pub source: InputSource,
11}
12
13#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
14#[serde(rename_all = "snake_case")]
15pub enum AudioResponseFormat {
16    #[default]
17    Json,
18    Text,
19    Srt,
20    VerboseJson,
21    Vtt,
22}
23
24#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
25#[serde(rename_all = "lowercase")]
26pub enum SpeechResponseFormat {
27    #[default]
28    Mp3,
29    Opus,
30    Aac,
31    Flac,
32    Pcm,
33    Wav,
34}
35
36#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
37#[serde(rename_all = "lowercase")]
38#[non_exhaustive]
39pub enum Voice {
40    #[default]
41    Alloy,
42    Ash,
43    Ballad,
44    Coral,
45    Echo,
46    Fable,
47    Onyx,
48    Nova,
49    Sage,
50    Shimmer,
51}
52
53#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
54pub enum SpeechModel {
55    #[default]
56    #[serde(rename = "tts-1")]
57    Tts1,
58    #[serde(rename = "tts-1-hd")]
59    Tts1Hd,
60    #[serde(untagged)]
61    Other(String),
62}
63
64#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
65#[serde(rename_all = "lowercase")]
66pub enum TimestampGranularity {
67    Word,
68    #[default]
69    Segment,
70}
71
72#[derive(Clone, Default, Debug, Builder, PartialEq)]
73#[builder(name = "CreateTranscriptionRequestArgs")]
74#[builder(pattern = "mutable")]
75#[builder(setter(into, strip_option), default)]
76#[builder(derive(Debug))]
77#[builder(build_fn(error = "OpenAIError"))]
78pub struct CreateTranscriptionRequest {
79    /// The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
80    pub file: AudioInput,
81
82    /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
83    pub model: String,
84
85    /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should match the audio language.
86    pub prompt: Option<String>,
87
88    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
89    pub response_format: Option<AudioResponseFormat>,
90
91    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
92    pub temperature: Option<f32>, // default: 0
93
94    /// The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will improve accuracy and latency.
95    pub language: Option<String>,
96
97    /// The timestamp granularities to populate for this transcription. `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
98    pub timestamp_granularities: Option<Vec<TimestampGranularity>>,
99}
100
101/// Represents a transcription response returned by model, based on the provided
102/// input.
103#[derive(Debug, Deserialize, Clone, Serialize)]
104pub struct CreateTranscriptionResponseJson {
105    /// The transcribed text.
106    pub text: String,
107}
108
109/// Represents a verbose json transcription response returned by model, based on
110/// the provided input.
111#[derive(Debug, Deserialize, Clone, Serialize)]
112pub struct CreateTranscriptionResponseVerboseJson {
113    /// The language of the input audio.
114    pub language: String,
115
116    /// The duration of the input audio.
117    pub duration: f32,
118
119    /// The transcribed text.
120    pub text: String,
121
122    /// Extracted words and their corresponding timestamps.
123    #[serde(skip_serializing_if = "Option::is_none")]
124    pub words: Option<Vec<TranscriptionWord>>,
125
126    /// Segments of the transcribed text and their corresponding details.
127    #[serde(skip_serializing_if = "Option::is_none")]
128    pub segments: Option<Vec<TranscriptionSegment>>,
129}
130
131#[derive(Debug, Deserialize, Clone, Serialize)]
132pub struct TranscriptionWord {
133    /// The text content of the word.
134    pub word: String,
135
136    /// Start time of the word in seconds.
137    pub start: f32,
138
139    /// End time of the word in seconds.
140    pub end: f32,
141}
142
143#[derive(Debug, Deserialize, Clone, Serialize)]
144pub struct TranscriptionSegment {
145    /// Unique identifier of the segment.
146    pub id: i32,
147
148    // Seek offset of the segment.
149    pub seek: i32,
150
151    /// Start time of the segment in seconds.
152    pub start: f32,
153
154    /// End time of the segment in seconds.
155    pub end: f32,
156
157    /// Text content of the segment.
158    pub text: String,
159
160    /// Array of token IDs for the text content.
161    pub tokens: Vec<i32>,
162
163    /// Temperature parameter used for generating the segment.
164    pub temperature: f32,
165
166    /// Average logprob of the segment. If the value is lower than -1, consider
167    /// the logprobs failed.
168    pub avg_logprob: f32,
169
170    /// Compression ratio of the segment. If the value is greater than 2.4,
171    /// consider the compression failed.
172    pub compression_ratio: f32,
173
174    /// Probability of no speech in the segment. If the value is higher than 1.0
175    /// and the `avg_logprob` is below -1, consider this segment silent.
176    pub no_speech_prob: f32,
177}
178
179#[derive(Clone, Default, Debug, Builder, PartialEq, Serialize, Deserialize)]
180#[builder(name = "CreateSpeechRequestArgs")]
181#[builder(pattern = "mutable")]
182#[builder(setter(into, strip_option), default)]
183#[builder(derive(Debug))]
184#[builder(build_fn(error = "OpenAIError"))]
185pub struct CreateSpeechRequest {
186    /// The text to generate audio for. The maximum length is 4096 characters.
187    pub input: String,
188
189    /// One of the available [TTS models](https://platform.openai.com/docs/models/tts): `tts-1` or `tts-1-hd`
190    pub model: SpeechModel,
191
192    /// The voice to use when generating the audio. Supported voices are `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer` and `verse`.
193
194    /// Previews of the voices are available in the [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
195    pub voice: Voice,
196
197    /// Control the voice of your generated audio with additional instructions.
198    /// Does not work with `tts-1` or `tts-1-hd`.
199    #[serde(skip_serializing_if = "Option::is_none")]
200    pub instructions: Option<String>,
201
202    /// The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`.
203    #[serde(skip_serializing_if = "Option::is_none")]
204    pub response_format: Option<SpeechResponseFormat>,
205
206    /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
207    #[serde(skip_serializing_if = "Option::is_none")]
208    pub speed: Option<f32>, // default: 1.0
209}
210
211#[derive(Clone, Default, Debug, Builder, PartialEq)]
212#[builder(name = "CreateTranslationRequestArgs")]
213#[builder(pattern = "mutable")]
214#[builder(setter(into, strip_option), default)]
215#[builder(derive(Debug))]
216#[builder(build_fn(error = "OpenAIError"))]
217pub struct CreateTranslationRequest {
218    /// The audio file object (not file name) translate, in one of these
219    ///formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
220    pub file: AudioInput,
221
222    /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
223    pub model: String,
224
225    /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should be in English.
226    pub prompt: Option<String>,
227
228    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
229    pub response_format: Option<AudioResponseFormat>,
230
231    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
232    pub temperature: Option<f32>, // default: 0
233}
234
235#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
236pub struct CreateTranslationResponseJson {
237    pub text: String,
238}
239
240#[derive(Debug, Deserialize, Clone, Serialize)]
241pub struct CreateTranslationResponseVerboseJson {
242    /// The language of the output translation (always `english`).
243    pub language: String,
244    /// The duration of the input audio.
245    pub duration: String,
246    /// The translated text.
247    pub text: String,
248    /// Segments of the translated text and their corresponding details.
249    #[serde(skip_serializing_if = "Option::is_none")]
250    pub segments: Option<Vec<TranscriptionSegment>>,
251}
252
253#[derive(Debug, Clone)]
254pub struct CreateSpeechResponse {
255    pub bytes: Bytes,
256}