async_openai/types/
audio.rs

1use bytes::Bytes;
2use derive_builder::Builder;
3use serde::{Deserialize, Serialize};
4
5use super::InputSource;
6use crate::error::OpenAIError;
7
8#[derive(Debug, Default, Clone, PartialEq)]
9pub struct AudioInput {
10    pub source: InputSource,
11}
12
13#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
14#[serde(rename_all = "snake_case")]
15pub enum AudioResponseFormat {
16    #[default]
17    Json,
18    Text,
19    Srt,
20    VerboseJson,
21    Vtt,
22}
23
24#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
25#[serde(rename_all = "lowercase")]
26pub enum SpeechResponseFormat {
27    #[default]
28    Mp3,
29    Opus,
30    Aac,
31    Flac,
32    Pcm,
33    Wav,
34}
35
36#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
37#[serde(rename_all = "lowercase")]
38#[non_exhaustive]
39pub enum Voice {
40    #[default]
41    Alloy,
42    Ash,
43    Coral,
44    Echo,
45    Fable,
46    Onyx,
47    Nova,
48    Sage,
49    Shimmer,
50}
51
52#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
53pub enum SpeechModel {
54    #[default]
55    #[serde(rename = "tts-1")]
56    Tts1,
57    #[serde(rename = "tts-1-hd")]
58    Tts1Hd,
59    #[serde(untagged)]
60    Other(String),
61}
62
63#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
64#[serde(rename_all = "lowercase")]
65pub enum TimestampGranularity {
66    Word,
67    #[default]
68    Segment,
69}
70
71#[derive(Clone, Default, Debug, Builder, PartialEq)]
72#[builder(name = "CreateTranscriptionRequestArgs")]
73#[builder(pattern = "mutable")]
74#[builder(setter(into, strip_option), default)]
75#[builder(derive(Debug))]
76#[builder(build_fn(error = "OpenAIError"))]
77pub struct CreateTranscriptionRequest {
78    /// The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
79    pub file: AudioInput,
80
81    /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
82    pub model: String,
83
84    /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should match the audio language.
85    pub prompt: Option<String>,
86
87    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
88    pub response_format: Option<AudioResponseFormat>,
89
90    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
91    pub temperature: Option<f32>, // default: 0
92
93    /// The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will improve accuracy and latency.
94    pub language: Option<String>,
95
96    /// The timestamp granularities to populate for this transcription. `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
97    pub timestamp_granularities: Option<Vec<TimestampGranularity>>,
98}
99
100/// Represents a transcription response returned by model, based on the provided
101/// input.
102#[derive(Debug, Deserialize, Clone, Serialize)]
103pub struct CreateTranscriptionResponseJson {
104    /// The transcribed text.
105    pub text: String,
106}
107
108/// Represents a verbose json transcription response returned by model, based on
109/// the provided input.
110#[derive(Debug, Deserialize, Clone, Serialize)]
111pub struct CreateTranscriptionResponseVerboseJson {
112    /// The language of the input audio.
113    pub language: String,
114
115    /// The duration of the input audio.
116    pub duration: f32,
117
118    /// The transcribed text.
119    pub text: String,
120
121    /// Extracted words and their corresponding timestamps.
122    #[serde(skip_serializing_if = "Option::is_none")]
123    pub words: Option<Vec<TranscriptionWord>>,
124
125    /// Segments of the transcribed text and their corresponding details.
126    #[serde(skip_serializing_if = "Option::is_none")]
127    pub segments: Option<Vec<TranscriptionSegment>>,
128}
129
130#[derive(Debug, Deserialize, Clone, Serialize)]
131pub struct TranscriptionWord {
132    /// The text content of the word.
133    pub word: String,
134
135    /// Start time of the word in seconds.
136    pub start: f32,
137
138    /// End time of the word in seconds.
139    pub end: f32,
140}
141
142#[derive(Debug, Deserialize, Clone, Serialize)]
143pub struct TranscriptionSegment {
144    /// Unique identifier of the segment.
145    pub id: i32,
146
147    // Seek offset of the segment.
148    pub seek: i32,
149
150    /// Start time of the segment in seconds.
151    pub start: f32,
152
153    /// End time of the segment in seconds.
154    pub end: f32,
155
156    /// Text content of the segment.
157    pub text: String,
158
159    /// Array of token IDs for the text content.
160    pub tokens: Vec<i32>,
161
162    /// Temperature parameter used for generating the segment.
163    pub temperature: f32,
164
165    /// Average logprob of the segment. If the value is lower than -1, consider
166    /// the logprobs failed.
167    pub avg_logprob: f32,
168
169    /// Compression ratio of the segment. If the value is greater than 2.4,
170    /// consider the compression failed.
171    pub compression_ratio: f32,
172
173    /// Probability of no speech in the segment. If the value is higher than 1.0
174    /// and the `avg_logprob` is below -1, consider this segment silent.
175    pub no_speech_prob: f32,
176}
177
178#[derive(Clone, Default, Debug, Builder, PartialEq, Serialize, Deserialize)]
179#[builder(name = "CreateSpeechRequestArgs")]
180#[builder(pattern = "mutable")]
181#[builder(setter(into, strip_option), default)]
182#[builder(derive(Debug))]
183#[builder(build_fn(error = "OpenAIError"))]
184pub struct CreateSpeechRequest {
185    /// The text to generate audio for. The maximum length is 4096 characters.
186    pub input: String,
187
188    /// One of the available [TTS models](https://platform.openai.com/docs/models/tts): `tts-1` or `tts-1-hd`
189    pub model: SpeechModel,
190
191    /// The voice to use when generating the audio. Supported voices are `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage` and `shimmer`.
192    /// Previews of the voices are available in the [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
193    pub voice: Voice,
194
195    /// The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`.
196    #[serde(skip_serializing_if = "Option::is_none")]
197    pub response_format: Option<SpeechResponseFormat>,
198
199    /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
200    #[serde(skip_serializing_if = "Option::is_none")]
201    pub speed: Option<f32>, // default: 1.0
202}
203
204#[derive(Clone, Default, Debug, Builder, PartialEq)]
205#[builder(name = "CreateTranslationRequestArgs")]
206#[builder(pattern = "mutable")]
207#[builder(setter(into, strip_option), default)]
208#[builder(derive(Debug))]
209#[builder(build_fn(error = "OpenAIError"))]
210pub struct CreateTranslationRequest {
211    /// The audio file object (not file name) translate, in one of these
212    ///formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
213    pub file: AudioInput,
214
215    /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
216    pub model: String,
217
218    /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should be in English.
219    pub prompt: Option<String>,
220
221    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
222    pub response_format: Option<AudioResponseFormat>,
223
224    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
225    pub temperature: Option<f32>, // default: 0
226}
227
228#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
229pub struct CreateTranslationResponseJson {
230    pub text: String,
231}
232
233#[derive(Debug, Deserialize, Clone, Serialize)]
234pub struct CreateTranslationResponseVerboseJson {
235    /// The language of the output translation (always `english`).
236    pub language: String,
237    /// The duration of the input audio.
238    pub duration: String,
239    /// The translated text.
240    pub text: String,
241    /// Segments of the translated text and their corresponding details.
242    #[serde(skip_serializing_if = "Option::is_none")]
243    pub segments: Option<Vec<TranscriptionSegment>>,
244}
245
246#[derive(Debug, Clone)]
247pub struct CreateSpeechResponse {
248    pub bytes: Bytes,
249}