async_openai_wasm/types/audio.rs
1use bytes::Bytes;
2use derive_builder::Builder;
3use serde::{Deserialize, Serialize};
4
5use super::InputSource;
6use crate::error::OpenAIError;
7
8#[derive(Debug, Default, Clone, PartialEq)]
9pub struct AudioInput {
10 pub source: InputSource,
11}
12
13#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
14#[serde(rename_all = "snake_case")]
15pub enum AudioResponseFormat {
16 #[default]
17 Json,
18 Text,
19 Srt,
20 VerboseJson,
21 Vtt,
22}
23
24#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
25#[serde(rename_all = "lowercase")]
26pub enum SpeechResponseFormat {
27 #[default]
28 Mp3,
29 Opus,
30 Aac,
31 Flac,
32 Pcm,
33 Wav,
34}
35
36#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
37#[serde(rename_all = "lowercase")]
38#[non_exhaustive]
39pub enum Voice {
40 #[default]
41 Alloy,
42 Echo,
43 Fable,
44 Onyx,
45 Nova,
46 Shimmer,
47}
48
49#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
50pub enum SpeechModel {
51 #[default]
52 #[serde(rename = "tts-1")]
53 Tts1,
54 #[serde(rename = "tts-1-hd")]
55 Tts1Hd,
56 #[serde(untagged)]
57 Other(String),
58}
59
60#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
61#[serde(rename_all = "lowercase")]
62pub enum TimestampGranularity {
63 Word,
64 #[default]
65 Segment,
66}
67
68#[derive(Clone, Default, Debug, Builder, PartialEq)]
69#[builder(name = "CreateTranscriptionRequestArgs")]
70#[builder(pattern = "mutable")]
71#[builder(setter(into, strip_option), default)]
72#[builder(derive(Debug))]
73#[builder(build_fn(error = "OpenAIError"))]
74pub struct CreateTranscriptionRequest {
75 /// The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
76 pub file: AudioInput,
77
78 /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
79 pub model: String,
80
81 /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should match the audio language.
82 pub prompt: Option<String>,
83
84 /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
85 pub response_format: Option<AudioResponseFormat>,
86
87 /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
88 pub temperature: Option<f32>, // default: 0
89
90 /// The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will improve accuracy and latency.
91 pub language: Option<String>,
92
93 /// The timestamp granularities to populate for this transcription. `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
94 pub timestamp_granularities: Option<Vec<TimestampGranularity>>,
95}
96
97/// Represents a transcription response returned by model, based on the provided
98/// input.
99#[derive(Debug, Deserialize, Clone, Serialize)]
100pub struct CreateTranscriptionResponseJson {
101 /// The transcribed text.
102 pub text: String,
103}
104
105/// Represents a verbose json transcription response returned by model, based on
106/// the provided input.
107#[derive(Debug, Deserialize, Clone, Serialize)]
108pub struct CreateTranscriptionResponseVerboseJson {
109 /// The language of the input audio.
110 pub language: String,
111
112 /// The duration of the input audio.
113 pub duration: f32,
114
115 /// The transcribed text.
116 pub text: String,
117
118 /// Extracted words and their corresponding timestamps.
119 #[serde(skip_serializing_if = "Option::is_none")]
120 pub words: Option<Vec<TranscriptionWord>>,
121
122 /// Segments of the transcribed text and their corresponding details.
123 #[serde(skip_serializing_if = "Option::is_none")]
124 pub segments: Option<Vec<TranscriptionSegment>>,
125}
126
127#[derive(Debug, Deserialize, Clone, Serialize)]
128pub struct TranscriptionWord {
129 /// The text content of the word.
130 pub word: String,
131
132 /// Start time of the word in seconds.
133 pub start: f32,
134
135 /// End time of the word in seconds.
136 pub end: f32,
137}
138
139#[derive(Debug, Deserialize, Clone, Serialize)]
140pub struct TranscriptionSegment {
141 /// Unique identifier of the segment.
142 pub id: i32,
143
144 // Seek offset of the segment.
145 pub seek: i32,
146
147 /// Start time of the segment in seconds.
148 pub start: f32,
149
150 /// End time of the segment in seconds.
151 pub end: f32,
152
153 /// Text content of the segment.
154 pub text: String,
155
156 /// Array of token IDs for the text content.
157 pub tokens: Vec<i32>,
158
159 /// Temperature parameter used for generating the segment.
160 pub temperature: f32,
161
162 /// Average logprob of the segment. If the value is lower than -1, consider
163 /// the logprobs failed.
164 pub avg_logprob: f32,
165
166 /// Compression ratio of the segment. If the value is greater than 2.4,
167 /// consider the compression failed.
168 pub compression_ratio: f32,
169
170 /// Probability of no speech in the segment. If the value is higher than 1.0
171 /// and the `avg_logprob` is below -1, consider this segment silent.
172 pub no_speech_prob: f32,
173}
174
175#[derive(Clone, Default, Debug, Builder, PartialEq, Serialize, Deserialize)]
176#[builder(name = "CreateSpeechRequestArgs")]
177#[builder(pattern = "mutable")]
178#[builder(setter(into, strip_option), default)]
179#[builder(derive(Debug))]
180#[builder(build_fn(error = "OpenAIError"))]
181pub struct CreateSpeechRequest {
182 /// The text to generate audio for. The maximum length is 4096 characters.
183 pub input: String,
184
185 /// One of the available [TTS models](https://platform.openai.com/docs/models/tts): `tts-1` or `tts-1-hd`
186 pub model: SpeechModel,
187
188 /// The voice to use when generating the audio. Supported voices are `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer`. Previews of the voices are available in the [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech/voice-options).
189 pub voice: Voice,
190
191 /// The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`.
192 #[serde(skip_serializing_if = "Option::is_none")]
193 pub response_format: Option<SpeechResponseFormat>,
194
195 /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
196 #[serde(skip_serializing_if = "Option::is_none")]
197 pub speed: Option<f32>, // default: 1.0
198}
199
200#[derive(Clone, Default, Debug, Builder, PartialEq)]
201#[builder(name = "CreateTranslationRequestArgs")]
202#[builder(pattern = "mutable")]
203#[builder(setter(into, strip_option), default)]
204#[builder(derive(Debug))]
205#[builder(build_fn(error = "OpenAIError"))]
206pub struct CreateTranslationRequest {
207 /// The audio file object (not file name) translate, in one of these
208 ///formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
209 pub file: AudioInput,
210
211 /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
212 pub model: String,
213
214 /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should be in English.
215 pub prompt: Option<String>,
216
217 /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
218 pub response_format: Option<AudioResponseFormat>,
219
220 /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
221 pub temperature: Option<f32>, // default: 0
222}
223
224#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
225pub struct CreateTranslationResponseJson {
226 pub text: String,
227}
228
229#[derive(Debug, Deserialize, Clone, Serialize)]
230pub struct CreateTranslationResponseVerboseJson {
231 /// The language of the output translation (always `english`).
232 pub language: String,
233 /// The duration of the input audio.
234 pub duration: String,
235 /// The translated text.
236 pub text: String,
237 /// Segments of the translated text and their corresponding details.
238 #[serde(skip_serializing_if = "Option::is_none")]
239 pub segments: Option<Vec<TranscriptionSegment>>,
240}
241
242#[derive(Debug, Clone)]
243pub struct CreateSpeechResponse {
244 pub bytes: Bytes,
245}