async_openai_wasm/types/
audio.rs

1use bytes::Bytes;
2use derive_builder::Builder;
3use serde::{Deserialize, Serialize};
4
5use super::InputSource;
6use crate::error::OpenAIError;
7
8#[derive(Debug, Default, Clone, PartialEq)]
9pub struct AudioInput {
10    pub source: InputSource,
11}
12
13#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
14#[serde(rename_all = "snake_case")]
15pub enum AudioResponseFormat {
16    #[default]
17    Json,
18    Text,
19    Srt,
20    VerboseJson,
21    Vtt,
22}
23
24#[derive(Debug, Serialize, Deserialize, Default, Clone, Copy, PartialEq)]
25#[serde(rename_all = "lowercase")]
26pub enum SpeechResponseFormat {
27    #[default]
28    Mp3,
29    Opus,
30    Aac,
31    Flac,
32    Pcm,
33    Wav,
34}
35
36#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
37#[serde(rename_all = "lowercase")]
38#[non_exhaustive]
39pub enum Voice {
40    #[default]
41    Alloy,
42    Echo,
43    Fable,
44    Onyx,
45    Nova,
46    Shimmer,
47}
48
49#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
50pub enum SpeechModel {
51    #[default]
52    #[serde(rename = "tts-1")]
53    Tts1,
54    #[serde(rename = "tts-1-hd")]
55    Tts1Hd,
56    #[serde(untagged)]
57    Other(String),
58}
59
60#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
61#[serde(rename_all = "lowercase")]
62pub enum TimestampGranularity {
63    Word,
64    #[default]
65    Segment,
66}
67
68#[derive(Clone, Default, Debug, Builder, PartialEq)]
69#[builder(name = "CreateTranscriptionRequestArgs")]
70#[builder(pattern = "mutable")]
71#[builder(setter(into, strip_option), default)]
72#[builder(derive(Debug))]
73#[builder(build_fn(error = "OpenAIError"))]
74pub struct CreateTranscriptionRequest {
75    /// The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
76    pub file: AudioInput,
77
78    /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
79    pub model: String,
80
81    /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should match the audio language.
82    pub prompt: Option<String>,
83
84    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
85    pub response_format: Option<AudioResponseFormat>,
86
87    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
88    pub temperature: Option<f32>, // default: 0
89
90    /// The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will improve accuracy and latency.
91    pub language: Option<String>,
92
93    /// The timestamp granularities to populate for this transcription. `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
94    pub timestamp_granularities: Option<Vec<TimestampGranularity>>,
95}
96
97/// Represents a transcription response returned by model, based on the provided
98/// input.
99#[derive(Debug, Deserialize, Clone, Serialize)]
100pub struct CreateTranscriptionResponseJson {
101    /// The transcribed text.
102    pub text: String,
103}
104
105/// Represents a verbose json transcription response returned by model, based on
106/// the provided input.
107#[derive(Debug, Deserialize, Clone, Serialize)]
108pub struct CreateTranscriptionResponseVerboseJson {
109    /// The language of the input audio.
110    pub language: String,
111
112    /// The duration of the input audio.
113    pub duration: f32,
114
115    /// The transcribed text.
116    pub text: String,
117
118    /// Extracted words and their corresponding timestamps.
119    #[serde(skip_serializing_if = "Option::is_none")]
120    pub words: Option<Vec<TranscriptionWord>>,
121
122    /// Segments of the transcribed text and their corresponding details.
123    #[serde(skip_serializing_if = "Option::is_none")]
124    pub segments: Option<Vec<TranscriptionSegment>>,
125}
126
127#[derive(Debug, Deserialize, Clone, Serialize)]
128pub struct TranscriptionWord {
129    /// The text content of the word.
130    pub word: String,
131
132    /// Start time of the word in seconds.
133    pub start: f32,
134
135    /// End time of the word in seconds.
136    pub end: f32,
137}
138
139#[derive(Debug, Deserialize, Clone, Serialize)]
140pub struct TranscriptionSegment {
141    /// Unique identifier of the segment.
142    pub id: i32,
143
144    // Seek offset of the segment.
145    pub seek: i32,
146
147    /// Start time of the segment in seconds.
148    pub start: f32,
149
150    /// End time of the segment in seconds.
151    pub end: f32,
152
153    /// Text content of the segment.
154    pub text: String,
155
156    /// Array of token IDs for the text content.
157    pub tokens: Vec<i32>,
158
159    /// Temperature parameter used for generating the segment.
160    pub temperature: f32,
161
162    /// Average logprob of the segment. If the value is lower than -1, consider
163    /// the logprobs failed.
164    pub avg_logprob: f32,
165
166    /// Compression ratio of the segment. If the value is greater than 2.4,
167    /// consider the compression failed.
168    pub compression_ratio: f32,
169
170    /// Probability of no speech in the segment. If the value is higher than 1.0
171    /// and the `avg_logprob` is below -1, consider this segment silent.
172    pub no_speech_prob: f32,
173}
174
175#[derive(Clone, Default, Debug, Builder, PartialEq, Serialize, Deserialize)]
176#[builder(name = "CreateSpeechRequestArgs")]
177#[builder(pattern = "mutable")]
178#[builder(setter(into, strip_option), default)]
179#[builder(derive(Debug))]
180#[builder(build_fn(error = "OpenAIError"))]
181pub struct CreateSpeechRequest {
182    /// The text to generate audio for. The maximum length is 4096 characters.
183    pub input: String,
184
185    /// One of the available [TTS models](https://platform.openai.com/docs/models/tts): `tts-1` or `tts-1-hd`
186    pub model: SpeechModel,
187
188    /// The voice to use when generating the audio. Supported voices are `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer`. Previews of the voices are available in the [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech/voice-options).
189    pub voice: Voice,
190
191    /// The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`.
192    #[serde(skip_serializing_if = "Option::is_none")]
193    pub response_format: Option<SpeechResponseFormat>,
194
195    /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
196    #[serde(skip_serializing_if = "Option::is_none")]
197    pub speed: Option<f32>, // default: 1.0
198}
199
200#[derive(Clone, Default, Debug, Builder, PartialEq)]
201#[builder(name = "CreateTranslationRequestArgs")]
202#[builder(pattern = "mutable")]
203#[builder(setter(into, strip_option), default)]
204#[builder(derive(Debug))]
205#[builder(build_fn(error = "OpenAIError"))]
206pub struct CreateTranslationRequest {
207    /// The audio file object (not file name) translate, in one of these
208    ///formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
209    pub file: AudioInput,
210
211    /// ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
212    pub model: String,
213
214    /// An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) should be in English.
215    pub prompt: Option<String>,
216
217    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
218    pub response_format: Option<AudioResponseFormat>,
219
220    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
221    pub temperature: Option<f32>, // default: 0
222}
223
224#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
225pub struct CreateTranslationResponseJson {
226    pub text: String,
227}
228
229#[derive(Debug, Deserialize, Clone, Serialize)]
230pub struct CreateTranslationResponseVerboseJson {
231    /// The language of the output translation (always `english`).
232    pub language: String,
233    /// The duration of the input audio.
234    pub duration: String,
235    /// The translated text.
236    pub text: String,
237    /// Segments of the translated text and their corresponding details.
238    #[serde(skip_serializing_if = "Option::is_none")]
239    pub segments: Option<Vec<TranscriptionSegment>>,
240}
241
242#[derive(Debug, Clone)]
243pub struct CreateSpeechResponse {
244    pub bytes: Bytes,
245}