rs_openai/interfaces/
audio.rs

1use crate::shared::response_wrapper::OpenAIError;
2use crate::shared::types::FileMeta;
3use derive_builder::Builder;
4use serde::{Deserialize, Serialize};
5
6#[derive(Debug, Serialize, Default, Clone, strum::Display)]
7pub enum SttResponseFormat {
8    #[default]
9    #[strum(serialize = "json")]
10    Json,
11    #[strum(serialize = "text")]
12    Text,
13    #[strum(serialize = "srt")]
14    Srt,
15    #[strum(serialize = "verbose_json")]
16    VerboseJson,
17    #[strum(serialize = "vtt")]
18    Vtt,
19}
20
21#[derive(Debug, Serialize, Default, Clone, strum::Display)]
22pub enum TtsResponseFormat {
23    #[default]
24    #[strum(serialize = "mp3")]
25    Mp3,
26    #[strum(serialize = "opus")]
27    Opus,
28    #[strum(serialize = "aac")]
29    Aac,
30    #[strum(serialize = "flac")]
31    Flac,
32    #[strum(serialize = "wav")]
33    Wav,
34    #[strum(serialize = "pcm")]
35    Pcm,
36}
37
38#[derive(Debug, Serialize, Default, Clone, strum::Display)]
39pub enum Language {
40    #[default]
41    #[strum(serialize = "en")]
42    English,
43    #[strum(serialize = "zh")]
44    Chinese,
45    #[strum(serialize = "de")]
46    German,
47    #[strum(serialize = "es")]
48    Spanish,
49    #[strum(serialize = "ru")]
50    Russian,
51    #[strum(serialize = "ko")]
52    Korean,
53    #[strum(serialize = "fr")]
54    French,
55    #[strum(serialize = "ja")]
56    Japanese,
57    #[strum(serialize = "pt")]
58    Portuguese,
59    #[strum(serialize = "tr")]
60    Turkish,
61    #[strum(serialize = "pl")]
62    Polish,
63    #[strum(serialize = "ca")]
64    Catalan,
65    #[strum(serialize = "nl")]
66    Dutch,
67    #[strum(serialize = "ar")]
68    Arabic,
69    #[strum(serialize = "sv")]
70    Swedish,
71    #[strum(serialize = "it")]
72    Italian,
73    #[strum(serialize = "id")]
74    Indonesian,
75    #[strum(serialize = "hi")]
76    Hindi,
77    #[strum(serialize = "fi")]
78    Finnish,
79    #[strum(serialize = "vi")]
80    Vietnamese,
81    #[strum(serialize = "he")]
82    Hebrew,
83    #[strum(serialize = "uk")]
84    Ukrainian,
85    #[strum(serialize = "el")]
86    Greek,
87    #[strum(serialize = "ms")]
88    Malay,
89    #[strum(serialize = "cs")]
90    Czech,
91    #[strum(serialize = "ro")]
92    Romanian,
93    #[strum(serialize = "da")]
94    Danish,
95    #[strum(serialize = "hu")]
96    Hungarian,
97    #[strum(serialize = "ta")]
98    Tamil,
99    #[strum(serialize = "no")]
100    Norwegian,
101    #[strum(serialize = "th")]
102    Thai,
103    #[strum(serialize = "ur")]
104    Urdu,
105    #[strum(serialize = "hr")]
106    Croatian,
107    #[strum(serialize = "bg")]
108    Bulgarian,
109    #[strum(serialize = "lt")]
110    Lithuanian,
111    #[strum(serialize = "la")]
112    Latin,
113    #[strum(serialize = "mi")]
114    Maori,
115    #[strum(serialize = "ml")]
116    Malayalam,
117    #[strum(serialize = "cy")]
118    Welsh,
119    #[strum(serialize = "sk")]
120    Slovak,
121    #[strum(serialize = "te")]
122    Telugu,
123    #[strum(serialize = "fa")]
124    Persian,
125    #[strum(serialize = "lv")]
126    Latvian,
127    #[strum(serialize = "bn")]
128    Bengali,
129    #[strum(serialize = "sr")]
130    Serbian,
131    #[strum(serialize = "az")]
132    Azerbaijani,
133    #[strum(serialize = "sl")]
134    Slovenian,
135    #[strum(serialize = "kn")]
136    Kannada,
137    #[strum(serialize = "et")]
138    Estonian,
139    #[strum(serialize = "mk")]
140    Macedonian,
141    #[strum(serialize = "br")]
142    Breton,
143    #[strum(serialize = "eu")]
144    Basque,
145    #[strum(serialize = "is")]
146    Icelandic,
147    #[strum(serialize = "hy")]
148    Armenian,
149    #[strum(serialize = "ne")]
150    Nepali,
151    #[strum(serialize = "mn")]
152    Mongolian,
153    #[strum(serialize = "bs")]
154    Bosnian,
155    #[strum(serialize = "kk")]
156    Kazakh,
157    #[strum(serialize = "sq")]
158    Albanian,
159    #[strum(serialize = "sw")]
160    Swahili,
161    #[strum(serialize = "gl")]
162    Galician,
163    #[strum(serialize = "mr")]
164    Marathi,
165    #[strum(serialize = "pa")]
166    Punjabi,
167    #[strum(serialize = "si")]
168    Sinhala,
169    #[strum(serialize = "km")]
170    Khmer,
171    #[strum(serialize = "sn")]
172    Shona,
173    #[strum(serialize = "yo")]
174    Yoruba,
175    #[strum(serialize = "so")]
176    Somali,
177    #[strum(serialize = "af")]
178    Afrikaans,
179    #[strum(serialize = "oc")]
180    Occitan,
181    #[strum(serialize = "ka")]
182    Georgian,
183    #[strum(serialize = "be")]
184    Belarusian,
185    #[strum(serialize = "tg")]
186    Tajik,
187    #[strum(serialize = "sd")]
188    Sindhi,
189    #[strum(serialize = "gu")]
190    Gujarati,
191    #[strum(serialize = "am")]
192    Amharic,
193    #[strum(serialize = "yi")]
194    Yiddish,
195    #[strum(serialize = "lo")]
196    Lao,
197    #[strum(serialize = "uz")]
198    Uzbek,
199    #[strum(serialize = "fo")]
200    Faroese,
201    #[strum(serialize = "ht")]
202    HaitianCreole,
203    #[strum(serialize = "ps")]
204    Pashto,
205    #[strum(serialize = "tk")]
206    Turkmen,
207    #[strum(serialize = "nn")]
208    Nynorsk,
209    #[strum(serialize = "mt")]
210    Maltese,
211    #[strum(serialize = "sa")]
212    Sanskrit,
213    #[strum(serialize = "lb")]
214    Luxembourgish,
215    #[strum(serialize = "my")]
216    Myanmar,
217    #[strum(serialize = "bo")]
218    Tibetan,
219    #[strum(serialize = "tl")]
220    Tagalog,
221    #[strum(serialize = "mg")]
222    Malagasy,
223    #[strum(serialize = "as")]
224    Assamese,
225    #[strum(serialize = "tt")]
226    Tatar,
227    #[strum(serialize = "haw")]
228    Hawaiian,
229    #[strum(serialize = "ln")]
230    Lingala,
231    #[strum(serialize = "ha")]
232    Hausa,
233    #[strum(serialize = "ba")]
234    Bashkir,
235    #[strum(serialize = "jw")]
236    Javanese,
237    #[strum(serialize = "su")]
238    Sundanese,
239}
240
241#[derive(Debug, Serialize, Default, Clone, strum::Display)]
242pub enum Voice {
243    #[default]
244    #[strum(serialize = "alloy")]
245    Alloy,
246    #[strum(serialize = "echo")]
247    Echo,
248    #[strum(serialize = "fable")]
249    Fable,
250    #[strum(serialize = "onyx")]
251    Onyx,
252    #[strum(serialize = "nova")]
253    Nova,
254    #[strum(serialize = "shimmer")]
255    Shimmer,
256}
257
258#[derive(Debug, Serialize, Default, Clone, strum::Display)]
259pub enum SttModel {
260    #[default]
261    #[strum(serialize = "whisper-1")]
262    Whisper1,
263}
264
265#[derive(Debug, Serialize, Default, Clone, strum::Display)]
266pub enum AudioSpeechModel {
267    #[default]
268    #[strum(serialize = "tts-1")]
269    Whisper1,
270    #[strum(serialize = "tts-1-hd")]
271    Whisper1Hd,
272}
273
274#[derive(Builder, Clone, Debug, Default, Serialize)]
275#[builder(name = "CreateSpeechRequestBuilder")]
276#[builder(pattern = "mutable")]
277#[builder(setter(into, strip_option), default)]
278#[builder(derive(Debug))]
279#[builder(build_fn(error = "OpenAIError"))]
280pub struct CreateSpeechRequest {
281    /// One of the available [TTS models](https://platform.openai.com/docs/models/tts): `tts-1` or `tts-1-hd`
282    pub model: AudioSpeechModel,
283
284    /// The text to generate audio for. The maximum length is 4096 characters.
285    pub input: String,
286
287    /// The voice to use when generating the audio. Supported voices are `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer`.
288    /// Previews of the voices are available in the [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech/voice-options).
289    pub voice: Voice,
290
291    /// The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`.
292    /// #[serde(skip_serializing_if = "Option::is_none")]
293    pub response_format: Option<SttResponseFormat>, // default: mp3
294
295    /// The speed of the generated audio. Select a value from `0.25` to `4.0`. `1.0` is the default.
296    #[serde(skip_serializing_if = "Option::is_none")]
297    pub speed: Option<f32>, // min: 0.25, max: 4.0, default: 1.0
298}
299
300#[derive(Builder, Clone, Debug, Default, Serialize)]
301#[builder(name = "CreateTranscriptionRequestBuilder")]
302#[builder(pattern = "mutable")]
303#[builder(setter(into, strip_option), default)]
304#[builder(derive(Debug))]
305#[builder(build_fn(error = "OpenAIError"))]
306pub struct CreateTranscriptionRequest {
307    /// The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
308    pub file: FileMeta,
309
310    /// ID of the model to use. Only `whisper-1` is currently available.
311    pub model: SttModel,
312
313    /// An optional text to guide the model's style or continue a previous audio segment.
314    /// The [prompt](https://platform.openai.com/docs/guides/speech-to-text/prompting) should match the audio language.
315    #[serde(skip_serializing_if = "Option::is_none")]
316    pub prompt: Option<String>,
317
318    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
319    #[serde(skip_serializing_if = "Option::is_none")]
320    pub response_format: Option<SttResponseFormat>, // default: "json"
321
322    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random,
323    /// while lower values like 0.2 will make it more focused and deterministic.
324    /// If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
325    #[serde(skip_serializing_if = "Option::is_none")]
326    pub temperature: Option<f32>, // min: 0, max: 1, default: 0
327
328    /// The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will improve accuracy and latency.
329    #[serde(skip_serializing_if = "Option::is_none")]
330    pub language: Option<Language>,
331}
332
333#[derive(Builder, Clone, Debug, Default, Serialize)]
334#[builder(name = "CreateTranslationRequestBuilder")]
335#[builder(pattern = "mutable")]
336#[builder(setter(into, strip_option), default)]
337#[builder(derive(Debug))]
338#[builder(build_fn(error = "OpenAIError"))]
339pub struct CreateTranslationRequest {
340    /// The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
341    pub file: FileMeta,
342
343    /// ID of the model to use. Only `whisper-1` is currently available.
344    pub model: SttModel,
345
346    /// An optional text to guide the model's style or continue a previous audio segment.
347    /// The [prompt](https://platform.openai.com/docs/guides/speech-to-text/prompting) should be in English.
348    #[serde(skip_serializing_if = "Option::is_none")]
349    pub prompt: Option<String>,
350
351    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
352    #[serde(skip_serializing_if = "Option::is_none")]
353    pub response_format: Option<SttResponseFormat>, // default: json
354
355    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random,
356    /// while lower values like 0.2 will make it more focused and deterministic.
357    /// If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit.
358    #[serde(skip_serializing_if = "Option::is_none")]
359    pub temperature: Option<f32>, // min: 0, max: 1, default: 0
360}
361
362#[derive(Debug, Deserialize, Clone, Serialize)]
363pub struct VerboseJsonForAudioResponse {
364    pub task: Option<String>,
365    pub language: Option<String>,
366    pub duration: Option<f32>,
367    pub segments: Option<Vec<Segment>>,
368    pub text: String,
369}
370
371#[derive(Debug, Deserialize, Clone, Serialize)]
372pub struct Segment {
373    pub id: u32,
374    pub seek: u32,
375    pub start: f32,
376    pub end: f32,
377    pub text: String,
378    pub tokens: Vec<u32>,
379    pub temperature: f32,
380    pub avg_logprob: f32,
381    pub compression_ratio: f32,
382    pub no_speech_prob: f32,
383}