Skip to main content

aiway_model_protocol/
audio.rs

1use crate::shared::FileUpload;
2use bytes::Bytes;
3use derive_builder::Builder;
4use serde::{Deserialize, Serialize};
5use serde_json::Value;
6use std::fmt::Display;
7
8#[derive(Serialize, Deserialize, Debug, Default, Builder, Clone, PartialEq)]
9#[builder(name = "AudioSpeechParametersBuilder")]
10#[builder(setter(into, strip_option), default)]
11pub struct AudioSpeechParameters {
12    /// One of the available TTS models: tts-1 or tts-1-hd.
13    pub model: String,
14    /// The text to generate audio for. The maximum length is 4096 characters.
15    pub input: String,
16    /// The voice to use when generating the audio.
17    #[serde(skip_serializing_if = "Option::is_none")]
18    pub voice: Option<String>,
19    /// The original voice text to, some models need it.
20    #[serde(skip_serializing_if = "Option::is_none")]
21    pub voice_text: Option<String>,
22    /// Control the voice of your generated audio with additional instructions. Does not work with tts-1 or tts-1-hd
23    #[serde(skip_serializing_if = "Option::is_none")]
24    pub instructions: Option<String>,
25    /// The format to audio in. Supported formats are mp3, opus, aac, flac, wav and pcm.
26    #[serde(skip_serializing_if = "Option::is_none")]
27    pub response_format: Option<AudioSpeechResponseFormat>,
28    /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
29    #[serde(skip_serializing_if = "Option::is_none")]
30    pub speed: Option<f32>,
31}
32
33#[derive(Serialize, Deserialize, Debug, Default, Builder, Clone, PartialEq)]
34#[builder(name = "AudioTranscriptionParametersBuilder")]
35#[builder(setter(into, strip_option), default)]
36pub struct AudioTranscriptionParameters {
37    /// The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
38    pub file: FileUpload,
39    /// ID of the model to use. Only whisper-1 is currently available.
40    pub model: String,
41    /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub language: Option<String>,
44    /// Controls how the audio is cut into chunks. When set to "auto", the server first normalizes loudness and then uses voice activity detection (VAD) to choose boundaries. server_vad object can be provided to tweak VAD detection parameters manually. If unset, the audio is transcribed as a single block.
45    #[serde(skip_serializing_if = "Option::is_none")]
46    pub chunking_strategy: Option<TranscriptionChunkingStrategy>,
47    /// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.
48    #[serde(skip_serializing_if = "Option::is_none")]
49    pub prompt: Option<String>,
50    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
51    #[serde(skip_serializing_if = "Option::is_none")]
52    pub response_format: Option<AudioOutputFormat>,
53    /// If set to true, the model response data will be streamed to the client as it is generated using server-sent events. Note: Streaming is not supported for the whisper-1 model and will be ignored.
54    #[serde(skip_serializing_if = "Option::is_none")]
55    pub stream: Option<bool>,
56    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random,
57    /// while lower values like 0.2 will make it more focused and deterministic.
58    /// If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.
59    #[serde(skip_serializing_if = "Option::is_none")]
60    pub temperature: Option<f32>,
61    /// The timestamp granularities to populate for this transcription. response_format must be set verbose_json to use timestamp granularities.
62    /// Either or both of these options are supported: word, or segment.
63    #[serde(skip_serializing_if = "Option::is_none")]
64    pub timestamp_granularities: Option<Vec<TimestampGranularity>>,
65    /// Allows to pass arbitrary json as an extra_body parameter, for specific features/openai-compatible endpoints.
66    #[serde(flatten)]
67    #[serde(skip_serializing_if = "Option::is_none")]
68    pub extra_body: Option<Value>,
69}
70
71#[derive(Serialize, Deserialize, Debug, Default, Builder, Clone, PartialEq)]
72#[builder(name = "AudioTranslationParametersBuilder")]
73#[builder(setter(into, strip_option), default)]
74pub struct AudioTranslationParameters {
75    /// The audio file object to translate, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
76    pub file: FileUpload,
77    /// ID of the model to use. Only whisper-1 is currently available.
78    pub model: String,
79    /// An optional text to guide the model's style or continue a previous audio segment. The prompt should be in English.
80    #[serde(skip_serializing_if = "Option::is_none")]
81    pub prompt: Option<String>,
82    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
83    #[serde(skip_serializing_if = "Option::is_none")]
84    pub response_format: Option<AudioOutputFormat>,
85    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random,
86    /// while lower values like 0.2 will make it more focused and deterministic.
87    /// If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.
88    #[serde(skip_serializing_if = "Option::is_none")]
89    pub temperature: Option<f32>,
90}
91
92#[derive(Debug, Clone)]
93pub struct AudioSpeechResponse {
94    pub bytes: Bytes,
95}
96
97#[cfg(feature = "stream")]
98#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
99pub struct StreamAudioSpeechParameters {
100    /// One of the available TTS models: tts-1 or tts-1-hd.
101    pub model: String,
102    /// The text to generate audio for. The maximum length is 4096 characters.
103    pub input: String,
104    /// The voice to use when generating the audio.
105    #[serde(skip_serializing_if = "Option::is_none")]
106    pub voice: Option<String>,
107    /// The original voice text to, some models need it.
108    #[serde(skip_serializing_if = "Option::is_none")]
109    pub voice_text: Option<String>,
110    /// The format to audio in. Supported formats are mp3, opus, aac, flac, wav and pcm.
111    #[serde(skip_serializing_if = "Option::is_none")]
112    pub response_format: Option<AudioSpeechResponseFormat>,
113    /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
114    #[serde(skip_serializing_if = "Option::is_none")]
115    pub speed: Option<f32>,
116    pub stream: bool,
117}
118
119#[cfg(feature = "stream")]
120#[derive(Debug, Clone, PartialEq)]
121pub struct AudioSpeechResponseChunkResponse {
122    pub bytes: Bytes,
123}
124
125#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
126#[serde(rename_all = "snake_case")]
127pub enum AudioOutputFormat {
128    Json,
129    Text,
130    Srt,
131    VerboseJson,
132    Vtt,
133}
134
135#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
136#[serde(rename_all = "snake_case")]
137pub enum AudioSpeechResponseFormat {
138    Mp3,
139    Opus,
140    Aac,
141    Flac,
142    Wav,
143    Pcm,
144}
145
146#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq)]
147#[serde(rename_all = "snake_case")]
148pub enum AudioVoice {
149    #[default]
150    Alloy,
151    Ash,
152    Coral,
153    Echo,
154    Fable,
155    Onyx,
156    Nova,
157    Sage,
158    Shimmer,
159}
160
161#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
162#[serde(rename_all = "snake_case")]
163pub enum TimestampGranularity {
164    Word,
165    Segment,
166}
167
168#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq)]
169#[serde(rename_all = "snake_case")]
170pub enum TranscriptionChunkingStrategy {
171    Auto,
172    #[serde(untagged)]
173    VadConfig(VadConfig),
174}
175
176#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq)]
177pub struct VadConfig {
178    /// Must be set to "server_vad" to enable manual chunking using server side VAD.
179    pub r#type: VadConfigType,
180    /// Amount of audio to include before the VAD detected speech (in milliseconds).
181    pub prefix_padding_ms: Option<usize>,
182    /// Duration of silence to detect speech stop (in milliseconds). With shorter values the model will respond more quickly, but may jump in on short pauses from the user.
183    pub silence_duration_ms: Option<usize>,
184    /// Sensitivity threshold (0.0 to 1.0) for voice activity detection. A higher threshold will require louder audio to activate the model, and thus might perform better in noisy environments.
185    pub threshold: Option<f32>,
186}
187
188#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq)]
189#[serde(rename_all = "snake_case")]
190pub enum VadConfigType {
191    ServerVad,
192}
193
194impl Display for AudioOutputFormat {
195    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
196        write!(
197            f,
198            "{}",
199            match self {
200                AudioOutputFormat::Json => "json",
201                AudioOutputFormat::Text => "text",
202                AudioOutputFormat::Srt => "srt",
203                AudioOutputFormat::VerboseJson => "verbose_json",
204                AudioOutputFormat::Vtt => "vtt",
205            }
206        )
207    }
208}
209
210impl Display for TimestampGranularity {
211    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
212        write!(
213            f,
214            "{}",
215            match self {
216                TimestampGranularity::Word => "word",
217                TimestampGranularity::Segment => "segment",
218            }
219        )
220    }
221}
222
223impl Display for TranscriptionChunkingStrategy {
224    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
225        match self {
226            TranscriptionChunkingStrategy::Auto => "auto".fmt(f),
227            TranscriptionChunkingStrategy::VadConfig(vad_config) => vad_config.fmt(f),
228        }
229    }
230}
231
232impl Display for VadConfig {
233    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
234        let s = serde_json::to_string(self).map_err(|_| std::fmt::Error)?;
235        write!(f, "{s}")
236    }
237}
238
239#[cfg(test)]
240mod tests {
241    use crate::audio::{
242        AudioTranscriptionParameters, AudioTranscriptionParametersBuilder,
243        TranscriptionChunkingStrategy, VadConfig, VadConfigType,
244    };
245    use crate::shared::FileUpload;
246
247    #[test]
248    fn test_audio_transcription_chunking_strategy_auto_serialization_deserialization() {
249        let chunking_strategy = TranscriptionChunkingStrategy::Auto;
250
251        let serialized = serde_json::to_string(&chunking_strategy).unwrap();
252        assert_eq!(serialized, "\"auto\"");
253
254        let deserialized: TranscriptionChunkingStrategy =
255            serde_json::from_str(serialized.as_str()).unwrap();
256        assert_eq!(deserialized, chunking_strategy)
257    }
258
259    #[test]
260    fn test_audio_transcription_chunking_strategy_vad_config_serialization_deserialization() {
261        let chunking_strategy = TranscriptionChunkingStrategy::VadConfig(VadConfig {
262            r#type: VadConfigType::ServerVad,
263            prefix_padding_ms: Some(10),
264            silence_duration_ms: Some(20),
265            threshold: Some(0.5),
266        });
267
268        let serialized = serde_json::to_string(&chunking_strategy).unwrap();
269        assert_eq!(serialized, "{\"type\":\"server_vad\",\"prefix_padding_ms\":10,\"silence_duration_ms\":20,\"threshold\":0.5}");
270
271        let deserialized: TranscriptionChunkingStrategy =
272            serde_json::from_str(serialized.as_str()).unwrap();
273        assert_eq!(deserialized, chunking_strategy)
274    }
275
276    #[test]
277    fn test_audio_transcription_extra_body_serialization_deserialization() {
278        let mut builder = &mut AudioTranscriptionParametersBuilder::default();
279        builder = builder.file(FileUpload::File("test.wav".to_string()));
280        builder = builder.model("test");
281        let extra = serde_json::json!({
282            "enable_my_feature": true,
283            "my_param": 10
284        });
285        builder = builder.extra_body(extra);
286
287        let params: AudioTranscriptionParameters = builder.build().unwrap();
288
289        let serialized = serde_json::to_string(&params).unwrap();
290        assert_eq!(serialized, "{\"file\":{\"File\":\"test.wav\"},\"model\":\"test\",\"enable_my_feature\":true,\"my_param\":10}");
291
292        let deserialized: AudioTranscriptionParameters =
293            serde_json::from_str(serialized.as_str()).unwrap();
294        assert_eq!(deserialized, params)
295    }
296}