Skip to main content

mimo_api/types/
audio.rs

1//! Audio types for the MiMo API.
2//!
3//! This module provides types for configuring audio output, particularly for
4//! text-to-speech (TTS) synthesis using the `mimo-v2-tts` model.
5
6use serde::{Deserialize, Serialize};
7
8/// Audio output format.
9#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
10#[serde(rename_all = "lowercase")]
11pub enum AudioFormat {
12    /// WAV format (recommended for high quality)
13    #[default]
14    Wav,
15    /// MP3 format (smaller file size)
16    Mp3,
17    /// PCM format (for streaming)
18    Pcm,
19}
20
21/// Available voice options for text-to-speech.
22#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
23#[serde(rename_all = "snake_case")]
24pub enum Voice {
25    /// MiMo default voice - balanced tone
26    #[default]
27    MimoDefault,
28    /// Default English female voice
29    DefaultEn,
30    /// Default Chinese female voice
31    DefaultZh,
32}
33
34/// Audio output configuration for text-to-speech.
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct Audio {
37    /// Output audio format
38    #[serde(skip_serializing_if = "Option::is_none")]
39    pub format: Option<AudioFormat>,
40    /// Voice to use for synthesis
41    #[serde(skip_serializing_if = "Option::is_none")]
42    pub voice: Option<Voice>,
43}
44
45impl Audio {
46    /// Create a new audio configuration.
47    ///
48    /// # Example
49    ///
50    /// ```rust
51    /// use mimo_api::{Audio, AudioFormat, Voice};
52    ///
53    /// let audio = Audio::new()
54    ///     .format(AudioFormat::Wav)
55    ///     .voice(Voice::MimoDefault);
56    /// ```
57    pub fn new() -> Self {
58        Self {
59            format: None,
60            voice: None,
61        }
62    }
63
64    /// Set the audio format.
65    pub fn format(mut self, format: AudioFormat) -> Self {
66        self.format = Some(format);
67        self
68    }
69
70    /// Set the voice for synthesis.
71    pub fn voice(mut self, voice: Voice) -> Self {
72        self.voice = Some(voice);
73        self
74    }
75
76    /// Create audio configuration with WAV format.
77    pub fn wav() -> Self {
78        Self::new().format(AudioFormat::Wav)
79    }
80
81    /// Create audio configuration with MP3 format.
82    pub fn mp3() -> Self {
83        Self::new().format(AudioFormat::Mp3)
84    }
85
86    /// Create audio configuration with PCM format (for streaming).
87    pub fn pcm() -> Self {
88        Self::new().format(AudioFormat::Pcm)
89    }
90}
91
92impl Default for Audio {
93    fn default() -> Self {
94        Self::new()
95    }
96}
97
98/// Response audio data from text-to-speech.
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct ResponseAudio {
101    /// Audio ID
102    pub id: String,
103    /// Base64 encoded audio data
104    pub data: String,
105    /// Expiration timestamp (Unix timestamp)
106    #[serde(skip_serializing_if = "Option::is_none")]
107    pub expires_at: Option<i64>,
108    /// Audio transcript (text that was synthesized)
109    #[serde(skip_serializing_if = "Option::is_none")]
110    pub transcript: Option<String>,
111}
112
113impl ResponseAudio {
114    /// Decode the base64 audio data to bytes.
115    ///
116    /// # Example
117    ///
118    /// ```rust,no_run
119    /// use mimo_api::{Client, Audio, Voice, Message};
120    ///
121    /// #[tokio::main]
122    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
123    ///     let client = Client::from_env()?;
124    ///     
125    ///     let response = client.tts("Hello, world!")
126    ///         .voice(Voice::DefaultEn)
127    ///         .send()
128    ///         .await?;
129    ///     
130    ///     let audio = response.audio()?;
131    ///     let audio_bytes = audio.decode_data()?;
132    ///     std::fs::write("output.wav", audio_bytes)?;
133    ///     Ok(())
134    /// }
135    /// ```
136    pub fn decode_data(&self) -> Result<Vec<u8>, base64::DecodeError> {
137        use base64::Engine;
138        base64::engine::general_purpose::STANDARD.decode(&self.data)
139    }
140
141    /// Get the transcript of the synthesized text.
142    pub fn transcript(&self) -> Option<&str> {
143        self.transcript.as_deref()
144    }
145
146    /// Check if the audio has expired.
147    pub fn is_expired(&self) -> bool {
148        if let Some(expires_at) = self.expires_at {
149            let now = std::time::SystemTime::now()
150                .duration_since(std::time::UNIX_EPOCH)
151                .unwrap()
152                .as_secs() as i64;
153            now > expires_at
154        } else {
155            false
156        }
157    }
158}
159
160/// Delta audio in a streaming response.
161#[derive(Debug, Clone, Serialize, Deserialize)]
162pub struct DeltaAudio {
163    /// Audio ID
164    pub id: String,
165    /// Base64 encoded audio data
166    pub data: String,
167    /// Expiration timestamp
168    #[serde(skip_serializing_if = "Option::is_none")]
169    pub expires_at: Option<i64>,
170    /// Audio transcript
171    #[serde(skip_serializing_if = "Option::is_none")]
172    pub transcript: Option<String>,
173}
174
175impl DeltaAudio {
176    /// Decode the base64 audio data to bytes.
177    pub fn decode_data(&self) -> Result<Vec<u8>, base64::DecodeError> {
178        use base64::Engine;
179        base64::engine::general_purpose::STANDARD.decode(&self.data)
180    }
181}
182
183/// Text-to-speech style control.
184///
185/// Use the `<style>` tag to control the overall style of the synthesized audio.
186/// The style should be placed at the beginning of the text to be synthesized.
187#[derive(Debug, Clone, Default)]
188pub struct TtsStyle {
189    styles: Vec<String>,
190}
191
192impl TtsStyle {
193    /// Create a new TTS style builder.
194    pub fn new() -> Self {
195        Self { styles: Vec::new() }
196    }
197
198    /// Add a style to apply.
199    ///
200    /// # Available Styles
201    ///
202    /// - **Speed control**: "变快", "变慢"
203    /// - **Emotion**: "开心", "悲伤", "生气"
204    /// - **Role play**: "孙悟空", "林黛玉"
205    /// - **Style change**: "悄悄话", "夹子音", "台湾腔"
206    /// - **Dialect**: "东北话", "四川话", "河南话", "粤语"
207    /// - **Singing**: "唱歌"
208    ///
209    /// # Example
210    ///
211    /// ```rust
212    /// use mimo_api::TtsStyle;
213    ///
214    /// let style = TtsStyle::new()
215    ///     .with_style("开心")
216    ///     .with_style("变快");
217    ///
218    /// let text = style.apply("明天就是周五了,真开心!");
219    /// assert!(text.starts_with("<style>"));
220    /// ```
221    pub fn with_style(mut self, style: impl Into<String>) -> Self {
222        self.styles.push(style.into());
223        self
224    }
225
226    /// Apply the style to the text to be synthesized.
227    ///
228    /// Returns the text with the style tag prepended.
229    pub fn apply(&self, text: &str) -> String {
230        if self.styles.is_empty() {
231            text.to_string()
232        } else {
233            format!("<style>{}</style>{}", self.styles.join(" "), text)
234        }
235    }
236}
237
238/// Create styled text for TTS with the given style.
239///
240/// # Example
241///
242/// ```rust
243/// use mimo_api::styled_text;
244///
245/// let text = styled_text("开心", "明天就是周五了,真开心!");
246/// assert!(text.starts_with("<style>开心</style>"));
247/// ```
248pub fn styled_text(style: &str, text: &str) -> String {
249    TtsStyle::new().with_style(style).apply(text)
250}
251
252#[cfg(test)]
253mod tests {
254    use super::*;
255    use base64::Engine;
256
257    #[test]
258    fn test_audio_format_default() {
259        let format = AudioFormat::default();
260        assert_eq!(format, AudioFormat::Wav);
261    }
262
263    #[test]
264    fn test_voice_default() {
265        let voice = Voice::default();
266        assert_eq!(voice, Voice::MimoDefault);
267    }
268
269    #[test]
270    fn test_audio_config() {
271        let audio = Audio::wav().voice(Voice::DefaultZh);
272        assert_eq!(audio.format, Some(AudioFormat::Wav));
273        assert_eq!(audio.voice, Some(Voice::DefaultZh));
274    }
275
276    #[test]
277    fn test_audio_serialization() {
278        let audio = Audio::mp3().voice(Voice::DefaultEn);
279        let json = serde_json::to_string(&audio).unwrap();
280        assert!(json.contains("\"format\":\"mp3\""));
281        assert!(json.contains("\"voice\":\"default_en\""));
282    }
283
284    #[test]
285    fn test_audio_formats() {
286        assert_eq!(Audio::wav().format, Some(AudioFormat::Wav));
287        assert_eq!(Audio::mp3().format, Some(AudioFormat::Mp3));
288        assert_eq!(Audio::pcm().format, Some(AudioFormat::Pcm));
289    }
290
291    #[test]
292    fn test_tts_style_single() {
293        let text = TtsStyle::new().with_style("开心").apply("Hello");
294        assert_eq!(text, "<style>开心</style>Hello");
295    }
296
297    #[test]
298    fn test_tts_style_multiple() {
299        let text = TtsStyle::new().with_style("开心").with_style("变快").apply("Hello");
300        assert!(text.starts_with("<style>"));
301        assert!(text.contains("开心"));
302        assert!(text.contains("变快"));
303        assert!(text.ends_with("Hello"));
304    }
305
306    #[test]
307    fn test_tts_style_empty() {
308        let text = TtsStyle::new().apply("Hello");
309        assert_eq!(text, "Hello");
310    }
311
312    #[test]
313    fn test_styled_text_helper() {
314        let text = styled_text("东北话", "哎呀妈呀");
315        assert_eq!(text, "<style>东北话</style>哎呀妈呀");
316    }
317
318    #[test]
319    fn test_response_audio_decode() {
320        let audio = ResponseAudio {
321            id: "test-id".to_string(),
322            data: base64::engine::general_purpose::STANDARD.encode(b"test audio data"),
323            expires_at: None,
324            transcript: Some("test".to_string()),
325        };
326
327        let decoded = audio.decode_data().unwrap();
328        assert_eq!(decoded, b"test audio data");
329    }
330
331    #[test]
332    fn test_response_audio_transcript() {
333        let audio = ResponseAudio {
334            id: "test-id".to_string(),
335            data: String::new(),
336            expires_at: None,
337            transcript: Some("Hello world".to_string()),
338        };
339
340        assert_eq!(audio.transcript(), Some("Hello world"));
341    }
342}