Skip to main content

mimo_api/types/
audio.rs

1//! Audio types for the MiMo API.
2//!
3//! This module provides types for configuring audio output, particularly for
4//! text-to-speech (TTS) synthesis using the `mimo-v2-tts` model.
5
6use crate::error::Result;
7use serde::{Deserialize, Serialize};
8
9/// Audio output format.
10#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
11#[serde(rename_all = "lowercase")]
12pub enum AudioFormat {
13    /// WAV format (recommended for high quality)
14    #[default]
15    Wav,
16    /// MP3 format (smaller file size)
17    Mp3,
18    /// PCM format (for streaming)
19    Pcm,
20}
21
22/// Available voice options for text-to-speech.
23#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
24#[serde(rename_all = "snake_case")]
25pub enum Voice {
26    /// MiMo default voice - balanced tone
27    #[default]
28    MimoDefault,
29    /// Default English female voice
30    DefaultEn,
31    /// Default Chinese female voice
32    DefaultZh,
33}
34
35/// Audio output configuration for text-to-speech.
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct Audio {
38    /// Output audio format
39    #[serde(skip_serializing_if = "Option::is_none")]
40    pub format: Option<AudioFormat>,
41    /// Voice to use for synthesis
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub voice: Option<Voice>,
44}
45
46impl Audio {
47    /// Create a new audio configuration.
48    ///
49    /// # Example
50    ///
51    /// ```rust
52    /// use mimo_api::{Audio, AudioFormat, Voice};
53    ///
54    /// let audio = Audio::new()
55    ///     .format(AudioFormat::Wav)
56    ///     .voice(Voice::MimoDefault);
57    /// ```
58    pub fn new() -> Self {
59        Self {
60            format: None,
61            voice: None,
62        }
63    }
64
65    /// Set the audio format.
66    pub fn format(mut self, format: AudioFormat) -> Self {
67        self.format = Some(format);
68        self
69    }
70
71    /// Set the voice for synthesis.
72    pub fn voice(mut self, voice: Voice) -> Self {
73        self.voice = Some(voice);
74        self
75    }
76
77    /// Create audio configuration with WAV format.
78    pub fn wav() -> Self {
79        Self::new().format(AudioFormat::Wav)
80    }
81
82    /// Create audio configuration with MP3 format.
83    pub fn mp3() -> Self {
84        Self::new().format(AudioFormat::Mp3)
85    }
86
87    /// Create audio configuration with PCM format (for streaming).
88    pub fn pcm() -> Self {
89        Self::new().format(AudioFormat::Pcm)
90    }
91}
92
93impl Default for Audio {
94    fn default() -> Self {
95        Self::new()
96    }
97}
98
99/// Response audio data from text-to-speech.
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct ResponseAudio {
102    /// Audio ID
103    pub id: String,
104    /// Base64 encoded audio data
105    pub data: String,
106    /// Expiration timestamp (Unix timestamp)
107    #[serde(skip_serializing_if = "Option::is_none")]
108    pub expires_at: Option<i64>,
109    /// Audio transcript (text that was synthesized)
110    #[serde(skip_serializing_if = "Option::is_none")]
111    pub transcript: Option<String>,
112}
113
114impl ResponseAudio {
115    /// Decode the base64 audio data to bytes.
116    ///
117    /// # Example
118    ///
119    /// ```rust,no_run
120    /// use mimo_api::{Client, Audio, Voice, Message};
121    ///
122    /// #[tokio::main]
123    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
124    ///     let client = Client::from_env()?;
125    ///
126    ///     let response = client.tts("Hello, world!")
127    ///         .voice(Voice::DefaultEn)
128    ///         .send()
129    ///         .await?;
130    ///
131    ///     let audio = response.audio()?;
132    ///     let audio_bytes = audio.decode_data()?;
133    ///     std::fs::write("output.wav", audio_bytes)?;
134    ///     Ok(())
135    /// }
136    /// ```
137    pub fn decode_data(&self) -> Result<Vec<u8>> {
138        use base64::Engine;
139        base64::engine::general_purpose::STANDARD.decode(&self.data).map_err(Into::into)
140    }
141
142    /// Get the transcript of the synthesized text.
143    pub fn transcript(&self) -> Option<&str> {
144        self.transcript.as_deref()
145    }
146
147    /// Check if the audio has expired.
148    pub fn is_expired(&self) -> bool {
149        if let Some(expires_at) = self.expires_at {
150            let now = std::time::SystemTime::now()
151                .duration_since(std::time::UNIX_EPOCH)
152                .unwrap()
153                .as_secs() as i64;
154            now > expires_at
155        } else {
156            false
157        }
158    }
159}
160
161/// Delta audio in a streaming response.
162#[derive(Debug, Clone, Serialize, Deserialize)]
163pub struct DeltaAudio {
164    /// Audio ID
165    pub id: String,
166    /// Base64 encoded audio data
167    pub data: String,
168    /// Expiration timestamp
169    #[serde(skip_serializing_if = "Option::is_none")]
170    pub expires_at: Option<i64>,
171    /// Audio transcript
172    #[serde(skip_serializing_if = "Option::is_none")]
173    pub transcript: Option<String>,
174}
175
176impl DeltaAudio {
177    /// Decode the base64 audio data to bytes.
178    pub fn decode_data(&self) -> Result<Vec<u8>> {
179        use base64::Engine;
180        base64::engine::general_purpose::STANDARD.decode(&self.data).map_err(Into::into)
181    }
182}
183
184/// Text-to-speech style control.
185///
186/// Use the `<style>` tag to control the overall style of the synthesized audio.
187/// The style should be placed at the beginning of the text to be synthesized.
188#[derive(Debug, Clone, Default)]
189pub struct TtsStyle {
190    styles: Vec<String>,
191}
192
193impl TtsStyle {
194    /// Create a new TTS style builder.
195    pub fn new() -> Self {
196        Self { styles: Vec::new() }
197    }
198
199    /// Add a style to apply.
200    ///
201    /// # Available Styles
202    ///
203    /// - **Speed control**: "变快", "变慢"
204    /// - **Emotion**: "开心", "悲伤", "生气"
205    /// - **Role play**: "孙悟空", "林黛玉"
206    /// - **Style change**: "悄悄话", "夹子音", "台湾腔"
207    /// - **Dialect**: "东北话", "四川话", "河南话", "粤语"
208    /// - **Singing**: "唱歌"
209    ///
210    /// # Example
211    ///
212    /// ```rust
213    /// use mimo_api::TtsStyle;
214    ///
215    /// let style = TtsStyle::new()
216    ///     .with_style("开心")
217    ///     .with_style("变快");
218    ///
219    /// let text = style.apply("明天就是周五了,真开心!");
220    /// assert!(text.starts_with("<style>"));
221    /// ```
222    pub fn with_style(mut self, style: impl Into<String>) -> Self {
223        self.styles.push(style.into());
224        self
225    }
226
227    /// Apply the style to the text to be synthesized.
228    ///
229    /// Returns the text with the style tag prepended.
230    pub fn apply(&self, text: &str) -> String {
231        if self.styles.is_empty() {
232            text.to_string()
233        } else {
234            format!("<style>{}</style>{}", self.styles.join(" "), text)
235        }
236    }
237}
238
239/// Create styled text for TTS with the given style.
240///
241/// # Example
242///
243/// ```rust
244/// use mimo_api::styled_text;
245///
246/// let text = styled_text("开心", "明天就是周五了,真开心!");
247/// assert!(text.starts_with("<style>开心</style>"));
248/// ```
249pub fn styled_text(style: &str, text: &str) -> String {
250    TtsStyle::new().with_style(style).apply(text)
251}
252
253#[cfg(test)]
254mod tests {
255    use super::*;
256    use base64::Engine;
257
258    #[test]
259    fn test_audio_format_default() {
260        let format = AudioFormat::default();
261        assert_eq!(format, AudioFormat::Wav);
262    }
263
264    #[test]
265    fn test_voice_default() {
266        let voice = Voice::default();
267        assert_eq!(voice, Voice::MimoDefault);
268    }
269
270    #[test]
271    fn test_audio_config() {
272        let audio = Audio::wav().voice(Voice::DefaultZh);
273        assert_eq!(audio.format, Some(AudioFormat::Wav));
274        assert_eq!(audio.voice, Some(Voice::DefaultZh));
275    }
276
277    #[test]
278    fn test_audio_serialization() {
279        let audio = Audio::mp3().voice(Voice::DefaultEn);
280        let json = serde_json::to_string(&audio).unwrap();
281        assert!(json.contains("\"format\":\"mp3\""));
282        assert!(json.contains("\"voice\":\"default_en\""));
283    }
284
285    #[test]
286    fn test_audio_formats() {
287        assert_eq!(Audio::wav().format, Some(AudioFormat::Wav));
288        assert_eq!(Audio::mp3().format, Some(AudioFormat::Mp3));
289        assert_eq!(Audio::pcm().format, Some(AudioFormat::Pcm));
290    }
291
292    #[test]
293    fn test_tts_style_single() {
294        let text = TtsStyle::new().with_style("开心").apply("Hello");
295        assert_eq!(text, "<style>开心</style>Hello");
296    }
297
298    #[test]
299    fn test_tts_style_multiple() {
300        let text = TtsStyle::new().with_style("开心").with_style("变快").apply("Hello");
301        assert!(text.starts_with("<style>"));
302        assert!(text.contains("开心"));
303        assert!(text.contains("变快"));
304        assert!(text.ends_with("Hello"));
305    }
306
307    #[test]
308    fn test_tts_style_empty() {
309        let text = TtsStyle::new().apply("Hello");
310        assert_eq!(text, "Hello");
311    }
312
313    #[test]
314    fn test_styled_text_helper() {
315        let text = styled_text("东北话", "哎呀妈呀");
316        assert_eq!(text, "<style>东北话</style>哎呀妈呀");
317    }
318
319    #[test]
320    fn test_response_audio_decode() {
321        let audio = ResponseAudio {
322            id: "test-id".to_string(),
323            data: base64::engine::general_purpose::STANDARD.encode(b"test audio data"),
324            expires_at: None,
325            transcript: Some("test".to_string()),
326        };
327
328        let decoded = audio.decode_data().unwrap();
329        assert_eq!(decoded, b"test audio data");
330    }
331
332    #[test]
333    fn test_response_audio_transcript() {
334        let audio = ResponseAudio {
335            id: "test-id".to_string(),
336            data: String::new(),
337            expires_at: None,
338            transcript: Some("Hello world".to_string()),
339        };
340
341        assert_eq!(audio.transcript(), Some("Hello world"));
342    }
343}