Skip to main content

mimo_api/types/
audio.rs

1//! Audio types for the MiMo API.
2//!
3//! This module provides types for configuring audio output, particularly for
4//! text-to-speech (TTS) synthesis using the `mimo-v2-tts` model.
5
6use {
7    crate::error::{Error, Result},
8    base64::prelude::*,
9    serde::{Deserialize, Serialize},
10    tokio::fs::read,
11};
12
13/// Audio output format.
14#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
15#[serde(rename_all = "lowercase")]
16pub enum AudioFormat {
17    /// WAV format (recommended for high quality)
18    #[default]
19    Wav,
20    /// MP3 format (smaller file size)
21    Mp3,
22    /// PCM format (for streaming, maps to pcm16)
23    Pcm,
24    /// PCM16 format (16-bit PCM, for streaming)
25    #[serde(rename = "pcm16")]
26    Pcm16,
27}
28
29//noinspection SpellCheckingInspection
30/// Available voice options for text-to-speech.
31///
32/// This enum supports both preset voices and custom voices (for voice cloning).
33#[derive(Debug, Clone, Default, PartialEq, Eq)]
34pub enum Voice {
35    /// MiMo default voice - balanced tone
36    #[default]
37    MimoDefault,
38    /// Default English female voice (legacy)
39    DefaultEn,
40    /// Default Chinese female voice (legacy)
41    DefaultZh,
42    /// 冰糖 - Chinese female voice
43    Bingtang,
44    /// 茉莉 - Chinese female voice
45    Moli,
46    /// 苏打 - Chinese male voice
47    Suda,
48    /// 白桦 - Chinese male voice
49    Baihua,
50    /// Mia - English female voice
51    Mia,
52    /// Chloe - English female voice
53    Chloe,
54    /// Milo - English male voice
55    Milo,
56    /// Dean - English male voice
57    Dean,
58    /// Custom voice string (for voice cloning with base64 audio)
59    Custom(String),
60}
61
62impl Voice {
63    /// Create a custom voice from a string (for voice cloning).
64    ///
65    /// The string should be in the format: `data:{MIME_TYPE};base64,$BASE64_AUDIO`
66    pub fn custom<S: Into<String>>(voice: S) -> Self {
67        Voice::Custom(voice.into())
68    }
69
70    /// Create a voice clone from an audio file path.
71    ///
72    /// Reads the audio file, encodes it as base64, and creates a custom voice string.
73    /// Supported formats: MP3, WAV.
74    pub async fn from_audio_file<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
75        let path = path.as_ref();
76        let data = read(path).await?;
77
78        let mime_type = match path.extension().and_then(|ext| ext.to_str()) {
79            Some("mp3") => "audio/mpeg",
80            Some("wav") => "audio/wav",
81            _ => return Err(Error::InvalidParameter("Unsupported audio format".into())),
82        };
83
84        let base64_audio = BASE64_STANDARD.encode(&data);
85        let voice_str = format!("data:{};base64,{}", mime_type, base64_audio);
86
87        Ok(Voice::Custom(voice_str))
88    }
89}
90
91// Manual Serialize implementation for Voice
92impl Serialize for Voice {
93    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
94    where
95        S: serde::Serializer,
96    {
97        let s = match self {
98            Voice::MimoDefault => "mimo_default",
99            Voice::DefaultEn => "default_en",
100            Voice::DefaultZh => "default_zh",
101            Voice::Bingtang => "冰糖",
102            Voice::Moli => "茉莉",
103            Voice::Suda => "苏打",
104            Voice::Baihua => "白桦",
105            Voice::Mia => "Mia",
106            Voice::Chloe => "Chloe",
107            Voice::Milo => "Milo",
108            Voice::Dean => "Dean",
109            Voice::Custom(s) => s.as_str(),
110        };
111        serializer.serialize_str(s)
112    }
113}
114
115// Manual Deserialize implementation for Voice
116impl<'de> Deserialize<'de> for Voice {
117    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
118    where
119        D: serde::Deserializer<'de>,
120    {
121        let s = String::deserialize(deserializer)?;
122        Ok(match s.as_str() {
123            "mimo_default" => Voice::MimoDefault,
124            "default_en" => Voice::DefaultEn,
125            "default_zh" => Voice::DefaultZh,
126            "冰糖" => Voice::Bingtang,
127            "茉莉" => Voice::Moli,
128            "苏打" => Voice::Suda,
129            "白桦" => Voice::Baihua,
130            "Mia" => Voice::Mia,
131            "Chloe" => Voice::Chloe,
132            "Milo" => Voice::Milo,
133            "Dean" => Voice::Dean,
134            _ => Voice::Custom(s),
135        })
136    }
137}
138
139/// Audio output configuration for text-to-speech.
140#[derive(Debug, Clone, Serialize, Deserialize)]
141pub struct Audio {
142    /// Output audio format
143    #[serde(skip_serializing_if = "Option::is_none")]
144    pub format: Option<AudioFormat>,
145    /// Voice to use for synthesis
146    #[serde(skip_serializing_if = "Option::is_none")]
147    pub voice: Option<Voice>,
148}
149
150impl Audio {
151    /// Create a new audio configuration.
152    ///
153    /// # Example
154    ///
155    /// ```rust
156    /// use mimo_api::{Audio, AudioFormat, Voice};
157    ///
158    /// let audio = Audio::new()
159    ///     .format(AudioFormat::Wav)
160    ///     .voice(Voice::MimoDefault);
161    /// ```
162    pub fn new() -> Self {
163        Self {
164            format: None,
165            voice: None,
166        }
167    }
168
169    /// Set the audio format.
170    pub fn format(mut self, format: AudioFormat) -> Self {
171        self.format = Some(format);
172        self
173    }
174
175    /// Set the voice for synthesis.
176    pub fn voice(mut self, voice: Voice) -> Self {
177        self.voice = Some(voice);
178        self
179    }
180
181    /// Create audio configuration with WAV format.
182    pub fn wav() -> Self {
183        Self::new().format(AudioFormat::Wav)
184    }
185
186    /// Create audio configuration with MP3 format.
187    pub fn mp3() -> Self {
188        Self::new().format(AudioFormat::Mp3)
189    }
190
191    /// Create audio configuration with PCM format (for streaming).
192    pub fn pcm() -> Self {
193        Self::new().format(AudioFormat::Pcm)
194    }
195
196    /// Create audio configuration with PCM16 format (16-bit PCM, for streaming).
197    pub fn pcm16() -> Self {
198        Self::new().format(AudioFormat::Pcm16)
199    }
200}
201
202impl Default for Audio {
203    fn default() -> Self {
204        Self::new()
205    }
206}
207
208/// Response audio data from text-to-speech.
209#[derive(Debug, Clone, Serialize, Deserialize)]
210pub struct ResponseAudio {
211    /// Audio ID
212    pub id: String,
213    /// Base64 encoded audio data
214    pub data: String,
215    /// Expiration timestamp (Unix timestamp)
216    #[serde(skip_serializing_if = "Option::is_none")]
217    pub expires_at: Option<i64>,
218    /// Audio transcript (text that was synthesized)
219    #[serde(skip_serializing_if = "Option::is_none")]
220    pub transcript: Option<String>,
221}
222
223impl ResponseAudio {
224    /// Decode the base64 audio data to bytes.
225    ///
226    /// # Example
227    ///
228    /// ```rust,no_run
229    /// use mimo_api::{Client, Audio, Voice, Message};
230    ///
231    /// #[tokio::main]
232    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
233    ///     let client = Client::from_env()?;
234    ///
235    ///     let response = client.tts("Hello, world!")
236    ///         .voice(Voice::DefaultEn)
237    ///         .send()
238    ///         .await?;
239    ///
240    ///     let audio = response.audio()?;
241    ///     let audio_bytes = audio.decode_data()?;
242    ///     std::fs::write("output.wav", audio_bytes)?;
243    ///     Ok(())
244    /// }
245    /// ```
246    pub fn decode_data(&self) -> Result<Vec<u8>> {
247        use base64::Engine;
248        base64::engine::general_purpose::STANDARD
249            .decode(&self.data)
250            .map_err(Into::into)
251    }
252
253    /// Get the transcript of the synthesized text.
254    pub fn transcript(&self) -> Option<&str> {
255        self.transcript.as_deref()
256    }
257
258    /// Check if the audio has expired.
259    pub fn is_expired(&self) -> bool {
260        if let Some(expires_at) = self.expires_at {
261            let now = std::time::SystemTime::now()
262                .duration_since(std::time::UNIX_EPOCH)
263                .unwrap()
264                .as_secs() as i64;
265            now > expires_at
266        } else {
267            false
268        }
269    }
270}
271
272/// Delta audio in a streaming response.
273#[derive(Debug, Clone, Serialize, Deserialize)]
274pub struct DeltaAudio {
275    /// Audio ID
276    pub id: String,
277    /// Base64 encoded audio data
278    pub data: String,
279    /// Expiration timestamp
280    #[serde(skip_serializing_if = "Option::is_none")]
281    pub expires_at: Option<i64>,
282    /// Audio transcript
283    #[serde(skip_serializing_if = "Option::is_none")]
284    pub transcript: Option<String>,
285}
286
287impl DeltaAudio {
288    /// Decode the base64 audio data to bytes.
289    pub fn decode_data(&self) -> Result<Vec<u8>> {
290        use base64::Engine;
291        base64::engine::general_purpose::STANDARD
292            .decode(&self.data)
293            .map_err(Into::into)
294    }
295}
296
297/// Text-to-speech style control.
298///
299/// Use the `<style>` tag to control the overall style of the synthesized audio.
300/// The style should be placed at the beginning of the text to be synthesized.
301#[derive(Debug, Clone, Default)]
302pub struct TtsStyle {
303    styles: Vec<String>,
304}
305
306impl TtsStyle {
307    /// Create a new TTS style builder.
308    pub fn new() -> Self {
309        Self { styles: Vec::new() }
310    }
311
312    /// Add a style to apply.
313    ///
314    /// # Available Styles
315    ///
316    /// - **Speed control**: "变快", "变慢"
317    /// - **Emotion**: "开心", "悲伤", "生气"
318    /// - **Role play**: "孙悟空", "林黛玉"
319    /// - **Style change**: "悄悄话", "夹子音", "台湾腔"
320    /// - **Dialect**: "东北话", "四川话", "河南话", "粤语"
321    /// - **Singing**: "唱歌"
322    ///
323    /// # Example
324    ///
325    /// ```rust
326    /// use mimo_api::TtsStyle;
327    ///
328    /// let style = TtsStyle::new()
329    ///     .with_style("开心")
330    ///     .with_style("变快");
331    ///
332    /// let text = style.apply("明天就是周五了,真开心!");
333    /// assert!(text.starts_with("<style>"));
334    /// ```
335    pub fn with_style(mut self, style: impl Into<String>) -> Self {
336        self.styles.push(style.into());
337        self
338    }
339
340    /// Apply the style to the text to be synthesized.
341    ///
342    /// Returns the text with the style tag prepended.
343    pub fn apply(&self, text: &str) -> String {
344        if self.styles.is_empty() {
345            text.to_string()
346        } else {
347            format!("<style>{}</style>{}", self.styles.join(" "), text)
348        }
349    }
350}
351
352/// Create styled text for TTS with the given style.
353///
354/// # Example
355///
356/// ```rust
357/// use mimo_api::styled_text;
358///
359/// let text = styled_text("开心", "明天就是周五了,真开心!");
360/// assert!(text.starts_with("<style>开心</style>"));
361/// ```
362pub fn styled_text(style: &str, text: &str) -> String {
363    TtsStyle::new().with_style(style).apply(text)
364}
365
366#[cfg(test)]
367mod tests {
368    use super::*;
369    use base64::Engine;
370
371    #[test]
372    fn test_audio_format_default() {
373        let format = AudioFormat::default();
374        assert_eq!(format, AudioFormat::Wav);
375    }
376
377    #[test]
378    fn test_voice_default() {
379        let voice = Voice::default();
380        assert_eq!(voice, Voice::MimoDefault);
381    }
382
383    #[test]
384    fn test_audio_config() {
385        let audio = Audio::wav().voice(Voice::DefaultZh);
386        assert_eq!(audio.format, Some(AudioFormat::Wav));
387        assert_eq!(audio.voice, Some(Voice::DefaultZh));
388    }
389
390    #[test]
391    fn test_audio_serialization() {
392        let audio = Audio::mp3().voice(Voice::DefaultEn);
393        let json = serde_json::to_string(&audio).unwrap();
394        assert!(json.contains("\"format\":\"mp3\""));
395        assert!(json.contains("\"voice\":\"default_en\""));
396    }
397
398    #[test]
399    fn test_audio_formats() {
400        assert_eq!(Audio::wav().format, Some(AudioFormat::Wav));
401        assert_eq!(Audio::mp3().format, Some(AudioFormat::Mp3));
402        assert_eq!(Audio::pcm().format, Some(AudioFormat::Pcm));
403    }
404
405    #[test]
406    fn test_tts_style_single() {
407        let text = TtsStyle::new().with_style("开心").apply("Hello");
408        assert_eq!(text, "<style>开心</style>Hello");
409    }
410
411    #[test]
412    fn test_tts_style_multiple() {
413        let text = TtsStyle::new()
414            .with_style("开心")
415            .with_style("变快")
416            .apply("Hello");
417        assert!(text.starts_with("<style>"));
418        assert!(text.contains("开心"));
419        assert!(text.contains("变快"));
420        assert!(text.ends_with("Hello"));
421    }
422
423    #[test]
424    fn test_tts_style_empty() {
425        let text = TtsStyle::new().apply("Hello");
426        assert_eq!(text, "Hello");
427    }
428
429    #[test]
430    fn test_styled_text_helper() {
431        let text = styled_text("东北话", "哎呀妈呀");
432        assert_eq!(text, "<style>东北话</style>哎呀妈呀");
433    }
434
435    #[test]
436    fn test_response_audio_decode() {
437        let audio = ResponseAudio {
438            id: "test-id".to_string(),
439            data: base64::engine::general_purpose::STANDARD.encode(b"test audio data"),
440            expires_at: None,
441            transcript: Some("test".to_string()),
442        };
443
444        let decoded = audio.decode_data().unwrap();
445        assert_eq!(decoded, b"test audio data");
446    }
447
448    #[test]
449    fn test_response_audio_transcript() {
450        let audio = ResponseAudio {
451            id: "test-id".to_string(),
452            data: String::new(),
453            expires_at: None,
454            transcript: Some("Hello world".to_string()),
455        };
456
457        assert_eq!(audio.transcript(), Some("Hello world"));
458    }
459}