use std::path::{Path, PathBuf};
use anyhow::Context as _;
use async_trait::async_trait;
use base64::{Engine as _, engine::general_purpose};
use serde_json::{Value, json};
use crate::client::{DeepSeekClient, SpeechSynthesisRequest};
use crate::config::{ApiProvider, normalize_model_name_for_provider};
use crate::network_policy::{Decision, host_from_url};
use super::spec::{
ApprovalRequirement, ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec,
optional_bool, optional_str, required_str,
};
pub(crate) const DEFAULT_FORMAT: &str = "wav";
pub(crate) const DEFAULT_VOICE: &str = "mimo_default";
const VOICE_CLONE_BASE64_MAX_BYTES: usize = 10 * 1024 * 1024;
pub(crate) const SUPPORTED_SPEECH_FORMATS: &[&str] = &["wav", "mp3", "pcm16"];
pub const SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS: &[&str] = &[
"mimo-v2.5-tts-voiceclone",
"mimo-v2.5-tts-voicedesign",
"mimo-v2.5-tts",
"mimo-v2-tts",
];
pub(crate) const SPEECH_MODEL_EXAMPLES: &[&str] = &[
"mimo-v2.5-tts",
"mimo-v2.5-tts-voicedesign",
"mimo-v2.5-tts-voiceclone",
"mimo-v2-tts",
];
pub struct SpeechTool {
name: &'static str,
client: Option<DeepSeekClient>,
output_dir: Option<PathBuf>,
}
impl SpeechTool {
#[must_use]
pub fn new(
name: &'static str,
client: Option<DeepSeekClient>,
output_dir: Option<PathBuf>,
) -> Self {
Self {
name,
client,
output_dir,
}
}
}
#[async_trait]
impl ToolSpec for SpeechTool {
fn name(&self) -> &str {
self.name
}
fn description(&self) -> &str {
"Generate speech/audio directly through the configured Xiaomi MiMo OpenAI-compatible API. Use this when the user asks for speech, TTS, narration, read-aloud, voice design, or voice cloning."
}
fn input_schema(&self) -> Value {
json!({
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "Text to synthesize. This is sent as the assistant message and is the spoken content; MiMo TTS style/audio tags may be included here."
},
"output": {
"type": "string",
"description": "Audio file path to write, relative to the workspace unless absolute. Default: speech.<format> in output_dir, configured [speech].output_dir, or the workspace."
},
"output_dir": {
"type": "string",
"description": "Directory for the default speech.<format> output file when output is omitted. Relative paths stay inside the workspace."
},
"model": {
"type": "string",
"description": "TTS model. Defaults to mimo-v2.5-tts, or infers voice-design/voice-clone models from voice_prompt/clone_voice.",
"enum": SPEECH_MODEL_EXAMPLES
},
"voice": {
"type": "string",
"description": "Built-in voice ID (for example mimo_default, 冰糖, 茉莉, 苏打, 白桦, Mia, Chloe, Milo, Dean) or a data:audio/...;base64,... URI for voice clone."
},
"instruction": {
"type": "string",
"description": "Natural-language style, emotion, speed, scene, or performance instruction. It is not spoken verbatim."
},
"voice_prompt": {
"type": "string",
"description": "Voice design prompt. When model is omitted this uses mimo-v2.5-tts-voicedesign."
},
"clone_voice": {
"type": "string",
"description": "Path to a .mp3 or .wav voice sample for cloning. When model is omitted this uses mimo-v2.5-tts-voiceclone."
},
"format": {
"type": "string",
"description": "Requested audio format. Default: wav. MiMo-V2.5-TTS documentation examples use wav and pcm16; mp3 is accepted when the API returns it.",
"enum": SUPPORTED_SPEECH_FORMATS
},
"stream": {
"type": "boolean",
"description": "Low-latency streaming request. The direct tool currently writes complete audio files only, so leave this false."
}
},
"required": ["text"]
})
}
fn capabilities(&self) -> Vec<ToolCapability> {
vec![
ToolCapability::WritesFiles,
ToolCapability::Network,
ToolCapability::Sandboxable,
]
}
fn approval_requirement(&self) -> ApprovalRequirement {
ApprovalRequirement::Auto
}
async fn execute(&self, input: Value, context: &ToolContext) -> Result<ToolResult, ToolError> {
let text = required_str(&input, "text")?.trim().to_string();
if text.is_empty() {
return Err(ToolError::invalid_input("speech text cannot be empty"));
}
let client = self.client.clone().ok_or_else(|| {
ToolError::not_available(
"speech tool requires an active Xiaomi MiMo API client; configure provider = \"xiaomi-mimo\" and an API key first",
)
})?;
let requested_format_raw = optional_str(&input, "format")
.map(str::trim)
.filter(|value| !value.is_empty())
.unwrap_or(DEFAULT_FORMAT);
let requested_format = normalize_speech_format(requested_format_raw).ok_or_else(|| {
ToolError::invalid_input(format!(
"unsupported speech format '{requested_format_raw}' (allowed: {})",
SUPPORTED_SPEECH_FORMATS.join(", ")
))
})?;
if optional_bool(&input, "stream", false) {
return Err(ToolError::invalid_input(
"stream=true low-latency speech output is not implemented in the direct tool yet; use stream=false to generate a complete audio file",
));
}
let output_raw = optional_str(&input, "output")
.map(str::trim)
.filter(|value| !value.is_empty());
let output_path = resolve_speech_output_path(
&input,
context,
output_raw,
&requested_format,
self.output_dir.as_ref(),
)?;
let output_label = output_raw
.map(str::to_string)
.unwrap_or_else(|| output_path.display().to_string());
let raw_voice = optional_str(&input, "voice")
.map(str::trim)
.filter(|value| !value.is_empty())
.map(str::to_string);
let raw_instruction = optional_str(&input, "instruction")
.map(str::trim)
.filter(|value| !value.is_empty())
.map(str::to_string);
let voice_prompt = optional_str(&input, "voice_prompt")
.map(str::trim)
.filter(|value| !value.is_empty())
.map(str::to_string);
let clone_voice = optional_str(&input, "clone_voice")
.map(str::trim)
.filter(|value| !value.is_empty())
.map(str::to_string);
let voice_is_data_uri = raw_voice
.as_deref()
.is_some_and(|value| value.starts_with("data:audio/"));
if clone_voice.is_some() && raw_voice.is_some() {
return Err(ToolError::invalid_input(
"use either clone_voice or voice for cloned voice data, not both",
));
}
let model = infer_speech_model(
optional_str(&input, "model"),
clone_voice.is_some() || voice_is_data_uri,
voice_prompt.is_some(),
);
let model_lower = model.to_ascii_lowercase();
if !model_lower.contains("tts") {
return Err(ToolError::invalid_input(format!(
"speech tool requires a TTS model (examples: {}), got '{model}'",
SPEECH_MODEL_EXAMPLES.join(", ")
)));
}
let is_voice_design = model_lower.contains("voicedesign");
let is_voice_clone = model_lower.contains("voiceclone");
let instruction = combine_speech_instructions(raw_instruction, voice_prompt);
if is_voice_design
&& instruction
.as_deref()
.is_none_or(|value| value.trim().is_empty())
{
return Err(ToolError::invalid_input(
"mimo-v2.5-tts-voicedesign requires voice_prompt or instruction",
));
}
let voice = if let Some(clone_path) = clone_voice {
let clone_path = context.resolve_path(&clone_path)?;
Some(encode_voice_clone_data_uri(&clone_path).await?)
} else if is_voice_design {
None
} else if let Some(value) = raw_voice {
Some(value)
} else if is_voice_clone {
return Err(ToolError::invalid_input(
"mimo-v2.5-tts-voiceclone requires clone_voice <mp3|wav> or voice <data-uri>",
));
} else {
Some(DEFAULT_VOICE.to_string())
};
check_network_policy(context, client.base_url())?;
let response = client
.synthesize_speech(SpeechSynthesisRequest {
model: model.clone(),
text,
instruction,
audio_format: requested_format,
voice,
})
.await
.map_err(|err| {
ToolError::execution_failed(format!("speech synthesis failed: {err}"))
})?;
if let Some(parent) = output_path
.parent()
.filter(|path| !path.as_os_str().is_empty())
{
tokio::fs::create_dir_all(parent).await.map_err(|err| {
ToolError::execution_failed(format!(
"failed to create output directory {}: {err}",
parent.display()
))
})?;
}
tokio::fs::write(&output_path, &response.audio_bytes)
.await
.map_err(|err| {
ToolError::execution_failed(format!(
"failed to write audio file {}: {err}",
output_path.display()
))
})?;
let result = json!({
"mode": "speech",
"success": true,
"api": "Xiaomi MiMo OpenAI-compatible chat/completions speech synthesis",
"base_url": openai_compatible_base_url(client.base_url()),
"model": response.model,
"format": response.audio_format,
"stream": false,
"output": output_label,
"absolute_output": output_path.display().to_string(),
"bytes": response.audio_bytes.len(),
"voice": response.voice.as_deref().map(describe_speech_voice),
"transcript": response.transcript,
"supported_formats": SUPPORTED_SPEECH_FORMATS,
"supported_xiaomi_mimo_models": SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS,
});
ToolResult::json(&result).map_err(|err| {
ToolError::execution_failed(format!("failed to serialize result: {err}"))
})
}
}
pub(crate) fn infer_speech_model(
model: Option<&str>,
has_clone_voice: bool,
has_voice_prompt: bool,
) -> String {
match model.map(str::trim).filter(|value| !value.is_empty()) {
Some(value) => normalize_model_name_for_provider(ApiProvider::XiaomiMimo, value)
.unwrap_or_else(|| value.into()),
None if has_clone_voice => "mimo-v2.5-tts-voiceclone".to_string(),
None if has_voice_prompt => "mimo-v2.5-tts-voicedesign".to_string(),
None => "mimo-v2.5-tts".to_string(),
}
}
pub(crate) fn combine_speech_instructions(
instruction: Option<String>,
voice_prompt: Option<String>,
) -> Option<String> {
match (instruction, voice_prompt) {
(Some(instruction), Some(voice_prompt)) => {
let instruction = instruction.trim();
let voice_prompt = voice_prompt.trim();
if instruction.is_empty() {
Some(voice_prompt.to_string()).filter(|value| !value.is_empty())
} else if voice_prompt.is_empty() {
Some(instruction.to_string()).filter(|value| !value.is_empty())
} else {
Some(format!("{voice_prompt}\n\n{instruction}"))
}
}
(Some(value), None) | (None, Some(value)) => {
let value = value.trim().to_string();
if value.is_empty() { None } else { Some(value) }
}
(None, None) => None,
}
}
pub(crate) fn normalize_speech_format(format: &str) -> Option<String> {
let normalized = format.trim().to_ascii_lowercase();
match normalized.as_str() {
"wav" | "mp3" | "pcm16" => Some(normalized),
"pcm" => Some("pcm16".to_string()),
_ => None,
}
}
pub(crate) fn default_speech_output_name(format: &str) -> String {
format!(
"speech.{}",
normalize_speech_format(format)
.as_deref()
.unwrap_or(DEFAULT_FORMAT)
)
}
fn resolve_speech_output_path(
input: &Value,
context: &ToolContext,
output_raw: Option<&str>,
format: &str,
configured_output_dir: Option<&PathBuf>,
) -> Result<PathBuf, ToolError> {
if let Some(output) = output_raw {
return context.resolve_path(output);
}
let filename = default_speech_output_name(format);
if let Some(output_dir) = optional_str(input, "output_dir")
.map(str::trim)
.filter(|value| !value.is_empty())
{
return Ok(context.resolve_path(output_dir)?.join(filename));
}
if let Some(output_dir) = configured_output_dir {
return Ok(output_dir.join(filename));
}
Ok(context.workspace.join(filename))
}
async fn encode_voice_clone_data_uri(path: &Path) -> Result<String, ToolError> {
let bytes = tokio::fs::read(path).await.map_err(|err| {
ToolError::execution_failed(format!(
"failed to read voice clone sample {}: {err}",
path.display()
))
})?;
voice_clone_data_uri_from_bytes(path, &bytes)
.map_err(|err| ToolError::invalid_input(err.to_string()))
}
pub(crate) fn encode_voice_clone_sample_data_uri(path: &Path) -> anyhow::Result<String> {
let bytes = std::fs::read(path)
.with_context(|| format!("Failed to read voice clone sample {}", path.display()))?;
voice_clone_data_uri_from_bytes(path, &bytes)
}
fn voice_clone_data_uri_from_bytes(path: &Path, bytes: &[u8]) -> anyhow::Result<String> {
let base64_audio = general_purpose::STANDARD.encode(bytes);
if base64_audio.len() > VOICE_CLONE_BASE64_MAX_BYTES {
anyhow::bail!(
"voice clone sample is too large after base64 encoding ({} bytes > 10 MB)",
base64_audio.len()
);
}
let extension = path
.extension()
.and_then(|value| value.to_str())
.unwrap_or_default()
.to_ascii_lowercase();
let mime = match extension.as_str() {
"mp3" => "audio/mpeg",
"wav" => "audio/wav",
other => {
anyhow::bail!("unsupported voice clone sample extension '{other}'. Use .mp3 or .wav.");
}
};
Ok(format!("data:{mime};base64,{base64_audio}"))
}
pub(crate) fn describe_speech_voice(voice: &str) -> String {
if voice.starts_with("data:") {
"embedded voice clone sample".to_string()
} else {
voice.to_string()
}
}
fn openai_compatible_base_url(base_url: &str) -> String {
let trimmed = base_url.trim_end_matches('/');
if trimmed.ends_with("/v1") || trimmed.ends_with("/beta") {
trimmed.to_string()
} else {
format!("{trimmed}/v1")
}
}
fn check_network_policy(context: &ToolContext, base_url: &str) -> Result<(), ToolError> {
let Some(decider) = context.network_policy.as_ref() else {
return Ok(());
};
let display_url = openai_compatible_base_url(base_url);
let Some(host) = host_from_url(&display_url) else {
return Ok(());
};
match decider.evaluate(&host, "speech") {
Decision::Allow => Ok(()),
Decision::Deny => Err(ToolError::permission_denied(format!(
"speech network call to '{host}' blocked by network policy"
))),
Decision::Prompt => Err(ToolError::permission_denied(format!(
"speech network call to '{host}' requires approval; re-run after `/network allow {host}` or set network.default = \"allow\" in config"
))),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn infers_speech_model_from_requested_mode() {
assert_eq!(infer_speech_model(None, false, false), "mimo-v2.5-tts");
assert_eq!(
infer_speech_model(None, false, true),
"mimo-v2.5-tts-voicedesign"
);
assert_eq!(
infer_speech_model(None, true, false),
"mimo-v2.5-tts-voiceclone"
);
assert_eq!(
infer_speech_model(Some("mimo-tts"), false, false),
"mimo-v2.5-tts"
);
assert_eq!(
infer_speech_model(Some("mimo-v2-tts"), false, false),
"mimo-v2-tts"
);
}
#[test]
fn combines_voice_prompt_before_instruction() {
assert_eq!(
combine_speech_instructions(
Some("Speak warmly.".to_string()),
Some("Young Chinese female voice".to_string())
)
.as_deref(),
Some("Young Chinese female voice\n\nSpeak warmly.")
);
assert_eq!(
combine_speech_instructions(Some(" calm ".to_string()), None).as_deref(),
Some("calm")
);
}
#[test]
fn normalizes_documented_speech_formats() {
assert_eq!(normalize_speech_format("WAV").as_deref(), Some("wav"));
assert_eq!(normalize_speech_format("pcm16").as_deref(), Some("pcm16"));
assert_eq!(normalize_speech_format("pcm").as_deref(), Some("pcm16"));
assert_eq!(normalize_speech_format("flac"), None);
}
#[test]
fn supported_xiaomi_mimo_speech_models_are_tts_only() {
assert!(
SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS
.iter()
.all(|model| model.to_ascii_lowercase().contains("tts")),
"model-visible speech list must not include chat-only MiMo models"
);
assert!(SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS.contains(&"mimo-v2.5-tts"));
assert!(!SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS.contains(&"mimo-v2.5-pro"));
assert!(!SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS.contains(&"mimo-v2.5"));
}
#[test]
fn configured_output_dir_is_used_for_default_tool_output() {
let tmp = tempfile::tempdir().expect("tempdir");
let context = ToolContext::new(tmp.path().to_path_buf());
let configured = tmp.path().join("speech-artifacts");
let output = resolve_speech_output_path(
&json!({"text": "hello"}),
&context,
None,
"pcm",
Some(&configured),
)
.expect("output path");
assert_eq!(output, configured.join("speech.pcm16"));
}
#[test]
fn displays_openai_compatible_base_url() {
assert_eq!(
openai_compatible_base_url("https://api.xiaomimimo.com"),
"https://api.xiaomimimo.com/v1"
);
assert_eq!(
openai_compatible_base_url("https://api.xiaomimimo.com/v1"),
"https://api.xiaomimimo.com/v1"
);
}
#[test]
fn speech_tool_is_auto_approved_but_not_read_only() {
let tool = SpeechTool::new("speech", None, None);
assert_eq!(tool.name(), "speech");
assert_eq!(tool.approval_requirement(), ApprovalRequirement::Auto);
assert!(!tool.is_read_only());
let schema = tool.input_schema();
assert!(schema.to_string().contains("mimo-v2.5-tts-voiceclone"));
assert!(schema.to_string().contains("pcm16"));
assert!(schema.to_string().contains("stream"));
}
}