use crate::handler::{
Pronunciation, SpeechHandler, SpeechSynthesizeParams, SpeechSynthesizeResult,
};
use adk_rust_mcp_common::config::Config;
use adk_rust_mcp_common::error::Error;
use rmcp::{
model::{
CallToolResult, Content, ListResourcesResult, ReadResourceResult, ServerCapabilities,
ServerInfo,
},
ErrorData as McpError, ServerHandler,
};
use schemars::JsonSchema;
use serde::Deserialize;
use std::borrow::Cow;
use std::sync::Arc;
use tokio::sync::RwLock;
use tracing::{debug, info};
#[derive(Clone)]
pub struct SpeechServer {
handler: Arc<RwLock<Option<SpeechHandler>>>,
config: Config,
}
#[derive(Debug, Deserialize, JsonSchema)]
pub struct SpeechSynthesizeToolParams {
pub text: String,
#[serde(default)]
pub voice: Option<String>,
#[serde(default)]
pub language_code: Option<String>,
#[serde(default)]
pub speaking_rate: Option<f32>,
#[serde(default)]
pub pitch: Option<f32>,
#[serde(default)]
pub pronunciations: Option<Vec<PronunciationToolParam>>,
#[serde(default)]
pub output_file: Option<String>,
}
#[derive(Debug, Deserialize, JsonSchema)]
pub struct PronunciationToolParam {
pub word: String,
pub phonetic: String,
pub alphabet: String,
}
impl From<PronunciationToolParam> for Pronunciation {
fn from(p: PronunciationToolParam) -> Self {
Self {
word: p.word,
phonetic: p.phonetic,
alphabet: p.alphabet,
}
}
}
impl From<SpeechSynthesizeToolParams> for SpeechSynthesizeParams {
fn from(params: SpeechSynthesizeToolParams) -> Self {
Self {
text: params.text,
voice: params.voice,
language_code: params
.language_code
.unwrap_or_else(|| "en-US".to_string()),
speaking_rate: params.speaking_rate.unwrap_or(1.0),
pitch: params.pitch.unwrap_or(0.0),
pronunciations: params
.pronunciations
.map(|p| p.into_iter().map(Into::into).collect()),
output_file: params.output_file,
}
}
}
impl SpeechServer {
pub fn new(config: Config) -> Self {
Self {
handler: Arc::new(RwLock::new(None)),
config,
}
}
async fn ensure_handler(&self) -> Result<(), Error> {
let mut handler = self.handler.write().await;
if handler.is_none() {
*handler = Some(SpeechHandler::new(self.config.clone()).await?);
}
Ok(())
}
pub async fn synthesize(
&self,
params: SpeechSynthesizeToolParams,
) -> Result<CallToolResult, McpError> {
info!(text_len = params.text.len(), "Synthesizing speech");
self.ensure_handler().await.map_err(|e| {
McpError::internal_error(format!("Failed to initialize handler: {}", e), None)
})?;
let handler_guard = self.handler.read().await;
let handler = handler_guard
.as_ref()
.ok_or_else(|| McpError::internal_error("Handler not initialized", None))?;
let synth_params: SpeechSynthesizeParams = params.into();
let result = handler.synthesize(synth_params).await.map_err(|e| {
McpError::internal_error(format!("Speech synthesis failed: {}", e), None)
})?;
let content = match result {
SpeechSynthesizeResult::Base64(audio) => {
vec![Content::text(format!(
"data:{};base64,{}",
audio.mime_type, audio.data
))]
}
SpeechSynthesizeResult::LocalFile(path) => {
vec![Content::text(format!("Audio saved to: {}", path))]
}
};
Ok(CallToolResult::success(content))
}
pub async fn list_voices(&self) -> Result<CallToolResult, McpError> {
info!("Listing available voices");
self.ensure_handler().await.map_err(|e| {
McpError::internal_error(format!("Failed to initialize handler: {}", e), None)
})?;
let handler_guard = self.handler.read().await;
let handler = handler_guard
.as_ref()
.ok_or_else(|| McpError::internal_error("Handler not initialized", None))?;
let voices = handler.list_voices().await.map_err(|e| {
McpError::internal_error(format!("Failed to list voices: {}", e), None)
})?;
let voices_json = serde_json::to_string_pretty(&voices).map_err(|e| {
McpError::internal_error(format!("Failed to serialize voices: {}", e), None)
})?;
Ok(CallToolResult::success(vec![Content::text(voices_json)]))
}
}
impl ServerHandler for SpeechServer {
fn get_info(&self) -> ServerInfo {
ServerInfo {
instructions: Some(
"Text-to-speech server using Google Cloud TTS Chirp3-HD API. \
Use the speech_synthesize tool to convert text to speech, \
and speech_list_voices to see available voices."
.to_string(),
),
capabilities: ServerCapabilities::builder().enable_tools().build(),
..Default::default()
}
}
fn list_tools(
&self,
_params: Option<rmcp::model::PaginatedRequestParams>,
_context: rmcp::service::RequestContext<rmcp::service::RoleServer>,
) -> impl std::future::Future<Output = Result<rmcp::model::ListToolsResult, McpError>> + Send + '_
{
async move {
use rmcp::model::{ListToolsResult, Tool};
use schemars::schema_for;
let synth_schema = schema_for!(SpeechSynthesizeToolParams);
let synth_schema_value = serde_json::to_value(&synth_schema).unwrap_or_default();
let synth_input_schema = match synth_schema_value {
serde_json::Value::Object(map) => Arc::new(map),
_ => Arc::new(serde_json::Map::new()),
};
let mut empty_schema_map = serde_json::Map::new();
empty_schema_map.insert("type".to_string(), serde_json::Value::String("object".to_string()));
let empty_schema = Arc::new(empty_schema_map);
Ok(ListToolsResult {
tools: vec![
Tool {
name: Cow::Borrowed("speech_synthesize"),
description: Some(Cow::Borrowed(
"Convert text to speech using Google Cloud TTS Chirp3-HD voices. \
Returns base64-encoded WAV audio or saves to a local file. \
Supports custom pronunciations using IPA or X-SAMPA phonetic alphabets.",
)),
input_schema: synth_input_schema,
annotations: None,
icons: None,
meta: None,
output_schema: None,
title: None,
},
Tool {
name: Cow::Borrowed("speech_list_voices"),
description: Some(Cow::Borrowed(
"List available Chirp3-HD voices with their supported languages.",
)),
input_schema: empty_schema,
annotations: None,
icons: None,
meta: None,
output_schema: None,
title: None,
},
],
next_cursor: None,
meta: None,
})
}
}
fn call_tool(
&self,
params: rmcp::model::CallToolRequestParams,
_context: rmcp::service::RequestContext<rmcp::service::RoleServer>,
) -> impl std::future::Future<Output = Result<CallToolResult, McpError>> + Send + '_ {
async move {
match params.name.as_ref() {
"speech_synthesize" => {
let tool_params: SpeechSynthesizeToolParams = params
.arguments
.map(|args| serde_json::from_value(serde_json::Value::Object(args)))
.transpose()
.map_err(|e| {
McpError::invalid_params(format!("Invalid parameters: {}", e), None)
})?
.ok_or_else(|| McpError::invalid_params("Missing parameters", None))?;
self.synthesize(tool_params).await
}
"speech_list_voices" => self.list_voices().await,
_ => Err(McpError::invalid_params(
format!("Unknown tool: {}", params.name),
None,
)),
}
}
}
fn list_resources(
&self,
_params: Option<rmcp::model::PaginatedRequestParams>,
_context: rmcp::service::RequestContext<rmcp::service::RoleServer>,
) -> impl std::future::Future<Output = Result<ListResourcesResult, McpError>> + Send + '_ {
async move {
debug!("Listing resources (none available for speech server)");
Ok(ListResourcesResult {
resources: vec![],
next_cursor: None,
meta: None,
})
}
}
fn read_resource(
&self,
params: rmcp::model::ReadResourceRequestParams,
_context: rmcp::service::RequestContext<rmcp::service::RoleServer>,
) -> impl std::future::Future<Output = Result<ReadResourceResult, McpError>> + Send + '_ {
async move {
let uri = ¶ms.uri;
debug!(uri = %uri, "Reading resource");
Err(McpError::resource_not_found(
format!("Unknown resource: {}", uri),
None,
))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn test_config() -> Config {
Config {
project_id: "test-project".to_string(),
location: "us-central1".to_string(),
gcs_bucket: None,
port: 8080,
..Default::default()
}
}
#[test]
fn test_server_info() {
let server = SpeechServer::new(test_config());
let info = server.get_info();
assert!(info.instructions.is_some());
}
#[test]
fn test_tool_params_conversion() {
let tool_params = SpeechSynthesizeToolParams {
text: "Hello world".to_string(),
voice: Some("en-US-Chirp3-HD-Achernar".to_string()),
language_code: Some("en-US".to_string()),
speaking_rate: Some(1.5),
pitch: Some(2.0),
pronunciations: Some(vec![PronunciationToolParam {
word: "hello".to_string(),
phonetic: "həˈloʊ".to_string(),
alphabet: "ipa".to_string(),
}]),
output_file: None,
};
let synth_params: SpeechSynthesizeParams = tool_params.into();
assert_eq!(synth_params.text, "Hello world");
assert_eq!(synth_params.voice, Some("en-US-Chirp3-HD-Achernar".to_string()));
assert_eq!(synth_params.language_code, "en-US");
assert_eq!(synth_params.speaking_rate, 1.5);
assert_eq!(synth_params.pitch, 2.0);
assert!(synth_params.pronunciations.is_some());
}
#[test]
fn test_tool_params_defaults() {
let tool_params = SpeechSynthesizeToolParams {
text: "Hello".to_string(),
voice: None,
language_code: None,
speaking_rate: None,
pitch: None,
pronunciations: None,
output_file: None,
};
let synth_params: SpeechSynthesizeParams = tool_params.into();
assert_eq!(synth_params.language_code, "en-US");
assert_eq!(synth_params.speaking_rate, 1.0);
assert_eq!(synth_params.pitch, 0.0);
}
#[test]
fn test_pronunciation_conversion() {
let tool_pron = PronunciationToolParam {
word: "tomato".to_string(),
phonetic: "təˈmeɪtoʊ".to_string(),
alphabet: "ipa".to_string(),
};
let pron: Pronunciation = tool_pron.into();
assert_eq!(pron.word, "tomato");
assert_eq!(pron.phonetic, "təˈmeɪtoʊ");
assert_eq!(pron.alphabet, "ipa");
}
}