1use super::{Tool, ToolResult};
9use anyhow::{Context, Result};
10use async_trait::async_trait;
11use serde::Deserialize;
12use serde_json::{Value, json};
13use std::time::Duration;
14
15const REQUEST_TIMEOUT: Duration = Duration::from_secs(120);
16
17fn default_voice_api_url() -> String {
18 std::env::var("CODETETHER_VOICE_API_URL")
19 .unwrap_or_else(|_| "https://voice.quantum-forge.io".to_string())
20}
21
22pub struct VoiceTool {
23 client: reqwest::Client,
24}
25
26impl Default for VoiceTool {
27 fn default() -> Self {
28 Self::new()
29 }
30}
31
32impl VoiceTool {
33 pub fn new() -> Self {
34 let client = reqwest::Client::builder()
35 .timeout(REQUEST_TIMEOUT)
36 .user_agent("CodeTether-Agent/1.0")
37 .build()
38 .expect("Failed to build HTTP client");
39 Self { client }
40 }
41
42 async fn speak(&self, params: &SpeakParams) -> Result<ToolResult> {
43 let base_url = default_voice_api_url();
44 let voice_id = params.voice_id.as_deref().unwrap_or("960f89fc");
45
46 let url = format!("{base_url}/voices/{voice_id}/speak");
47
48 let lang = params.language.clone().unwrap_or_else(|| "english".into());
49
50 let form = reqwest::multipart::Form::new()
51 .text("text", params.text.clone())
52 .text("language", lang);
53
54 let resp = self
55 .client
56 .post(&url)
57 .multipart(form)
58 .send()
59 .await
60 .map_err(|e| anyhow::anyhow!("Voice API request failed: {e}"))?;
61
62 if !resp.status().is_success() {
63 let status = resp.status();
64 let body = resp.text().await.unwrap_or_default();
65 return Ok(ToolResult::error(format!(
66 "Voice API returned {status}: {body}"
67 )));
68 }
69
70 let job_id = resp
71 .headers()
72 .get("x-job-id")
73 .and_then(|v| v.to_str().ok())
74 .unwrap_or("unknown")
75 .to_string();
76
77 let bytes = resp
79 .bytes()
80 .await
81 .context("Failed to read audio response")?;
82 let output_dir = std::env::current_dir().unwrap_or_else(|_| ".".into());
83 let output_path = output_dir.join(format!("voice_{job_id}.wav"));
84 tokio::fs::write(&output_path, &bytes)
85 .await
86 .context("Failed to save audio file")?;
87
88 let duration_secs = bytes.len() as f64 / (24000.0 * 2.0); Ok(ToolResult::success(format!(
91 "Generated speech saved to: {}\nJob ID: {job_id}\nApprox duration: {duration_secs:.1}s\nSize: {} bytes",
92 output_path.display(),
93 bytes.len()
94 ))
95 .with_metadata("job_id", json!(job_id))
96 .with_metadata("output_path", json!(output_path.to_string_lossy()))
97 .with_metadata("size_bytes", json!(bytes.len())))
98 }
99
100 async fn transcribe(&self, params: &TranscribeParams) -> Result<ToolResult> {
101 let base_url = default_voice_api_url();
102 let url = format!("{base_url}/transcribe");
103
104 let file_path = std::path::Path::new(¶ms.file_path);
105 if !file_path.exists() {
106 return Ok(ToolResult::error(format!(
107 "File not found: {}",
108 params.file_path
109 )));
110 }
111
112 let file_bytes = tokio::fs::read(file_path)
113 .await
114 .context("Failed to read audio file")?;
115
116 let file_name = file_path
117 .file_name()
118 .unwrap_or_default()
119 .to_string_lossy()
120 .to_string();
121
122 let part = reqwest::multipart::Part::bytes(file_bytes)
123 .file_name(file_name)
124 .mime_str("application/octet-stream")?;
125
126 let form = reqwest::multipart::Form::new().part("audio_file", part);
127
128 let resp = self
129 .client
130 .post(&url)
131 .multipart(form)
132 .send()
133 .await
134 .map_err(|e| anyhow::anyhow!("Transcription request failed: {e}"))?;
135
136 if !resp.status().is_success() {
137 let status = resp.status();
138 let body = resp.text().await.unwrap_or_default();
139 return Ok(ToolResult::error(format!(
140 "Transcription API returned {status}: {body}"
141 )));
142 }
143
144 let body: Value = resp.json().await.context("Failed to parse response")?;
145 let text = body["transcription"]
146 .as_str()
147 .unwrap_or("(no transcription returned)");
148
149 Ok(ToolResult::success(text).with_metadata("file", json!(params.file_path)))
150 }
151
152 async fn list_voices(&self) -> Result<ToolResult> {
153 let base_url = default_voice_api_url();
154 let url = format!("{base_url}/voices");
155
156 let resp = self
157 .client
158 .get(&url)
159 .send()
160 .await
161 .map_err(|e| anyhow::anyhow!("Voice API request failed: {e}"))?;
162
163 if !resp.status().is_success() {
164 let status = resp.status();
165 let body = resp.text().await.unwrap_or_default();
166 return Ok(ToolResult::error(format!(
167 "Voice API returned {status}: {body}"
168 )));
169 }
170
171 let body: Value = resp.json().await.context("Failed to parse response")?;
172 let voices = body["voices"].as_array();
173
174 match voices {
175 Some(voices) if !voices.is_empty() => {
176 let mut output = String::from("Available voices:\n\n");
177 for v in voices {
178 let id = v["voice_id"].as_str().unwrap_or("?");
179 let name = v["name"].as_str().unwrap_or("?");
180 let dur = v["duration_seconds"].as_f64().unwrap_or(0.0);
181 let created = v["created_at"].as_str().unwrap_or("?");
182 output.push_str(&format!(
183 "- {name} (id: {id}, sample: {dur:.1}s, created: {created})\n"
184 ));
185 }
186 Ok(ToolResult::success(output).with_metadata("count", json!(voices.len())))
187 }
188 _ => Ok(ToolResult::success(
189 "No voices found. Create one by uploading a voice sample.",
190 )),
191 }
192 }
193
194 async fn health(&self) -> Result<ToolResult> {
195 let base_url = default_voice_api_url();
196 let url = format!("{base_url}/health");
197
198 let resp = self
199 .client
200 .get(&url)
201 .timeout(Duration::from_secs(5))
202 .send()
203 .await
204 .map_err(|e| anyhow::anyhow!("Voice API health check failed: {e}"))?;
205
206 let body: Value = resp
207 .json()
208 .await
209 .context("Failed to parse health response")?;
210 let status = body["status"].as_str().unwrap_or("unknown");
211 let tts_loaded = body["tts_model_loaded"].as_bool().unwrap_or(false);
212 let whisper_loaded = body["whisper_model_loaded"].as_bool().unwrap_or(false);
213
214 Ok(ToolResult::success(format!(
215 "Voice API Status: {status}\nTTS model: {}\nWhisper model: {}",
216 if tts_loaded { "loaded" } else { "not loaded" },
217 if whisper_loaded {
218 "loaded"
219 } else {
220 "not loaded"
221 },
222 )))
223 }
224}
225
226#[derive(Deserialize)]
227struct Params {
228 action: String,
229 #[serde(default)]
230 text: Option<String>,
231 #[serde(default)]
232 voice_id: Option<String>,
233 #[serde(default)]
234 language: Option<String>,
235 #[serde(default)]
236 file_path: Option<String>,
237}
238
239#[derive(Deserialize)]
240struct SpeakParams {
241 text: String,
242 voice_id: Option<String>,
243 language: Option<String>,
244}
245
246#[derive(Deserialize)]
247struct TranscribeParams {
248 file_path: String,
249}
250
251#[async_trait]
252impl Tool for VoiceTool {
253 fn id(&self) -> &str {
254 "voice"
255 }
256 fn name(&self) -> &str {
257 "Voice"
258 }
259 fn description(&self) -> &str {
260 "Text-to-speech, transcription, and voice management via Qwen TTS. Actions: speak (text to speech with cloned voice), transcribe (audio file to text), list_voices (show available voice profiles), health (check API status). Set CODETETHER_VOICE_API_URL to override the default endpoint."
261 }
262 fn parameters(&self) -> Value {
263 json!({
264 "type": "object",
265 "properties": {
266 "action": {
267 "type": "string",
268 "enum": ["speak", "transcribe", "list_voices", "health"],
269 "description": "Action to perform"
270 },
271 "text": {
272 "type": "string",
273 "description": "Text to speak (required for 'speak' action)"
274 },
275 "voice_id": {
276 "type": "string",
277 "description": "Voice profile ID (optional for 'speak', defaults to Riley voice)"
278 },
279 "language": {
280 "type": "string",
281 "description": "Language for speech (default: english)",
282 "default": "english"
283 },
284 "file_path": {
285 "type": "string",
286 "description": "Path to audio/video file (required for 'transcribe' action)"
287 }
288 },
289 "required": ["action"]
290 })
291 }
292
293 async fn execute(&self, params: Value) -> Result<ToolResult> {
294 let p: Params = serde_json::from_value(params).context("Invalid params")?;
295
296 match p.action.as_str() {
297 "speak" => {
298 let text = match p.text {
299 Some(t) if !t.trim().is_empty() => t,
300 _ => {
301 return Ok(ToolResult::structured_error(
302 "MISSING_PARAM",
303 "voice",
304 "The 'text' parameter is required for the 'speak' action",
305 Some(vec!["text"]),
306 Some(json!({"action": "speak", "text": "Hello world"})),
307 ));
308 }
309 };
310 self.speak(&SpeakParams {
311 text,
312 voice_id: p.voice_id,
313 language: p.language,
314 })
315 .await
316 }
317 "transcribe" => {
318 let file_path = match p.file_path {
319 Some(f) if !f.trim().is_empty() => f,
320 _ => {
321 return Ok(ToolResult::structured_error(
322 "MISSING_PARAM",
323 "voice",
324 "The 'file_path' parameter is required for the 'transcribe' action",
325 Some(vec!["file_path"]),
326 Some(
327 json!({"action": "transcribe", "file_path": "/path/to/audio.wav"}),
328 ),
329 ));
330 }
331 };
332 self.transcribe(&TranscribeParams { file_path }).await
333 }
334 "list_voices" => self.list_voices().await,
335 "health" => self.health().await,
336 other => Ok(ToolResult::structured_error(
337 "INVALID_ACTION",
338 "voice",
339 &format!("Unknown action '{other}'. Use: speak, transcribe, list_voices, health"),
340 None,
341 Some(json!({"action": "speak", "text": "Hello world"})),
342 )),
343 }
344 }
345}