Skip to main content

codetether_agent/tool/
voice.rs

1//! Voice Tool - Text-to-speech, transcription, and voice cloning via Qwen TTS API.
2//!
3//! Connects to the Voice Cloning Service (Qwen3-TTS) for:
4//! - Speaking text with a saved voice profile
5//! - Transcribing audio files to text
6//! - Listing available voice profiles
7
8use super::{Tool, ToolResult};
9use anyhow::{Context, Result};
10use async_trait::async_trait;
11use serde::Deserialize;
12use serde_json::{Value, json};
13use std::time::Duration;
14
15const REQUEST_TIMEOUT: Duration = Duration::from_secs(120);
16
17fn default_voice_api_url() -> String {
18    std::env::var("CODETETHER_VOICE_API_URL")
19        .unwrap_or_else(|_| "https://voice.quantum-forge.io".to_string())
20}
21
22pub struct VoiceTool {
23    client: reqwest::Client,
24}
25
26impl Default for VoiceTool {
27    fn default() -> Self {
28        Self::new()
29    }
30}
31
32impl VoiceTool {
33    pub fn new() -> Self {
34        let client = reqwest::Client::builder()
35            .timeout(REQUEST_TIMEOUT)
36            .user_agent("CodeTether-Agent/1.0")
37            .build()
38            .expect("Failed to build HTTP client");
39        Self { client }
40    }
41
42    async fn speak(&self, params: &SpeakParams) -> Result<ToolResult> {
43        let base_url = default_voice_api_url();
44        let voice_id = params.voice_id.as_deref().unwrap_or("960f89fc");
45
46        let url = format!("{base_url}/voices/{voice_id}/speak");
47
48        let lang = params.language.clone().unwrap_or_else(|| "english".into());
49
50        let form = reqwest::multipart::Form::new()
51            .text("text", params.text.clone())
52            .text("language", lang);
53
54        let resp = self
55            .client
56            .post(&url)
57            .multipart(form)
58            .send()
59            .await
60            .map_err(|e| anyhow::anyhow!("Voice API request failed: {e}"))?;
61
62        if !resp.status().is_success() {
63            let status = resp.status();
64            let body = resp.text().await.unwrap_or_default();
65            return Ok(ToolResult::error(format!(
66                "Voice API returned {status}: {body}"
67            )));
68        }
69
70        let job_id = resp
71            .headers()
72            .get("x-job-id")
73            .and_then(|v| v.to_str().ok())
74            .unwrap_or("unknown")
75            .to_string();
76
77        // Save the WAV to a temp file
78        let bytes = resp
79            .bytes()
80            .await
81            .context("Failed to read audio response")?;
82        let output_dir = std::env::current_dir().unwrap_or_else(|_| ".".into());
83        let output_path = output_dir.join(format!("voice_{job_id}.wav"));
84        tokio::fs::write(&output_path, &bytes)
85            .await
86            .context("Failed to save audio file")?;
87
88        let duration_secs = bytes.len() as f64 / (24000.0 * 2.0); // 24kHz, 16-bit mono
89
90        Ok(ToolResult::success(format!(
91            "Generated speech saved to: {}\nJob ID: {job_id}\nApprox duration: {duration_secs:.1}s\nSize: {} bytes",
92            output_path.display(),
93            bytes.len()
94        ))
95        .with_metadata("job_id", json!(job_id))
96        .with_metadata("output_path", json!(output_path.to_string_lossy()))
97        .with_metadata("size_bytes", json!(bytes.len())))
98    }
99
100    async fn transcribe(&self, params: &TranscribeParams) -> Result<ToolResult> {
101        let base_url = default_voice_api_url();
102        let url = format!("{base_url}/transcribe");
103
104        let file_path = std::path::Path::new(&params.file_path);
105        if !file_path.exists() {
106            return Ok(ToolResult::error(format!(
107                "File not found: {}",
108                params.file_path
109            )));
110        }
111
112        let file_bytes = tokio::fs::read(file_path)
113            .await
114            .context("Failed to read audio file")?;
115
116        let file_name = file_path
117            .file_name()
118            .unwrap_or_default()
119            .to_string_lossy()
120            .to_string();
121
122        let part = reqwest::multipart::Part::bytes(file_bytes)
123            .file_name(file_name)
124            .mime_str("application/octet-stream")?;
125
126        let form = reqwest::multipart::Form::new().part("audio_file", part);
127
128        let resp = self
129            .client
130            .post(&url)
131            .multipart(form)
132            .send()
133            .await
134            .map_err(|e| anyhow::anyhow!("Transcription request failed: {e}"))?;
135
136        if !resp.status().is_success() {
137            let status = resp.status();
138            let body = resp.text().await.unwrap_or_default();
139            return Ok(ToolResult::error(format!(
140                "Transcription API returned {status}: {body}"
141            )));
142        }
143
144        let body: Value = resp.json().await.context("Failed to parse response")?;
145        let text = body["transcription"]
146            .as_str()
147            .unwrap_or("(no transcription returned)");
148
149        Ok(ToolResult::success(text).with_metadata("file", json!(params.file_path)))
150    }
151
152    async fn list_voices(&self) -> Result<ToolResult> {
153        let base_url = default_voice_api_url();
154        let url = format!("{base_url}/voices");
155
156        let resp = self
157            .client
158            .get(&url)
159            .send()
160            .await
161            .map_err(|e| anyhow::anyhow!("Voice API request failed: {e}"))?;
162
163        if !resp.status().is_success() {
164            let status = resp.status();
165            let body = resp.text().await.unwrap_or_default();
166            return Ok(ToolResult::error(format!(
167                "Voice API returned {status}: {body}"
168            )));
169        }
170
171        let body: Value = resp.json().await.context("Failed to parse response")?;
172        let voices = body["voices"].as_array();
173
174        match voices {
175            Some(voices) if !voices.is_empty() => {
176                let mut output = String::from("Available voices:\n\n");
177                for v in voices {
178                    let id = v["voice_id"].as_str().unwrap_or("?");
179                    let name = v["name"].as_str().unwrap_or("?");
180                    let dur = v["duration_seconds"].as_f64().unwrap_or(0.0);
181                    let created = v["created_at"].as_str().unwrap_or("?");
182                    output.push_str(&format!(
183                        "- {name} (id: {id}, sample: {dur:.1}s, created: {created})\n"
184                    ));
185                }
186                Ok(ToolResult::success(output).with_metadata("count", json!(voices.len())))
187            }
188            _ => Ok(ToolResult::success(
189                "No voices found. Create one by uploading a voice sample.",
190            )),
191        }
192    }
193
194    async fn health(&self) -> Result<ToolResult> {
195        let base_url = default_voice_api_url();
196        let url = format!("{base_url}/health");
197
198        let resp = self
199            .client
200            .get(&url)
201            .timeout(Duration::from_secs(5))
202            .send()
203            .await
204            .map_err(|e| anyhow::anyhow!("Voice API health check failed: {e}"))?;
205
206        let body: Value = resp
207            .json()
208            .await
209            .context("Failed to parse health response")?;
210        let status = body["status"].as_str().unwrap_or("unknown");
211        let tts_loaded = body["tts_model_loaded"].as_bool().unwrap_or(false);
212        let whisper_loaded = body["whisper_model_loaded"].as_bool().unwrap_or(false);
213
214        Ok(ToolResult::success(format!(
215            "Voice API Status: {status}\nTTS model: {}\nWhisper model: {}",
216            if tts_loaded { "loaded" } else { "not loaded" },
217            if whisper_loaded {
218                "loaded"
219            } else {
220                "not loaded"
221            },
222        )))
223    }
224}
225
226#[derive(Deserialize)]
227struct Params {
228    action: String,
229    #[serde(default)]
230    text: Option<String>,
231    #[serde(default)]
232    voice_id: Option<String>,
233    #[serde(default)]
234    language: Option<String>,
235    #[serde(default)]
236    file_path: Option<String>,
237}
238
239#[derive(Deserialize)]
240struct SpeakParams {
241    text: String,
242    voice_id: Option<String>,
243    language: Option<String>,
244}
245
246#[derive(Deserialize)]
247struct TranscribeParams {
248    file_path: String,
249}
250
251#[async_trait]
252impl Tool for VoiceTool {
253    fn id(&self) -> &str {
254        "voice"
255    }
256    fn name(&self) -> &str {
257        "Voice"
258    }
259    fn description(&self) -> &str {
260        "Text-to-speech, transcription, and voice management via Qwen TTS. Actions: speak (text to speech with cloned voice), transcribe (audio file to text), list_voices (show available voice profiles), health (check API status). Set CODETETHER_VOICE_API_URL to override the default endpoint."
261    }
262    fn parameters(&self) -> Value {
263        json!({
264            "type": "object",
265            "properties": {
266                "action": {
267                    "type": "string",
268                    "enum": ["speak", "transcribe", "list_voices", "health"],
269                    "description": "Action to perform"
270                },
271                "text": {
272                    "type": "string",
273                    "description": "Text to speak (required for 'speak' action)"
274                },
275                "voice_id": {
276                    "type": "string",
277                    "description": "Voice profile ID (optional for 'speak', defaults to Riley voice)"
278                },
279                "language": {
280                    "type": "string",
281                    "description": "Language for speech (default: english)",
282                    "default": "english"
283                },
284                "file_path": {
285                    "type": "string",
286                    "description": "Path to audio/video file (required for 'transcribe' action)"
287                }
288            },
289            "required": ["action"]
290        })
291    }
292
293    async fn execute(&self, params: Value) -> Result<ToolResult> {
294        let p: Params = serde_json::from_value(params).context("Invalid params")?;
295
296        match p.action.as_str() {
297            "speak" => {
298                let text = match p.text {
299                    Some(t) if !t.trim().is_empty() => t,
300                    _ => {
301                        return Ok(ToolResult::structured_error(
302                            "MISSING_PARAM",
303                            "voice",
304                            "The 'text' parameter is required for the 'speak' action",
305                            Some(vec!["text"]),
306                            Some(json!({"action": "speak", "text": "Hello world"})),
307                        ));
308                    }
309                };
310                self.speak(&SpeakParams {
311                    text,
312                    voice_id: p.voice_id,
313                    language: p.language,
314                })
315                .await
316            }
317            "transcribe" => {
318                let file_path = match p.file_path {
319                    Some(f) if !f.trim().is_empty() => f,
320                    _ => {
321                        return Ok(ToolResult::structured_error(
322                            "MISSING_PARAM",
323                            "voice",
324                            "The 'file_path' parameter is required for the 'transcribe' action",
325                            Some(vec!["file_path"]),
326                            Some(
327                                json!({"action": "transcribe", "file_path": "/path/to/audio.wav"}),
328                            ),
329                        ));
330                    }
331                };
332                self.transcribe(&TranscribeParams { file_path }).await
333            }
334            "list_voices" => self.list_voices().await,
335            "health" => self.health().await,
336            other => Ok(ToolResult::structured_error(
337                "INVALID_ACTION",
338                "voice",
339                &format!("Unknown action '{other}'. Use: speak, transcribe, list_voices, health"),
340                None,
341                Some(json!({"action": "speak", "text": "Hello world"})),
342            )),
343        }
344    }
345}