use crate::error::{NpcError, Result};
use std::collections::HashMap;
pub async fn speech_to_text(audio_data: &[u8], engine: &str, language: Option<&str>) -> Result<HashMap<String, serde_json::Value>> {
match engine.to_lowercase().as_str() {
"whisper" | "faster-whisper" => stt_whisper(audio_data, "base", language),
"openai" => stt_openai(audio_data, None, "whisper-1", language).await,
"gemini" => stt_gemini(audio_data, None, "gemini-1.5-flash", language).await,
"groq" => stt_groq(audio_data, None, "whisper-large-v3", language).await,
"elevenlabs" => stt_elevenlabs(audio_data, None, "scribe_v1", language).await,
other => Err(NpcError::Shell(format!("Unknown STT engine: {}", other))),
}
}
pub fn stt_whisper(audio_data: &[u8], _model_size: &str, language: Option<&str>) -> Result<HashMap<String, serde_json::Value>> {
let rt = tokio::runtime::Handle::try_current()
.or_else(|_| {
tokio::runtime::Runtime::new().map(|rt| rt.handle().clone())
})
.map_err(|e| NpcError::Other(format!("No tokio runtime: {}", e)))?;
let data = audio_data.to_vec();
let lang = language.map(String::from);
rt.block_on(async {
if std::env::var("GROQ_API_KEY").is_ok() {
return stt_groq(&data, None, "whisper-large-v3", lang.as_deref()).await;
}
if std::env::var("OPENAI_API_KEY").is_ok() {
return stt_openai(&data, None, "whisper-1", lang.as_deref()).await;
}
Err(NpcError::LlmRequest("No STT API key available. Set GROQ_API_KEY or OPENAI_API_KEY.".into()))
})
}
pub async fn stt_openai(audio_data: &[u8], api_key: Option<&str>, model: &str, language: Option<&str>) -> Result<HashMap<String, serde_json::Value>> {
let key = api_key.map(String::from).or_else(|| std::env::var("OPENAI_API_KEY").ok()).ok_or_else(|| NpcError::LlmRequest("OPENAI_API_KEY not set".into()))?;
let file_part = reqwest::multipart::Part::bytes(audio_data.to_vec()).file_name("audio.wav").mime_str("audio/wav").map_err(|e| NpcError::LlmRequest(format!("MIME: {}", e)))?;
let mut form = reqwest::multipart::Form::new().part("file", file_part).text("model", model.to_string()).text("response_format", "verbose_json".to_string());
if let Some(l) = language { form = form.text("language", l.to_string()); }
let resp = reqwest::Client::new().post("https://api.openai.com/v1/audio/transcriptions").header("Authorization", format!("Bearer {}", key)).multipart(form).send().await?;
if !resp.status().is_success() { return Err(NpcError::LlmRequest(format!("OpenAI STT: {}", resp.text().await.unwrap_or_default()))); }
let json: serde_json::Value = resp.json().await?;
let mut r = HashMap::new();
r.insert("text".into(), json.get("text").cloned().unwrap_or(serde_json::Value::String(String::new())));
r.insert("language".into(), json.get("language").cloned().unwrap_or(serde_json::Value::String("en".into())));
Ok(r)
}
pub async fn stt_gemini(audio_data: &[u8], api_key: Option<&str>, model: &str, language: Option<&str>) -> Result<HashMap<String, serde_json::Value>> {
let key = api_key.map(String::from).or_else(|| std::env::var("GOOGLE_API_KEY").ok()).or_else(|| std::env::var("GEMINI_API_KEY").ok()).ok_or_else(|| NpcError::LlmRequest("GOOGLE_API_KEY not set".into()))?;
use base64::Engine;
let b64 = base64::engine::general_purpose::STANDARD.encode(audio_data);
let prompt = language.map(|l| format!("Transcribe in {}. Output only transcription.", l)).unwrap_or_else(|| "Transcribe exactly. Output only transcription.".into());
let url = format!("https://generativelanguage.googleapis.com/v1beta/models/{}:generateContent?key={}", model, key);
let body = serde_json::json!({"contents": [{"parts": [{"text": prompt}, {"inlineData": {"mimeType": "audio/wav", "data": b64}}]}]});
let resp = reqwest::Client::new().post(&url).json(&body).send().await?;
if !resp.status().is_success() { return Err(NpcError::LlmRequest(format!("Gemini STT: {}", resp.text().await.unwrap_or_default()))); }
let json: serde_json::Value = resp.json().await?;
let text = json["candidates"][0]["content"]["parts"][0]["text"].as_str().unwrap_or("").trim().to_string();
let mut r = HashMap::new(); r.insert("text".into(), serde_json::Value::String(text)); Ok(r)
}
pub async fn stt_groq(audio_data: &[u8], api_key: Option<&str>, model: &str, language: Option<&str>) -> Result<HashMap<String, serde_json::Value>> {
let key = api_key.map(String::from).or_else(|| std::env::var("GROQ_API_KEY").ok()).ok_or_else(|| NpcError::LlmRequest("GROQ_API_KEY not set".into()))?;
let file_part = reqwest::multipart::Part::bytes(audio_data.to_vec()).file_name("audio.wav").mime_str("audio/wav").map_err(|e| NpcError::LlmRequest(format!("MIME: {}", e)))?;
let mut form = reqwest::multipart::Form::new().part("file", file_part).text("model", model.to_string());
if let Some(l) = language { form = form.text("language", l.to_string()); }
let resp = reqwest::Client::new().post("https://api.groq.com/openai/v1/audio/transcriptions").header("Authorization", format!("Bearer {}", key)).multipart(form).send().await?;
if !resp.status().is_success() { return Err(NpcError::LlmRequest(format!("Groq STT: {}", resp.text().await.unwrap_or_default()))); }
let json: serde_json::Value = resp.json().await?;
let mut r = HashMap::new(); r.insert("text".into(), json.get("text").cloned().unwrap_or(serde_json::Value::String(String::new()))); Ok(r)
}
pub async fn stt_elevenlabs(audio_data: &[u8], api_key: Option<&str>, model_id: &str, language: Option<&str>) -> Result<HashMap<String, serde_json::Value>> {
let key = api_key.map(String::from).or_else(|| std::env::var("ELEVENLABS_API_KEY").ok()).ok_or_else(|| NpcError::LlmRequest("ELEVENLABS_API_KEY not set".into()))?;
let file_part = reqwest::multipart::Part::bytes(audio_data.to_vec()).file_name("audio.wav").mime_str("audio/wav").map_err(|e| NpcError::LlmRequest(format!("MIME: {}", e)))?;
let mut form = reqwest::multipart::Form::new().part("file", file_part).text("model_id", model_id.to_string());
if let Some(l) = language { form = form.text("language_code", l.to_string()); }
let resp = reqwest::Client::new().post("https://api.elevenlabs.io/v1/speech-to-text").header("xi-api-key", &key).multipart(form).send().await?;
if !resp.status().is_success() { return Err(NpcError::LlmRequest(format!("ElevenLabs STT: {}", resp.text().await.unwrap_or_default()))); }
let json: serde_json::Value = resp.json().await?;
let mut r = HashMap::new();
r.insert("text".into(), json.get("text").cloned().unwrap_or(serde_json::Value::String(String::new())));
r.insert("language".into(), json.get("language_code").cloned().unwrap_or(serde_json::Value::Null));
Ok(r)
}
pub fn get_available_stt_engines() -> HashMap<String, bool> {
let mut engines = HashMap::new();
engines.insert("openai".into(), std::env::var("OPENAI_API_KEY").is_ok());
engines.insert("groq".into(), std::env::var("GROQ_API_KEY").is_ok());
engines.insert("gemini".into(), std::env::var("GOOGLE_API_KEY").is_ok() || std::env::var("GEMINI_API_KEY").is_ok());
engines.insert("elevenlabs".into(), std::env::var("ELEVENLABS_API_KEY").is_ok());
engines
}
pub fn transcribe_audio_file(file_path: &str, language: Option<&str>) -> Result<String> {
let data = std::fs::read(file_path).map_err(|e| NpcError::FileLoad { path: file_path.into(), source: e })?;
let result = stt_whisper(&data, "small", language)?;
Ok(result.get("text").and_then(|v| v.as_str()).unwrap_or("").to_string())
}