use crate::error::{NpcError, Result};
use std::collections::HashMap;
pub async fn speech_to_text(
audio_data: &[u8],
engine: &str,
language: Option<&str>,
) -> Result<HashMap<String, serde_json::Value>> {
match engine.to_lowercase().as_str() {
"whisper" | "faster-whisper" => stt_whisper(audio_data, "base", language),
"openai" => stt_openai(audio_data, None, "whisper-1", language).await,
"gemini" => stt_gemini(audio_data, None, "gemini-1.5-flash", language).await,
"groq" => stt_groq(audio_data, None, "whisper-large-v3", language).await,
"elevenlabs" => stt_elevenlabs(audio_data, None, "scribe_v1", language).await,
other => Err(NpcError::Shell(format!("Unknown STT engine: {}", other))),
}
}
pub fn stt_whisper(
audio_data: &[u8],
_model_size: &str,
language: Option<&str>,
) -> Result<HashMap<String, serde_json::Value>> {
let rt = tokio::runtime::Handle::try_current()
.or_else(|_| tokio::runtime::Runtime::new().map(|rt| rt.handle().clone()))
.map_err(|e| NpcError::Other(format!("No tokio runtime: {}", e)))?;
let data = audio_data.to_vec();
let lang = language.map(String::from);
rt.block_on(async {
if std::env::var("GROQ_API_KEY").is_ok() {
return stt_groq(&data, None, "whisper-large-v3", lang.as_deref()).await;
}
if std::env::var("OPENAI_API_KEY").is_ok() {
return stt_openai(&data, None, "whisper-1", lang.as_deref()).await;
}
Err(NpcError::LlmRequest(
"No STT API key available. Set GROQ_API_KEY or OPENAI_API_KEY.".into(),
))
})
}
pub async fn stt_openai(
audio_data: &[u8],
api_key: Option<&str>,
model: &str,
language: Option<&str>,
) -> Result<HashMap<String, serde_json::Value>> {
let key = api_key
.map(String::from)
.or_else(|| std::env::var("OPENAI_API_KEY").ok())
.ok_or_else(|| NpcError::LlmRequest("OPENAI_API_KEY not set".into()))?;
let file_part = reqwest::multipart::Part::bytes(audio_data.to_vec())
.file_name("audio.wav")
.mime_str("audio/wav")
.map_err(|e| NpcError::LlmRequest(format!("MIME: {}", e)))?;
let mut form = reqwest::multipart::Form::new()
.part("file", file_part)
.text("model", model.to_string())
.text("response_format", "verbose_json".to_string());
if let Some(l) = language {
form = form.text("language", l.to_string());
}
let resp = reqwest::Client::new()
.post("https://api.openai.com/v1/audio/transcriptions")
.header("Authorization", format!("Bearer {}", key))
.multipart(form)
.send()
.await?;
if !resp.status().is_success() {
return Err(NpcError::LlmRequest(format!(
"OpenAI STT: {}",
resp.text().await.unwrap_or_default()
)));
}
let json: serde_json::Value = resp.json().await?;
let mut r = HashMap::new();
r.insert(
"text".into(),
json.get("text")
.cloned()
.unwrap_or(serde_json::Value::String(String::new())),
);
r.insert(
"language".into(),
json.get("language")
.cloned()
.unwrap_or(serde_json::Value::String("en".into())),
);
Ok(r)
}
pub async fn stt_gemini(
audio_data: &[u8],
api_key: Option<&str>,
model: &str,
language: Option<&str>,
) -> Result<HashMap<String, serde_json::Value>> {
let key = api_key
.map(String::from)
.or_else(|| std::env::var("GOOGLE_API_KEY").ok())
.or_else(|| std::env::var("GEMINI_API_KEY").ok())
.ok_or_else(|| NpcError::LlmRequest("GOOGLE_API_KEY not set".into()))?;
use base64::Engine;
let b64 = base64::engine::general_purpose::STANDARD.encode(audio_data);
let prompt = language
.map(|l| format!("Transcribe in {}. Output only transcription.", l))
.unwrap_or_else(|| "Transcribe exactly. Output only transcription.".into());
let url = format!(
"https://generativelanguage.googleapis.com/v1beta/models/{}:generateContent?key={}",
model, key
);
let body = serde_json::json!({"contents": [{"parts": [{"text": prompt}, {"inlineData": {"mimeType": "audio/wav", "data": b64}}]}]});
let resp = reqwest::Client::new().post(&url).json(&body).send().await?;
if !resp.status().is_success() {
return Err(NpcError::LlmRequest(format!(
"Gemini STT: {}",
resp.text().await.unwrap_or_default()
)));
}
let json: serde_json::Value = resp.json().await?;
let text = json["candidates"][0]["content"]["parts"][0]["text"]
.as_str()
.unwrap_or("")
.trim()
.to_string();
let mut r = HashMap::new();
r.insert("text".into(), serde_json::Value::String(text));
Ok(r)
}
pub async fn stt_groq(
audio_data: &[u8],
api_key: Option<&str>,
model: &str,
language: Option<&str>,
) -> Result<HashMap<String, serde_json::Value>> {
let key = api_key
.map(String::from)
.or_else(|| std::env::var("GROQ_API_KEY").ok())
.ok_or_else(|| NpcError::LlmRequest("GROQ_API_KEY not set".into()))?;
let file_part = reqwest::multipart::Part::bytes(audio_data.to_vec())
.file_name("audio.wav")
.mime_str("audio/wav")
.map_err(|e| NpcError::LlmRequest(format!("MIME: {}", e)))?;
let mut form = reqwest::multipart::Form::new()
.part("file", file_part)
.text("model", model.to_string());
if let Some(l) = language {
form = form.text("language", l.to_string());
}
let resp = reqwest::Client::new()
.post("https://api.groq.com/openai/v1/audio/transcriptions")
.header("Authorization", format!("Bearer {}", key))
.multipart(form)
.send()
.await?;
if !resp.status().is_success() {
return Err(NpcError::LlmRequest(format!(
"Groq STT: {}",
resp.text().await.unwrap_or_default()
)));
}
let json: serde_json::Value = resp.json().await?;
let mut r = HashMap::new();
r.insert(
"text".into(),
json.get("text")
.cloned()
.unwrap_or(serde_json::Value::String(String::new())),
);
Ok(r)
}
pub async fn stt_elevenlabs(
audio_data: &[u8],
api_key: Option<&str>,
model_id: &str,
language: Option<&str>,
) -> Result<HashMap<String, serde_json::Value>> {
let key = api_key
.map(String::from)
.or_else(|| std::env::var("ELEVENLABS_API_KEY").ok())
.ok_or_else(|| NpcError::LlmRequest("ELEVENLABS_API_KEY not set".into()))?;
let file_part = reqwest::multipart::Part::bytes(audio_data.to_vec())
.file_name("audio.wav")
.mime_str("audio/wav")
.map_err(|e| NpcError::LlmRequest(format!("MIME: {}", e)))?;
let mut form = reqwest::multipart::Form::new()
.part("file", file_part)
.text("model_id", model_id.to_string());
if let Some(l) = language {
form = form.text("language_code", l.to_string());
}
let resp = reqwest::Client::new()
.post("https://api.elevenlabs.io/v1/speech-to-text")
.header("xi-api-key", &key)
.multipart(form)
.send()
.await?;
if !resp.status().is_success() {
return Err(NpcError::LlmRequest(format!(
"ElevenLabs STT: {}",
resp.text().await.unwrap_or_default()
)));
}
let json: serde_json::Value = resp.json().await?;
let mut r = HashMap::new();
r.insert(
"text".into(),
json.get("text")
.cloned()
.unwrap_or(serde_json::Value::String(String::new())),
);
r.insert(
"language".into(),
json.get("language_code")
.cloned()
.unwrap_or(serde_json::Value::Null),
);
Ok(r)
}
pub fn get_available_stt_engines() -> HashMap<String, bool> {
let mut engines = HashMap::new();
engines.insert("openai".into(), std::env::var("OPENAI_API_KEY").is_ok());
engines.insert("groq".into(), std::env::var("GROQ_API_KEY").is_ok());
engines.insert(
"gemini".into(),
std::env::var("GOOGLE_API_KEY").is_ok() || std::env::var("GEMINI_API_KEY").is_ok(),
);
engines.insert(
"elevenlabs".into(),
std::env::var("ELEVENLABS_API_KEY").is_ok(),
);
engines
}
pub fn transcribe_audio_file(file_path: &str, language: Option<&str>) -> Result<String> {
let data = std::fs::read(file_path).map_err(|e| NpcError::FileLoad {
path: file_path.into(),
source: e,
})?;
let result = stt_whisper(&data, "small", language)?;
Ok(result
.get("text")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string())
}