Skip to main content

sparrow/tools/
stt.rs

1//! Speech-to-Text tool for Sparrow.
2//!
3//! Transcribes audio files to text using Whisper (local or API).
4
5use std::path::{Path, PathBuf};
6use std::process::Command;
7
8/// Available STT backends.
9#[derive(Debug, Clone)]
10pub enum SttBackend {
11    /// OpenAI Whisper API
12    OpenAIApi,
13    /// Local Whisper CLI (whisper-rs or openai-whisper)
14    WhisperCli,
15    /// Local whisper.cpp
16    WhisperCpp,
17}
18
19/// Transcribe an audio file to text.
20///
21/// Returns the transcription text.
22pub fn transcribe(
23    audio_path: &Path,
24    language: Option<&str>,
25    backend: SttBackend,
26) -> anyhow::Result<String> {
27    if !audio_path.exists() {
28        anyhow::bail!("Audio file not found: {}", audio_path.display());
29    }
30
31    match backend {
32        SttBackend::OpenAIApi => transcribe_openai(audio_path, language),
33        SttBackend::WhisperCli => transcribe_whisper_cli(audio_path, language),
34        SttBackend::WhisperCpp => transcribe_whisper_cpp(audio_path, language),
35    }
36}
37
38/// Use OpenAI Whisper API.
39fn transcribe_openai(audio_path: &Path, language: Option<&str>) -> anyhow::Result<String> {
40    let api_key = std::env::var("OPENAI_API_KEY")
41        .map_err(|_| anyhow::anyhow!("OPENAI_API_KEY environment variable not set"))?;
42
43    let client = reqwest::blocking::Client::new();
44    let form = reqwest::blocking::multipart::Form::new()
45        .file("file", audio_path)?
46        .text("model", "whisper-1");
47
48    let form = if let Some(lang) = language {
49        form.text("language", lang.to_string())
50    } else {
51        form
52    };
53
54    let resp = client
55        .post("https://api.openai.com/v1/audio/transcriptions")
56        .header("Authorization", format!("Bearer {}", api_key))
57        .multipart(form)
58        .send()?;
59
60    if !resp.status().is_success() {
61        let body = resp.text()?;
62        anyhow::bail!("OpenAI Whisper failed: {body}");
63    }
64
65    let json: serde_json::Value = resp.json()?;
66    json.get("text")
67        .and_then(|t| t.as_str())
68        .map(|t| t.to_string())
69        .ok_or_else(|| anyhow::anyhow!("No transcription in response"))
70}
71
72/// Use local whisper CLI.
73fn transcribe_whisper_cli(audio_path: &Path, language: Option<&str>) -> anyhow::Result<String> {
74    let mut cmd = Command::new("whisper");
75    cmd.arg(audio_path);
76
77    if let Some(lang) = language {
78        cmd.args(["--language", lang]);
79    }
80
81    cmd.arg("--output_format").arg("txt");
82    cmd.arg("--output_dir")
83        .arg(audio_path.parent().unwrap_or(Path::new(".")));
84
85    let output = cmd.output()?;
86
87    if !output.status.success() {
88        let stderr = String::from_utf8_lossy(&output.stderr);
89        anyhow::bail!("whisper CLI failed: {stderr}\nInstall: pip install openai-whisper");
90    }
91
92    // Read the generated .txt file
93    let txt_path = audio_path.with_extension("txt");
94    if txt_path.exists() {
95        Ok(std::fs::read_to_string(&txt_path)?)
96    } else {
97        // Output is in stdout
98        Ok(String::from_utf8_lossy(&output.stdout).to_string())
99    }
100}
101
102/// Use local whisper.cpp.
103fn transcribe_whisper_cpp(audio_path: &Path, language: Option<&str>) -> anyhow::Result<String> {
104    let mut cmd = Command::new("whisper-cpp");
105    cmd.arg("-f").arg(audio_path);
106
107    if let Some(lang) = language {
108        cmd.arg("-l").arg(lang);
109    }
110
111    cmd.arg("-otxt");
112
113    let output = cmd.output()?;
114
115    if !output.status.success() {
116        let stderr = String::from_utf8_lossy(&output.stderr);
117        anyhow::bail!("whisper-cpp failed: {stderr}");
118    }
119
120    // Read the generated .txt file
121    let txt_path = audio_path.with_extension("txt");
122    if txt_path.exists() {
123        Ok(std::fs::read_to_string(&txt_path)?)
124    } else {
125        Ok(String::from_utf8_lossy(&output.stdout).to_string())
126    }
127}