whisper/
whisper.rs

1//! Use OpenAI whisper to transcribe audio from the FFmpeg CLI in realtime.
2//! Caution: hacky prototype code
3
4use ffmpeg_sidecar::{command::FfmpegCommand, event::FfmpegEvent};
5use std::env;
6use std::fs;
7use std::io::{self, Write};
8use std::path::Path;
9use std::process::Command;
10use std::time::{Duration, Instant};
11
12fn main() -> anyhow::Result<()> {
13  let _guard = temporarily_use_ffmpeg_from_system_path()?;
14
15  // Download whisper model if it doesn't exist
16  download_whisper_model()?;
17
18  // Find default audio input device
19  let audio_device = find_default_audio_device()?;
20  println!("Listening to audio device: {}", audio_device);
21  println!("Starting real-time transcription... (Say 'stop recording' or press Ctrl+C to stop)");
22
23  // Run Whisper transcription with microphone input
24  // `destination=-` uses FFmpeg AVIO syntax to direct output to stdout
25  // `queue` sets the seconds buffered before processing, affecting both latency and transcription
26  let whisper_filter = "whisper=model=./whisper.cpp/models/ggml-base.en.bin:destination=-:queue=2";
27
28  let mut command = FfmpegCommand::new();
29
30  // Configure audio input based on platform
31  if cfg!(windows) {
32    command
33      .format("dshow")
34      .args("-audio_buffer_size 50".split(' ')) // reduces latency to 50ms
35      .input(format!("audio={}", audio_device));
36  } else {
37    // For Linux/Mac - this is a simplified approach, may need adjustment
38    command
39      .format("pulse") // or "alsa" on Linux
40      .input("default");
41  }
42
43  let iter = command
44    .arg("-af")
45    .arg(&whisper_filter)
46    .format("null")
47    .output("-")
48    .spawn()?
49    .iter()?;
50
51  let mut transcription_parts = Vec::new();
52  let mut last_transcription_time = Instant::now();
53  let pause_threshold = Duration::from_secs(2); // 2 seconds of silence = line break
54
55  for event in iter {
56    match event {
57      FfmpegEvent::ParsedConfiguration(config) => {
58        if !config
59          .configuration
60          .contains(&"--enable-whisper".to_string())
61        {
62          anyhow::bail!("FFmpeg was not built with Whisper support (--enable-whisper)");
63        }
64      }
65      FfmpegEvent::OutputChunk(chunk) => {
66        // Convert raw bytes to text and collect transcription parts
67        if let Ok(text) = String::from_utf8(chunk) {
68          let trimmed = text.trim();
69          if !trimmed.is_empty() {
70            let now = Instant::now();
71
72            // Check if there's been a pause since last transcription
73            if now.duration_since(last_transcription_time) > pause_threshold
74              && !transcription_parts.is_empty()
75            {
76              // Start a new line after a pause
77              println!(); // Just add a newline
78              transcription_parts.clear();
79            }
80
81            // Check for stop command before adding new text
82            let test_text = format!("{} {}", transcription_parts.join(" "), trimmed).to_lowercase();
83            if test_text.contains("stop recording") {
84              print!("{} {}", transcription_parts.join(" "), trimmed);
85              println!("\nStop command detected. Ending transcription session.");
86              break;
87            }
88
89            transcription_parts.push(trimmed.to_string());
90
91            // Print just the new word with a space
92            print!(" {}", trimmed);
93            io::stdout().flush().unwrap();
94
95            last_transcription_time = now;
96          }
97        }
98      }
99      FfmpegEvent::Done => {
100        println!("\nTranscription complete!");
101        break;
102      }
103      _ => {}
104    }
105  }
106
107  Ok(())
108}
109
110fn find_default_audio_device() -> anyhow::Result<String> {
111  if cfg!(windows) {
112    // Windows: Use dshow to find audio devices
113    let audio_device = FfmpegCommand::new()
114      .hide_banner()
115      .args(["-list_devices", "true"])
116      .format("dshow")
117      .input("dummy")
118      .spawn()?
119      .iter()?
120      .into_ffmpeg_stderr()
121      .find(|line| line.contains("(audio)"))
122      .and_then(|line| line.split('\"').nth(1).map(|s| s.to_string()))
123      .ok_or_else(|| anyhow::anyhow!("No audio device found on Windows"))?;
124
125    Ok(audio_device)
126  } else {
127    // Linux/Mac: Use default device (could be improved with proper device detection)
128    println!("Note: Using default audio device. On Linux/Mac, you may need to adjust audio format and device.");
129    Ok("default".to_string())
130  }
131}
132
133fn download_whisper_model() -> anyhow::Result<()> {
134  let model_path = Path::new("whisper.cpp/models/ggml-base.en.bin");
135
136  // Check if model already exists
137  if model_path.exists() {
138    println!("Whisper model already exists at {}", model_path.display());
139    return Ok(());
140  }
141
142  println!("Downloading whisper.cpp and base.en model...");
143
144  // Clone whisper.cpp repository if it doesn't exist
145  if !Path::new("whisper.cpp").exists() {
146    println!("Cloning whisper.cpp repository...");
147    let output = Command::new("git")
148      .args(&["clone", "https://github.com/ggml-org/whisper.cpp.git"])
149      .output()?;
150
151    if !output.status.success() {
152      anyhow::bail!(
153        "Failed to clone whisper.cpp: {}",
154        String::from_utf8_lossy(&output.stderr)
155      );
156    }
157  }
158
159  // Download the model using the provided script
160  println!("Downloading base.en model...");
161  let output = Command::new("sh")
162    .args(&["./models/download-ggml-model.sh", "base.en"])
163    .current_dir("whisper.cpp")
164    .output()?;
165
166  if !output.status.success() {
167    anyhow::bail!(
168      "Failed to download model: {}",
169      String::from_utf8_lossy(&output.stderr)
170    );
171  }
172
173  println!(
174    "Successfully downloaded whisper model to {}",
175    model_path.display()
176  );
177  Ok(())
178}
179
180/// The `essentials` binary downloaded by the library doesn't have `whisper`
181/// Temporarily hide local ffmpeg binaries to force system path usage
182/// Requires ffmpeg-8-full to be installed in system PATH
183fn temporarily_use_ffmpeg_from_system_path() -> anyhow::Result<RestoreGuard> {
184  // Get the directory where the current executable is located
185  let exe_dir = env::current_exe()?.parent().unwrap().to_path_buf();
186
187  // Temporarily rename local ffmpeg binaries to force system path usage
188  let ffmpeg_names = ["ffmpeg", "ffmpeg.exe"];
189  let mut renamed_paths = Vec::new();
190
191  // Rename any local ffmpeg binaries in the executable directory
192  for name in &ffmpeg_names {
193    let ffmpeg_path = exe_dir.join(name);
194    if ffmpeg_path.exists() {
195      let backup_path = exe_dir.join(format!("{}.backup", name));
196      fs::rename(&ffmpeg_path, &backup_path)?;
197      println!(
198        "Temporarily renamed {} to {}",
199        ffmpeg_path.display(),
200        backup_path.display()
201      );
202      renamed_paths.push((ffmpeg_path, backup_path));
203    }
204  }
205
206  Ok(RestoreGuard { renamed_paths })
207}
208
209struct RestoreGuard {
210  renamed_paths: Vec<(std::path::PathBuf, std::path::PathBuf)>,
211}
212
213impl Drop for RestoreGuard {
214  fn drop(&mut self) {
215    for (original, backup) in &self.renamed_paths {
216      if let Err(e) = fs::rename(backup, original) {
217        eprintln!("Failed to restore {}: {}", original.display(), e);
218      } else {
219        println!("Restored {}", original.display());
220      }
221    }
222  }
223}