Skip to main content

opencode_voice/audio/
capture.rs

1//! cpal-based microphone capture: 16kHz mono i16 PCM audio recording.
2//!
3//! Tries the ideal config (16kHz mono) first.  When the device doesn't support
4//! it — common on macOS — falls back to the device's native sample-rate and
5//! channel count and resamples to 16kHz mono in the audio callback.
6
7use anyhow::{Context, Result};
8use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
9use cpal::{SampleFormat, StreamConfig};
10use std::sync::{Arc, Mutex};
11use std::time::Instant;
12
13/// Lists all available audio input device names.
14pub fn list_devices() -> Result<Vec<String>> {
15    let host = cpal::default_host();
16    let devices = host
17        .input_devices()
18        .context("Failed to enumerate input devices")?;
19    Ok(devices.filter_map(|d| d.name().ok()).collect())
20}
21
22/// Resampling state carried across audio callbacks.
23struct ResampleState {
24    /// native_sample_rate / 16_000
25    ratio: f64,
26    /// Fractional input-sample position carried between callbacks.
27    phase: f64,
28}
29
30/// Records microphone audio via cpal, always producing 16kHz mono i16 output.
31pub struct CpalRecorder {
32    device_name: Option<String>,
33    samples: Arc<Mutex<Vec<i16>>>,
34    stream: Option<cpal::Stream>,
35    start_time: Option<Instant>,
36    energy_tx: Option<tokio::sync::mpsc::UnboundedSender<f32>>,
37}
38
39impl CpalRecorder {
40    /// Creates a new recorder for the given device (or default if None).
41    pub fn new(device: Option<&str>) -> Result<Self> {
42        Ok(CpalRecorder {
43            device_name: device.map(|s| s.to_string()),
44            samples: Arc::new(Mutex::new(Vec::new())),
45            stream: None,
46            start_time: None,
47            energy_tx: None,
48        })
49    }
50
51    /// Starts recording. Returns a receiver for RMS energy updates (0.0–1.0).
52    pub fn start(&mut self) -> Result<tokio::sync::mpsc::UnboundedReceiver<f32>> {
53        let host = cpal::default_host();
54
55        // Find device
56        let device = if let Some(ref name) = self.device_name {
57            host.input_devices()
58                .context("Failed to enumerate devices")?
59                .find(|d| d.name().map(|n| n == *name).unwrap_or(false))
60                .with_context(|| format!("Audio device '{}' not found", name))?
61        } else {
62            host.default_input_device().context(
63                "No default audio input device found. Please check microphone connection.",
64            )?
65        };
66
67        let (energy_tx, energy_rx) = tokio::sync::mpsc::unbounded_channel::<f32>();
68
69        let stream = self.build_stream(&device, energy_tx.clone())?;
70
71        stream.play().context("Failed to start audio stream")?;
72
73        self.stream = Some(stream);
74        self.start_time = Some(Instant::now());
75        self.energy_tx = Some(energy_tx);
76
77        Ok(energy_rx)
78    }
79
80    /// Builds the cpal input stream.
81    ///
82    /// 1. Try 16kHz mono i16 (zero conversion — ideal).
83    /// 2. Try 16kHz mono f32 (format conversion only).
84    /// 3. Fall back to the device's native config and resample in the callback.
85    fn build_stream(
86        &self,
87        device: &cpal::Device,
88        energy_tx: tokio::sync::mpsc::UnboundedSender<f32>,
89    ) -> Result<cpal::Stream> {
90        let ideal_config = StreamConfig {
91            channels: 1,
92            sample_rate: cpal::SampleRate(16_000),
93            buffer_size: cpal::BufferSize::Default,
94        };
95
96        let debug = std::env::var("RUST_LOG").is_ok();
97
98        // --- Strategy 1: 16kHz mono i16 (ideal) ---
99        if let Ok(stream) = self.build_direct_i16_stream(device, &ideal_config, energy_tx.clone()) {
100            if debug {
101                eprintln!("[audio] Using 16kHz mono i16 (ideal)");
102            }
103            return Ok(stream);
104        }
105
106        // --- Strategy 2: 16kHz mono f32 ---
107        if let Ok(stream) = self.build_direct_f32_stream(device, &ideal_config, energy_tx.clone()) {
108            if debug {
109                eprintln!("[audio] Using 16kHz mono f32");
110            }
111            return Ok(stream);
112        }
113
114        // --- Strategy 3: native config + resample ---
115        let default_config = device
116            .default_input_config()
117            .context("Failed to get any supported input config from audio device")?;
118
119        let native_rate = default_config.sample_rate().0;
120        let native_channels = default_config.channels();
121        let native_format = default_config.sample_format();
122
123        if debug {
124            eprintln!(
125                "[audio] Capturing at native {}Hz {}ch {:?}, resampling to 16kHz",
126                native_rate, native_channels, native_format
127            );
128        }
129
130        let stream_config = StreamConfig {
131            channels: native_channels,
132            sample_rate: cpal::SampleRate(native_rate),
133            buffer_size: cpal::BufferSize::Default,
134        };
135
136        match native_format {
137            SampleFormat::I16 => self.build_resampling_i16_stream(
138                device,
139                &stream_config,
140                native_rate,
141                native_channels,
142                energy_tx,
143            ),
144            _ => self.build_resampling_f32_stream(
145                device,
146                &stream_config,
147                native_rate,
148                native_channels,
149                energy_tx,
150            ),
151        }
152        .context("Failed to build audio input stream with any supported configuration. Check microphone permissions.")
153    }
154
155    // ---------------------------------------------------------------
156    //  Direct streams (16kHz mono, no resampling)
157    // ---------------------------------------------------------------
158
159    /// 16kHz mono i16 — no conversion needed.
160    fn build_direct_i16_stream(
161        &self,
162        device: &cpal::Device,
163        config: &StreamConfig,
164        energy_tx: tokio::sync::mpsc::UnboundedSender<f32>,
165    ) -> Result<cpal::Stream> {
166        let samples_arc = Arc::clone(&self.samples);
167
168        let stream = device
169            .build_input_stream(
170                config,
171                move |data: &[i16], _: &cpal::InputCallbackInfo| {
172                    if !data.is_empty() {
173                        let sum_sq: f64 = data
174                            .iter()
175                            .map(|&s| {
176                                let f = s as f64 / 32768.0;
177                                f * f
178                            })
179                            .sum();
180                        let rms = (sum_sq / data.len() as f64).sqrt() as f32;
181                        let _ = energy_tx.send(rms.min(1.0));
182                    }
183                    if let Ok(mut guard) = samples_arc.try_lock() {
184                        guard.extend_from_slice(data);
185                    }
186                },
187                |err| eprintln!("Audio stream error: {}", err),
188                None,
189            )
190            .map_err(|e| anyhow::anyhow!("i16 stream: {}", e))?;
191
192        Ok(stream)
193    }
194
195    /// 16kHz mono f32 — format conversion only (f32 → i16).
196    fn build_direct_f32_stream(
197        &self,
198        device: &cpal::Device,
199        config: &StreamConfig,
200        energy_tx: tokio::sync::mpsc::UnboundedSender<f32>,
201    ) -> Result<cpal::Stream> {
202        let samples_arc = Arc::clone(&self.samples);
203
204        let stream = device
205            .build_input_stream(
206                config,
207                move |data: &[f32], _: &cpal::InputCallbackInfo| {
208                    if !data.is_empty() {
209                        let sum_sq: f64 = data.iter().map(|&s| (s as f64) * (s as f64)).sum();
210                        let rms = (sum_sq / data.len() as f64).sqrt() as f32;
211                        let _ = energy_tx.send(rms.min(1.0));
212                    }
213                    if let Ok(mut guard) = samples_arc.try_lock() {
214                        for &s in data {
215                            let clamped = s.clamp(-1.0, 1.0);
216                            guard.push((clamped * 32767.0) as i16);
217                        }
218                    }
219                },
220                |err| eprintln!("Audio stream error: {}", err),
221                None,
222            )
223            .map_err(|e| anyhow::anyhow!("f32 stream: {}", e))?;
224
225        Ok(stream)
226    }
227
228    // ---------------------------------------------------------------
229    //  Resampling streams (native rate/channels → 16kHz mono)
230    // ---------------------------------------------------------------
231
232    /// Native-rate f32 stream with downmix + resample to 16kHz mono i16.
233    fn build_resampling_f32_stream(
234        &self,
235        device: &cpal::Device,
236        config: &StreamConfig,
237        native_rate: u32,
238        native_channels: u16,
239        energy_tx: tokio::sync::mpsc::UnboundedSender<f32>,
240    ) -> Result<cpal::Stream> {
241        let samples_arc = Arc::clone(&self.samples);
242        let state = Arc::new(Mutex::new(ResampleState {
243            ratio: native_rate as f64 / 16_000.0,
244            phase: 0.0,
245        }));
246
247        let stream = device
248            .build_input_stream(
249                config,
250                move |data: &[f32], _: &cpal::InputCallbackInfo| {
251                    let ch = native_channels as usize;
252
253                    // --- Downmix to mono ---
254                    let mono: Vec<f32> = if ch > 1 {
255                        data.chunks(ch)
256                            .map(|frame| frame.iter().sum::<f32>() / ch as f32)
257                            .collect()
258                    } else {
259                        data.to_vec()
260                    };
261
262                    // --- RMS energy ---
263                    if !mono.is_empty() {
264                        let sum_sq: f64 = mono.iter().map(|&s| (s as f64) * (s as f64)).sum();
265                        let rms = (sum_sq / mono.len() as f64).sqrt() as f32;
266                        let _ = energy_tx.send(rms.min(1.0));
267                    }
268
269                    // --- Resample (linear interpolation) ---
270                    if let Ok(mut st) = state.lock() {
271                        let ratio = st.ratio;
272                        let mut phase = st.phase;
273                        let len = mono.len() as f64;
274                        let mut resampled = Vec::new();
275
276                        while phase < len {
277                            let idx = phase as usize;
278                            let frac = (phase - idx as f64) as f32;
279                            let a = mono[idx];
280                            let b = if idx + 1 < mono.len() {
281                                mono[idx + 1]
282                            } else {
283                                a
284                            };
285                            let sample = a + (b - a) * frac;
286                            let clamped = sample.clamp(-1.0, 1.0);
287                            resampled.push((clamped * 32767.0) as i16);
288                            phase += ratio;
289                        }
290
291                        st.phase = phase - len;
292
293                        if let Ok(mut guard) = samples_arc.try_lock() {
294                            guard.extend_from_slice(&resampled);
295                        }
296                    }
297                },
298                |err| eprintln!("Audio stream error: {}", err),
299                None,
300            )
301            .map_err(|e| anyhow::anyhow!("Resampling f32 stream: {}", e))?;
302
303        Ok(stream)
304    }
305
306    /// Native-rate i16 stream with downmix + resample to 16kHz mono i16.
307    fn build_resampling_i16_stream(
308        &self,
309        device: &cpal::Device,
310        config: &StreamConfig,
311        native_rate: u32,
312        native_channels: u16,
313        energy_tx: tokio::sync::mpsc::UnboundedSender<f32>,
314    ) -> Result<cpal::Stream> {
315        let samples_arc = Arc::clone(&self.samples);
316        let state = Arc::new(Mutex::new(ResampleState {
317            ratio: native_rate as f64 / 16_000.0,
318            phase: 0.0,
319        }));
320
321        let stream = device
322            .build_input_stream(
323                config,
324                move |data: &[i16], _: &cpal::InputCallbackInfo| {
325                    let ch = native_channels as usize;
326
327                    // --- Convert to f32 and downmix to mono ---
328                    let mono: Vec<f32> = if ch > 1 {
329                        data.chunks(ch)
330                            .map(|frame| {
331                                let sum: f32 = frame.iter().map(|&s| s as f32 / 32768.0).sum();
332                                sum / ch as f32
333                            })
334                            .collect()
335                    } else {
336                        data.iter().map(|&s| s as f32 / 32768.0).collect()
337                    };
338
339                    // --- RMS energy ---
340                    if !mono.is_empty() {
341                        let sum_sq: f64 = mono.iter().map(|&s| (s as f64) * (s as f64)).sum();
342                        let rms = (sum_sq / mono.len() as f64).sqrt() as f32;
343                        let _ = energy_tx.send(rms.min(1.0));
344                    }
345
346                    // --- Resample (linear interpolation) ---
347                    if let Ok(mut st) = state.lock() {
348                        let ratio = st.ratio;
349                        let mut phase = st.phase;
350                        let len = mono.len() as f64;
351                        let mut resampled = Vec::new();
352
353                        while phase < len {
354                            let idx = phase as usize;
355                            let frac = (phase - idx as f64) as f32;
356                            let a = mono[idx];
357                            let b = if idx + 1 < mono.len() {
358                                mono[idx + 1]
359                            } else {
360                                a
361                            };
362                            let sample = a + (b - a) * frac;
363                            let clamped = sample.clamp(-1.0, 1.0);
364                            resampled.push((clamped * 32767.0) as i16);
365                            phase += ratio;
366                        }
367
368                        st.phase = phase - len;
369
370                        if let Ok(mut guard) = samples_arc.try_lock() {
371                            guard.extend_from_slice(&resampled);
372                        }
373                    }
374                },
375                |err| eprintln!("Audio stream error: {}", err),
376                None,
377            )
378            .map_err(|e| anyhow::anyhow!("Resampling i16 stream: {}", e))?;
379
380        Ok(stream)
381    }
382
383    /// Stops recording and returns all captured samples (16kHz mono i16).
384    pub fn stop(&mut self) -> Result<Vec<i16>> {
385        // Drop the stream to stop recording
386        self.stream = None;
387        self.energy_tx = None;
388
389        let samples = {
390            let guard = self
391                .samples
392                .lock()
393                .map_err(|_| anyhow::anyhow!("Failed to lock samples buffer"))?;
394            guard.clone()
395        };
396
397        // Clear for next use
398        if let Ok(mut guard) = self.samples.lock() {
399            guard.clear();
400        }
401
402        Ok(samples)
403    }
404
405    /// Returns the elapsed recording duration in seconds.
406    pub fn duration(&self) -> f64 {
407        self.start_time
408            .map(|t| t.elapsed().as_secs_f64())
409            .unwrap_or(0.0)
410    }
411}
412
413// Safety: CpalRecorder is Send because Arc<Mutex<>> handles shared state.
414// cpal::Stream is not Send on all platforms (e.g. macOS CoreAudio), but we
415// manage it carefully: the stream is only dropped (in stop()), never accessed
416// from another thread after creation.
417unsafe impl Send for CpalRecorder {}