Skip to main content

omni_dev/voice/
capture.rs

1//! End-to-end capture pipeline orchestrator.
2//!
3//! Glues an [`AudioSource`] through the write-path stages (mixdown →
4//! resample → idle detection → trailing-silence trim → WAV write) and
5//! reports a structured [`CaptureSummary`] when done. Signal-driven
6//! termination is wired up via [`install_ctrl_c_handler`], which the CLI
7//! entry point calls before delegating to [`run_capture`].
8
9use std::path::PathBuf;
10use std::sync::atomic::{AtomicBool, Ordering};
11use std::sync::Arc;
12
13use anyhow::{anyhow, Context, Result};
14
15use super::audio::AudioSource;
16use super::idle::{trim_trailing_silence, IdleDetector};
17use super::wav::{mono_mixdown, Resampler, WavWriter};
18
19/// Options for a single capture session.
20#[derive(Debug, Clone)]
21pub struct CaptureOpts {
22    /// Destination WAV path.
23    pub output: PathBuf,
24    /// Seconds of trailing silence that auto-stop capture. `0` disables
25    /// auto-stop (capture runs until the source is exhausted or a stop
26    /// signal arrives).
27    pub idle_after_secs: u32,
28}
29
30impl CaptureOpts {
31    /// Creates a new options struct with the given output path and
32    /// idle-after threshold.
33    #[must_use]
34    pub fn new(output: impl Into<PathBuf>, idle_after_secs: u32) -> Self {
35        Self {
36            output: output.into(),
37            idle_after_secs,
38        }
39    }
40}
41
42/// Why capture stopped.
43#[derive(Debug, Clone, Copy, PartialEq, Eq)]
44pub enum TerminationReason {
45    /// The idle (silence) detector fired its `idle_after_secs` budget.
46    Idle,
47    /// The [`AudioSource`] returned `None` (file end, stream closed).
48    SourceExhausted,
49    /// An external stop signal flipped the supplied `AtomicBool` to true
50    /// (Ctrl-C in production).
51    Signal,
52}
53
54/// Structured summary of a capture session for tests and logging.
55#[derive(Debug, Clone)]
56pub struct CaptureSummary {
57    /// Path the WAV was written to.
58    pub output: PathBuf,
59    /// Samples actually written to disk (post-trim).
60    pub samples_written: u64,
61    /// Samples that were dropped from the tail by trailing-silence trim.
62    /// Always 0 for [`TerminationReason::Signal`] (user-driven stops are
63    /// not trimmed — the user chose where to stop).
64    pub trimmed_samples: u64,
65    /// Why capture stopped.
66    pub terminated_by: TerminationReason,
67}
68
69/// Drives `source` through the pipeline and writes a 16 kHz mono 16-bit
70/// PCM WAV at `opts.output`.
71///
72/// Termination conditions, in priority order each loop iteration:
73///
74/// 1. `stop_signal` set to true → [`TerminationReason::Signal`].
75/// 2. Idle detector fires → [`TerminationReason::Idle`].
76/// 3. Source returns `None` → [`TerminationReason::SourceExhausted`].
77///
78/// On [`TerminationReason::Idle`] the trailing silence that caused the
79/// trigger is trimmed before the WAV header is finalised. On
80/// [`TerminationReason::Signal`] nothing is trimmed (the user chose the
81/// cutoff). On [`TerminationReason::SourceExhausted`] the entire
82/// resampled stream is written verbatim.
83///
84/// Returns an error if no voiced window was ever observed — emitting a
85/// near-empty WAV would just crash downstream Whisper anyway, so we
86/// fail loudly instead. The output file is removed on this path so the
87/// caller never sees a malformed WAV on disk.
88pub fn run_capture<S: AudioSource>(
89    mut source: S,
90    opts: CaptureOpts,
91    stop_signal: Arc<AtomicBool>,
92) -> Result<CaptureSummary> {
93    let mut resampler = Resampler::new(source.sample_rate())
94        .with_context(|| format!("Failed to build resampler at {} Hz", source.sample_rate()))?;
95    let mut detector = IdleDetector::new(opts.idle_after_secs);
96    let mut buffer: Vec<f32> = Vec::new();
97    let channels = source.channels();
98    let termination = loop {
99        if stop_signal.load(Ordering::Relaxed) {
100            break TerminationReason::Signal;
101        }
102        let Some(chunk) = source.next_chunk() else {
103            break TerminationReason::SourceExhausted;
104        };
105        let mono = mono_mixdown(&chunk, channels);
106        let resampled = resampler.push(&mono)?;
107        detector.push(&resampled);
108        buffer.extend_from_slice(&resampled);
109        if detector.is_idle() {
110            break TerminationReason::Idle;
111        }
112    };
113
114    // Drain the resampler tail when the source exhausted naturally. We
115    // skip this on Signal-driven shutdown to keep the cut crisp.
116    if matches!(
117        termination,
118        TerminationReason::SourceExhausted | TerminationReason::Idle
119    ) {
120        let tail = resampler.flush()?;
121        detector.push(&tail);
122        buffer.extend_from_slice(&tail);
123    }
124
125    // Trim trailing silence only when the idle detector fired.
126    let (samples_to_write, trimmed): (&[f32], u64) = if termination == TerminationReason::Idle {
127        let tail = detector.trailing_silence_samples();
128        let trimmed = trim_trailing_silence(&buffer, tail);
129        let dropped = (buffer.len() - trimmed.len()) as u64;
130        (trimmed, dropped)
131    } else {
132        (buffer.as_slice(), 0)
133    };
134
135    // No-audio guard: on natural termination (idle / source exhausted) we
136    // require at least one voiced window. On signal-driven termination the
137    // user explicitly stopped, so we instead just require *some* samples
138    // to write — Ctrl-C before anything was captured is also a hard error,
139    // but a quiet recording the user chose to keep is not.
140    match termination {
141        TerminationReason::Idle | TerminationReason::SourceExhausted => {
142            if !detector.has_any_voice() {
143                return Err(anyhow!(
144                    "No audio detected — every window of the {:.1}s capture was below the \
145                     silence threshold. Is the microphone muted or routed to a different device?",
146                    elapsed_seconds(buffer.len())
147                ));
148            }
149        }
150        TerminationReason::Signal => {
151            if samples_to_write.is_empty() {
152                return Err(anyhow!(
153                    "Stopped before any audio was captured — the stop signal arrived before \
154                     the first sample reached the writer."
155                ));
156            }
157        }
158    }
159
160    let mut writer = WavWriter::create(&opts.output)?;
161    writer.write_samples(samples_to_write)?;
162    let samples_written = writer.samples_written();
163    writer.finalize()?;
164
165    Ok(CaptureSummary {
166        output: opts.output,
167        samples_written,
168        trimmed_samples: trimmed,
169        terminated_by: termination,
170    })
171}
172
173fn elapsed_seconds(samples_at_16k: usize) -> f32 {
174    samples_at_16k as f32 / super::wav::TARGET_SAMPLE_RATE as f32
175}
176
177/// Installs a SIGINT (Ctrl-C) handler that flips the returned flag on
178/// receipt, and returns a fresh `Arc<AtomicBool>` initialised to false.
179///
180/// The flag is the one [`run_capture`] polls each iteration; flipping it
181/// causes the pipeline to terminate with [`TerminationReason::Signal`].
182/// On Unix this uses `signal-hook` (safe, no global state hijack). On
183/// Windows the same `signal-hook` API targets `SIGINT` via the
184/// console-control handler; the call is portable.
185///
186/// Safe to call once per process. A second call adds a *second* handler
187/// that also flips the flag — harmless but redundant. The capture loop
188/// already terminates on the first flip.
189pub fn install_ctrl_c_handler() -> Result<Arc<AtomicBool>> {
190    let flag = Arc::new(AtomicBool::new(false));
191    signal_hook::flag::register(signal_hook::consts::SIGINT, flag.clone())
192        .context("Failed to register SIGINT handler")?;
193    Ok(flag)
194}
195
196#[cfg(test)]
197#[allow(clippy::unwrap_used, clippy::expect_used)]
198mod tests {
199    use super::*;
200
201    use std::f32::consts::TAU;
202
203    use crate::voice::audio::FileAudioSource;
204
205    fn voiced_then_silent(rate: u32, voiced_s: f32, silent_s: f32, amplitude: f32) -> Vec<f32> {
206        let voiced_n = (rate as f32 * voiced_s) as usize;
207        let silent_n = (rate as f32 * silent_s) as usize;
208        let mut out: Vec<f32> = (0..voiced_n)
209            .map(|i| amplitude * (TAU * 440.0 * i as f32 / rate as f32).sin())
210            .collect();
211        out.extend(std::iter::repeat_n(0.0, silent_n));
212        out
213    }
214
215    fn write_to_temp(prefix: &str) -> (tempfile::TempDir, std::path::PathBuf) {
216        let tmp = tempfile::TempDir::new().unwrap();
217        let path = tmp.path().join(format!("{prefix}.wav"));
218        (tmp, path)
219    }
220
221    fn stop_flag(value: bool) -> Arc<AtomicBool> {
222        Arc::new(AtomicBool::new(value))
223    }
224
225    #[test]
226    fn idle_termination_trims_trailing_silence() -> Result<()> {
227        // 1 s voiced @ 0.4 amp, then 3 s silence. idle_after_secs=2 → fires
228        // after 2 s of silence; one extra second of silence remains in the
229        // buffer that should be trimmed.
230        let source_rate = 48_000;
231        let samples = voiced_then_silent(source_rate, 1.0, 3.0, 0.4);
232        let source = FileAudioSource::from_samples(samples, source_rate, 1, 4800);
233        let (_tmp, path) = write_to_temp("idle");
234        let summary = run_capture(source, CaptureOpts::new(&path, 2), stop_flag(false))?;
235
236        assert_eq!(summary.terminated_by, TerminationReason::Idle);
237        assert!(
238            summary.trimmed_samples > 0,
239            "tail silence should be trimmed"
240        );
241
242        let reader = hound::WavReader::open(&path)?;
243        let spec = reader.spec();
244        assert_eq!(spec.channels, 1);
245        assert_eq!(spec.sample_rate, 16_000);
246        assert_eq!(spec.bits_per_sample, 16);
247
248        let frame_count = reader.duration() as usize;
249        // Expected ≈ 1 s of voiced content at 16 kHz (~16_000 samples).
250        // Sinc warm-up and window-alignment slack mean ±2_000 frames is
251        // routine. The hard contract is that *some* of the 3 s of silence
252        // was trimmed.
253        assert!(
254            (14_000..=20_000).contains(&frame_count),
255            "unexpected frame count after trim: {frame_count}"
256        );
257        Ok(())
258    }
259
260    #[test]
261    fn source_exhausted_writes_everything() -> Result<()> {
262        // 0.5 s voiced @ 0.4 amp. idle_after_secs huge → never fires; source
263        // runs out naturally.
264        let source_rate = 16_000; // identity path through resampler
265        let voiced_n = 8000;
266        let samples: Vec<f32> = (0..voiced_n)
267            .map(|i| 0.4 * (TAU * 440.0 * i as f32 / source_rate as f32).sin())
268            .collect();
269        let source = FileAudioSource::from_samples(samples, source_rate, 1, 1024);
270        let (_tmp, path) = write_to_temp("exhausted");
271        let summary = run_capture(source, CaptureOpts::new(&path, 60), stop_flag(false))?;
272
273        assert_eq!(summary.terminated_by, TerminationReason::SourceExhausted);
274        assert_eq!(summary.trimmed_samples, 0);
275        assert_eq!(summary.samples_written, voiced_n as u64);
276        Ok(())
277    }
278
279    /// Test-only [`AudioSource`] that wraps another source and flips the
280    /// supplied stop signal after `flip_after_chunks` calls. Lets us
281    /// exercise the signal-termination path deterministically without
282    /// race conditions.
283    struct SignalFlippingSource<S: AudioSource> {
284        inner: S,
285        stop: Arc<AtomicBool>,
286        chunks_returned: u32,
287        flip_after_chunks: u32,
288    }
289
290    impl<S: AudioSource> AudioSource for SignalFlippingSource<S> {
291        fn next_chunk(&mut self) -> Option<Vec<f32>> {
292            let chunk = self.inner.next_chunk();
293            if chunk.is_some() {
294                self.chunks_returned += 1;
295                if self.chunks_returned >= self.flip_after_chunks {
296                    self.stop.store(true, Ordering::Relaxed);
297                }
298            }
299            chunk
300        }
301        fn sample_rate(&self) -> u32 {
302            self.inner.sample_rate()
303        }
304        fn channels(&self) -> u16 {
305            self.inner.channels()
306        }
307    }
308
309    #[test]
310    fn signal_termination_does_not_trim() -> Result<()> {
311        // Long voiced source; signal flips after the first chunk reaches
312        // the loop body, so some voiced audio always lands in the buffer.
313        let source_rate = 16_000;
314        let samples: Vec<f32> = (0..160_000)
315            .map(|i| 0.4 * (TAU * 440.0 * i as f32 / source_rate as f32).sin())
316            .collect();
317        let inner = FileAudioSource::from_samples(samples, source_rate, 1, 4000);
318        let stop = stop_flag(false);
319        let source = SignalFlippingSource {
320            inner,
321            stop: stop.clone(),
322            chunks_returned: 0,
323            flip_after_chunks: 1,
324        };
325        let (_tmp, path) = write_to_temp("signal");
326        let summary = run_capture(source, CaptureOpts::new(&path, 5), stop)?;
327
328        assert_eq!(summary.terminated_by, TerminationReason::Signal);
329        assert_eq!(
330            summary.trimmed_samples, 0,
331            "signal termination must not trim"
332        );
333        assert!(
334            summary.samples_written > 0,
335            "should have captured something"
336        );
337        // File exists and is readable.
338        let reader = hound::WavReader::open(&path)?;
339        assert_eq!(reader.spec().sample_rate, 16_000);
340        Ok(())
341    }
342
343    #[test]
344    fn signal_termination_with_no_captured_audio_fails_loudly() {
345        // Pre-flip the signal: the loop's top-of-iteration check exits
346        // before reading anything. The orchestrator must error rather
347        // than write a zero-sample WAV.
348        let source_rate = 16_000;
349        let source = FileAudioSource::from_samples(vec![0.0; 16_000], source_rate, 1, 16);
350        let stop = stop_flag(true);
351        let (_tmp, path) = write_to_temp("signal_empty");
352        let err = run_capture(source, CaptureOpts::new(&path, 5), stop).unwrap_err();
353        assert!(
354            err.to_string()
355                .contains("Stopped before any audio was captured"),
356            "expected loud failure, got: {err}"
357        );
358    }
359
360    #[test]
361    fn silence_only_input_fails_loudly() {
362        let source_rate = 16_000;
363        let silence = vec![0.0_f32; 16_000 * 6]; // 6 s of silence
364        let source = FileAudioSource::from_samples(silence, source_rate, 1, 1024);
365        let (_tmp, path) = write_to_temp("silent");
366        let err = run_capture(source, CaptureOpts::new(&path, 2), stop_flag(false)).unwrap_err();
367        assert!(
368            err.to_string().contains("No audio detected"),
369            "expected loud failure, got: {err}"
370        );
371        // The output file may have been created and then is overwritten by
372        // a later run, but it should not have been finalised — assert the
373        // error path didn't accidentally succeed and return a summary.
374    }
375}