Skip to main content

wavekat_vad/
lib.rs

1//! WaveKat VAD — Unified voice activity detection with multiple backends.
2//!
3//! This crate provides a common [`VoiceActivityDetector`] trait with
4//! implementations for different VAD backends, enabling experimentation
5//! and benchmarking across technologies.
6//!
7//! # Backends
8//!
9//! | Backend | Feature | Description |
10//! |---------|---------|-------------|
11//! | WebRTC | `webrtc` (default) | Google's WebRTC VAD — fast, binary output |
12//! | Silero | `silero` | Neural network via ONNX Runtime — higher accuracy, continuous probability |
13//! | TEN-VAD | `ten-vad` | Agora's TEN-VAD via ONNX — pure Rust, no C dependency |
14//!
15//! # Feature flags
16//!
17//! - **`webrtc`** *(default)* — WebRTC VAD backend
18//! - **`silero`** — Silero VAD backend (downloads ONNX model at build time)
19//! - **`ten-vad`** — TEN-VAD backend (downloads ONNX model at build time)
20//! - **`denoise`** — RNNoise-based noise suppression in the preprocessing pipeline
21//! - **`serde`** — `Serialize`/`Deserialize` impls for config types
22//!
23//! # TEN-VAD model license
24//!
25//! The TEN-VAD ONNX model is licensed under Apache-2.0 with a non-compete clause
26//! by the TEN-framework / Agora. It restricts deployment that competes with Agora's
27//! offerings. Review the [TEN-VAD license](https://github.com/TEN-framework/ten-vad)
28//! before using in production.
29//!
30//! # Example
31//!
32//! ```no_run
33//! # #[cfg(feature = "webrtc")]
34//! # {
35//! use wavekat_vad::VoiceActivityDetector;
36//! use wavekat_vad::backends::webrtc::{WebRtcVad, WebRtcVadMode};
37//!
38//! let mut vad = WebRtcVad::new(16000, WebRtcVadMode::Quality).unwrap();
39//! let samples = vec![0i16; 160]; // 10ms at 16kHz
40//! let probability = vad.process(&samples, 16000).unwrap();
41//! println!("Speech probability: {probability}");
42//! # }
43//! ```
44
45pub mod adapter;
46pub mod backends;
47pub mod error;
48pub mod frame;
49pub mod preprocessing;
50
51pub use adapter::FrameAdapter;
52
53pub use error::VadError;
54
55/// Describes the audio requirements of a VAD backend.
56#[derive(Debug, Clone, PartialEq, Eq)]
57pub struct VadCapabilities {
58    /// Sample rate in Hz.
59    pub sample_rate: u32,
60    /// Required frame size in samples.
61    pub frame_size: usize,
62    /// Frame duration in milliseconds (derived from sample_rate and frame_size).
63    pub frame_duration_ms: u32,
64}
65
66/// Common interface for voice activity detection backends.
67///
68/// Each backend implements this trait, allowing callers to swap
69/// implementations without changing their processing logic.
70pub trait VoiceActivityDetector: Send {
71    /// Returns the audio requirements of this detector.
72    ///
73    /// Use this to determine the expected sample rate and frame size
74    /// before calling [`process`](Self::process).
75    fn capabilities(&self) -> VadCapabilities;
76
77    /// Process an audio frame and return the probability of speech.
78    ///
79    /// Returns a value between `0.0` (silence) and `1.0` (speech).
80    /// Some backends (e.g. WebRTC) return only binary values (`0.0` or `1.0`),
81    /// while others (e.g. Silero) return continuous probabilities.
82    ///
83    /// # Arguments
84    ///
85    /// * `samples` — Audio samples as 16-bit signed integers, mono channel.
86    ///   Must match the `frame_size` from [`capabilities`](Self::capabilities).
87    /// * `sample_rate` — Sample rate in Hz (must match the rate the detector was created with).
88    ///
89    /// # Errors
90    ///
91    /// Returns [`VadError`] if the sample rate or frame size is invalid,
92    /// or if the backend encounters a processing error.
93    fn process(&mut self, samples: &[i16], sample_rate: u32) -> Result<f32, VadError>;
94
95    /// Reset the detector's internal state.
96    ///
97    /// Call this when starting a new audio stream or after a long pause.
98    fn reset(&mut self);
99}