wavekat_vad/lib.rs
1//! WaveKat VAD — Unified voice activity detection with multiple backends.
2//!
3//! This crate provides a common [`VoiceActivityDetector`] trait with
4//! implementations for different VAD backends, enabling experimentation
5//! and benchmarking across technologies.
6//!
7//! # Backends
8//!
9//! | Backend | Feature | Description |
10//! |---------|---------|-------------|
11//! | WebRTC | `webrtc` (default) | Google's WebRTC VAD — fast, binary output |
12//! | Silero | `silero` | Neural network via ONNX Runtime — higher accuracy, continuous probability |
13//! | TEN-VAD | `ten-vad` | Agora's TEN-VAD via ONNX — pure Rust, no C dependency |
14//!
15//! # Feature flags
16//!
17//! - **`webrtc`** *(default)* — WebRTC VAD backend
18//! - **`silero`** — Silero VAD backend (downloads ONNX model at build time)
19//! - **`ten-vad`** — TEN-VAD backend (downloads ONNX model at build time)
20//! - **`denoise`** — RNNoise-based noise suppression in the preprocessing pipeline
21//! - **`serde`** — `Serialize`/`Deserialize` impls for config types
22//!
23//! # TEN-VAD model license
24//!
25//! The TEN-VAD ONNX model is licensed under Apache-2.0 with a non-compete clause
26//! by the TEN-framework / Agora. It restricts deployment that competes with Agora's
27//! offerings. Review the [TEN-VAD license](https://github.com/TEN-framework/ten-vad)
28//! before using in production.
29//!
30//! # Example
31//!
32//! ```no_run
33//! # #[cfg(feature = "webrtc")]
34//! # {
35//! use wavekat_vad::VoiceActivityDetector;
36//! use wavekat_vad::backends::webrtc::{WebRtcVad, WebRtcVadMode};
37//!
38//! let mut vad = WebRtcVad::new(16000, WebRtcVadMode::Quality).unwrap();
39//! let samples = vec![0i16; 160]; // 10ms at 16kHz
40//! let probability = vad.process(&samples, 16000).unwrap();
41//! println!("Speech probability: {probability}");
42//! # }
43//! ```
44
45pub mod adapter;
46pub mod backends;
47pub mod error;
48pub mod frame;
49pub mod preprocessing;
50
51pub use adapter::FrameAdapter;
52
53pub use error::VadError;
54
55/// Describes the audio requirements of a VAD backend.
56#[derive(Debug, Clone, PartialEq, Eq)]
57pub struct VadCapabilities {
58 /// Sample rate in Hz.
59 pub sample_rate: u32,
60 /// Required frame size in samples.
61 pub frame_size: usize,
62 /// Frame duration in milliseconds (derived from sample_rate and frame_size).
63 pub frame_duration_ms: u32,
64}
65
66/// Common interface for voice activity detection backends.
67///
68/// Each backend implements this trait, allowing callers to swap
69/// implementations without changing their processing logic.
70pub trait VoiceActivityDetector: Send {
71 /// Returns the audio requirements of this detector.
72 ///
73 /// Use this to determine the expected sample rate and frame size
74 /// before calling [`process`](Self::process).
75 fn capabilities(&self) -> VadCapabilities;
76
77 /// Process an audio frame and return the probability of speech.
78 ///
79 /// Returns a value between `0.0` (silence) and `1.0` (speech).
80 /// Some backends (e.g. WebRTC) return only binary values (`0.0` or `1.0`),
81 /// while others (e.g. Silero) return continuous probabilities.
82 ///
83 /// # Arguments
84 ///
85 /// * `samples` — Audio samples as 16-bit signed integers, mono channel.
86 /// Must match the `frame_size` from [`capabilities`](Self::capabilities).
87 /// * `sample_rate` — Sample rate in Hz (must match the rate the detector was created with).
88 ///
89 /// # Errors
90 ///
91 /// Returns [`VadError`] if the sample rate or frame size is invalid,
92 /// or if the backend encounters a processing error.
93 fn process(&mut self, samples: &[i16], sample_rate: u32) -> Result<f32, VadError>;
94
95 /// Reset the detector's internal state.
96 ///
97 /// Call this when starting a new audio stream or after a long pause.
98 fn reset(&mut self);
99}