wavekat_vad/lib.rs
1//! WaveKat VAD — Unified voice activity detection with multiple backends.
2//!
3//! This crate provides a common [`VoiceActivityDetector`] trait with
4//! implementations for different VAD backends, enabling experimentation
5//! and benchmarking across technologies.
6//!
7//! # Backends
8//!
9//! | Backend | Feature | Description |
10//! |---------|---------|-------------|
11//! | WebRTC | `webrtc` (default) | Google's WebRTC VAD — fast, binary output |
12//! | Silero | `silero` | Neural network via ONNX Runtime — higher accuracy, continuous probability |
13//! | TEN-VAD | `ten-vad` | Agora's TEN-VAD via ONNX — pure Rust, no C dependency |
14//!
15//! # Feature flags
16//!
17//! - **`webrtc`** *(default)* — WebRTC VAD backend
18//! - **`silero`** — Silero VAD backend (downloads ONNX model at build time)
19//! - **`ten-vad`** — TEN-VAD backend (downloads ONNX model at build time)
20//! - **`denoise`** — RNNoise-based noise suppression in the preprocessing pipeline
21//! - **`serde`** — `Serialize`/`Deserialize` impls for config types
22//!
23//! # TEN-VAD model license
24//!
25//! The TEN-VAD ONNX model is licensed under Apache-2.0 with a non-compete clause
26//! by the TEN-framework / Agora. It restricts deployment that competes with Agora's
27//! offerings. Review the [TEN-VAD license](https://github.com/TEN-framework/ten-vad)
28//! before using in production.
29//!
30//! # Example
31//!
32//! ```no_run
33//! use wavekat_vad::VoiceActivityDetector;
34//! use wavekat_vad::backends::webrtc::{WebRtcVad, WebRtcVadMode};
35//!
36//! let mut vad = WebRtcVad::new(16000, WebRtcVadMode::Quality).unwrap();
37//! let samples = vec![0i16; 160]; // 10ms at 16kHz
38//! let probability = vad.process(&samples, 16000).unwrap();
39//! println!("Speech probability: {probability}");
40//! ```
41
42pub mod adapter;
43pub mod backends;
44pub mod error;
45pub mod frame;
46pub mod preprocessing;
47
48pub use adapter::FrameAdapter;
49
50pub use error::VadError;
51
52/// Describes the audio requirements of a VAD backend.
53#[derive(Debug, Clone, PartialEq, Eq)]
54pub struct VadCapabilities {
55 /// Sample rate in Hz.
56 pub sample_rate: u32,
57 /// Required frame size in samples.
58 pub frame_size: usize,
59 /// Frame duration in milliseconds (derived from sample_rate and frame_size).
60 pub frame_duration_ms: u32,
61}
62
63/// Common interface for voice activity detection backends.
64///
65/// Each backend implements this trait, allowing callers to swap
66/// implementations without changing their processing logic.
67pub trait VoiceActivityDetector: Send {
68 /// Returns the audio requirements of this detector.
69 ///
70 /// Use this to determine the expected sample rate and frame size
71 /// before calling [`process`](Self::process).
72 fn capabilities(&self) -> VadCapabilities;
73
74 /// Process an audio frame and return the probability of speech.
75 ///
76 /// Returns a value between `0.0` (silence) and `1.0` (speech).
77 /// Some backends (e.g. WebRTC) return only binary values (`0.0` or `1.0`),
78 /// while others (e.g. Silero) return continuous probabilities.
79 ///
80 /// # Arguments
81 ///
82 /// * `samples` — Audio samples as 16-bit signed integers, mono channel.
83 /// Must match the `frame_size` from [`capabilities`](Self::capabilities).
84 /// * `sample_rate` — Sample rate in Hz (must match the rate the detector was created with).
85 ///
86 /// # Errors
87 ///
88 /// Returns [`VadError`] if the sample rate or frame size is invalid,
89 /// or if the backend encounters a processing error.
90 fn process(&mut self, samples: &[i16], sample_rate: u32) -> Result<f32, VadError>;
91
92 /// Reset the detector's internal state.
93 ///
94 /// Call this when starting a new audio stream or after a long pause.
95 fn reset(&mut self);
96}