speech_prep/vad/config.rs
1//! VAD configuration and validation.
2
3use crate::error::{Error, Result};
4use crate::time::{AudioDuration, AudioTimestamp};
5
6/// Number of nanoseconds in one second, used for time conversion.
7const NANOS_PER_SECOND: u128 = 1_000_000_000;
8
9/// Configuration for the voice activity detector.
10///
11/// # Performance Characteristics
12///
13/// - **Latency**: Typically <2ms per 20ms frame (10% overhead)
14/// - **Memory**: ~10KB per detector instance (FFT buffers + state)
15/// - **Accuracy**: >95% speech detection on clean audio
16///
17/// # Configuration Guidelines
18///
19/// ## Quick Start (Use Defaults)
20///
21/// ```rust,no_run
22/// use speech_prep::VadConfig;
23///
24/// let config = VadConfig::default(); // Optimized for 16kHz mono speech
25/// ```
26///
27/// ## Advanced Tuning
28///
29/// **For Noisy Environments**: Increase `activation_margin` to 1.3-1.5
30///
31/// ```rust,no_run
32/// # use speech_prep::VadConfig;
33/// let config = VadConfig {
34/// activation_margin: 1.4, // Require stronger signal
35/// hangover_frames: 5, // Longer trailing silence tolerance
36/// ..VadConfig::default()
37/// };
38/// ```
39///
40/// **For Low-Latency Applications**: Reduce `frame_duration`
41///
42/// ```rust,no_run
43/// # use speech_prep::VadConfig;
44/// # use speech_prep::time::AudioDuration;
45/// let config = VadConfig {
46/// frame_duration: AudioDuration::from_millis(10), // 10ms frames
47/// ..VadConfig::default()
48/// };
49/// ```
50///
51/// **For Soft/Quiet Speech**: Lower `activation_margin`
52///
53/// ```rust,no_run
54/// # use speech_prep::VadConfig;
55/// let config = VadConfig {
56/// activation_margin: 1.05, // More sensitive
57/// min_speech_frames: 2, // Faster activation
58/// ..VadConfig::default()
59/// };
60/// ```
61#[derive(Debug, Clone, Copy)]
62pub struct VadConfig {
63 /// Expected audio sample rate in Hz.
64 ///
65 /// **Default**: 16000 (16kHz - optimal for speech)
66 ///
67 /// **Valid Range**: 8000-48000 Hz
68 ///
69 /// **Performance Impact**: Higher rates increase FFT computation cost.
70 /// At 48kHz, expect ~3x slower processing vs 16kHz.
71 ///
72 /// **Recommendation**: Use 16kHz unless your audio pipeline requires
73 /// otherwise.
74 pub sample_rate: u32,
75
76 /// Frame duration used for analysis.
77 ///
78 /// **Default**: 20ms (320 samples at 16kHz)
79 ///
80 /// **Valid Range**: 10-50ms
81 ///
82 /// **Trade-offs**:
83 /// - Shorter (10ms): Lower latency, less robust to noise
84 /// - Longer (50ms): Higher latency, more stable detection
85 ///
86 /// **Performance Impact**: 20ms frame = ~1.5ms processing time.
87 /// Linear scaling: 10ms → ~0.75ms, 50ms → ~3.75ms.
88 pub frame_duration: AudioDuration,
89
90 /// Fractional overlap between adjacent frames.
91 ///
92 /// **Default**: 0.5 (50% overlap)
93 ///
94 /// **Valid Range**: [0.0, 1.0)
95 ///
96 /// **Effect**: Higher overlap increases temporal resolution but adds
97 /// computation cost. 50% overlap means processing 2x frames for same audio
98 /// duration.
99 ///
100 /// **Recommendation**: 0.5 for balanced accuracy/performance, 0.75 for
101 /// critical applications requiring precise boundary detection.
102 pub frame_overlap: f32,
103
104 /// Smoothing factor for rolling energy baseline (exponential moving
105 /// average).
106 ///
107 /// **Default**: 0.85 (85% history, 15% new observation)
108 ///
109 /// **Valid Range**: [0.0, 1.0)
110 ///
111 /// **Effect**: Controls adaptation speed to background noise changes.
112 /// - Higher (0.9-0.95): Slower adaptation, stable in constant noise
113 /// - Lower (0.7-0.8): Faster adaptation, handles dynamic noise
114 ///
115 /// **Half-Life**: At 0.85, baseline half-life ≈ 4.3 frames (86ms at
116 /// 20ms/frame).
117 pub energy_smoothing: f32,
118
119 /// Smoothing factor for rolling spectral flux baseline.
120 ///
121 /// **Default**: 0.8 (80% history, 20% new observation)
122 ///
123 /// **Valid Range**: [0.0, 1.0)
124 ///
125 /// **Effect**: Controls adaptation to spectral change patterns.
126 /// Flux typically more variable than energy, so slightly lower smoothing.
127 ///
128 /// **Half-Life**: At 0.8, baseline half-life ≈ 3.1 frames (62ms at
129 /// 20ms/frame).
130 pub flux_smoothing: f32,
131
132 /// Minimum energy floor to prevent division by zero in normalization.
133 ///
134 /// **Default**: 1e-4 (0.0001)
135 ///
136 /// **Valid Range**: >0.0 (typically 1e-6 to 1e-3)
137 ///
138 /// **Effect**: Prevents numerical instability when audio is completely
139 /// silent. Value is small enough to not affect real audio.
140 pub energy_floor: f32,
141
142 /// Minimum spectral flux floor to prevent division by zero.
143 ///
144 /// **Default**: 1e-4 (0.0001)
145 ///
146 /// **Valid Range**: >0.0 (typically 1e-6 to 1e-3)
147 ///
148 /// **Effect**: Prevents numerical instability in flux calculations.
149 pub flux_floor: f32,
150
151 /// Smoothing factor for the dynamic decision threshold.
152 ///
153 /// **Default**: 0.9 (90% history, 10% new)
154 ///
155 /// **Valid Range**: [0.0, 1.0)
156 ///
157 /// **Effect**: Controls how quickly the detector adapts its sensitivity.
158 /// Higher values make the threshold more stable, preventing rapid
159 /// oscillations in marginal cases.
160 pub threshold_smoothing: f32,
161
162 /// Multiplier applied to dynamic threshold to activate speech detection.
163 ///
164 /// **Default**: 1.1 (110% of baseline threshold)
165 ///
166 /// **Valid Range**: ≥1.0
167 ///
168 /// **Effect**: Creates hysteresis to prevent chattering at boundaries.
169 /// - 1.05-1.1: High sensitivity (detects soft speech, more false positives)
170 /// - 1.2-1.5: Low sensitivity (robust to noise, may miss quiet speech)
171 ///
172 /// **Recommendation**: Start with 1.1, increase if too many false
173 /// activations.
174 pub activation_margin: f32,
175
176 /// Multiplier applied to dynamic threshold when releasing to silence.
177 ///
178 /// **Default**: 0.9 (90% of baseline threshold)
179 ///
180 /// **Valid Range**: >0.0, must be ≤ `activation_margin`
181 ///
182 /// **Effect**: Creates hysteresis to maintain speech state during brief
183 /// pauses. Difference between activation and release margins prevents
184 /// rapid toggling.
185 ///
186 /// **Typical Gap**: 0.1-0.3 between margins (e.g., activate=1.2,
187 /// release=0.9).
188 pub release_margin: f32,
189
190 /// Initial baseline threshold before dynamic adaptation kicks in.
191 ///
192 /// **Default**: 0.4 (40% of normalized scale)
193 ///
194 /// **Valid Range**: 0.0-1.0
195 ///
196 /// **Effect**: Starting point for adaptive threshold. After 10-20 frames,
197 /// adaptive algorithm takes over and this value becomes less relevant.
198 ///
199 /// **Recommendation**: Leave at default unless you know audio
200 /// characteristics.
201 pub base_threshold: f32,
202
203 /// Weight applied to normalized energy when combining dual metrics.
204 ///
205 /// **Default**: 0.6 (60% energy, 40% flux)
206 ///
207 /// **Valid Range**: 0.0-1.0 (combined with `flux_weight` should sum to 1.0)
208 ///
209 /// **Effect**: Energy detects signal presence, flux detects spectral
210 /// changes. Higher energy weight emphasizes volume-based detection.
211 ///
212 /// **Use Cases**:
213 /// - 0.7-0.8: Emphasize loudness (good for clean recordings)
214 /// - 0.5-0.6: Balanced (default, works well generally)
215 /// - 0.3-0.4: Emphasize spectral change (noisy environments)
216 pub energy_weight: f32,
217
218 /// Weight applied to normalized spectral flux when combining metrics.
219 ///
220 /// **Default**: 0.4 (40% flux, 60% energy)
221 ///
222 /// **Valid Range**: 0.0-1.0 (combined with `energy_weight` should sum to
223 /// 1.0)
224 ///
225 /// **Effect**: Flux is more robust to constant background noise but can
226 /// be fooled by music or non-speech sounds with spectral variation.
227 pub flux_weight: f32,
228
229 /// Number of trailing silent frames retained at the end of a speech
230 /// segment.
231 ///
232 /// **Default**: 3 frames (60ms at 20ms/frame)
233 ///
234 /// **Valid Range**: 0-10 frames (typically)
235 ///
236 /// **Effect**: Prevents premature cutoff of speech segments during brief
237 /// pauses (e.g., between words). Too high causes long trailing silence.
238 ///
239 /// **Recommendation**:
240 /// - 2-3: Normal speech (default)
241 /// - 5-8: Slow/hesitant speech
242 /// - 0-1: Real-time applications requiring minimal latency
243 pub hangover_frames: usize,
244
245 /// Minimum number of speech frames required to emit a segment.
246 ///
247 /// **Default**: 3 frames (60ms at 20ms/frame)
248 ///
249 /// **Valid Range**: 1-10 frames (typically)
250 ///
251 /// **Effect**: Filters out brief noise spikes mistaken for speech.
252 /// Too high causes missed short utterances (e.g., "yes", "no").
253 ///
254 /// **Recommendation**:
255 /// - 2-3: Balanced (default)
256 /// - 1: Detect very short sounds
257 /// - 5+: Only long speech segments
258 pub min_speech_frames: usize,
259
260 /// Absolute start time for the first sample processed by this detector.
261 ///
262 /// **Default**: `AudioTimestamp::EPOCH` (zero-based stream time)
263 ///
264 /// **Effect**: Used for timestamping detected speech segments. Set this
265 /// to the origin you want segment timestamps to use, or leave it as
266 /// `EPOCH` for timestamps relative to the start of processing.
267 ///
268 /// **Use Cases**:
269 /// - Live streams: Set to a shared stream origin
270 /// - Batch processing: Keep `EPOCH` or provide a known offset
271 /// - Testing: Leave as `EPOCH` for deterministic timestamps
272 pub stream_start_time: AudioTimestamp,
273
274 /// Optional pre-emphasis coefficient applied before analysis (high-pass
275 /// filter).
276 ///
277 /// **Default**: `Some(0.97)` (standard speech pre-emphasis)
278 ///
279 /// **Valid Range**: `None` or `Some(0.9-0.99)`
280 ///
281 /// **Effect**: Applies first-order high-pass filter: `y[n] = x[n] -
282 /// α*x[n-1]`
283 /// - Boosts high frequencies relative to low frequencies
284 /// - Compensates for typical speech spectral tilt (more energy in low
285 /// freqs)
286 /// - Improves robustness to low-frequency rumble/hum
287 ///
288 /// **Recommendation**:
289 /// - `Some(0.97)`: Standard for speech (default)
290 /// - `Some(0.95)`: More aggressive high-pass (very noisy low-freq
291 /// environment)
292 /// - `None`: Disable if audio already pre-emphasized or for
293 /// music/non-speech
294 pub pre_emphasis: Option<f32>,
295}
296
297impl Default for VadConfig {
298 fn default() -> Self {
299 Self {
300 sample_rate: 16_000,
301 frame_duration: AudioDuration::from_millis(20),
302 frame_overlap: 0.5,
303 energy_smoothing: 0.85,
304 flux_smoothing: 0.8,
305 energy_floor: 1e-4,
306 flux_floor: 1e-4,
307 threshold_smoothing: 0.9,
308 activation_margin: 1.1,
309 release_margin: 0.9,
310 base_threshold: 0.4,
311 energy_weight: 0.6,
312 flux_weight: 0.4,
313 hangover_frames: 3,
314 min_speech_frames: 3,
315 stream_start_time: AudioTimestamp::EPOCH,
316 pre_emphasis: Some(0.97),
317 }
318 }
319}
320
321impl VadConfig {
322 /// Validate configuration invariants.
323 pub fn validate(&self) -> Result<()> {
324 const EPSILON: f32 = 1e-6;
325
326 if self.sample_rate == 0 {
327 return Err(invalid_input("sample_rate must be greater than zero"));
328 }
329
330 if self.frame_duration.as_nanos() as u64 == 0 {
331 return Err(invalid_input("frame_duration must be non-zero"));
332 }
333
334 if !(0.0..1.0).contains(&self.frame_overlap) {
335 return Err(invalid_input("frame_overlap must be within [0.0, 1.0)"));
336 }
337
338 if !(0.0..1.0).contains(&self.energy_smoothing) {
339 return Err(invalid_input("energy_smoothing must be within [0.0, 1.0)"));
340 }
341
342 if !(0.0..1.0).contains(&self.flux_smoothing) {
343 return Err(invalid_input("flux_smoothing must be within [0.0, 1.0)"));
344 }
345
346 if !(0.0..1.0).contains(&self.threshold_smoothing) {
347 return Err(invalid_input(
348 "threshold_smoothing must be within [0.0, 1.0)",
349 ));
350 }
351
352 if self.activation_margin < 1.0 {
353 return Err(invalid_input("activation_margin must be >= 1.0"));
354 }
355
356 if self.release_margin <= 0.0 {
357 return Err(invalid_input("release_margin must be positive"));
358 }
359
360 if self.release_margin > self.activation_margin {
361 return Err(invalid_input("release_margin must be <= activation_margin"));
362 }
363
364 if self.base_threshold <= 0.0 {
365 return Err(invalid_input("base_threshold must be positive"));
366 }
367
368 if self.energy_weight < 0.0 || self.flux_weight < 0.0 {
369 return Err(invalid_input("metric weights must be non-negative"));
370 }
371
372 let weight_sum = self.energy_weight + self.flux_weight;
373 if weight_sum.abs() < EPSILON {
374 return Err(invalid_input("metric weights must not both be zero"));
375 }
376
377 if self.min_speech_frames == 0 {
378 return Err(invalid_input("min_speech_frames must be greater than zero"));
379 }
380
381 if let Some(coeff) = self.pre_emphasis {
382 if !(0.0..1.0).contains(&coeff) {
383 return Err(invalid_input(
384 "pre_emphasis coefficient must be in [0.0, 1.0)",
385 ));
386 }
387 }
388
389 Ok(())
390 }
391
392 /// Frame length in samples derived from the configured duration and sample
393 /// rate.
394 /// Returns an error if the computed frame length exceeds platform limits.
395 pub fn frame_length_samples(&self) -> Result<usize> {
396 let sr = u128::from(self.sample_rate);
397 let nanos = self.frame_duration.as_nanos();
398 let numerator = nanos
399 .saturating_mul(sr)
400 .saturating_add(NANOS_PER_SECOND / 2);
401 let samples = usize::try_from(numerator / NANOS_PER_SECOND)
402 .map_err(|_| invalid_input("frame duration too large for platform"))?;
403 // Ensure minimum of 1 sample to prevent division by zero downstream
404 Ok(samples.max(1))
405 }
406
407 /// Hop size in samples considering the configured frame overlap.
408 pub fn hop_length_samples(&self) -> Result<usize> {
409 let frame_length = self.frame_length_samples()?;
410 let hop = (frame_length as f32 * (1.0 - self.frame_overlap)).round() as usize;
411 Ok(hop.max(1))
412 }
413
414 /// FFT size for spectral analysis (next power of two of the frame length).
415 pub fn fft_size(&self) -> Result<usize> {
416 Ok(self.frame_length_samples()?.next_power_of_two())
417 }
418}
419
420fn invalid_input(message: impl Into<String>) -> Error {
421 Error::InvalidInput(message.into())
422}