speech_prep/vad/
config.rs

1//! VAD configuration and validation.
2
3use crate::error::{Error, Result};
4use crate::time::{AudioDuration, AudioTimestamp};
5
6/// Number of nanoseconds in one second, used for time conversion.
7const NANOS_PER_SECOND: u128 = 1_000_000_000;
8
9/// Configuration for the voice activity detector.
10///
11/// # Performance Characteristics
12///
13/// - **Latency**: Typically <2ms per 20ms frame (10% overhead)
14/// - **Memory**: ~10KB per detector instance (FFT buffers + state)
15/// - **Accuracy**: >95% speech detection on clean audio
16///
17/// # Configuration Guidelines
18///
19/// ## Quick Start (Use Defaults)
20///
21/// ```rust,no_run
22/// use speech_prep::VadConfig;
23///
24/// let config = VadConfig::default(); // Optimized for 16kHz mono speech
25/// ```
26///
27/// ## Advanced Tuning
28///
29/// **For Noisy Environments**: Increase `activation_margin` to 1.3-1.5
30///
31/// ```rust,no_run
32/// # use speech_prep::VadConfig;
33/// let config = VadConfig {
34///     activation_margin: 1.4, // Require stronger signal
35///     hangover_frames: 5,     // Longer trailing silence tolerance
36///     ..VadConfig::default()
37/// };
38/// ```
39///
40/// **For Low-Latency Applications**: Reduce `frame_duration`
41///
42/// ```rust,no_run
43/// # use speech_prep::VadConfig;
44/// # use speech_prep::time::AudioDuration;
45/// let config = VadConfig {
46///     frame_duration: AudioDuration::from_millis(10), // 10ms frames
47///     ..VadConfig::default()
48/// };
49/// ```
50///
51/// **For Soft/Quiet Speech**: Lower `activation_margin`
52///
53/// ```rust,no_run
54/// # use speech_prep::VadConfig;
55/// let config = VadConfig {
56///     activation_margin: 1.05, // More sensitive
57///     min_speech_frames: 2,    // Faster activation
58///     ..VadConfig::default()
59/// };
60/// ```
61#[derive(Debug, Clone, Copy)]
62pub struct VadConfig {
63    /// Expected audio sample rate in Hz.
64    ///
65    /// **Default**: 16000 (16kHz - optimal for speech)
66    ///
67    /// **Valid Range**: 8000-48000 Hz
68    ///
69    /// **Performance Impact**: Higher rates increase FFT computation cost.
70    /// At 48kHz, expect ~3x slower processing vs 16kHz.
71    ///
72    /// **Recommendation**: Use 16kHz unless your audio pipeline requires
73    /// otherwise.
74    pub sample_rate: u32,
75
76    /// Frame duration used for analysis.
77    ///
78    /// **Default**: 20ms (320 samples at 16kHz)
79    ///
80    /// **Valid Range**: 10-50ms
81    ///
82    /// **Trade-offs**:
83    /// - Shorter (10ms): Lower latency, less robust to noise
84    /// - Longer (50ms): Higher latency, more stable detection
85    ///
86    /// **Performance Impact**: 20ms frame = ~1.5ms processing time.
87    /// Linear scaling: 10ms → ~0.75ms, 50ms → ~3.75ms.
88    pub frame_duration: AudioDuration,
89
90    /// Fractional overlap between adjacent frames.
91    ///
92    /// **Default**: 0.5 (50% overlap)
93    ///
94    /// **Valid Range**: [0.0, 1.0)
95    ///
96    /// **Effect**: Higher overlap increases temporal resolution but adds
97    /// computation cost. 50% overlap means processing 2x frames for same audio
98    /// duration.
99    ///
100    /// **Recommendation**: 0.5 for balanced accuracy/performance, 0.75 for
101    /// critical applications requiring precise boundary detection.
102    pub frame_overlap: f32,
103
104    /// Smoothing factor for rolling energy baseline (exponential moving
105    /// average).
106    ///
107    /// **Default**: 0.85 (85% history, 15% new observation)
108    ///
109    /// **Valid Range**: [0.0, 1.0)
110    ///
111    /// **Effect**: Controls adaptation speed to background noise changes.
112    /// - Higher (0.9-0.95): Slower adaptation, stable in constant noise
113    /// - Lower (0.7-0.8): Faster adaptation, handles dynamic noise
114    ///
115    /// **Half-Life**: At 0.85, baseline half-life ≈ 4.3 frames (86ms at
116    /// 20ms/frame).
117    pub energy_smoothing: f32,
118
119    /// Smoothing factor for rolling spectral flux baseline.
120    ///
121    /// **Default**: 0.8 (80% history, 20% new observation)
122    ///
123    /// **Valid Range**: [0.0, 1.0)
124    ///
125    /// **Effect**: Controls adaptation to spectral change patterns.
126    /// Flux typically more variable than energy, so slightly lower smoothing.
127    ///
128    /// **Half-Life**: At 0.8, baseline half-life ≈ 3.1 frames (62ms at
129    /// 20ms/frame).
130    pub flux_smoothing: f32,
131
132    /// Minimum energy floor to prevent division by zero in normalization.
133    ///
134    /// **Default**: 1e-4 (0.0001)
135    ///
136    /// **Valid Range**: >0.0 (typically 1e-6 to 1e-3)
137    ///
138    /// **Effect**: Prevents numerical instability when audio is completely
139    /// silent. Value is small enough to not affect real audio.
140    pub energy_floor: f32,
141
142    /// Minimum spectral flux floor to prevent division by zero.
143    ///
144    /// **Default**: 1e-4 (0.0001)
145    ///
146    /// **Valid Range**: >0.0 (typically 1e-6 to 1e-3)
147    ///
148    /// **Effect**: Prevents numerical instability in flux calculations.
149    pub flux_floor: f32,
150
151    /// Smoothing factor for the dynamic decision threshold.
152    ///
153    /// **Default**: 0.9 (90% history, 10% new)
154    ///
155    /// **Valid Range**: [0.0, 1.0)
156    ///
157    /// **Effect**: Controls how quickly the detector adapts its sensitivity.
158    /// Higher values make the threshold more stable, preventing rapid
159    /// oscillations in marginal cases.
160    pub threshold_smoothing: f32,
161
162    /// Multiplier applied to dynamic threshold to activate speech detection.
163    ///
164    /// **Default**: 1.1 (110% of baseline threshold)
165    ///
166    /// **Valid Range**: ≥1.0
167    ///
168    /// **Effect**: Creates hysteresis to prevent chattering at boundaries.
169    /// - 1.05-1.1: High sensitivity (detects soft speech, more false positives)
170    /// - 1.2-1.5: Low sensitivity (robust to noise, may miss quiet speech)
171    ///
172    /// **Recommendation**: Start with 1.1, increase if too many false
173    /// activations.
174    pub activation_margin: f32,
175
176    /// Multiplier applied to dynamic threshold when releasing to silence.
177    ///
178    /// **Default**: 0.9 (90% of baseline threshold)
179    ///
180    /// **Valid Range**: >0.0, must be ≤ `activation_margin`
181    ///
182    /// **Effect**: Creates hysteresis to maintain speech state during brief
183    /// pauses. Difference between activation and release margins prevents
184    /// rapid toggling.
185    ///
186    /// **Typical Gap**: 0.1-0.3 between margins (e.g., activate=1.2,
187    /// release=0.9).
188    pub release_margin: f32,
189
190    /// Initial baseline threshold before dynamic adaptation kicks in.
191    ///
192    /// **Default**: 0.4 (40% of normalized scale)
193    ///
194    /// **Valid Range**: 0.0-1.0
195    ///
196    /// **Effect**: Starting point for adaptive threshold. After 10-20 frames,
197    /// adaptive algorithm takes over and this value becomes less relevant.
198    ///
199    /// **Recommendation**: Leave at default unless you know audio
200    /// characteristics.
201    pub base_threshold: f32,
202
203    /// Weight applied to normalized energy when combining dual metrics.
204    ///
205    /// **Default**: 0.6 (60% energy, 40% flux)
206    ///
207    /// **Valid Range**: 0.0-1.0 (combined with `flux_weight` should sum to 1.0)
208    ///
209    /// **Effect**: Energy detects signal presence, flux detects spectral
210    /// changes. Higher energy weight emphasizes volume-based detection.
211    ///
212    /// **Use Cases**:
213    /// - 0.7-0.8: Emphasize loudness (good for clean recordings)
214    /// - 0.5-0.6: Balanced (default, works well generally)
215    /// - 0.3-0.4: Emphasize spectral change (noisy environments)
216    pub energy_weight: f32,
217
218    /// Weight applied to normalized spectral flux when combining metrics.
219    ///
220    /// **Default**: 0.4 (40% flux, 60% energy)
221    ///
222    /// **Valid Range**: 0.0-1.0 (combined with `energy_weight` should sum to
223    /// 1.0)
224    ///
225    /// **Effect**: Flux is more robust to constant background noise but can
226    /// be fooled by music or non-speech sounds with spectral variation.
227    pub flux_weight: f32,
228
229    /// Number of trailing silent frames retained at the end of a speech
230    /// segment.
231    ///
232    /// **Default**: 3 frames (60ms at 20ms/frame)
233    ///
234    /// **Valid Range**: 0-10 frames (typically)
235    ///
236    /// **Effect**: Prevents premature cutoff of speech segments during brief
237    /// pauses (e.g., between words). Too high causes long trailing silence.
238    ///
239    /// **Recommendation**:
240    /// - 2-3: Normal speech (default)
241    /// - 5-8: Slow/hesitant speech
242    /// - 0-1: Real-time applications requiring minimal latency
243    pub hangover_frames: usize,
244
245    /// Minimum number of speech frames required to emit a segment.
246    ///
247    /// **Default**: 3 frames (60ms at 20ms/frame)
248    ///
249    /// **Valid Range**: 1-10 frames (typically)
250    ///
251    /// **Effect**: Filters out brief noise spikes mistaken for speech.
252    /// Too high causes missed short utterances (e.g., "yes", "no").
253    ///
254    /// **Recommendation**:
255    /// - 2-3: Balanced (default)
256    /// - 1: Detect very short sounds
257    /// - 5+: Only long speech segments
258    pub min_speech_frames: usize,
259
260    /// Absolute start time for the first sample processed by this detector.
261    ///
262    /// **Default**: `AudioTimestamp::EPOCH` (zero-based stream time)
263    ///
264    /// **Effect**: Used for timestamping detected speech segments. Set this
265    /// to the origin you want segment timestamps to use, or leave it as
266    /// `EPOCH` for timestamps relative to the start of processing.
267    ///
268    /// **Use Cases**:
269    /// - Live streams: Set to a shared stream origin
270    /// - Batch processing: Keep `EPOCH` or provide a known offset
271    /// - Testing: Leave as `EPOCH` for deterministic timestamps
272    pub stream_start_time: AudioTimestamp,
273
274    /// Optional pre-emphasis coefficient applied before analysis (high-pass
275    /// filter).
276    ///
277    /// **Default**: `Some(0.97)` (standard speech pre-emphasis)
278    ///
279    /// **Valid Range**: `None` or `Some(0.9-0.99)`
280    ///
281    /// **Effect**: Applies first-order high-pass filter: `y[n] = x[n] -
282    /// α*x[n-1]`
283    /// - Boosts high frequencies relative to low frequencies
284    /// - Compensates for typical speech spectral tilt (more energy in low
285    ///   freqs)
286    /// - Improves robustness to low-frequency rumble/hum
287    ///
288    /// **Recommendation**:
289    /// - `Some(0.97)`: Standard for speech (default)
290    /// - `Some(0.95)`: More aggressive high-pass (very noisy low-freq
291    ///   environment)
292    /// - `None`: Disable if audio already pre-emphasized or for
293    ///   music/non-speech
294    pub pre_emphasis: Option<f32>,
295}
296
297impl Default for VadConfig {
298    fn default() -> Self {
299        Self {
300            sample_rate: 16_000,
301            frame_duration: AudioDuration::from_millis(20),
302            frame_overlap: 0.5,
303            energy_smoothing: 0.85,
304            flux_smoothing: 0.8,
305            energy_floor: 1e-4,
306            flux_floor: 1e-4,
307            threshold_smoothing: 0.9,
308            activation_margin: 1.1,
309            release_margin: 0.9,
310            base_threshold: 0.4,
311            energy_weight: 0.6,
312            flux_weight: 0.4,
313            hangover_frames: 3,
314            min_speech_frames: 3,
315            stream_start_time: AudioTimestamp::EPOCH,
316            pre_emphasis: Some(0.97),
317        }
318    }
319}
320
321impl VadConfig {
322    /// Validate configuration invariants.
323    pub fn validate(&self) -> Result<()> {
324        const EPSILON: f32 = 1e-6;
325
326        if self.sample_rate == 0 {
327            return Err(invalid_input("sample_rate must be greater than zero"));
328        }
329
330        if self.frame_duration.as_nanos() as u64 == 0 {
331            return Err(invalid_input("frame_duration must be non-zero"));
332        }
333
334        if !(0.0..1.0).contains(&self.frame_overlap) {
335            return Err(invalid_input("frame_overlap must be within [0.0, 1.0)"));
336        }
337
338        if !(0.0..1.0).contains(&self.energy_smoothing) {
339            return Err(invalid_input("energy_smoothing must be within [0.0, 1.0)"));
340        }
341
342        if !(0.0..1.0).contains(&self.flux_smoothing) {
343            return Err(invalid_input("flux_smoothing must be within [0.0, 1.0)"));
344        }
345
346        if !(0.0..1.0).contains(&self.threshold_smoothing) {
347            return Err(invalid_input(
348                "threshold_smoothing must be within [0.0, 1.0)",
349            ));
350        }
351
352        if self.activation_margin < 1.0 {
353            return Err(invalid_input("activation_margin must be >= 1.0"));
354        }
355
356        if self.release_margin <= 0.0 {
357            return Err(invalid_input("release_margin must be positive"));
358        }
359
360        if self.release_margin > self.activation_margin {
361            return Err(invalid_input("release_margin must be <= activation_margin"));
362        }
363
364        if self.base_threshold <= 0.0 {
365            return Err(invalid_input("base_threshold must be positive"));
366        }
367
368        if self.energy_weight < 0.0 || self.flux_weight < 0.0 {
369            return Err(invalid_input("metric weights must be non-negative"));
370        }
371
372        let weight_sum = self.energy_weight + self.flux_weight;
373        if weight_sum.abs() < EPSILON {
374            return Err(invalid_input("metric weights must not both be zero"));
375        }
376
377        if self.min_speech_frames == 0 {
378            return Err(invalid_input("min_speech_frames must be greater than zero"));
379        }
380
381        if let Some(coeff) = self.pre_emphasis {
382            if !(0.0..1.0).contains(&coeff) {
383                return Err(invalid_input(
384                    "pre_emphasis coefficient must be in [0.0, 1.0)",
385                ));
386            }
387        }
388
389        Ok(())
390    }
391
392    /// Frame length in samples derived from the configured duration and sample
393    /// rate.
394    /// Returns an error if the computed frame length exceeds platform limits.
395    pub fn frame_length_samples(&self) -> Result<usize> {
396        let sr = u128::from(self.sample_rate);
397        let nanos = self.frame_duration.as_nanos();
398        let numerator = nanos
399            .saturating_mul(sr)
400            .saturating_add(NANOS_PER_SECOND / 2);
401        let samples = usize::try_from(numerator / NANOS_PER_SECOND)
402            .map_err(|_| invalid_input("frame duration too large for platform"))?;
403        // Ensure minimum of 1 sample to prevent division by zero downstream
404        Ok(samples.max(1))
405    }
406
407    /// Hop size in samples considering the configured frame overlap.
408    pub fn hop_length_samples(&self) -> Result<usize> {
409        let frame_length = self.frame_length_samples()?;
410        let hop = (frame_length as f32 * (1.0 - self.frame_overlap)).round() as usize;
411        Ok(hop.max(1))
412    }
413
414    /// FFT size for spectral analysis (next power of two of the frame length).
415    pub fn fft_size(&self) -> Result<usize> {
416        Ok(self.frame_length_samples()?.next_power_of_two())
417    }
418}
419
420fn invalid_input(message: impl Into<String>) -> Error {
421    Error::InvalidInput(message.into())
422}
speech_prep/vad/config.rs

speech_prep/vad/
config.rs