oximedia-codec 0.1.7

Video codec implementations for OxiMedia
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
//! Voice Activity Detection (VAD) for the Opus SILK encoder.
//!
//! This module implements a multi-stage VAD pipeline suitable for use inside
//! the SILK encoder path of the Opus codec.  The design is inspired by the
//! WebRTC VAD (GMM-based + energy) and the ITU-T G.729B Annex B VAD, but is
//! entirely independent of those implementations.
//!
//! # Algorithm
//!
//! 1. **Energy gate** – Reject frames whose short-term energy is below a
//!    noise-floor estimate by at least `energy_threshold_db` dB.
//! 2. **Spectral flatness** – Compute the geometric-to-arithmetic mean ratio
//!    of the sub-band energies.  A flat spectrum (close to 1) is characteristic
//!    of noise; a peaked spectrum indicates voiced/unvoiced speech.
//! 3. **Zero-crossing rate** – High ZCR combined with low energy signals
//!    unvoiced speech or silence depending on the energy level.
//! 4. **Hangover** – Maintain a voice hangover counter so that short gaps
//!    (e.g. stop consonants) are not misclassified as silence.
//!
//! # References
//!
//! - RFC 6716, §3 (Opus SILK mode description)
//! - ITU-T G.729 Annex B – "A silence compression scheme"
//! - "A Computationally Efficient VAD for Narrowband Speech", Sohn et al.

#![forbid(unsafe_code)]
#![allow(clippy::cast_precision_loss)]
#![allow(clippy::cast_possible_truncation)]
#![allow(clippy::cast_sign_loss)]
#![allow(clippy::cast_lossless)]

// ─────────────────────────────────────────────────────────────────────────────
// Constants
// ─────────────────────────────────────────────────────────────────────────────

/// Number of sub-bands used for spectral analysis.
const NUM_SUBBANDS: usize = 4;

/// Default energy threshold (dB above estimated noise floor).
const DEFAULT_ENERGY_THRESHOLD_DB: f32 = 10.0;

/// Default hangover duration in frames.
const DEFAULT_HANGOVER_FRAMES: u32 = 8;

/// Smoothing factor for the noise floor tracker (first-order IIR).
/// At α = 0.995 the tracker adapts in ~200 frames ≈ 4 s at 20 ms/frame.
const NOISE_TRACK_ALPHA: f32 = 0.995;

/// Smoothing factor used when the noise floor is rising (fast adaptation).
const NOISE_TRACK_ALPHA_RISE: f32 = 0.90;

/// Spectral flatness threshold: values below this indicate peaky (speech-like) spectra.
const SPECTRAL_FLATNESS_THRESHOLD: f32 = 0.70;

/// Zero-crossing rate threshold per 160 samples (10 ms @ 16 kHz).
/// Above this value the frame is considered noise-like.
const ZCR_HIGH_THRESHOLD: f32 = 60.0;

// ─────────────────────────────────────────────────────────────────────────────
// VAD decision
// ─────────────────────────────────────────────────────────────────────────────

/// Output of the VAD for a single frame.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum VadDecision {
    /// Voice (speech) detected.
    Voice,
    /// No voice — encoder may apply comfort noise or DTX.
    Silence,
}

impl VadDecision {
    /// Returns `true` if speech is present.
    #[must_use]
    pub fn is_voice(self) -> bool {
        self == Self::Voice
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// Per-frame features
// ─────────────────────────────────────────────────────────────────────────────

/// Acoustic features computed for a single analysis frame.
#[derive(Debug, Clone)]
pub struct FrameFeatures {
    /// Short-term energy (sum of squared samples).
    pub energy: f32,
    /// Spectral flatness measure (0 = tonal/speech, 1 = flat/noise).
    pub spectral_flatness: f32,
    /// Normalised zero-crossing rate (crossings per 160 samples).
    pub zcr: f32,
    /// Sub-band energies (four bands: 0–500, 500–1k, 1k–2k, 2k–4k Hz).
    pub subband_energy: [f32; NUM_SUBBANDS],
}

impl FrameFeatures {
    /// Compute features from a slice of PCM samples (16-bit, mono).
    ///
    /// `sample_rate` is used to map sub-band boundaries to bin indices.
    /// If `samples` is empty, all features are zero.
    #[must_use]
    pub fn from_pcm_i16(samples: &[i16], sample_rate: u32) -> Self {
        if samples.is_empty() {
            return Self::zeroed();
        }

        // ── Energy ──────────────────────────────────────────────────────────
        let energy: f32 = samples.iter().map(|&s| (s as f32) * (s as f32)).sum();

        // ── Zero-crossing rate ───────────────────────────────────────────────
        let mut zcr_count = 0u32;
        for w in samples.windows(2) {
            // sign change: one positive, one non-positive
            let a = w[0];
            let b = w[1];
            if (a >= 0 && b < 0) || (a < 0 && b >= 0) {
                zcr_count += 1;
            }
        }
        // Normalise to crossings per 160 samples
        let normaliser = 160.0 / samples.len() as f32;
        let zcr = zcr_count as f32 * normaliser;

        // ── Sub-band energies (naïve DFT-free split using sample-domain
        //    band-pass decimation approximation) ────────────────────────────
        let subband_energy = compute_subband_energies(samples, sample_rate);

        // ── Spectral flatness ────────────────────────────────────────────────
        let spectral_flatness = spectral_flatness_from_bands(&subband_energy);

        Self {
            energy,
            spectral_flatness,
            zcr,
            subband_energy,
        }
    }

    /// Compute features from f32 PCM samples normalised to [-1, 1].
    #[must_use]
    pub fn from_pcm_f32(samples: &[f32], sample_rate: u32) -> Self {
        // Convert to i16 for unified path
        let i16_samples: Vec<i16> = samples
            .iter()
            .map(|&s| (s.clamp(-1.0, 1.0) * 32767.0) as i16)
            .collect();
        Self::from_pcm_i16(&i16_samples, sample_rate)
    }

    fn zeroed() -> Self {
        Self {
            energy: 0.0,
            spectral_flatness: 1.0,
            zcr: 0.0,
            subband_energy: [0.0; NUM_SUBBANDS],
        }
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// Sub-band and spectral helpers
// ─────────────────────────────────────────────────────────────────────────────

/// Compute approximate sub-band energies using a CIC-like decimation.
///
/// The four bands cover 0–Fs/8, Fs/8–Fs/4, Fs/4–3Fs/8, 3Fs/8–Fs/2.
/// This avoids a full FFT while retaining enough spectral resolution for VAD.
fn compute_subband_energies(samples: &[i16], _sample_rate: u32) -> [f32; NUM_SUBBANDS] {
    // Split samples into four interleaved decimated streams and sum energy.
    // For band k we take samples at positions k, k+4, k+8, …
    let mut bands = [0.0f32; NUM_SUBBANDS];
    for (i, &s) in samples.iter().enumerate() {
        let band = i % NUM_SUBBANDS;
        bands[band] += (s as f32) * (s as f32);
    }
    // Normalise by count per band
    let n = (samples.len() / NUM_SUBBANDS).max(1) as f32;
    for b in &mut bands {
        *b /= n;
    }
    bands
}

/// Spectral flatness measure: geometric mean / arithmetic mean of sub-band energies.
///
/// Returns a value in [0, 1]: 1 = perfectly flat (white noise), 0 = single tone.
fn spectral_flatness_from_bands(bands: &[f32; NUM_SUBBANDS]) -> f32 {
    let min_energy = 1e-6_f32;
    let arith_mean: f32 = bands.iter().map(|&b| b + min_energy).sum::<f32>() / NUM_SUBBANDS as f32;
    let log_sum: f32 = bands.iter().map(|&b| (b + min_energy).ln()).sum::<f32>();
    let geo_mean = (log_sum / NUM_SUBBANDS as f32).exp();
    if arith_mean > 0.0 {
        (geo_mean / arith_mean).min(1.0)
    } else {
        1.0
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// VAD configuration
// ─────────────────────────────────────────────────────────────────────────────

/// Configuration for the [`VoiceActivityDetector`].
#[derive(Debug, Clone)]
pub struct VadConfig {
    /// Energy threshold above noise floor (dB) to declare speech.
    pub energy_threshold_db: f32,
    /// Number of consecutive voiced frames after which hangover kicks in.
    pub hangover_frames: u32,
    /// Spectral flatness threshold: above this → noise-like.
    pub spectral_flatness_threshold: f32,
    /// Zero-crossing rate above which the frame is considered noise-like.
    pub zcr_high_threshold: f32,
    /// Weight given to the energy cue (0–1). The remainder goes to spectral/ZCR.
    pub energy_weight: f32,
}

impl Default for VadConfig {
    fn default() -> Self {
        Self {
            energy_threshold_db: DEFAULT_ENERGY_THRESHOLD_DB,
            hangover_frames: DEFAULT_HANGOVER_FRAMES,
            spectral_flatness_threshold: SPECTRAL_FLATNESS_THRESHOLD,
            zcr_high_threshold: ZCR_HIGH_THRESHOLD,
            energy_weight: 0.6,
        }
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// VAD state machine
// ─────────────────────────────────────────────────────────────────────────────

/// Multi-feature voice activity detector.
///
/// Processes PCM frames and emits [`VadDecision`] per frame.
///
/// # Example
///
/// ```
/// use oximedia_codec::opus::vad::{VoiceActivityDetector, VadConfig, VadDecision};
///
/// let mut vad = VoiceActivityDetector::new(VadConfig::default());
/// let silence = vec![0i16; 160];
/// let decision = vad.process_i16(&silence, 16000);
/// assert_eq!(decision, VadDecision::Silence);
/// ```
pub struct VoiceActivityDetector {
    config: VadConfig,
    /// Exponentially smoothed noise floor (energy units, not dB).
    noise_floor: f32,
    /// Hangover counter: number of remaining hangover frames.
    hangover: u32,
    /// Total frames processed.
    frame_count: u64,
    /// Smoothed energy for UI/diagnostics.
    smoothed_energy: f32,
}

impl VoiceActivityDetector {
    /// Create a new VAD with the given configuration.
    #[must_use]
    pub fn new(config: VadConfig) -> Self {
        Self {
            config,
            noise_floor: 1.0,
            hangover: 0,
            frame_count: 0,
            smoothed_energy: 0.0,
        }
    }

    /// Process a frame of 16-bit PCM samples at `sample_rate` Hz.
    ///
    /// Returns `VadDecision::Voice` if speech is likely present in this frame.
    pub fn process_i16(&mut self, samples: &[i16], sample_rate: u32) -> VadDecision {
        let features = FrameFeatures::from_pcm_i16(samples, sample_rate);
        self.process_features(&features)
    }

    /// Process a frame of f32 PCM samples (range [-1, 1]) at `sample_rate` Hz.
    pub fn process_f32(&mut self, samples: &[f32], sample_rate: u32) -> VadDecision {
        let features = FrameFeatures::from_pcm_f32(samples, sample_rate);
        self.process_features(&features)
    }

    /// Process pre-computed [`FrameFeatures`].
    pub fn process_features(&mut self, features: &FrameFeatures) -> VadDecision {
        self.frame_count += 1;

        // Update smoothed energy (EMA)
        self.smoothed_energy = 0.9 * self.smoothed_energy + 0.1 * features.energy;

        // ── 1. Energy gate ───────────────────────────────────────────────────
        let threshold_linear = db_to_linear_energy(self.config.energy_threshold_db);
        let energy_above_noise = features.energy > self.noise_floor * threshold_linear;

        // ── 2. Spectral flatness cue ─────────────────────────────────────────
        // Low flatness (peaky) → speech-like
        let spectral_speech = features.spectral_flatness < self.config.spectral_flatness_threshold;

        // ── 3. ZCR cue ───────────────────────────────────────────────────────
        let zcr_noise = features.zcr > self.config.zcr_high_threshold;

        // ── 4. Fuse cues ─────────────────────────────────────────────────────
        // Weighted vote: energy has the highest weight
        let w_e = self.config.energy_weight;
        let w_s = (1.0 - w_e) * 0.5;
        let w_z = (1.0 - w_e) * 0.5;

        let speech_score = w_e * energy_above_noise as u8 as f32
            + w_s * spectral_speech as u8 as f32
            + w_z * (!zcr_noise) as u8 as f32;

        let raw_voice = speech_score >= 0.5;

        // ── 5. Update noise floor tracker ────────────────────────────────────
        // Update only on frames classified as silence (after hangover decision)
        let decision_before_hangover = raw_voice;
        if !decision_before_hangover {
            let alpha = if features.energy > self.noise_floor {
                NOISE_TRACK_ALPHA_RISE
            } else {
                NOISE_TRACK_ALPHA
            };
            self.noise_floor = alpha * self.noise_floor + (1.0 - alpha) * features.energy.max(1.0);
        }

        // ── 6. Hangover logic ─────────────────────────────────────────────────
        if raw_voice {
            self.hangover = self.config.hangover_frames;
            VadDecision::Voice
        } else if self.hangover > 0 {
            self.hangover -= 1;
            VadDecision::Voice
        } else {
            VadDecision::Silence
        }
    }

    /// Reset internal state (use between calls with different audio streams).
    pub fn reset(&mut self) {
        self.noise_floor = 1.0;
        self.hangover = 0;
        self.frame_count = 0;
        self.smoothed_energy = 0.0;
    }

    /// Current estimated noise floor (energy units).
    #[must_use]
    pub fn noise_floor(&self) -> f32 {
        self.noise_floor
    }

    /// Total number of frames processed since creation / last reset.
    #[must_use]
    pub fn frame_count(&self) -> u64 {
        self.frame_count
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// Helpers
// ─────────────────────────────────────────────────────────────────────────────

/// Convert a dB SNR threshold to a linear energy ratio.
///
/// `energy_ratio = 10^(db / 10)` (power domain).
#[inline]
fn db_to_linear_energy(db: f32) -> f32 {
    10.0_f32.powf(db / 10.0)
}

// ─────────────────────────────────────────────────────────────────────────────
// Tests
// ─────────────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    fn silence_frame(len: usize) -> Vec<i16> {
        vec![0i16; len]
    }

    fn speech_frame(len: usize, amplitude: i16) -> Vec<i16> {
        // Simple sine-like pattern: alternating +amplitude / -amplitude
        (0..len)
            .map(|i| if i % 2 == 0 { amplitude } else { -amplitude })
            .collect()
    }

    fn loud_sine(len: usize) -> Vec<i16> {
        // 200 Hz sine at 16 kHz, 80% amplitude
        let freq = 200.0f32;
        let sr = 16000.0f32;
        (0..len)
            .map(|i| {
                let t = i as f32 / sr;
                ((2.0 * std::f32::consts::PI * freq * t).sin() * 26000.0) as i16
            })
            .collect()
    }

    #[test]
    fn test_silence_classified_as_silence() {
        let mut vad = VoiceActivityDetector::new(VadConfig::default());
        // Feed many silence frames to let noise floor settle
        for _ in 0..30 {
            vad.process_i16(&silence_frame(160), 16000);
        }
        let decision = vad.process_i16(&silence_frame(160), 16000);
        assert_eq!(decision, VadDecision::Silence);
    }

    #[test]
    fn test_loud_speech_classified_as_voice() {
        let mut vad = VoiceActivityDetector::new(VadConfig::default());
        // Warm up with silence
        for _ in 0..10 {
            vad.process_i16(&silence_frame(160), 16000);
        }
        // Feed a loud speech frame
        let frame = loud_sine(160);
        let decision = vad.process_i16(&frame, 16000);
        assert_eq!(decision, VadDecision::Voice);
    }

    #[test]
    fn test_hangover_extends_voice() {
        let cfg = VadConfig {
            hangover_frames: 5,
            ..Default::default()
        };
        let mut vad = VoiceActivityDetector::new(cfg);
        // Warm up
        for _ in 0..10 {
            vad.process_i16(&silence_frame(160), 16000);
        }
        // One loud frame → triggers voice + hangover
        vad.process_i16(&loud_sine(160), 16000);
        // Next frame is silence but should still be Voice due to hangover
        let d = vad.process_i16(&silence_frame(160), 16000);
        assert_eq!(
            d,
            VadDecision::Voice,
            "hangover should keep decision as Voice"
        );
    }

    #[test]
    fn test_frame_count_increments() {
        let mut vad = VoiceActivityDetector::new(VadConfig::default());
        assert_eq!(vad.frame_count(), 0);
        vad.process_i16(&silence_frame(160), 16000);
        vad.process_i16(&silence_frame(160), 16000);
        assert_eq!(vad.frame_count(), 2);
    }

    #[test]
    fn test_reset_clears_state() {
        let mut vad = VoiceActivityDetector::new(VadConfig::default());
        for _ in 0..20 {
            vad.process_i16(&loud_sine(160), 16000);
        }
        vad.reset();
        assert_eq!(vad.frame_count(), 0);
        assert_eq!(vad.noise_floor(), 1.0);
    }

    #[test]
    fn test_f32_processing() {
        let mut vad = VoiceActivityDetector::new(VadConfig::default());
        // Warm up
        for _ in 0..10 {
            vad.process_f32(&vec![0.0f32; 160], 16000);
        }
        // Loud speech
        let loud: Vec<f32> = (0..160)
            .map(|i| {
                let t = i as f32 / 16000.0;
                (2.0 * std::f32::consts::PI * 200.0 * t).sin() * 0.8
            })
            .collect();
        let d = vad.process_f32(&loud, 16000);
        assert_eq!(d, VadDecision::Voice);
    }

    #[test]
    fn test_spectral_flatness_flat_is_close_to_one() {
        // White-ish noise has flat spectrum → flatness close to 1
        let bands = [1000.0f32, 1100.0, 950.0, 1050.0];
        let sf = spectral_flatness_from_bands(&bands);
        assert!(
            sf > 0.90,
            "flat-spectrum flatness should be > 0.90, got {sf}"
        );
    }

    #[test]
    fn test_spectral_flatness_peaky_is_low() {
        // One dominant band → flatness should be much lower than 1
        let bands = [10000.0f32, 10.0, 10.0, 10.0];
        let sf = spectral_flatness_from_bands(&bands);
        assert!(
            sf < 0.50,
            "peaky-spectrum flatness should be < 0.50, got {sf}"
        );
    }

    #[test]
    fn test_frame_features_zero_input() {
        let feats = FrameFeatures::from_pcm_i16(&[], 16000);
        assert_eq!(feats.energy, 0.0);
    }

    #[test]
    fn test_db_to_linear_energy_10db() {
        let ratio = db_to_linear_energy(10.0);
        // 10 dB ≈ 10.0 in power
        assert!((ratio - 10.0).abs() < 0.01, "expected ~10.0 got {ratio}");
    }
}