blip25-mbe 0.1.0

Research MBE / IMBE / AMBE+2 vocoder family: P25 wire formats, multi-generation codecs, and parametric rate conversion. Educational use; see PATENT_NOTICE.md.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
//! Spectral subtraction denoiser — opt-in input-side noise suppression
//! applied to `signal_spectrum()` output before §0.5 amplitude
//! estimation.
//!
//! Different from the retired `--harmonic-bin-pad` and
//! `--noise-floor-beta` directions: those operated on per-bin /
//! per-harmonic math inside the §0.5 estimator. This operates on the
//! input spectrum BEFORE the harmonic estimator sees it.
//!
//! Reference: Boll, *Suppression of Acoustic Noise in Speech Using
//! Spectral Subtraction*, IEEE TASSP 1979 (classic spectral
//! subtraction); Martin, *Noise Power Spectral Density Estimation
//! Based on Optimal Smoothing and Minimum Statistics*, IEEE TSAP
//! 2001 (per-bin running minimum noise tracker).
//!
//! Design:
//! - Per-bin running noise estimate `N(m)` updated via EMA on
//!   silence-flagged frames (the §0.8.4 silence detector is the
//!   speech-activity gate).
//! - On every frame, compute suppression gain
//!   `G(m) = sqrt(max(|S(m)|² - β·N(m), α·|S(m)|²) / |S(m)|²)`
//!   with `α` an over-subtraction floor preventing musical noise.
//! - Apply phase-preserving: `S'(m) = G(m) · S(m)`.

use super::basis::{Complex64, DFT_SIZE};

/// Over-subtraction multiplier `β`. Boll 1979 used `β = 1`. β-sweep
/// 2026-04-28 across the 5-vector set: knox_1 (noisy tone) peaks at
/// β=0.1 (+0.638 PESQ); β=0.05 is too gentle and *hurts* knox_1
/// (-0.122 PESQ) by introducing a subtractive bias without removing
/// noise; β≥0.5 over-subtracts speech (mark drops 0.36+ PESQ). 0.1
/// is the empirical sweet spot for the existing silence-detector
/// gating.
const SUBTRACTION_BETA: f64 = 0.1;

/// Suppression floor `α` — the gain `G(m)` is never allowed to drop
/// below `sqrt(α)`. Prevents nulls in the output spectrum (musical
/// noise). Berouti et al. 1979 recommend 0.01–0.05.
const SPECTRAL_FLOOR_ALPHA: f64 = 0.02;

/// EMA forget factor for noise-spectrum updates during silence frames.
/// `λ ∈ [0, 1)`; smaller = faster adaptation. Martin 2001 uses
/// long-window minimum statistics; we approximate with a slow EMA
/// (≈ 1 s effective window at 50 fps).
const NOISE_EMA_LAMBDA: f64 = 0.95;

/// Cold-start noise-spectrum value (per bin, in `|S|²` units).
const NOISE_INIT: f64 = 0.0;

/// Frame-energy / noise-floor ratio gate: a frame must satisfy
/// `frame_energy < STRICT_ENERGY_RATIO · η` to be eligible for a
/// noise-estimate update. Kept as a permissive secondary check —
/// the dominant filter is now spectral stationarity (below).
const STRICT_ENERGY_RATIO: f64 = 4.0;

/// Minimum consecutive eligible frames before any noise-estimate
/// update fires. Two frames is enough to reject single-frame
/// outliers without forcing the §0.8.4 silence-detector hysteresis
/// (which doesn't fire on continuous stationary tones like knox_1).
const STRICT_CONSECUTIVE_FRAMES: u32 = 2;

/// Cosine-similarity threshold between the current frame's
/// `|S(m)|²` spectrum and the running EMA of recent `|S(m)|²`
/// spectra. Above this → spectrum is stationary frame-to-frame →
/// eligible for noise-estimate update. Sweep 2026-04-28 across the
/// 5-vector set: 0.99 leaks into clean/alert; 0.995 leaks into
/// alert (formant of the tonal sweep momentarily stationary);
/// **0.999 hits all five vectors within the 0.05 PESQ goal** while
/// retaining +0.585 PESQ on knox_1.
///
/// The 2026-04-28 strict-gate follow-up showed energy alone can't
/// distinguish stationary tones (high energy, want to update) from
/// speech pauses (high residual energy, do NOT want to update).
/// Spectral stationarity targets what energy can't see: speech
/// changes the spectrum every frame (formant motion); stationary
/// tones / hum / fan noise hold the spectrum constant.
const STATIONARITY_COS_THRESHOLD: f64 = 0.999;

/// EMA forget factor for the magnitude-spectrum reference used by
/// the stationarity gate. Faster than the noise-PSD EMA — the
/// reference needs to track recent spectra so the cosine-similarity
/// test reflects "this frame matches the *recent* spectrum", not
/// "this frame matches the long-run average." 0.7 = ~3-frame
/// effective window.
const STATIONARITY_REF_LAMBDA: f64 = 0.7;

/// Decision-directed a-priori SNR smoothing factor `α_DD` (Ephraim &
/// Malah 1984, Eq. 51). Blends the previous frame's enhanced-magnitude
/// SNR with the current a-posteriori SNR. `α_DD = 0.98` is the paper
/// default — high values smooth out musical noise, low values track
/// fast transients. We use the paper value.
const DD_SNR_SMOOTHING: f64 = 0.98;

/// Floor for the a-priori SNR estimate `ξ`. Prevents log/division blow
/// up on bins where the previous-enhanced-magnitude is zero. -25 dB =
/// ξ_min ≈ 0.00316 (paper recommendation).
const A_PRIORI_SNR_FLOOR: f64 = 0.003_162_277_660;

/// Suppression mode for the noise-removal step.
///
/// 5-vector A/B 2026-04-28 with the spectral-stationarity gate (cos
/// ≥ 0.999, β=0.1, α=0.02):
///
/// | Vector | Boll  | Wiener+floor |
/// |--------|-------|--------------|
/// | clean  | 3.202 | 3.202        |
/// | dam    | 3.304 | 3.304        |
/// | alert  | 2.597 | 2.556        |
/// | mark   | 2.699 | 2.699        |
/// | knox_1 | 1.919 | 1.841        |
///
/// Boll wins overall (+0.111 vs +0.087 average across the set).
/// Wiener has tighter STOI on alert (intelligibility) but PESQ
/// scoring prefers Boll's curve. Boll is the default; the Wiener
/// path stays in-tree as a documented alternative.
const SUPPRESSION_MODE: SuppressionMode = SuppressionMode::Boll;

/// Internal selector for the gain-curve to apply once the gate decides
/// to suppress. Constant across the run; switching is a recompile.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum SuppressionMode {
    /// Boll 1979 power subtraction with `α` floor and `β`
    /// over-subtraction.
    Boll,
    /// Wiener gain `G = ξ̂ / (1 + ξ̂)` with decision-directed
    /// a-priori SNR `ξ̂`. Smoother gain curve than Boll; less
    /// musical noise on residual spectra. Kept in-tree as a
    /// documented alternative to the default Boll path (see
    /// `project_wiener_gain_null_vs_boll_2026-04-28` memory entry):
    /// switch by changing the `SUPPRESSION_MODE` const above.
    #[allow(dead_code)]
    Wiener,
}

/// Per-bin noise-PSD estimate covering the same DFT layout as
/// `signal_spectrum` (256-entry, half-spectrum used).
#[derive(Clone, Debug)]
pub struct NoiseSpectrum {
    /// `n_psd[m]` = running estimate of `E[|N(m)|²]` for bin `m`.
    /// Indexed identically to `signal_spectrum`'s output.
    pub n_psd: [f64; DFT_SIZE],
    /// `true` after the first eligible-frame EMA update; before that
    /// `n_psd` is all-zero and `apply_subtraction` is a no-op (no
    /// noise model yet).
    pub primed: bool,
    /// Consecutive eligible-frame counter — saturates at
    /// `STRICT_CONSECUTIVE_FRAMES`. Updates only fire once this
    /// counter reaches the threshold, after which it stays armed
    /// until ineligibility resets it to zero.
    eligible_run: u32,
    /// Running EMA reference of `|S(m)|²` for the spectral-stationarity
    /// gate. Updated every frame (regardless of eligibility) so the
    /// gate has a recent baseline to compare against.
    spectrum_ref: [f64; DFT_SIZE],
    /// `true` after the first frame populates `spectrum_ref`. Before
    /// that the cosine-similarity test is undefined, so the gate
    /// treats it as ineligible.
    spectrum_ref_primed: bool,
    /// Per-bin previous-frame enhanced-magnitude squared `|Â_{t-1}(m)|²`
    /// — feeds the decision-directed a-priori SNR estimate in
    /// `SuppressionMode::Wiener`. Unused under `SuppressionMode::Boll`.
    prev_enhanced_psd: [f64; DFT_SIZE],
}

impl NoiseSpectrum {
    /// Cold-start state — zeros, not primed, no eligible run.
    pub const fn new() -> Self {
        Self {
            n_psd: [NOISE_INIT; DFT_SIZE],
            primed: false,
            eligible_run: 0,
            spectrum_ref: [0.0; DFT_SIZE],
            spectrum_ref_primed: false,
            prev_enhanced_psd: [0.0; DFT_SIZE],
        }
    }

    /// Reset to cold start (mirrors `Vocoder::reset`).
    pub fn reset(&mut self) {
        self.n_psd = [NOISE_INIT; DFT_SIZE];
        self.primed = false;
        self.eligible_run = 0;
        self.spectrum_ref = [0.0; DFT_SIZE];
        self.spectrum_ref_primed = false;
        self.prev_enhanced_psd = [0.0; DFT_SIZE];
    }

    /// Cosine similarity between `|sw|²` and the running spectrum
    /// reference. Returns `1.0` when both vectors are identical (in
    /// direction); near `0` when orthogonal. Returns `0.0` if the
    /// reference isn't yet primed or either vector is all-zero.
    fn stationarity_cos_sim(&self, sw: &[Complex64; DFT_SIZE]) -> f64 {
        if !self.spectrum_ref_primed {
            return 0.0;
        }
        let mut dot = 0.0f64;
        let mut a_norm = 0.0f64;
        let mut b_norm = 0.0f64;
        for m in 0..DFT_SIZE {
            let a = sw[m].norm_sqr();
            let b = self.spectrum_ref[m];
            dot += a * b;
            a_norm += a * a;
            b_norm += b * b;
        }
        let denom = (a_norm * b_norm).sqrt();
        if denom <= 1e-30 {
            0.0
        } else {
            dot / denom
        }
    }

    /// Update the noise estimate from the current frame's
    /// `signal_spectrum`. Three-stage gate:
    ///
    /// 1. **Spectral stationarity** — cosine similarity between
    ///    `|sw|²` and the running EMA reference must be ≥
    ///    `STATIONARITY_COS_THRESHOLD`. This is the dominant filter:
    ///    speech changes formant peaks every frame (low cos-sim);
    ///    stationary tones / hum / fan noise hold (high cos-sim).
    /// 2. **Permissive energy / silence** — `silent OR frame_energy <
    ///    STRICT_ENERGY_RATIO · η`. Either signal-detector path
    ///    suffices; the spectral-stationarity test does most of the
    ///    work.
    /// 3. **Consecutive-frame counter** — at least
    ///    `STRICT_CONSECUTIVE_FRAMES` eligible frames in a row to
    ///    reject single-frame outliers.
    ///
    /// The reference spectrum is updated every frame regardless of
    /// eligibility so the cos-sim test always has a recent baseline.
    pub fn update(&mut self, sw: &[Complex64; DFT_SIZE], silent: bool, frame_energy: f64, noise_floor_eta: f64) {
        // Stage 1: spectral stationarity test (uses prior reference).
        let cos_sim = self.stationarity_cos_sim(sw);
        let stationary = cos_sim >= STATIONARITY_COS_THRESHOLD;

        // Update the spectrum reference EVERY frame. A fast EMA so the
        // reference reflects the recent past, not the long-run mean.
        if !self.spectrum_ref_primed {
            for (m, slot) in self.spectrum_ref.iter_mut().enumerate() {
                *slot = sw[m].norm_sqr();
            }
            self.spectrum_ref_primed = true;
        } else {
            for (m, slot) in self.spectrum_ref.iter_mut().enumerate() {
                let cur = sw[m].norm_sqr();
                *slot = STATIONARITY_REF_LAMBDA * *slot
                    + (1.0 - STATIONARITY_REF_LAMBDA) * cur;
            }
        }

        // Stage 2: secondary energy / silence permissive check.
        let energy_ok = silent || frame_energy < STRICT_ENERGY_RATIO * noise_floor_eta;
        if !stationary || !energy_ok {
            self.eligible_run = 0;
            return;
        }

        // Stage 3: consecutive-eligible-frame counter.
        self.eligible_run = self.eligible_run.saturating_add(1);
        if self.eligible_run < STRICT_CONSECUTIVE_FRAMES {
            return;
        }
        if !self.primed {
            for (m, slot) in self.n_psd.iter_mut().enumerate() {
                *slot = sw[m].norm_sqr();
            }
            self.primed = true;
            return;
        }
        for (m, slot) in self.n_psd.iter_mut().enumerate() {
            let cur = sw[m].norm_sqr();
            *slot = NOISE_EMA_LAMBDA * *slot + (1.0 - NOISE_EMA_LAMBDA) * cur;
        }
    }
}

impl Default for NoiseSpectrum {
    #[inline]
    fn default() -> Self {
        Self::new()
    }
}

/// Apply noise suppression to `sw` using the running noise estimate.
/// Dispatches on the compile-time `SUPPRESSION_MODE` constant: Boll
/// 1979 power subtraction or Wiener gain with decision-directed
/// a-priori SNR. No-op when `state.primed == false`.
///
/// Mutates `state.prev_enhanced_psd` under `SuppressionMode::Wiener`
/// so the next frame's decision-directed estimator sees this frame's
/// enhanced magnitude. Boll mode does not touch state.
///
/// Phase is preserved; only the magnitude is scaled.
pub fn apply_subtraction(
    sw: &[Complex64; DFT_SIZE],
    state: &mut NoiseSpectrum,
) -> [Complex64; DFT_SIZE] {
    if !state.primed {
        return *sw;
    }
    let mut out = *sw;
    match SUPPRESSION_MODE {
        SuppressionMode::Boll => {
            for m in 0..DFT_SIZE {
                let s_psd = sw[m].norm_sqr();
                if s_psd <= 1e-30 {
                    continue;
                }
                let suppressed = (s_psd - SUBTRACTION_BETA * state.n_psd[m])
                    .max(SPECTRAL_FLOOR_ALPHA * s_psd);
                let gain = (suppressed / s_psd).sqrt();
                out[m] = Complex64::new(sw[m].re * gain, sw[m].im * gain);
            }
        }
        SuppressionMode::Wiener => {
            for m in 0..DFT_SIZE {
                let s_psd = sw[m].norm_sqr();
                let n_psd = state.n_psd[m].max(1e-30);
                if s_psd <= 1e-30 {
                    state.prev_enhanced_psd[m] = 0.0;
                    continue;
                }
                // a-posteriori SNR γ = |Y|²/N (clamped at 0 to be
                // robust against under-estimated noise floor).
                let gamma = (s_psd / n_psd - 1.0).max(0.0);
                // Decision-directed a-priori SNR ξ̂_t = α·|Â_{t-1}|²/N
                //                              + (1-α)·max(γ - 1, 0).
                let xi = (DD_SNR_SMOOTHING * state.prev_enhanced_psd[m] / n_psd
                        + (1.0 - DD_SNR_SMOOTHING) * gamma)
                        .max(A_PRIORI_SNR_FLOOR);
                // Wiener gain, with the same `sqrt(α)` floor Boll mode
                // uses — prevents the gain from collapsing on bins
                // whose a-priori SNR estimate dips low (which on
                // tonal sweeps tanks alert -0.76 PESQ).
                let raw_gain = xi / (1.0 + xi);
                let gain = raw_gain.max(SPECTRAL_FLOOR_ALPHA.sqrt());
                out[m] = Complex64::new(sw[m].re * gain, sw[m].im * gain);
                // Track |Â|² = G² · |Y|² for next frame's ξ̂.
                state.prev_enhanced_psd[m] = gain * gain * s_psd;
            }
        }
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;

    fn flat_spectrum(magnitude: f64) -> [Complex64; DFT_SIZE] {
        let mut sw = [Complex64::new(0.0, 0.0); DFT_SIZE];
        for s in &mut sw {
            *s = Complex64::new(magnitude, 0.0);
        }
        sw
    }

    #[test]
    fn unprimed_noise_state_is_passthrough() {
        let sw = flat_spectrum(2.0);
        let mut state = NoiseSpectrum::new();
        let out = apply_subtraction(&sw, &mut state);
        for m in 0..DFT_SIZE {
            assert!((out[m].re - sw[m].re).abs() < 1e-12);
        }
    }

    /// Helper: drive enough eligible frames so the strict gate primes.
    /// Needs `STRICT_CONSECUTIVE_FRAMES + 1` calls because the first
    /// call only seeds the spectrum reference (the cos-sim test fails
    /// on an unprimed reference). All calls pass `silent=true` and
    /// `frame_energy=0` so the secondary gate also passes.
    fn prime_with(state: &mut NoiseSpectrum, sw: &[Complex64; DFT_SIZE]) {
        for _ in 0..(STRICT_CONSECUTIVE_FRAMES + 1) {
            state.update(sw, true, /*frame_energy*/ 0.0, /*η*/ 1.0);
        }
    }

    #[test]
    fn strict_gate_requires_consecutive_stationary_frames() {
        let mut state = NoiseSpectrum::new();
        let noise = flat_spectrum(0.5);
        // First call only seeds the spectrum reference. Subsequent
        // STRICT_CONSECUTIVE_FRAMES - 1 calls don't yet prime the
        // noise estimator.
        for _ in 0..STRICT_CONSECUTIVE_FRAMES {
            state.update(&noise, true, 0.0, 1.0);
        }
        assert!(!state.primed);
        // One more eligible frame trips the gate.
        state.update(&noise, true, 0.0, 1.0);
        assert!(state.primed);
        for m in 0..DFT_SIZE {
            assert!((state.n_psd[m] - 0.25).abs() < 1e-9);
        }
    }

    #[test]
    fn non_stationary_frame_breaks_eligible_run() {
        let mut state = NoiseSpectrum::new();
        let noise = flat_spectrum(0.5);
        // Drive a few stationary frames (seeds reference + builds run).
        for _ in 0..STRICT_CONSECUTIVE_FRAMES {
            state.update(&noise, true, 0.0, 1.0);
        }
        // A spectrum-shifted frame fails the stationarity test.
        let mut shifted = flat_spectrum(0.0);
        // Make spectrum drastically different from the all-0.5 reference:
        // concentrate all energy at one bin, the reference at all bins
        // → cos_sim ≈ 1/sqrt(N), well below the 0.99 threshold.
        shifted[10] = Complex64::new(100.0, 0.0);
        state.update(&shifted, true, 0.0, 1.0);
        // Run resets, primed not advanced.
        assert!(!state.primed);
    }

    #[test]
    fn voiced_frames_hold_noise_estimate() {
        let mut state = NoiseSpectrum::new();
        let noise = flat_spectrum(0.5);
        prime_with(&mut state, &noise);
        let baseline = state.n_psd[0];
        // A spectrum-shifted "voiced" frame fails the stationarity
        // gate AND the silent gate — must NOT change n_psd.
        let mut voice = flat_spectrum(0.0);
        voice[10] = Complex64::new(50.0, 0.0);
        state.update(&voice, false, 100.0, 1.0);
        for m in 0..DFT_SIZE {
            assert!((state.n_psd[m] - baseline).abs() < 1e-9,
                "voiced update leaked: bin {m} = {}", state.n_psd[m]);
        }
    }

    #[test]
    fn subtraction_attenuates_signal_at_or_below_noise_psd() {
        let mut state = NoiseSpectrum::new();
        // Train the noise estimate to |0.5|² = 0.25.
        let train = flat_spectrum(0.5);
        prime_with(&mut state, &train);
        // Apply to a signal at the same level. After β·N subtraction
        // the residual is max((1-β)·s_psd, α·s_psd); gain is
        // sqrt(max(1-β, α)). At β=0.1 → sqrt(0.9) ≈ 0.949;
        // at β=1.5 → sqrt(α) ≈ 0.141. Both regimes must yield gain
        // strictly less than 1 (the signal IS attenuated).
        let probe = flat_spectrum(0.5);
        let out = apply_subtraction(&probe, &mut state);
        let expected_gain = ((1.0 - SUBTRACTION_BETA).max(SPECTRAL_FLOOR_ALPHA)).sqrt();
        let expected_max = expected_gain * 0.5 + 1e-9;
        for m in 0..DFT_SIZE {
            assert!(out[m].re.abs() <= expected_max,
                "bin {m} expected ≤ {expected_max:.4}, got {}", out[m].re);
            assert!(out[m].re.abs() < 0.5,
                "bin {m} not attenuated at all: {}", out[m].re);
        }
    }

    #[test]
    fn subtraction_passes_strong_signal_through() {
        let mut state = NoiseSpectrum::new();
        let noise = flat_spectrum(0.1);
        prime_with(&mut state, &noise);
        // |signal|² = 100; |noise|² = 0.01; SNR ≈ 10000. Both Boll and
        // Wiener should leave the signal almost intact (gain near 1);
        // Wiener with cold prev_enhanced_psd attenuates ~0.5%.
        let signal = flat_spectrum(10.0);
        let out = apply_subtraction(&signal, &mut state);
        for m in 0..DFT_SIZE {
            assert!((out[m].re - 10.0).abs() < 0.5,
                "bin {m} over-attenuated: {}", out[m].re);
            assert!(out[m].re.abs() > 5.0,
                "bin {m} excessively attenuated: {}", out[m].re);
        }
    }
}