blip25-mbe 0.1.0

Research MBE / IMBE / AMBE+2 vocoder family: P25 wire formats, multi-generation codecs, and parametric rate conversion. Educational use; see PATENT_NOTICE.md.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
//! §0.3 — Initial pitch estimation.
//!
//! Reference: vocoder_analysis_encoder_addendum.md §0.3 (BABA-A §5.1.1
//! through §5.1.4, Equations 5–23). Three stages:
//!
//!   (a) E(P) — error function on the half-sample grid, "close to 0 is good".
//!       Uses a squared-envelope autocorrelation r(t) of the LPF'd signal.
//!   (b) Look-back tracker: argmin E(P) on [0.8·P̂_{−1}, 1.2·P̂_{−1}],
//!       then CE_B = E(P̂_B) + E_{−1} + E_{−2}.
//!   (c) Look-ahead tracker: argmin over P_0 of E(P_0) + E_1(P̂_1(P_0))
//!       + E_2(P̂_2(P_0)), with pitch-continuity clamps; then sub-multiple
//!       cascade (Eq. 18/19/20) to correct octave-doubling.
//!   (d) Decision: CE_B ≤ 0.48 ⇒ P̂_B, else CE_B ≤ CE_F ⇒ P̂_B, else P̂_F.

use super::{analysis_window, lpf_tap};
use crate::mbe_params::SAMPLES_PER_FRAME;

/// Half-support of Annex B window `w_I(n)`: `n ∈ [−W_I_HALF, W_I_HALF]`.
pub const W_I_HALF: i32 = 150;

/// Half-support of Annex D LPF `h_LPF(n)`: `n ∈ [−H_LPF_HALF, H_LPF_HALF]`.
pub const H_LPF_HALF: i32 = 10;

/// Lookahead buffer length the encoder maintains. Mirrors
/// `super::LOOKAHEAD_LEN` but kept here as a `pub const` so the shared
/// LPF helper doesn't need a circular import.
pub const LOOKAHEAD_LEN: usize = 3 * SAMPLES_PER_FRAME as usize;

/// Per-frame "tail extension" each pitch slot's LPF reaches past the
/// frame center: from frame center, the slot's LPF output covers
/// `n ∈ [−W_I_HALF, W_I_HALF]`. Frame centers sit at `slot·160 + 80`,
/// so the union of all slot LPF outputs covers buffer indices
/// `[slot0_center − W_I_HALF, slot2_center + W_I_HALF]` =
/// `[80 − 150, 2·160 + 80 + 150]` = `[−70, 550]` (inclusive both
/// ends → 621 entries).
const SHARED_LPF_TAIL: usize = (W_I_HALF as usize) - (SAMPLES_PER_FRAME as usize / 2);

/// Length of the shared LPF output covering all 3 pitch-slot windows
/// at once. Buf range is `[−70, 550]` inclusive both ends, so 621
/// entries: `(slot2_right − slot0_left) + 1` =
/// `(2·SAMPLES_PER_FRAME + 2·W_I_HALF) + 1`.
pub const SHARED_LPF_LEN: usize =
    2 * SAMPLES_PER_FRAME as usize + 2 * W_I_HALF as usize + 1;

/// Half-support of the input buffer that `PitchSearch::new` consumes.
/// The LPF needs `±H_LPF_HALF` samples beyond the `w_I` support, so the
/// input must cover `n ∈ [−(W_I_HALF + H_LPF_HALF), W_I_HALF + H_LPF_HALF]`.
pub const PITCH_INPUT_HALF: i32 = W_I_HALF + H_LPF_HALF;

/// Length of the input buffer passed to [`PitchSearch::new`]: 321 samples
/// covering `n ∈ [−160, 160]`, zero-extended outside.
pub const PITCH_INPUT_LEN: usize = (2 * PITCH_INPUT_HALF + 1) as usize;

/// Minimum of the pitch candidate grid (Eq. 11).
pub const PITCH_GRID_MIN: f64 = 21.0;

/// Maximum of the pitch candidate grid (Eq. 11).
pub const PITCH_GRID_MAX: f64 = 122.0;

/// Spacing between adjacent grid candidates (half-sample resolution).
pub const PITCH_GRID_STEP: f64 = 0.5;

/// Number of candidates on the Eq. 11 grid: `{21, 21.5, …, 121.5, 122}`.
pub const PITCH_GRID_LEN: usize = 203;

/// Pitch-continuity window half-width (Eq. 10, Eq. 14, Eq. 16):
/// `[0.8·P_ref, 1.2·P_ref]`.
const PITCH_CONTINUITY_LO: f64 = 0.8;
const PITCH_CONTINUITY_HI: f64 = 1.2;

/// Energy floor guarding the Eq. 5 denominator on silent frames.
const ENERGY_FLOOR: f64 = 1e-12;

/// Cold-start pitch for `P̂_{−1}` and `P̂_{−2}` per §0.3.7.
pub const PITCH_COLD_START: f64 = 100.0;

/// Eq. 9: `s_LPF(n) = Σ_{j} s(n − j)·h_LPF(j)` for `j ∈ [−10, 10]`.
///
/// `s_input` covers `n ∈ [−PITCH_INPUT_HALF, PITCH_INPUT_HALF]`. The
/// returned buffer covers the `w_I` support `n ∈ [−W_I_HALF, W_I_HALF]`.
/// Samples outside `s_input` are treated as zero (addendum §0.3.1) —
/// but `PITCH_INPUT_HALF = W_I_HALF + H_LPF_HALF`, so for every output
/// `n` and every tap `j` the source index `n - j + PITCH_INPUT_HALF`
/// stays inside `[0, PITCH_INPUT_LEN)`. The bounds check is provable
/// dead and the inner loop runs without one.
pub fn compute_s_lpf(s_input: &[f64; PITCH_INPUT_LEN]) -> [f64; (2 * W_I_HALF + 1) as usize] {
    // Pre-load LPF taps as f64 in a stack array so the inner loop is
    // a tight FIR convolution (21 mul-adds per output sample).
    const TAPS: usize = (2 * H_LPF_HALF + 1) as usize;
    let mut h = [0.0f64; TAPS];
    for j in -H_LPF_HALF..=H_LPF_HALF {
        h[(j + H_LPF_HALF) as usize] = f64::from(lpf_tap(j));
    }

    let mut out = [0.0f64; (2 * W_I_HALF + 1) as usize];
    // Output index `i = n + W_I_HALF`; source index for tap `k` (=
    // `j + H_LPF_HALF`) is `n - j + PITCH_INPUT_HALF`
    //   = (i - W_I_HALF) - (k - H_LPF_HALF) + PITCH_INPUT_HALF
    //   = i + (PITCH_INPUT_HALF - W_I_HALF + H_LPF_HALF) - k
    //   = i + 2·H_LPF_HALF - k          (since PITCH_INPUT_HALF = W_I_HALF + H_LPF_HALF)
    // → starting `src` for tap k=0 at output i is `i + 2·H_LPF_HALF`,
    // and decrements by 1 for each tap. Bounds: i ∈ [0, 2·W_I_HALF],
    // k ∈ [0, 2·H_LPF_HALF], so src ∈ [0, 2·(W_I_HALF + H_LPF_HALF)]
    // = [0, PITCH_INPUT_LEN − 1]. Safe without per-iteration check.
    let two_h = 2 * H_LPF_HALF as usize;
    for (i, slot) in out.iter_mut().enumerate() {
        let src_base = i + two_h;
        let mut acc = 0.0;
        for k in 0..TAPS {
            acc += s_input[src_base - k] * h[k];
        }
        *slot = acc;
    }
    out
}

/// LPF the entire 480-sample lookahead buffer in one pass, producing
/// 621 outputs covering buffer indices `[−70, 550]`. Each pitch slot's
/// 301-entry local `s_LPF` array (centered at `slot·160 + 80`) is then
/// just a 301-entry slice of the shared output:
///   - slot 0 → `[0..301]`     (buf `[−70, 230]`)
///   - slot 1 → `[160..461]`   (buf `[90, 390]`)
///   - slot 2 → `[320..621]`   (buf `[250, 550]`)
///
/// Replaces 3 separate `compute_s_lpf` passes (each 301 outputs × 21
/// taps) with one wider pass (621 × 21) — 30% fewer FIR mults per
/// frame. The arithmetic identity that makes this work: the slot's
/// per-window zero-extension boundaries (slot 0 at slot-local i=80,
/// slot 2 at slot-local i=240+1) coincide with the lookahead buffer's
/// natural `[0, LOOKAHEAD_LEN)` boundary, so a single shared
/// zero-extension produces the same FIR outputs as per-slot windowing.
pub fn compute_s_lpf_shared(lookahead: &[f64; LOOKAHEAD_LEN]) -> [f64; SHARED_LPF_LEN] {
    const TAPS: usize = (2 * H_LPF_HALF + 1) as usize;
    let mut h = [0.0f64; TAPS];
    for j in -H_LPF_HALF..=H_LPF_HALF {
        h[(j + H_LPF_HALF) as usize] = f64::from(lpf_tap(j));
    }

    // Padded input covering buf logical positions `[−80, 560]`
    // (length 641 = SHARED_LPF_LEN + 2·H_LPF_HALF): zero outside
    // `[0, LOOKAHEAD_LEN)`, lookahead inside.  The padding width on
    // each side is `SHARED_LPF_TAIL + H_LPF_HALF = 80`.
    const PAD_LEFT: usize = SHARED_LPF_TAIL + H_LPF_HALF as usize;
    const PAD_LEN: usize = SHARED_LPF_LEN + 2 * H_LPF_HALF as usize;
    let mut padded = [0.0f64; PAD_LEN];
    padded[PAD_LEFT..PAD_LEFT + LOOKAHEAD_LEN].copy_from_slice(lookahead);

    // Output index `i` ∈ [0, SHARED_LPF_LEN) is at buffer logical
    // position `B = i − SHARED_LPF_TAIL`. The FIR reads padded at buf
    // `[B − H_LPF_HALF, B + H_LPF_HALF]`, which in padded indices is
    // `[B + PAD_LEFT − H_LPF_HALF, B + PAD_LEFT + H_LPF_HALF]` =
    // `[i, i + 2·H_LPF_HALF]` since `PAD_LEFT − SHARED_LPF_TAIL =
    // H_LPF_HALF`. So `src_base = i + 2·H_LPF_HALF`, decrement by k.
    let two_h = 2 * H_LPF_HALF as usize;
    let mut out = [0.0f64; SHARED_LPF_LEN];
    for (i, slot) in out.iter_mut().enumerate() {
        let src_base = i + two_h;
        let mut acc = 0.0;
        for k in 0..TAPS {
            acc += padded[src_base - k] * h[k];
        }
        *slot = acc;
    }
    out
}

/// Slice the shared LPF output for the given pitch slot. Returns a
/// 301-entry slice covering that slot's local `s_LPF` array indexed
/// by `n + W_I_HALF` for `n ∈ [−W_I_HALF, W_I_HALF]`.
#[inline]
pub fn slot_s_lpf(shared: &[f64; SHARED_LPF_LEN], slot: usize) -> &[f64] {
    let start = slot * SAMPLES_PER_FRAME as usize;
    &shared[start..start + (2 * W_I_HALF + 1) as usize]
}

/// Quantizer state for one frame of pitch analysis.
///
/// Caches the Eq. 5 energy term and the Eq. 7 autocorrelation `r(t)`
/// for integer `t ∈ [0, 122]`, so `E(P)` can be evaluated cheaply for
/// any grid candidate. The fourth moment of `w_I` (a window-dependent
/// constant) is also precomputed so the denominator is one multiply +
/// one subtract away.
#[derive(Clone, Debug)]
pub struct PitchSearch {
    /// `Σ_{j} s²_LPF(j) · w²_I(j)` — total windowed energy.
    energy: f64,
    /// `Σ w_I⁴(j)` — window-dependent denominator constant (Eq. 5).
    w_i_fourth_moment: f64,
    /// `r(t) = Σ_{j} s_LPF(j)·w²_I(j) · s_LPF(j+t)·w²_I(j+t)` for
    /// `t ∈ [0, R_MAX_LAG]` per addendum §0.3.2 Eq. 7.
    ///
    /// `R_MAX_LAG = W_I_HALF = 150`: Eq. 5's sum reaches lags
    /// `n·P` with `|n| ≤ ⌊150/P⌋`, so `|n·P| ≤ 150` by construction.
    r: [f64; R_MAX_LAG + 1],
    /// `E(P)` precomputed at every half-sample grid point for fast
    /// lookup. Indexed by `i ∈ [0, PITCH_GRID_LEN)` where the grid
    /// pitch is `PITCH_GRID_MIN + i · PITCH_GRID_STEP`. Built once in
    /// `from_lpf`; consumed by [`Self::argmin_in_range`] /
    /// [`Self::e_of_p_grid`] / [`look_ahead`]. Off-grid queries still
    /// recompute via [`Self::e_of_p`].
    e_grid: [f64; PITCH_GRID_LEN],
    /// Sparse-table range-minimum index over `e_grid`. Optional —
    /// only built for `PitchSearch` instances that will see many
    /// `argmin_in_range` calls (typically `next1`/`next2` inside
    /// [`look_ahead`]). When `None`, `argmin_in_range` falls back to
    /// the linear-scan path. Boxed to keep the sparse-less
    /// `PitchSearch` size small for `EncodeTrace::clone()`.
    ///
    /// Layout: `sparse[k][i]` = `(min_e, argmin_idx)` over
    /// `e_grid[i..i+2^k]` with lex-ordering on `(min_e, argmin_idx)`
    /// so that ties resolve to the smaller index — preserving the
    /// linear-scan path's tie-break (first occurrence wins).
    sparse: Option<Box<SparseRmq>>,
}

/// Number of sparse-table levels: `⌈log2(PITCH_GRID_LEN)⌉`. With
/// `PITCH_GRID_LEN = 203`, k ranges 0..=7 (2^7 = 128 ≤ 203, 2^8 = 256 > 203).
const SPARSE_LEVELS: usize = 8;

/// One row of the sparse-table RMQ — `(min_e, argmin_idx)` per grid
/// index `i`, covering `e_grid[i..min(i+2^k, PITCH_GRID_LEN)]`.
type SparseRow = [(f64, u16); PITCH_GRID_LEN];

/// Sparse-table range-min over `e_grid`. `sparse[k][i]` = lex-min over
/// `e_grid[i..min(i+2^k, PITCH_GRID_LEN)]`.
type SparseRmq = [SparseRow; SPARSE_LEVELS];

/// Lex-min on `(value, idx)` pairs — ties on `value` resolve to the
/// smaller `idx`, which mirrors the linear-scan path's "first
/// occurrence wins" tie-break in [`PitchSearch::argmin_in_range`].
#[inline]
fn lex_min(a: (f64, u16), b: (f64, u16)) -> (f64, u16) {
    match a.0.partial_cmp(&b.0) {
        Some(core::cmp::Ordering::Less) => a,
        Some(core::cmp::Ordering::Greater) => b,
        _ => {
            if a.1 <= b.1 {
                a
            } else {
                b
            }
        }
    }
}

const R_MAX_LAG: usize = W_I_HALF as usize;

/// FFT length used for the §0.3.2 Eq. 7 autocorrelation. Must be ≥
/// `sw2.len() + R_MAX_LAG` so circular wraparound doesn't pollute
/// the lags we care about (`t ∈ [0, R_MAX_LAG]`). 512 is the next
/// power of two above 301 + 150 = 451 — keeps rustfft on its
/// fastest radix-2 path.
const AUTOCORR_FFT_LEN: usize = 512;

/// Compute r(t) = Σ_j sw2[j] · sw2[j+t] for t ∈ [0, R_MAX_LAG] via
/// FFT-based autocorrelation. Faster than the direct O(N²) form
/// for the (N=301, R=150) sizes used here — rustfft's radix-2 path
/// at length 512 is heavily SIMD-optimized.
fn autocorr_via_fft(sw2: &[f64; (2 * W_I_HALF + 1) as usize]) -> [f64; R_MAX_LAG + 1] {
    use num_complex::Complex;
    use rustfft::{Fft, FftPlanner};
    use std::sync::OnceLock;

    static FFT_FWD: OnceLock<std::sync::Arc<dyn Fft<f64>>> = OnceLock::new();
    static FFT_INV: OnceLock<std::sync::Arc<dyn Fft<f64>>> = OnceLock::new();
    let fft_fwd = FFT_FWD.get_or_init(|| {
        let mut planner = FftPlanner::<f64>::new();
        planner.plan_fft_forward(AUTOCORR_FFT_LEN)
    });
    let fft_inv = FFT_INV.get_or_init(|| {
        let mut planner = FftPlanner::<f64>::new();
        planner.plan_fft_inverse(AUTOCORR_FFT_LEN)
    });

    let mut buf = [Complex::<f64>::new(0.0, 0.0); AUTOCORR_FFT_LEN];
    for (i, &x) in sw2.iter().enumerate() {
        buf[i].re = x;
    }
    fft_fwd.process(&mut buf);
    // |X(f)|² in-place; for real input X has hermitian symmetry so
    // the result is real, but we keep the complex layout for the IFFT.
    for c in buf.iter_mut() {
        c.re = c.re * c.re + c.im * c.im;
        c.im = 0.0;
    }
    fft_inv.process(&mut buf);

    // rustfft's inverse is unnormalized — divide by N. Lag t is at
    // index t in the IFFT output (positive lags are at the start of
    // the buffer; circular-aliased negative lags fold to the tail and
    // we don't read them).
    let scale = 1.0 / AUTOCORR_FFT_LEN as f64;
    let mut r = [0.0f64; R_MAX_LAG + 1];
    for (t, slot) in r.iter_mut().enumerate() {
        *slot = buf[t].re * scale;
    }
    r
}

impl PitchSearch {
    /// Build the per-frame pitch search from the 321-sample analysis
    /// buffer (see [`PITCH_INPUT_LEN`]).
    pub fn new(s_input: &[f64; PITCH_INPUT_LEN]) -> Self {
        let s_lpf = compute_s_lpf(s_input);
        Self::from_lpf(&s_lpf)
    }

    /// Build from a slice of an already-LPF'd buffer. Used by the
    /// shared-LPF fast path: `compute_s_lpf_shared` produces 621
    /// outputs once per frame, then `slot_s_lpf` returns 301-entry
    /// slices that feed three calls to `from_lpf_slice`. Panics if
    /// the slice isn't exactly `2·W_I_HALF + 1` entries long.
    pub fn from_lpf_slice(s_lpf: &[f64]) -> Self {
        let arr: &[f64; (2 * W_I_HALF + 1) as usize] = s_lpf
            .try_into()
            .expect("from_lpf_slice expects exactly 2·W_I_HALF + 1 entries");
        Self::from_lpf(arr)
    }

    /// Build from an already-LPF'd buffer. Useful for tests that want
    /// to inject a synthetic `s_LPF(n)` without round-tripping through
    /// [`compute_s_lpf`].
    pub fn from_lpf(s_lpf: &[f64; (2 * W_I_HALF + 1) as usize]) -> Self {
        // Energy Σ s²_LPF·w²_I, fourth moment Σ w_I⁴, and the
        // per-frame envelope sw2[j] = s_LPF(j)·w²_I(j) that feeds the
        // Eq. 7 autocorrelation. Addendum §0.3.8: note s_LPF is
        // linear and the window is squared — see
        // analysis/vocoder_analysis_eq7_correction.md.
        let mut sw2 = [0.0f64; (2 * W_I_HALF + 1) as usize];
        let mut energy = 0.0;
        let mut w4 = 0.0;
        for j in -W_I_HALF..=W_I_HALF {
            let idx = (j + W_I_HALF) as usize;
            let s = s_lpf[idx];
            let w = f64::from(analysis_window(j));
            let w2 = w * w;
            sw2[idx] = s * w2;
            energy += s * s * w2;
            w4 += w2 * w2;
        }
        // Eq. 7 via FFT — r(t) = Σ sw2[j]·sw2[j+t] is the
        // autocorrelation of sw2, so r = IFFT(|FFT(zero-padded sw2)|²).
        // We zero-pad to AUTOCORR_FFT_LEN (= 512, the next pow-2 ≥
        // sw2.len() + R_MAX_LAG) so the circular autocorrelation
        // matches the linear one over t ∈ [0, R_MAX_LAG]. FFT is
        // general DSP, not P25 IP — same clean-room justification as
        // the §0.2 signal_spectrum and §1.12.1 unvoiced-synth FFT.
        let r = autocorr_via_fft(&sw2);
        // Precompute E(P) at every half-sample grid point so the
        // hot loops in look_back / look_ahead become slice scans
        // instead of recomputing the Eq. 5 sum each visit.
        let mut e_grid = [0.0f64; PITCH_GRID_LEN];
        let mut tmp = Self {
            energy,
            w_i_fourth_moment: w4,
            r,
            e_grid: [0.0; PITCH_GRID_LEN],
            sparse: None,
        };
        for (i, slot) in e_grid.iter_mut().enumerate() {
            let p = PITCH_GRID_MIN + (i as f64) * PITCH_GRID_STEP;
            *slot = tmp.e_of_p(p);
        }
        tmp.e_grid = e_grid;
        tmp
    }

    /// Build the sparse-table RMQ over `e_grid` so that
    /// [`Self::argmin_in_range`] runs in `O(1)` per call instead of the
    /// default linear scan. Worth ~25 µs/frame on the [`look_ahead`]
    /// 203-iteration outer loop where each step does two
    /// `argmin_in_range` calls on the lookahead frames.
    ///
    /// Idempotent — calling more than once is a no-op (the table is
    /// determined by `e_grid`, which doesn't change after construction).
    pub fn enable_argmin_fast(&mut self) {
        if self.sparse.is_some() {
            return;
        }
        // Box the SparseRmq directly on the heap. The optimizer typically
        // elides the stack copy via NRVO for Box::new of a fixed-size
        // array literal, but core::array::from_fn keeps codegen sane on
        // any Rust version.
        let mut sparse: Box<SparseRmq> = Box::new(core::array::from_fn(|_| {
            [(f64::INFINITY, 0u16); PITCH_GRID_LEN]
        }));

        // Level 0: each cell is the value at its own index.
        for i in 0..PITCH_GRID_LEN {
            sparse[0][i] = (self.e_grid[i], i as u16);
        }
        // Level k > 0: cell i covers e_grid[i..i+2^k] = union of
        // (k−1, i) and (k−1, i + 2^(k−1)). When the right half spills
        // off the grid, just reuse the left (which is already edge-
        // clamped at the previous level).
        for k in 1..SPARSE_LEVELS {
            let half = 1usize << (k - 1);
            for i in 0..PITCH_GRID_LEN {
                let left = sparse[k - 1][i];
                let right = if i + half < PITCH_GRID_LEN {
                    sparse[k - 1][i + half]
                } else {
                    left
                };
                sparse[k][i] = lex_min(left, right);
            }
        }
        self.sparse = Some(sparse);
    }

    /// Eq. 8: linear interpolation of `r(t)` between integer lags.
    fn r_at(&self, t: f64) -> f64 {
        let t_floor = t.floor();
        let frac = t - t_floor;
        let ti = t_floor as usize;
        if ti + 1 >= self.r.len() {
            return *self.r.last().unwrap_or(&0.0);
        }
        (1.0 - frac) * self.r[ti] + frac * self.r[ti + 1]
    }

    /// Eq. 5: error function `E(P)`. Returns `1.0` (the "no-confidence"
    /// value) on silent frames where the denominator would blow up.
    pub fn e_of_p(&self, p: f64) -> f64 {
        if self.energy < ENERGY_FLOOR {
            return 1.0;
        }
        let n_max = (f64::from(W_I_HALF) / p).floor() as i32;
        let mut inner = 0.0;
        for n in -n_max..=n_max {
            inner += self.r_at((f64::from(n) * p).abs());
        }
        let num = self.energy - p * inner;
        let denom = self.energy * (1.0 - p * self.w_i_fourth_moment);
        if denom <= 0.0 {
            return 1.0;
        }
        num / denom
    }

    /// `E(P)` at the `i`-th half-sample grid point — i.e.
    /// `E(PITCH_GRID_MIN + i · PITCH_GRID_STEP)`, looked up from the
    /// cache built in `from_lpf`. Cheap: single array access.
    /// Out-of-range `i` returns `1.0` (matching the off-grid `e_of_p`
    /// silent-frame fallback).
    #[inline]
    pub fn e_of_p_grid(&self, i: usize) -> f64 {
        self.e_grid.get(i).copied().unwrap_or(1.0)
    }

    /// Argmin of `E(P)` over the half-sample grid within `[p_lo, p_hi]`,
    /// both bounds clamped to `[PITCH_GRID_MIN, PITCH_GRID_MAX]`. The
    /// range is inclusive on both ends. Returns `(P̂, E(P̂))`.
    ///
    /// `O(N)` linear scan by default. When [`Self::enable_argmin_fast`]
    /// has been called, falls through to the sparse-table RMQ for
    /// `O(1)` per query — the hot path inside [`look_ahead`].
    pub fn argmin_in_range(&self, p_lo: f64, p_hi: f64) -> (f64, f64) {
        let p_lo = p_lo.max(PITCH_GRID_MIN);
        let p_hi = p_hi.min(PITCH_GRID_MAX);
        // Round up to the next grid point at p_lo; round down at p_hi.
        let i_lo = ((p_lo - PITCH_GRID_MIN) / PITCH_GRID_STEP).ceil() as i32;
        let i_hi = ((p_hi - PITCH_GRID_MIN) / PITCH_GRID_STEP).floor() as i32;
        if i_lo > i_hi {
            // Empty range after clamp — degenerate but possible if
            // p_lo > p_hi or both fall in the same sub-grid bin.
            let p = PITCH_GRID_MIN + f64::from(i_lo.max(0)) * PITCH_GRID_STEP;
            return (p, 1.0);
        }
        let i_lo_u = i_lo as usize;
        let i_hi_u = (i_hi as usize).min(PITCH_GRID_LEN - 1);

        // Fast path: sparse-table RMQ. Cover [i_lo, i_hi] with two
        // pre-aggregated windows of length 2^k where
        // k = ⌊log2(i_hi − i_lo + 1)⌋; lex-min the two cells.
        if let Some(sparse) = self.sparse.as_deref() {
            let len = i_hi_u - i_lo_u + 1;
            // ⌊log2(len)⌋, valid because len ≥ 1 here.
            let k = (len as u32).ilog2() as usize;
            let window = 1usize << k;
            let left = sparse[k][i_lo_u];
            let right = sparse[k][i_hi_u + 1 - window];
            let (best_e, best_i_u16) = lex_min(left, right);
            let best_p = PITCH_GRID_MIN + f64::from(best_i_u16) * PITCH_GRID_STEP;
            return (best_p, best_e);
        }

        // Slow path: linear scan.
        let slice = &self.e_grid[i_lo_u..=i_hi_u];
        let (rel_idx, &best_e) = slice
            .iter()
            .enumerate()
            .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal))
            .expect("non-empty slice");
        let best_i = i_lo_u + rel_idx;
        let best_p = PITCH_GRID_MIN + (best_i as f64) * PITCH_GRID_STEP;
        (best_p, best_e)
    }
}

/// Snap `p` to the nearest half-sample grid value. Returns `None` if
/// the snapped value falls outside `[PITCH_GRID_MIN, PITCH_GRID_MAX]`.
#[inline]
pub fn snap_to_pitch_grid(p: f64) -> Option<f64> {
    let snapped = (p / PITCH_GRID_STEP).round() * PITCH_GRID_STEP;
    if (PITCH_GRID_MIN..=PITCH_GRID_MAX).contains(&snapped) {
        Some(snapped)
    } else {
        None
    }
}

/// Past-frame context consumed by [`look_back`] — the selected pitch
/// from the previous frame and the frozen `E_{−1}`, `E_{−2}` scalar
/// values at the past two frames' selected pitches. Cold-start values
/// are encoded in [`LookBackContext::cold_start`] per §0.3.7.
#[derive(Clone, Copy, Debug)]
pub struct LookBackContext {
    /// `P̂_{−1}` — previous frame's selected pitch.
    pub prev_pitch: f64,
    /// `E_{−1}(P̂_{−1})` — previous frame's error at its selected pitch.
    pub prev_err_1: f64,
    /// `E_{−2}(P̂_{−2})` — two-frames-back error at its selected pitch.
    pub prev_err_2: f64,
}

impl LookBackContext {
    /// Cold-start values per addendum §0.3.7.
    pub const fn cold_start() -> Self {
        Self {
            prev_pitch: PITCH_COLD_START,
            prev_err_1: 0.0,
            prev_err_2: 0.0,
        }
    }
}

/// Look-back pitch tracking per Eq. 10–12. Returns `(P̂_B, CE_B)`.
pub fn look_back(current: &PitchSearch, ctx: LookBackContext) -> (f64, f64) {
    let p_lo = PITCH_CONTINUITY_LO * ctx.prev_pitch;
    let p_hi = PITCH_CONTINUITY_HI * ctx.prev_pitch;
    let (p_b, e_b) = current.argmin_in_range(p_lo, p_hi);
    let ce_b = e_b + ctx.prev_err_1 + ctx.prev_err_2;
    (p_b, ce_b)
}

/// Look-ahead pitch tracking per Eq. 13–17, plus the sub-multiple
/// cascade (Eq. 18–20). Returns `(P̂_F, CE_F)`.
///
/// `next1` and `next2` are the pitch-search tables for frames `t+1`
/// and `t+2`; the caller is responsible for the two-frame lookahead.
/// On cold start (before enough lookahead has accumulated) the higher-
/// level pipeline should route through silence/erasure emission per
/// addendum §0.3.7 rather than call this function with placeholder
/// tables — the result would be a pitch derived from zeros.
pub fn look_ahead(
    current: &PitchSearch,
    next1: &PitchSearch,
    next2: &PitchSearch,
) -> (f64, f64) {
    // Scan P_0 over the full half-sample grid. For each P_0, find
    // P̂_1 ∈ [0.8·P_0, 1.2·P_0] minimizing E_1, then P̂_2 ∈
    // [0.8·P̂_1, 1.2·P̂_1] minimizing E_2. CE_F(P_0) is the sum.
    let mut best_p0 = PITCH_GRID_MIN;
    let mut best_ce = f64::INFINITY;
    for i in 0..PITCH_GRID_LEN {
        let p0 = PITCH_GRID_MIN + (i as f64) * PITCH_GRID_STEP;
        let e0 = current.e_of_p_grid(i);
        let (p1, e1) = next1.argmin_in_range(
            PITCH_CONTINUITY_LO * p0,
            PITCH_CONTINUITY_HI * p0,
        );
        let (_p2, e2) = next2.argmin_in_range(
            PITCH_CONTINUITY_LO * p1,
            PITCH_CONTINUITY_HI * p1,
        );
        let ce = e0 + e1 + e2;
        if ce < best_ce {
            best_ce = ce;
            best_p0 = p0;
        }
    }

    // Sub-multiple cascade per BABA-A §5.1.4 (PDF p. 15): check the
    // **smallest** sub-multiple first (largest `n`), then the next-
    // larger, and so on. The first one to satisfy any of Eq. 18/19/20
    // wins. Ratios are computed against `CE_F(P̂_0)`.
    //
    // Iteration: `n` runs from `⌊P̂_0 / 21⌋` down to `2`. Sub-multiples
    // with `P̂_0 / n < 21` are disregarded (PDF prose).
    let ce_p0 = best_ce;
    let n_max = (best_p0 / PITCH_GRID_MIN).floor() as u32;
    if n_max >= 2 {
        for n in (2..=n_max).rev() {
            let raw = best_p0 / f64::from(n);
            if raw < PITCH_GRID_MIN {
                continue;
            }
            let Some(sub) = snap_to_pitch_grid(raw) else {
                continue;
            };
            let e0s = current.e_of_p(sub);
            let (p1s, e1s) = next1.argmin_in_range(
                PITCH_CONTINUITY_LO * sub,
                PITCH_CONTINUITY_HI * sub,
            );
            let (_p2s, e2s) = next2.argmin_in_range(
                PITCH_CONTINUITY_LO * p1s,
                PITCH_CONTINUITY_HI * p1s,
            );
            let ce_sub = e0s + e1s + e2s;
            let ratio = if ce_p0 > 0.0 {
                ce_sub / ce_p0
            } else {
                f64::INFINITY
            };
            let eq18 = ce_sub <= 0.85 && ratio <= 1.7;
            let eq19 = ce_sub <= 0.40 && ratio <= 3.5;
            let eq20 = ce_sub <= 0.05;
            if eq18 || eq19 || eq20 {
                return (sub, ce_sub);
            }
        }
    }
    (best_p0, ce_p0)
}

/// Eq. 21–23: choose `P̂_I ∈ {P̂_B, P̂_F}`.
///
/// Bias toward pitch continuity: when the backward hypothesis is strong
/// in absolute terms (`CE_B ≤ 0.48`) it wins unconditionally; otherwise
/// the two compete head-to-head.
#[inline]
pub fn decide_initial_pitch(p_b: f64, ce_b: f64, p_f: f64, ce_f: f64) -> f64 {
    if ce_b <= 0.48 {
        p_b
    } else if ce_b <= ce_f {
        p_b
    } else {
        p_f
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn snap_to_pitch_grid_rounds_to_half_sample() {
        assert_eq!(snap_to_pitch_grid(50.0), Some(50.0));
        assert_eq!(snap_to_pitch_grid(50.1), Some(50.0));
        assert_eq!(snap_to_pitch_grid(50.25), Some(50.5)); // round-half-to-even on doubles
        assert_eq!(snap_to_pitch_grid(50.3), Some(50.5));
        assert_eq!(snap_to_pitch_grid(50.7), Some(50.5));
        assert_eq!(snap_to_pitch_grid(50.8), Some(51.0));
    }

    #[test]
    fn snap_to_pitch_grid_rejects_out_of_range() {
        // 20.9 rounds up to 21.0 (borderline in-range).
        assert_eq!(snap_to_pitch_grid(20.9), Some(21.0));
        // 20.6 rounds to 20.5, out of range — rejected.
        assert_eq!(snap_to_pitch_grid(20.6), None);
        // Values well below the grid are rejected.
        assert_eq!(snap_to_pitch_grid(10.0), None);
        assert_eq!(snap_to_pitch_grid(200.0), None);
        // Exactly on the boundaries: in-range.
        assert_eq!(snap_to_pitch_grid(21.0), Some(21.0));
        assert_eq!(snap_to_pitch_grid(122.0), Some(122.0));
    }

    /// On a silent frame `E(P) = 1.0` per the Eq. 5 silence guard.
    #[test]
    fn e_of_p_is_one_on_silence() {
        let s = [0.0f64; PITCH_INPUT_LEN];
        let search = PitchSearch::new(&s);
        for p in [21.0, 50.0, 100.0, 122.0] {
            assert_eq!(search.e_of_p(p), 1.0, "P = {p}");
        }
    }

    /// `argmin_in_range` returns a grid value and restricts to the
    /// requested window (clamped to the full grid bounds).
    #[test]
    fn argmin_in_range_clamps_and_returns_grid_value() {
        let s = [0.1f64; PITCH_INPUT_LEN]; // non-silent, unstructured
        let search = PitchSearch::new(&s);
        let (p, _) = search.argmin_in_range(40.0, 60.0);
        assert!((40.0..=60.0).contains(&p), "P = {p} out of [40, 60]");
        // P is on the half-sample grid.
        let doubled = (p * 2.0).round();
        assert!((doubled / 2.0 - p).abs() < 1e-9, "P = {p} not on grid");
    }

    fn periodic_input(period: f64) -> [f64; PITCH_INPUT_LEN] {
        // Sum of a few harmonics so the *squared* envelope has
        // fundamental period equal to `period`, not period/2. A pure
        // cosine would have period/2 in its squared envelope.
        let mut s = [0.0f64; PITCH_INPUT_LEN];
        let omega = 2.0 * core::f64::consts::PI / period;
        for (idx, slot) in s.iter_mut().enumerate() {
            let n = idx as i32 - PITCH_INPUT_HALF;
            let nf = f64::from(n);
            *slot = (omega * nf).cos()
                + 0.5 * (2.0 * omega * nf).cos()
                + 0.25 * (3.0 * omega * nf).cos();
        }
        s
    }

    /// `argmin_in_range` actually finds the minimum `E(P)` in the
    /// requested window — confirmed by brute-force sweep. (This is a
    /// pure correctness check on the search loop, not a claim about
    /// which pitch "should" win for a given signal; that end-to-end
    /// property depends on real PCM against DVSI vectors, since
    /// perfectly periodic synthetic signals can't exercise the
    /// tracking + cascade semantics — every `E(nP)` is too negative
    /// for the Eq. 18/19/20 thresholds to prune spurious
    /// sub-multiples.)
    #[test]
    fn argmin_in_range_matches_brute_force_sweep() {
        let s = periodic_input(50.0);
        let search = PitchSearch::new(&s);
        for (p_lo, p_hi) in [(21.0, 122.0), (40.0, 60.0), (80.0, 100.0), (45.0, 55.0)] {
            let (p_hat, e_hat) = search.argmin_in_range(p_lo, p_hi);
            // Sweep the grid within [p_lo, p_hi] and collect the true minimum.
            let mut brute_min = f64::INFINITY;
            let mut brute_p = p_hat;
            let mut p = PITCH_GRID_MIN;
            while p <= PITCH_GRID_MAX {
                if p >= p_lo && p <= p_hi {
                    let e = search.e_of_p(p);
                    if e < brute_min {
                        brute_min = e;
                        brute_p = p;
                    }
                }
                p += PITCH_GRID_STEP;
            }
            assert!(
                (e_hat - brute_min).abs() < 1e-12 && (p_hat - brute_p).abs() < 1e-9,
                "[{p_lo}, {p_hi}]: argmin_in_range returned ({p_hat}, {e_hat}), \
                 brute = ({brute_p}, {brute_min})"
            );
        }
    }

    /// Diagnostic probe: dump E(P) across the grid for a synthetic
    /// periodic signal. Not a true test — just prints the curve so
    /// failures can be reasoned about. Gated behind nocapture.
    #[test]
    #[ignore]
    fn probe_dump_e_of_p() {
        for true_p in [50.0, 65.0, 80.0, 100.0] {
            let s = periodic_input(true_p);
            let search = PitchSearch::new(&s);
            let (p_hat, e_hat) = search.argmin_in_range(PITCH_GRID_MIN, PITCH_GRID_MAX);
            println!("\n=== true_p = {true_p} ===");
            println!("argmin: P̂ = {p_hat}, E = {e_hat:.4}");
            println!("energy = {:.4e}", search.energy);
            println!("w_i_fourth_moment = {:.4e}", search.w_i_fourth_moment);
            let mut p = PITCH_GRID_MIN;
            while p <= PITCH_GRID_MAX {
                let e = search.e_of_p(p);
                if p == true_p || p == p_hat || (p - PITCH_GRID_MIN) % 10.0 == 0.0 {
                    println!("  E({p:6.1}) = {e:+.4}");
                }
                p += PITCH_GRID_STEP;
            }
        }
    }

    /// `look_back` with cold-start context returns `CE_B = E(P̂_B)`
    /// because `E_{−1} = E_{−2} = 0` per §0.3.7.
    #[test]
    fn look_back_cold_start_has_zero_history_contribution() {
        let s = periodic_input(50.0);
        let search = PitchSearch::new(&s);
        let ctx = LookBackContext::cold_start();
        let (p_b, ce_b) = look_back(&search, ctx);
        // P̂_B falls in [0.8·100, 1.2·100] = [80, 120].
        assert!(
            (80.0..=120.0).contains(&p_b),
            "P̂_B = {p_b} out of [80, 120]"
        );
        // CE_B = E(P̂_B) + 0 + 0.
        let e_b = search.e_of_p(p_b);
        assert!((ce_b - e_b).abs() < 1e-12, "CE_B = {ce_b}, E(P̂_B) = {e_b}");
    }

    /// `look_back` with a small `CE_B` (Eq. 21 path) drives the
    /// decision toward `P̂_B` regardless of `P̂_F`.
    #[test]
    fn decide_initial_pitch_eq21_wins_when_ce_b_is_small() {
        assert_eq!(decide_initial_pitch(50.0, 0.3, 70.0, 0.2), 50.0);
        assert_eq!(decide_initial_pitch(50.0, 0.48, 70.0, 0.1), 50.0);
    }

    /// Eq. 22: when `CE_B > 0.48` but still ≤ `CE_F`, backward wins.
    #[test]
    fn decide_initial_pitch_eq22_wins_when_ce_b_leq_ce_f() {
        assert_eq!(decide_initial_pitch(50.0, 0.6, 70.0, 0.65), 50.0);
        assert_eq!(decide_initial_pitch(50.0, 0.6, 70.0, 0.6), 50.0);
    }

    /// Eq. 23: when `CE_B > CE_F` and `CE_B > 0.48`, forward wins.
    #[test]
    fn decide_initial_pitch_eq23_picks_forward_when_ce_b_is_larger() {
        assert_eq!(decide_initial_pitch(50.0, 0.7, 70.0, 0.5), 70.0);
    }

    /// `look_ahead` returns a pitch on the half-sample grid and
    /// some finite cumulative-error value. End-to-end correctness
    /// (does it pick the "right" pitch on real speech?) requires
    /// DVSI PCM vectors — see the comment on
    /// [`e_of_p_local_minimum_within_continuity_window`].
    #[test]
    fn look_ahead_returns_grid_value_and_finite_ce() {
        let s = periodic_input(50.0);
        let cur = PitchSearch::new(&s);
        let n1 = PitchSearch::new(&s);
        let n2 = PitchSearch::new(&s);
        let (p_f, ce_f) = look_ahead(&cur, &n1, &n2);
        assert!(
            (PITCH_GRID_MIN..=PITCH_GRID_MAX).contains(&p_f),
            "P̂_F = {p_f} out of grid"
        );
        assert!(ce_f.is_finite(), "CE_F = {ce_f} not finite");
        let doubled = (p_f / PITCH_GRID_STEP).round();
        assert!(
            (doubled * PITCH_GRID_STEP - p_f).abs() < 1e-9,
            "P̂_F = {p_f} not on grid"
        );
    }

    /// Boundary conditions: `compute_s_lpf` zero-extends the input.
    /// For an all-zero input it produces an all-zero output.
    #[test]
    fn compute_s_lpf_of_zero_is_zero() {
        let s = [0.0f64; PITCH_INPUT_LEN];
        let s_lpf = compute_s_lpf(&s);
        for (i, &v) in s_lpf.iter().enumerate() {
            assert!(v.abs() < 1e-12, "s_LPF[{i}] = {v}");
        }
    }

    /// `compute_s_lpf` applied to a DC input recovers the filter's DC
    /// gain on the interior (away from zero-padded edges). Annex D
    /// `h_LPF` is a symmetric lowpass, so its DC gain is `Σ h_LPF(j)`.
    #[test]
    fn compute_s_lpf_interior_matches_filter_dc_gain() {
        let s = [1.0f64; PITCH_INPUT_LEN];
        let s_lpf = compute_s_lpf(&s);
        let dc_gain: f64 = (-H_LPF_HALF..=H_LPF_HALF)
            .map(|j| f64::from(lpf_tap(j)))
            .sum();
        // Interior samples (far from the ±150 edges) see the full
        // filter. Sample at n=0.
        let center = s_lpf[W_I_HALF as usize];
        assert!(
            (center - dc_gain).abs() < 1e-9,
            "s_LPF(0) = {center}, expected DC gain {dc_gain}"
        );
    }
}