ruopus 0.1.2

A pure-Rust implementation of the Opus audio codec (RFC 6716). No FFI; unsafe confined to documented SIMD kernels.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
//! Per-frame SILK encode assembly (RFC 6716 §5.2).
//!
//! [`SilkChannelEncoder::encode_frame`] is the end-to-end SILK encode path:
//! it ties the analysis and quantisation kernels into a single coded frame
//! and is validated by round-tripping through the bit-exact decoder. It
//! decides voiced/unvoiced itself (via the pitch analysis) and handles both
//! paths; the remaining work is the higher-level rate control, the resampler
//! front-end, stereo, and the public Opus mode glue.
//!
//! The chain is: pitch analysis (whitening + lag search, deciding voicing) →
//! Burg LPC → NLSF VQ → `nlsf2a` requantise → (voiced) LTP correlation +
//! gain VQ → noise-shaping analysis → gain quantisation → NSQ → index/pulse
//! bitstream. The decoder rebuilds the prediction coefficients and gains
//! from the coded indices, so feeding NSQ those same quantised values makes
//! its reconstruction equal the decoder's output. Cross-frame state (NSQ
//! history, pitch lag, input history, shaping smoothers) is carried in the
//! encoder so consecutive frames stay in sync with the decoder.

extern crate alloc;
use alloc::vec;
use alloc::vec::Vec;

use super::super::gains::{gains_dequant, gains_quant};
use super::super::indices::{
    CondCoding, EcPrevState, MAX_LPC_ORDER, MAX_NB_SUBFR, SideInfoIndices, TYPE_UNVOICED, TYPE_VOICED, encode_indices,
    nlsf_codebook,
};
use super::super::math::smulwb;
use super::super::nlsf::nlsf2a;
use super::super::params::LTP_ORDER;
use super::super::pulses::encode_pulses;
use super::super::tables::LTPSCALES_TABLE_Q14;
use super::control::control_snr;
use super::dsp::{energy, lpc_analysis_filter_flp};
use super::gains::process_gains;
use super::lpc::burg_modified;
use super::ltp::{find_ltp, quant_ltp_gains};
use super::nlsf::{a2nlsf, nlsf_encode, nlsf_vq_weights_laroia};
use super::noise_shape::{NoiseShapeConfig, ShapeState, noise_shape_analysis};
use super::nsq::{NsqConfig, NsqState, nsq};
use super::pitch_analysis::find_pitch_lags;
use super::vad::VadState;
use crate::range::RangeEncoder;

/// One channel's SILK encoder state.
#[derive(Clone)]
pub(crate) struct SilkChannelEncoder {
    pub nsq: NsqState,
    /// Cross-frame noise-shaping smoothing state (`sShape`).
    pub shape: ShapeState,
    /// Gain-quantiser accumulator (`sShape.LastGainIndex`).
    pub last_gain_index: i8,
    /// Cumulative LTP max-gain accumulator (`sum_log_gain_Q7`).
    pub sum_log_gain_q7: i32,
    /// Previous frame's final pitch lag (`prevLag`, 0 if unvoiced).
    pub prev_lag: i32,
    /// Previous frame's signal type (`prevSignalType`).
    pub prev_signal_type: i32,
    /// Normalised long-term correlation carried across frames (`LTPCorr`).
    pub ltp_corr: f32,
    /// The previous `ltp_mem_length` input samples, used as pitch-analysis
    /// history for the next frame.
    pub prev_input: Vec<i16>,
    /// Voice-activity detector (noise-floor estimation) state.
    pub vad: VadState,
    /// Target bitrate (bps), mapped to the coding SNR per frame.
    pub target_rate_bps: i32,
    /// Expected packet-loss percentage 0-100 (`PacketLoss_perc`). When > 0, an
    /// independently-coded voiced frame raises its LTP scaling index to reduce
    /// inter-frame prediction dependency (loss-robust coding).
    pub packet_loss_perc: i32,
    /// Number of SILK frames per Opus packet (`nFramesPerPacket`: 1 for
    /// 10/20 ms, 2 for 40 ms, 3 for 60 ms), used by the LTP scaling selection.
    pub n_frames_per_packet: i32,
    /// Gain increase applied to the LBRR copy's first-subframe gain index
    /// (`LBRR_GainIncreases`), so the redundant FEC copy costs fewer bits. 0
    /// disables the reduced-rate LBRR pass (the copy then reuses the full-rate
    /// pulses).
    pub lbrr_gain_increases: i32,
    /// Cross-frame LBRR gain-index accumulator (`LBRRprevLastGainIndex`), reset
    /// at the first LBRR frame of a packet and carried across conditionally
    /// coded LBRR frames.
    pub lbrr_prev_last_gain_index: i8,
    /// True until the first frame after a reset has been coded; relaxes the
    /// maximum-prediction-gain cap (`first_frame_after_reset`).
    pub first_frame_after_reset: bool,
    /// Entropy-coding history for [`encode_indices`].
    pub ec_prev: EcPrevState,
    pub fs_khz: i32,
    pub nb_subfr: usize,
    /// Encode complexity 0-10; selects the pitch-search depth/threshold/order
    /// (the rest of the analysis is already its low-complexity configuration:
    /// plain NSQ, no warping).
    pub complexity: u8,
    /// Per-frame work buffers, reused across frames to avoid reallocating (and
    /// re-zeroing) them every call: pitch-analysis input/residual, the aligned
    /// residual, the noise-shape input, the LPC-analysis input, and the pulse
    /// output. Fully overwritten each frame.
    scratch_pitch: Vec<f32>,
    scratch_res: Vec<f32>,
    scratch_res_f: Vec<f32>,
    scratch_x_buf: Vec<f32>,
    scratch_lpc_in: Vec<f32>,
    scratch_pulses: Vec<i8>,
    /// Reused rate-control snapshot (NSQ state + entropy history), so the
    /// gain-bisection fallback does not clone the NSQ `Vec`s every frame.
    snap_nsq: NsqState,
}

impl SilkChannelEncoder {
    /// A reset encoder for the given internal rate and subframe count.
    #[must_use]
    pub(crate) fn new(fs_khz: i32, nb_subfr: usize) -> Self {
        SilkChannelEncoder {
            nsq: NsqState::new(),
            shape: ShapeState::default(),
            last_gain_index: 10,
            sum_log_gain_q7: 0,
            prev_lag: 0,
            prev_signal_type: TYPE_UNVOICED,
            ltp_corr: 0.0,
            prev_input: vec![0; 20 * fs_khz as usize],
            vad: VadState::new(),
            target_rate_bps: 30_000,
            packet_loss_perc: 0,
            n_frames_per_packet: 1,
            lbrr_gain_increases: 0,
            lbrr_prev_last_gain_index: 0,
            first_frame_after_reset: true,
            ec_prev: EcPrevState::default(),
            fs_khz,
            nb_subfr,
            complexity: 10,
            scratch_pitch: Vec::new(),
            scratch_res: Vec::new(),
            scratch_res_f: Vec::new(),
            scratch_x_buf: Vec::new(),
            scratch_lpc_in: Vec::new(),
            scratch_pulses: Vec::new(),
            snap_nsq: NsqState::new(),
        }
    }

    /// Sets the target bitrate (bps), which maps to the coding SNR per frame.
    pub(crate) fn set_bitrate(&mut self, bps: i32) {
        self.target_rate_bps = bps;
    }

    /// Sets the expected packet-loss percentage 0-100 (`PacketLoss_perc`),
    /// clamped; > 0 enables loss-robust LTP scaling on independently coded
    /// voiced frames.
    pub(crate) fn set_packet_loss_perc(&mut self, perc: i32) {
        self.packet_loss_perc = perc.clamp(0, 100);
    }

    /// Sets the number of SILK frames per packet (`nFramesPerPacket`), used by
    /// the LTP scaling selection.
    pub(crate) fn set_n_frames_per_packet(&mut self, n: i32) {
        self.n_frames_per_packet = n.max(1);
    }

    /// Sets the LBRR first-subframe gain increase (`LBRR_GainIncreases`); 0
    /// disables the reduced-rate LBRR pass (the copy reuses the full-rate
    /// pulses).
    pub(crate) fn set_lbrr_gain_increases(&mut self, g: i32) {
        self.lbrr_gain_increases = g.max(0);
    }

    /// Sets the encode complexity 0-10 (clamped), selecting the pitch search.
    pub(crate) fn set_complexity(&mut self, complexity: u8) {
        self.complexity = complexity.min(10);
    }

    /// Resets the prediction memory for the first coded side frame after a
    /// mid-only stretch, mirroring the decoder's `reset_side_prediction`.
    pub(crate) fn reset_side_prediction(&mut self) {
        self.nsq = NsqState::new();
        self.nsq.lag_prev = 100;
        self.shape = ShapeState::default();
        self.last_gain_index = 10;
        self.sum_log_gain_q7 = 0;
        self.prev_lag = 0;
        self.prev_signal_type = TYPE_UNVOICED;
        self.ltp_corr = 0.0;
        self.first_frame_after_reset = true;
        for v in &mut self.prev_input {
            *v = 0;
        }
    }

    /// Encodes one frame of `input` (i16 PCM at the internal rate,
    /// `frame_length` samples), deciding voiced/unvoiced and (when voiced)
    /// the pitch lags itself via the pitch analysis. Returns the coded
    /// `SideInfoIndices`.
    pub(crate) fn encode_frame(
        &mut self,
        enc: &mut RangeEncoder,
        input: &[i16],
        cond_coding: CondCoding,
        max_bits: Option<i32>,
    ) -> SideInfoIndices {
        self.encode_frame_inner(enc, input, cond_coding, max_bits, false).0
    }

    /// Codes the regular frame into `enc` honouring the cumulative `max_bits`
    /// cap (so hybrid packets reserve the CELT high band its room) and captures
    /// the chosen `indices`/`pulses`, advancing the cross-frame analysis state
    /// (NSQ history, pitch/voicing, input history, gain accumulator, entropy
    /// history) and `ec_prev` exactly as a regular [`encode_frame`] would. This
    /// is the in-band FEC capture path: the captured frame is stored to re-emit
    /// as the *next* packet's LBRR copy via [`emit_frame`](Self::emit_frame).
    ///
    /// Returns `(indices, pulses)` for the regular frame and, when
    /// `lbrr_gain_increases > 0`, a reduced-rate LBRR copy `(indices, pulses)`
    /// produced by a second NSQ pass with a raised first-subframe gain index
    /// (`silk_LBRR_encode`). When the gain increase is 0 the LBRR copy is
    /// `None` and the caller reuses the regular frame as the LBRR copy.
    #[allow(clippy::type_complexity)]
    pub(crate) fn encode_frame_capture(
        &mut self,
        enc: &mut RangeEncoder,
        input: &[i16],
        cond_coding: CondCoding,
        max_bits: Option<i32>,
    ) -> ((SideInfoIndices, Vec<i8>), Option<(SideInfoIndices, Vec<i8>)>) {
        let want_lbrr = self.lbrr_gain_increases > 0;
        let (indices, pulses, lbrr) = self.encode_frame_inner(enc, input, cond_coding, max_bits, want_lbrr);
        ((indices, pulses), lbrr)
    }

    /// Emits a previously [`encode_frame_capture`](Self::encode_frame_capture)d
    /// frame into `enc` with the given conditional-coding mode, as the LBRR copy
    /// when
    /// `lbrr` is true (which forces the VAD-present type coding). Advances the
    /// entropy history (`ec_prev`) only - no analysis state.
    pub(crate) fn emit_frame(
        &mut self,
        enc: &mut RangeEncoder,
        indices: &SideInfoIndices,
        pulses: &[i8],
        cond_coding: CondCoding,
        lbrr: bool,
    ) {
        let frame_length = self.nb_subfr * 5 * self.fs_khz as usize;
        encode_indices(
            enc,
            indices,
            self.fs_khz,
            self.nb_subfr,
            lbrr,
            true,
            cond_coding,
            &mut self.ec_prev,
        );
        encode_pulses(
            enc,
            i32::from(indices.signal_type),
            i32::from(indices.quant_offset_type),
            pulses,
            frame_length,
        );
    }

    #[allow(clippy::type_complexity)]
    fn encode_frame_inner(
        &mut self,
        enc: &mut RangeEncoder,
        input: &[i16],
        cond_coding: CondCoding,
        max_bits: Option<i32>,
        want_lbrr: bool,
    ) -> (SideInfoIndices, Vec<i8>, Option<(SideInfoIndices, Vec<i8>)>) {
        let order = if self.fs_khz == 16 { 16 } else { 10 };
        let subfr_length = 5 * self.fs_khz as usize;
        let frame_length = self.nb_subfr * subfr_length;
        let ltp_mem_length = 20 * self.fs_khz as usize;
        let la_pitch = 2 * self.fs_khz as usize;
        debug_assert_eq!(input.len(), frame_length);

        // Voice-activity analysis: speech-activity, spectral tilt and per-band
        // input quality, which tune the pitch threshold, noise shaping and gains.
        let vad = { self.vad.get_sa_q8(input, frame_length, self.fs_khz) };

        // Pitch analysis: whiten and search for the lag. `pitch_x_buf` holds
        // `ltp_mem_length` of history, the frame, then `la_pitch` lookahead;
        // an isolated frame zero-pads the history and lookahead.
        // Pitch-search settings from the complexity: the
        // estimation complexity (0/1/2), the voicing threshold, and the
        // whitening LPC order (capped at the prediction order).
        let (pe_complexity, search_thres, pe_lpc_order): (usize, f32, usize) = match self.complexity {
            0 => (0, 0.8, 6),
            1 => (1, 0.76, 8),
            2 => (0, 0.8, 6),
            3 => (1, 0.76, 8),
            4 | 5 => (1, 0.74, 10),
            6 | 7 => (1, 0.72, 12),
            _ => (2, 0.7, 16),
        };
        let pe_order = pe_lpc_order.min(order);
        let buf_len = la_pitch + frame_length + ltp_mem_length;
        // Reuse the work buffers (no per-frame allocation). `pitch_x_buf` is
        // history + frame, with the trailing `la_pitch` lookahead zeroed; the
        // zipped writes carry no bounds checks.
        let mut pitch_x_buf = core::mem::take(&mut self.scratch_pitch);
        pitch_x_buf.resize(buf_len, 0.0);
        for (dst, &v) in pitch_x_buf[..ltp_mem_length].iter_mut().zip(self.prev_input.iter()) {
            *dst = f32::from(v);
        }
        for (dst, &v) in pitch_x_buf[ltp_mem_length..ltp_mem_length + frame_length]
            .iter_mut()
            .zip(input.iter())
        {
            *dst = f32::from(v);
        }
        pitch_x_buf[ltp_mem_length + frame_length..].fill(0.0);
        let mut res = core::mem::take(&mut self.scratch_res);
        res.clear();
        res.resize(buf_len, 0.0);
        let pl = find_pitch_lags(
            &pitch_x_buf,
            &mut res,
            self.fs_khz,
            self.nb_subfr,
            pe_order,
            pe_complexity,
            search_thres,
            self.prev_lag,
            self.prev_signal_type,
            vad.speech_activity_q8,
            vad.input_tilt_q15,
            &mut self.ltp_corr,
        );
        let is_voiced = pl.voicing == 0;
        let signal_type = if is_voiced { TYPE_VOICED } else { TYPE_UNVOICED };
        let pitch_l = pl.pitch_l;
        // The whitened residual aligned to the frame (`res_pitch_frame`).
        let mut res_f = core::mem::take(&mut self.scratch_res_f);
        res_f.clear();
        res_f.extend_from_slice(&res[ltp_mem_length..ltp_mem_length + frame_length]);

        // Noise-shaping analysis (complexity-0 configuration: no warping, so
        // the plain NSQ sees ordinary shaping coefficients). The reference runs
        // this before `find_pred_coefs` because it produces the per-subframe
        // gains the prediction analysis normalises by.
        let la_shape = 3 * self.fs_khz as usize;
        let shaping_lpc_order = 12.min(order);
        let snr_db_q7 = control_snr(self.fs_khz, self.nb_subfr, self.target_rate_bps);
        let mut x_buf = core::mem::take(&mut self.scratch_x_buf);
        x_buf.clear();
        x_buf.resize(frame_length + 2 * la_shape, 0.0);
        for (dst, &v) in x_buf[la_shape..la_shape + frame_length].iter_mut().zip(input.iter()) {
            *dst = f32::from(v);
        }
        let shape_cfg = NoiseShapeConfig {
            fs_khz: self.fs_khz,
            nb_subfr: self.nb_subfr,
            subfr_length,
            la_shape,
            shape_win_length: subfr_length + 2 * la_shape,
            shaping_lpc_order,
            warping_q16: 0,
            signal_type,
            snr_db_q7,
            speech_activity_q8: vad.speech_activity_q8,
            input_quality_bands_q15: [vad.input_quality_bands_q15[0], vad.input_quality_bands_q15[1]],
            use_cbr: true,
            ltp_corr: self.ltp_corr,
            pred_gain: pl.pred_gain,
            pitch_l,
        };
        let shp = { noise_shape_analysis(&mut self.shape, &shape_cfg, &res_f, &x_buf) };

        // `silk_find_pred_coefs_FLP`: short- and long-term prediction analysis
        // on the gain-normalised input. `LPC_in_pre` holds, per subframe,
        // `order` history samples plus the subframe scaled by the inverse gain
        // (and, when voiced, whitened by the LTP first); the short-term LPC is
        // estimated from that signal rather than the raw frame.
        let mut inv_gains = [0.0f32; MAX_NB_SUBFR];
        for (k, ig) in inv_gains.iter_mut().enumerate().take(self.nb_subfr) {
            *ig = 1.0 / shp.gains[k];
        }
        let pre = order; // preceding samples per subframe (predictLPCOrder)
        let shift = subfr_length + pre;
        let mut lpc_in_pre = core::mem::take(&mut self.scratch_lpc_in);
        lpc_in_pre.clear();
        lpc_in_pre.resize(self.nb_subfr * shift, 0.0);

        let mut ltp_coef = [0i16; LTP_ORDER * MAX_NB_SUBFR];
        let mut ltp_index = [0i8; MAX_NB_SUBFR];
        let mut per_index = 0i8;
        let mut pred_gain_db = 0.0f32;
        if is_voiced {
            // LTP correlation + gain VQ on the whitened residual.
            let mut xx = vec![0.0f32; self.nb_subfr * LTP_ORDER * LTP_ORDER];
            let mut x_x = vec![0.0f32; self.nb_subfr * LTP_ORDER];
            find_ltp(
                &res,
                ltp_mem_length,
                &pitch_l,
                subfr_length,
                self.nb_subfr,
                &mut xx,
                &mut x_x,
            );
            let g = { quant_ltp_gains(&xx, &x_x, subfr_length as i32, self.nb_subfr, &mut self.sum_log_gain_q7) };
            ltp_coef = g.b_q14;
            ltp_index[..self.nb_subfr].copy_from_slice(&g.cbk_index[..self.nb_subfr]);
            per_index = g.periodicity_index;
            pred_gain_db = g.pred_gain_db;

            // LTP analysis filter: subtract the long-term prediction (lagged by
            // `pitch_l[k]`, taps centred at `LTP_ORDER/2`) and scale by the
            // inverse gain. `pitch_x_buf` holds the raw input with history.
            for k in 0..self.nb_subfr {
                let x_ptr = ltp_mem_length + k * subfr_length - pre;
                let inv = inv_gains[k];
                let btmp = &ltp_coef[k * LTP_ORDER..k * LTP_ORDER + LTP_ORDER];
                for i in 0..shift {
                    let xi = x_ptr + i;
                    let lag = xi - pitch_l[k] as usize;
                    let mut v = pitch_x_buf[xi];
                    for (j, &b) in btmp.iter().enumerate() {
                        v -= (f32::from(b) / 16384.0) * pitch_x_buf[lag + LTP_ORDER / 2 - j];
                    }
                    lpc_in_pre[k * shift + i] = v * inv;
                }
            }
        } else {
            // Unvoiced: gain-normalised input with `pre` history samples.
            for k in 0..self.nb_subfr {
                let x_ptr = ltp_mem_length + k * subfr_length - pre;
                let inv = inv_gains[k];
                for i in 0..shift {
                    lpc_in_pre[k * shift + i] = pitch_x_buf[x_ptr + i] * inv;
                }
            }
        }

        // Maximum prediction-gain cap (`minInvGain`): looser right after a
        // reset, otherwise scaled by the LTP coding gain and the coding quality
        // so a strong long-term predictor permits a sharper LPC.
        let min_inv_gain = if self.first_frame_after_reset {
            1.0 / 100.0 // 1 / MAX_PREDICTION_POWER_GAIN_AFTER_RESET
        } else {
            let g = 2.0f32.powf(pred_gain_db / 3.0) / 1e4; // / MAX_PREDICTION_POWER_GAIN
            g / (0.25 + 0.75 * shp.coding_quality)
        };

        // Short-term analysis: Burg LPC over the per-subframe `LPC_in_pre`
        // blocks → NLSF → VQ-quantised indices (requantised NLSF written back),
        // then the Q12 LPC the decoder rebuilds.
        let mut lpc = [0.0f32; MAX_LPC_ORDER];
        {
            burg_modified(
                &mut lpc[..order],
                &lpc_in_pre,
                min_inv_gain,
                shift,
                self.nb_subfr,
                order,
            );
        }

        let cb = nlsf_codebook(self.fs_khz);
        let mut nlsf_q15: Vec<i16> = a2nlsf(&lpc[..order]);
        let mut w_q2 = [0i16; MAX_LPC_ORDER];
        nlsf_vq_weights_laroia(&mut w_q2[..order], &nlsf_q15, order);
        let (nlsf_indices, _) = nlsf_encode(&mut nlsf_q15, cb, &w_q2[..order], 1 << 14, 4, signal_type as usize);

        let mut pred_coef = [0i16; 2 * MAX_LPC_ORDER];
        let mut a_q12 = [0i16; MAX_LPC_ORDER];
        nlsf2a(&mut a_q12[..order], &nlsf_q15[..order]);
        pred_coef[..order].copy_from_slice(&a_q12[..order]);
        pred_coef[MAX_LPC_ORDER..MAX_LPC_ORDER + order].copy_from_slice(&a_q12[..order]);

        // Residual energy on `LPC_in_pre` with the quantised LPC
        // (`silk_residual_energy_FLP`): ResNrg[k] = Gains[k]^2 · energy(residual).
        let a_f: Vec<f32> = a_q12[..order].iter().map(|&c| f32::from(c) / 4096.0).collect();
        let mut gains = shp.gains;
        let mut res_nrg = [0.0f32; MAX_NB_SUBFR];
        let mut lpc_res = vec![0.0f32; shift];
        for k in 0..self.nb_subfr {
            let sub = &lpc_in_pre[k * shift..k * shift + shift];
            lpc_analysis_filter_flp(&mut lpc_res, &a_f, sub, shift, order);
            let nrg = energy(&lpc_res[order..shift]);
            res_nrg[k] = (f64::from(gains[k]) * f64::from(gains[k]) * nrg) as f32;
        }

        // `lastGainIndexPrev`: the gain-quantiser accumulator before this
        // frame, restored each iteration of the `max_bits` rate-control loop.
        let last_gain_prev = self.last_gain_index;
        let gres = process_gains(
            &mut gains,
            &res_nrg,
            signal_type,
            shp.quant_offset_type,
            self.nb_subfr,
            subfr_length,
            snr_db_q7,
            pred_gain_db,
            vad.input_tilt_q15,
            1,
            vad.speech_activity_q8,
            shp.input_quality,
            shp.coding_quality,
            &mut self.last_gain_index,
            cond_coding,
        );

        // LTP scaling (`silk_LTP_scale_ctrl`): the first (independently coded)
        // frame of a packet raises the LTP scaling index when packet loss is
        // expected, reducing inter-frame prediction dependency so a lost frame
        // damages fewer following frames. Conditionally coded frames always use
        // index 0 (minimum scaling). With `packet_loss_perc == 0` (the default)
        // we keep index 0 to stay byte-identical to the prior behaviour; the
        // reference's `round_loss = perc + nFramesPerPacket` term would raise it
        // even at zero loss, but that divergence is gated off here.
        let ltp_scale_index = if self.packet_loss_perc > 0 && is_voiced && cond_coding == CondCoding::Independently {
            // round_loss = PacketLoss_perc + nFramesPerPacket;
            // LTP_scaleIndex = LIMIT(round_loss * LTPredCodGain * 0.1, 0, 2)
            // (truncated toward zero on assignment to int8). `pred_gain_db`
            // is the LTP prediction coding gain (`LTPredCodGain`).
            let round_loss = self.packet_loss_perc + self.n_frames_per_packet;
            let v = round_loss as f32 * pred_gain_db * 0.1;
            v.clamp(0.0, 2.0) as i8
        } else {
            0i8
        };
        let ltp_scale_q14 = if is_voiced {
            i32::from(LTPSCALES_TABLE_Q14[ltp_scale_index as usize])
        } else {
            0
        };
        let seed = 0i32;

        let cfg = NsqConfig {
            frame_length,
            subfr_length,
            nb_subfr: self.nb_subfr,
            ltp_mem_length,
            predict_lpc_order: order,
            shaping_lpc_order,
        };
        // Lambda (RD trade-off) and the quantiser offset can be adjusted by the
        // rate-control loop when a frame is stuck above the cap.
        let mut lambda_q10 = (gres.lambda * 1024.0) as i32;
        let mut quant_offset_type = gres.quant_offset_type;

        // Assemble the side info; `gains_indices` and `quant_offset_type` are
        // refreshed per rate-control iteration below.
        let mut indices = SideInfoIndices {
            signal_type: signal_type as i8,
            quant_offset_type: quant_offset_type as i8,
            nlsf_interp_coef_q2: 4,
            seed: seed as i8,
            ..SideInfoIndices::default()
        };
        indices.nlsf_indices[..=order].copy_from_slice(&nlsf_indices[..=order]);
        if is_voiced {
            indices.lag_index = pl.lag_index;
            indices.contour_index = pl.contour_index;
            indices.per_index = per_index;
            indices.ltp_index = ltp_index;
            indices.ltp_scale_index = ltp_scale_index;
        }
        indices.gains_indices[..self.nb_subfr].copy_from_slice(&gres.gains_indices[..self.nb_subfr]);

        // LBRR (FEC redundant copy), `silk_LBRR_encode`: encode the excitation
        // at a lower bitrate by raising the first-subframe gain index by
        // `lbrr_gain_increases` (first LBRR frame of the packet only), then
        // re-running NSQ on the *pre-frame* NSQ state with the re-dequantised,
        // coarser gains. This runs before the regular NSQ loop so it sees the
        // same pre-frame NSQ history the reference does. The LBRR copy is only
        // decoded on packet loss (`decode_fec`); the regular frame below is
        // unaffected.
        let lbrr_out = if want_lbrr {
            let mut lbrr_indices = indices.clone();
            // First LBRR frame in the packet (independent coding) resets the
            // accumulator and applies the gain bump; conditionally coded LBRR
            // frames carry the accumulator and skip the bump.
            if cond_coding == CondCoding::Independently {
                self.lbrr_prev_last_gain_index = self.last_gain_index;
                let bumped = i32::from(lbrr_indices.gains_indices[0]) + self.lbrr_gain_increases;
                lbrr_indices.gains_indices[0] = bumped.min(63) as i8; // N_LEVELS_QGAIN - 1
            }
            // Re-dequantise the (bumped) gain indices to keep the encoder's
            // gains in sync with what `decode_fec` will reconstruct.
            let lbrr_gains_q16 = gains_dequant(
                &lbrr_indices.gains_indices,
                &mut self.lbrr_prev_last_gain_index,
                cond_coding == CondCoding::Conditionally,
                self.nb_subfr,
            );
            // Second NSQ pass on a clone of the pre-frame NSQ state, so the
            // regular NSQ loop below still starts from the untouched state.
            let mut lbrr_nsq = self.nsq.clone();
            let mut lbrr_pulses = vec![0i8; frame_length];
            nsq(
                &mut lbrr_nsq,
                &cfg,
                signal_type,
                quant_offset_type,
                4,
                seed,
                input,
                &mut lbrr_pulses,
                &pred_coef,
                &ltp_coef,
                &shp.ar_q13,
                &shp.harm_shape_gain_q14,
                &shp.tilt_q14,
                &shp.lf_shp_q14,
                &lbrr_gains_q16,
                &pitch_l,
                lambda_q10,
                ltp_scale_q14,
            );
            Some((lbrr_indices, lbrr_pulses))
        } else {
            None
        };

        // Carry forward the pitch/voicing state and input history.
        self.prev_lag = if is_voiced { pitch_l[self.nb_subfr - 1] } else { 0 };
        self.prev_signal_type = signal_type;
        self.first_frame_after_reset = false;
        if frame_length >= ltp_mem_length {
            self.prev_input.copy_from_slice(&input[frame_length - ltp_mem_length..]);
        } else {
            self.prev_input.copy_within(frame_length.., 0);
            self.prev_input[ltp_mem_length - frame_length..].copy_from_slice(input);
        }

        // NSQ + entropy coding. With `max_bits` set (hybrid), this is the
        // per-frame rate-control loop: the first
        // attempt uses the gains from `process_gains`; if it busts the cap, the
        // unquantised gains are scaled coarser by a multiplier - geometrically
        // until the over/under budget is bracketed, then bisection-interpolated
        // toward the cap - re-running NSQ and re-coding from a snapshot each
        // time. The best fitting attempt is restored at the end.
        //
        // When a frame stays stuck above the cap the loop also raises the RD
        // lambda + drops the dither and locks each subframe to its sparsest
        // multiplier, both from the reference. Capped at the reference's 1024
        // (4×) ceiling over 6 iterations; a rare loud transient that still busts
        // at 4× is left over budget and the caller (encode_auto) falls back to
        // CELT, which codes it far better than extreme-gain SILK. The reference's
        // zero-pulse damage control (which would desync our decoder without a
        // synthesis resync) is the one remaining refinement.
        let mut pulses = core::mem::take(&mut self.scratch_pulses);
        pulses.clear();
        pulses.resize(frame_length, 0);
        let mut gains_q16 = gres.gains_q16;
        let mut gains_indices = gres.gains_indices;
        // Snapshot for the gain-bisection fallback. The bulky NSQ state is
        // copied into the reused `snap_nsq` (no per-frame allocation); only the
        // small range-coder state is cloned.
        let snap_enc = max_bits.map(|_| {
            self.snap_nsq.clone_from(&self.nsq);
            (enc.clone(), self.ec_prev)
        });
        let bits_margin = max_bits.map_or(0, |m| m / 4); // VBR: within 25% is close enough
        let mut best_fit: Option<(RangeEncoder, NsqState, EcPrevState, i8, [i8; MAX_NB_SUBFR])> = None;
        let mut gain_mult_q8 = 256i32;
        let (mut found_lower, mut found_upper) = (false, false);
        let (mut gm_lower, mut gm_upper) = (0i32, 0i32);
        let (mut nb_lower, mut nb_upper) = (0i32, 0i32);
        let mut gain_lock = [false; MAX_NB_SUBFR];
        let mut best_gain_mult = [256i32; MAX_NB_SUBFR];
        let mut best_sum = [0i32; MAX_NB_SUBFR];
        let mut iter = 0;
        loop {
            if iter > 0 {
                // Restore and re-quantise the gains scaled by the multiplier.
                let (enc0, ec0) = snap_enc.as_ref().expect("snapshot present when capping");
                enc.clone_from(enc0);
                self.nsq.clone_from(&self.snap_nsq);
                self.ec_prev = *ec0;
                self.last_gain_index = last_gain_prev;
                let mut pg = [0i32; MAX_NB_SUBFR];
                for (k, pg_k) in pg.iter_mut().enumerate().take(self.nb_subfr) {
                    // pGains_Q16 = LSHIFT_SAT32(SMULWB(GainsUnq_Q16, gainMult_Q8), 8),
                    // using each subframe's locked multiplier when it has one.
                    let gm = if gain_lock[k] { best_gain_mult[k] } else { gain_mult_q8 };
                    let v = i64::from(smulwb(gres.gains_unq_q16[k], gm)) << 8;
                    *pg_k = v.clamp(i64::from(i32::MIN), i64::from(i32::MAX)) as i32;
                }
                let mut gi = [0i8; MAX_NB_SUBFR];
                gains_quant(
                    &mut gi,
                    &mut pg,
                    &mut self.last_gain_index,
                    cond_coding == CondCoding::Conditionally,
                    self.nb_subfr,
                );
                gains_q16 = pg;
                gains_indices = gi;
            }

            nsq(
                &mut self.nsq,
                &cfg,
                signal_type,
                quant_offset_type,
                4,
                seed,
                input,
                &mut pulses,
                &pred_coef,
                &ltp_coef,
                &shp.ar_q13,
                &shp.harm_shape_gain_q14,
                &shp.tilt_q14,
                &shp.lf_shp_q14,
                &gains_q16,
                &pitch_l,
                lambda_q10,
                ltp_scale_q14,
            );
            indices.gains_indices[..self.nb_subfr].copy_from_slice(&gains_indices[..self.nb_subfr]);
            indices.quant_offset_type = quant_offset_type as i8;
            encode_indices(
                enc,
                &indices,
                self.fs_khz,
                self.nb_subfr,
                false,
                true,
                cond_coding,
                &mut self.ec_prev,
            );
            encode_pulses(enc, signal_type, quant_offset_type, &pulses, frame_length);

            let Some(max_bits) = max_bits else { break };
            let n_bits = enc.tell() as i32;
            // VBR: the first attempt at the target gains is accepted if it fits.
            if iter == 0 && n_bits <= max_bits {
                break;
            }
            if n_bits > max_bits {
                if !found_lower && iter >= 2 {
                    // Stuck above the cap: trade more distortion for rate (raise
                    // lambda) and drop the dither (quantOffsetType = 0), then
                    // discard the stale upper bracket.
                    lambda_q10 = (lambda_q10 * 3 / 2).max(1536);
                    quant_offset_type = 0;
                    found_upper = false;
                } else {
                    found_upper = true;
                    nb_upper = n_bits;
                    gm_upper = gain_mult_q8;
                }
            } else if n_bits < max_bits - bits_margin {
                best_fit = Some((
                    enc.clone(),
                    self.nsq.clone(),
                    self.ec_prev,
                    self.last_gain_index,
                    indices.gains_indices,
                ));
                found_lower = true;
                nb_lower = n_bits;
                gm_lower = gain_mult_q8;
            } else {
                // Close enough to the cap - accept this attempt.
                break;
            }

            // Per-subframe gain lock: while still over budget without a lower
            // bracket, remember the multiplier that gave each subframe its
            // sparsest pulses, and lock subframes that stop improving so the
            // global multiplier does not over-coarsen them.
            if !found_lower && n_bits > max_bits {
                for k in 0..self.nb_subfr {
                    let sum: i32 = pulses[k * subfr_length..(k + 1) * subfr_length]
                        .iter()
                        .map(|&p| i32::from(p).abs())
                        .sum();
                    if iter == 0 || (sum < best_sum[k] && !gain_lock[k]) {
                        best_sum[k] = sum;
                        best_gain_mult[k] = gain_mult_q8;
                    } else {
                        gain_lock[k] = true;
                    }
                }
            }

            if iter >= 6 {
                break;
            }
            iter += 1;
            gain_mult_q8 = if found_lower && found_upper {
                // Interpolate to the cap, then clamp to the middle half of the
                // bracket (gm_upper < gm_lower since more gain → fewer bits).
                let interp = gm_lower + (gm_upper - gm_lower) * (max_bits - nb_lower) / (nb_upper - nb_lower);
                let span = gm_upper - gm_lower;
                interp.clamp(gm_upper - span / 4, gm_lower + span / 4)
            } else if n_bits > max_bits {
                (gain_mult_q8 * 3 / 2).min(1024)
            } else {
                (gain_mult_q8 * 4 / 5).max(64)
            };
        }

        // If the final attempt still busts but an earlier one fit, restore it.
        if let (Some(max_bits), Some(bf)) = (max_bits, &best_fit) {
            if enc.tell() as i32 > max_bits {
                let (enc0, nsq0, ec0, lg0, gi0) = bf;
                enc.clone_from(enc0);
                self.nsq.clone_from(nsq0);
                self.ec_prev = *ec0;
                self.last_gain_index = *lg0;
                indices.gains_indices = *gi0;
            }
        }
        // `indices`/`pulses`/`quant_offset_type` now hold the chosen attempt and
        // `self.ec_prev` reflects exactly one regular emission. The caller may
        // re-emit these captured indices/pulses as an LBRR copy (FEC).

        // Capture the chosen pulses for the caller (the analysis/emission split
        // and the LBRR copy both need them after the work buffer is returned).
        let pulses_out = pulses.clone();

        // Return the work buffers for the next frame to reuse.
        self.scratch_pitch = pitch_x_buf;
        self.scratch_res = res;
        self.scratch_res_f = res_f;
        self.scratch_x_buf = x_buf;
        self.scratch_lpc_in = lpc_in_pre;
        self.scratch_pulses = pulses;
        (indices, pulses_out, lbrr_out)
    }
}

#[cfg(test)]
mod tests {
    use super::super::super::decoder::SilkChannelDecoder;
    use super::*;
    use crate::range::RangeDecoder;

    /// Encodes `input` on `e`, captures the encoder's NSQ reconstruction, and
    /// decodes the coded frame on the persistent decoder `d` (so cross-frame
    /// state stays in sync). Returns (decoder xq, encoder xq, coded signal
    /// type, input-correlation).
    fn round_trip(
        e: &mut SilkChannelEncoder,
        d: &mut SilkChannelDecoder,
        input: &[i16],
    ) -> (Vec<i16>, Vec<i16>, i32, f64) {
        let frame_length = e.nb_subfr * 5 * e.fs_khz as usize;
        let ltp_mem = 20 * e.fs_khz as usize;

        let mut enc = RangeEncoder::new(512);
        let ind = e.encode_frame(&mut enc, input, CondCoding::Independently, None);
        let signal_type = i32::from(ind.signal_type);
        let xq_enc: Vec<i16> = e.nsq.xq[ltp_mem..ltp_mem + frame_length].to_vec();
        let bytes = enc.finalize().expect("frame fits");
        assert!(!bytes.is_empty());

        let mut dec = RangeDecoder::new(&bytes);
        let mut xq = vec![0i16; frame_length];
        d.decode_frame(&mut dec, &mut xq, true, false, CondCoding::Independently);

        let (mut sig, mut dot, mut e_out) = (0.0f64, 0.0f64, 0.0f64);
        for i in 0..frame_length {
            let a = f64::from(input[i]);
            let b = f64::from(xq[i]);
            sig += a * a;
            dot += a * b;
            e_out += b * b;
        }
        let corr = dot / (sig.sqrt() * e_out.sqrt()).max(1.0);
        (xq, xq_enc, signal_type, corr)
    }

    /// A noise frame is detected as unvoiced and decodes to the encoder's own
    /// NSQ reconstruction.
    #[test]
    fn unvoiced_frame_round_trips_through_the_decoder() {
        let (fs_khz, nb_subfr) = (16i32, 4usize);
        let frame_length = nb_subfr * 5 * fs_khz as usize;
        let mut seed = 0x9e37_u32;
        let input: Vec<i16> = (0..frame_length)
            .map(|_| {
                seed = seed.wrapping_mul(1_103_515_245).wrapping_add(12_345);
                ((seed >> 16) as i32 - 32768) as i16 / 12
            })
            .collect();

        let mut e = SilkChannelEncoder::new(fs_khz, nb_subfr);
        let mut d = SilkChannelDecoder::new(fs_khz, nb_subfr);
        let (xq, xq_enc, signal_type, _corr) = round_trip(&mut e, &mut d, &input);
        assert_eq!(signal_type, TYPE_UNVOICED, "noise should be detected unvoiced");
        assert_eq!(xq, xq_enc, "decoder disagrees with the encoder's NSQ reconstruction");
    }

    /// A strongly periodic frame is detected as voiced and round-trips through
    /// the decoder with the long-term predictor engaged.
    #[test]
    fn voiced_frame_round_trips_through_the_decoder() {
        let (fs_khz, nb_subfr) = (16i32, 4usize);
        let frame_length = nb_subfr * 5 * fs_khz as usize;

        // A continuous, strongly periodic tone (period 100 samples) spanning
        // two frames, so the second frame's pitch history is phase-continuous.
        let full: Vec<i16> = (0..2 * frame_length)
            .map(|i| {
                let mut s = 2500.0 * (core::f32::consts::TAU * i as f32 / 100.0).sin();
                s += 900.0 * (core::f32::consts::TAU * i as f32 / 50.0).sin();
                s += ((i as i32 * 1733 + 3) % 173 - 86) as f32 * 1.2;
                s.clamp(-30000.0, 30000.0) as i16
            })
            .collect();

        let mut e = SilkChannelEncoder::new(fs_khz, nb_subfr);
        let mut d = SilkChannelDecoder::new(fs_khz, nb_subfr);
        // First frame primes the pitch-analysis history; the second's history
        // is then the (phase-continuous) first frame. The decoder is shared so
        // its synthesis history matches the encoder's going into frame two.
        round_trip(&mut e, &mut d, &full[..frame_length]);
        let (xq, xq_enc, signal_type, corr) = round_trip(&mut e, &mut d, &full[frame_length..]);
        assert_eq!(signal_type, TYPE_VOICED, "periodic signal should be detected voiced");
        assert_eq!(
            xq, xq_enc,
            "decoder disagrees with the encoder's NSQ reconstruction (voiced)"
        );
        assert!(corr > 0.5, "voiced reconstruction correlation {corr:.3} too low");
    }
}