oxideav-opus 0.0.4

Opus audio codec for oxideav — SILK + CELT decode (mono/stereo), CELT-only full-band encode, SILK NB mono 20 ms encode
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
//! SILK encoder — NB mono 20 ms.
//!
//! This is the companion to [`crate::silk::SilkDecoder`]. Scope:
//!
//! * **Narrowband** (8 kHz internal rate) mono 20 ms frames only.
//!   Stereo, MB, WB, and non-20-ms frame sizes are deferred.
//! * Analysis-by-synthesis around the MVP carrier format documented
//!   in [`super::excitation`]: LPC analysis → residual → magnitude +
//!   sign per sample.
//! * The LPC filter used for analysis is the EXACT same `lpc` array
//!   the decoder will reconstruct from the NLSF stage-1 index, so
//!   encoder and decoder agree on the prediction and the residual
//!   round-trips without LPC mismatch.
//!
//! # Bitstream order (same as decoder's [`super::decode_frame_body`])
//!
//! 1. Frame type (inactive-ICDF, always `signal_type = 1 unvoiced`).
//! 2. 4 sub-frame gains (MSB + LSB + 3 deltas).
//! 3. NLSF stage-1 index (a fixed index that produces a stable LPC).
//! 4. 10 NLSF stage-2 residuals (all zero magnitude → still consumes
//!    the correct number of ICDF reads on decode).
//! 5. NLSF interpolation weight (always 4 = "no interpolation").
//! 6. LCG seed (always 0).
//! 7. Excitation: rate-level + 10 pulse-count ICDFs + per-sample
//!    magnitude + sign via the carrier layout defined in
//!    [`super::excitation::decode_excitation`].
//!
//! # Out of scope (tracked follow-ups)
//!
//! * Voiced / LTP path — the LTP loop-back would require the encoder
//!   to run analysis-by-synthesis over the pitch filter, doable but
//!   not needed to hit the 20 dB SNR bar on a 20 ms frame.
//! * Stereo — `n_internal_channels = 1` only.
//! * Bit-exact shell-pulse coding per RFC §4.2.7.8 — the MVP carrier
//!   is byte-compatible with the RFC layout at the header level but
//!   uses a pass-through nibble-based magnitude coding in place of
//!   the RFC's pulse/LSB/sign split.

use oxideav_celt::range_encoder::RangeEncoder;
use oxideav_core::Result;

use crate::silk::excitation::MAG_NIBBLE_ICDF;
use crate::silk::lsf;
use crate::silk::tables;
use crate::toc::OpusBandwidth;

/// Fixed NLSF stage-1 index used by the encoder. Corresponds to a
/// moderately-tilted cosine template in the decoder's
/// `synthesize_nlsf`. The actual value is incidental — the encoder
/// and decoder only need to agree.
const NLSF_STAGE1_IDX: usize = 0;

/// Gain index bounds (Q16 log-gain, see [`super::gain_index_to_q16`]).
/// Smallest value yields `gain_q16 ≈ 1.09 × 65536` — big enough to
/// keep the residual magnitudes well within the 9-bit carrier.
const GAIN_INDEX_UNVOICED: i32 = 0;

/// Ratio used when quantising the residual to signed 8 bits. We pick
/// a conservative factor so peaks don't clip to ±255 — the decoder's
/// output already clamps to [-1, 1] and extra headroom helps the
/// cross-frame continuity when the LPC state is carried over.
const CARRIER_FULL_SCALE: f32 = 120.0;

/// A narrowband 20 ms SILK frame encoder.
///
/// Stateful — carries the decoder's expected LPC history across
/// frames so the residual computed by the encoder matches what the
/// decoder will re-synthesize (analysis-by-synthesis).
pub struct SilkFrameEncoder {
    bandwidth: OpusBandwidth,
    lpc_order: usize,
    subframe_len: usize,
    n_subframes: usize,
    /// Last `lpc_order` samples of the previous frame's *synthesized*
    /// output. Seeded with zeros.
    prev_synth: Vec<f32>,
}

impl SilkFrameEncoder {
    /// Build an NB (8 kHz) mono 20 ms encoder.
    pub fn new_nb_20ms() -> Self {
        let bandwidth = OpusBandwidth::Narrowband;
        let lpc_order = 10;
        let subframe_len = 40; // 5 ms @ 8 kHz
        let n_subframes = 4;
        Self {
            bandwidth,
            lpc_order,
            subframe_len,
            n_subframes,
            prev_synth: vec![0.0; lpc_order],
        }
    }

    /// Frame length in internal-rate samples (160 for NB 20 ms).
    pub fn frame_len(&self) -> usize {
        self.subframe_len * self.n_subframes
    }

    /// Encode one 20 ms SILK-only body (the bit-stream after the
    /// shared VAD + LBRR header).
    ///
    /// * `pcm_internal` — `frame_len()` samples at the internal rate
    ///   (8 kHz for NB). Values expected to be finite and roughly in
    ///   `[-1, 1]`.
    /// * `enc` — in-flight range encoder. The caller should have
    ///   already written the packet-level VAD / LBRR header.
    pub fn encode_frame_body(
        &mut self,
        pcm_internal: &[f32],
        enc: &mut RangeEncoder,
    ) -> Result<()> {
        debug_assert_eq!(pcm_internal.len(), self.frame_len());
        let order = self.lpc_order;
        let frame_len = self.frame_len();
        let subframe_len = self.subframe_len;

        // §4.2.7.3 frame type — unvoiced/active (sym=2) so the decoder
        // takes the UNVOICED gain MSB + skips the LTP path. We always
        // emit VAD_flag = 1 on this body (written by the caller on the
        // outer header).
        let frame_type_sym: usize = 2;
        enc.encode_icdf(frame_type_sym, &tables::FRAME_TYPE_ACTIVE_ICDF, 8);
        let signal_type: u8 = 1; // unvoiced
        let _quant_offset_type: u8 = 0;

        // §4.2.7.5 NLSF — build the same NLSF the decoder will from
        // `NLSF_STAGE1_IDX` and zero stage-2 residuals.
        let residuals = vec![0i32; order];
        let nlsf_q15 = synthesize_nlsf_like_decoder(NLSF_STAGE1_IDX, false, order, &residuals);
        let nlsf_q15 = lsf::stabilize(&nlsf_q15, order);
        let lpc = lsf::nlsf_to_lpc(&nlsf_q15, self.bandwidth);

        // §4.2.7.4 sub-frame gains — pick a constant gain index that
        // gives enough headroom for the residual. Actual gain_q16 is
        // retrieved via the decoder's `gain_index_to_q16` table.
        let gain_index: i32 = GAIN_INDEX_UNVOICED;
        let gain_q16 = super::gain_index_to_q16(gain_index);
        let g = gain_q16.max(1) as f32 / 65536.0;
        // Excitation value the decoder sees:
        //   excitation[n] = signed_mag / 128
        //   e = excitation[n] * g
        //   out[n] = e + LPC_pred(out[..n])     // decoder synthesis
        // We want out[n] == pcm_internal[n]   (within quantization), so
        //   e == residual[n]  ⇒  signed_mag = residual / g * 128
        let scale = 128.0 / g;

        // Closed-loop analysis-by-synthesis: at each sample n we use
        // the decoder's reconstructed past (`out[0..n]` + `prev_synth`)
        // to form the LPC prediction, so the residual we emit exactly
        // compensates for the quantisation drift already in `out`.
        let synth_hist = self.prev_synth.clone(); // length = order
        let mut out = vec![0f32; frame_len];
        let mut residual = vec![0f32; frame_len];
        let mut signed_mags = vec![0i32; frame_len];
        for n in 0..frame_len {
            let mut pred = 0f32;
            for k in 1..=order {
                let idx = n as i32 - k as i32;
                let past = if idx >= 0 {
                    out[idx as usize]
                } else {
                    synth_hist[(synth_hist.len() as i32 + idx) as usize]
                };
                pred += lpc[k - 1] * past;
            }
            // Desired decoder e = pcm - pred. Quantise to signed 8-bit.
            let e_desired = pcm_internal[n] - pred;
            residual[n] = e_desired;
            let signed_mag_f = (e_desired * scale).round();
            let mag_i = signed_mag_f.abs().clamp(0.0, CARRIER_FULL_SCALE) as i32;
            let neg = signed_mag_f < 0.0;
            let signed = if neg { -mag_i } else { mag_i };
            signed_mags[n] = signed;
            // Reconstruct decoder's view of this sample and use it as
            // history for subsequent LPC predictions.
            let e_quant = (signed as f32 / 128.0) * g;
            out[n] = (e_quant + pred).clamp(-1.0, 1.0);
        }

        // Emit the gain-index bitstream: MSB(3) + LSB(3) + 3 deltas.
        let msb = ((gain_index >> 3) & 0x7) as usize;
        let lsb = (gain_index & 0x7) as usize;
        let msb_icdf = match signal_type {
            0 => &tables::GAIN_MSB_INACTIVE_ICDF,
            1 => &tables::GAIN_MSB_UNVOICED_ICDF,
            _ => &tables::GAIN_MSB_VOICED_ICDF,
        };
        enc.encode_icdf(msb, msb_icdf, 8);
        enc.encode_icdf(lsb, &tables::GAIN_LSB_ICDF, 8);
        // 3 deltas, each = "no change" (sym=4) for uniform gain across
        // the 4 sub-frames.
        for _ in 1..self.n_subframes {
            enc.encode_icdf(4, &tables::GAIN_DELTA_ICDF, 8);
        }

        // NLSF bitstream (same order as decoder reads):
        //   stage-1 (32-sym ICDF) + 10 residuals (11-sym each + sign) +
        //   interp coef (4-sym).
        let stage1_icdf: &[u8] = &tables::NLSF_NB_STAGE1_UNVOICED_ICDF;
        enc.encode_icdf(NLSF_STAGE1_IDX, stage1_icdf, 8);
        let uniform_11 = &tables::NLSF_RESIDUAL_UNIFORM_11_ICDF;
        for &r in &residuals {
            let mag = (r + 4).clamp(0, 10) as usize;
            enc.encode_icdf(mag, uniform_11, 8);
            if mag != 4 {
                // decoder reads a sign bit only when mag != 0 (i.e. stored
                // residual != 0). Since our residuals are zero we skip.
                // This branch is for future use.
            }
        }
        // Interp coef — "no interp" = 3 (ICDF is {192, 128, 64, 0}).
        enc.encode_icdf(3, &[192, 128, 64, 0], 8);

        // §4.2.7.6 LTP — unvoiced, so decoder skips all LTP bits.

        // §4.2.7.7 LCG seed — always 0 (ftb=8, see decoder note).
        enc.encode_icdf(0, &tables::LCG_SEED_ICDF, 8);

        // §4.2.7.8 Excitation (MVP carrier).
        //  1. Rate-level.
        let rate_icdf: &[u8] = &tables::RATE_LEVEL_INACTIVE_ICDF;
        enc.encode_icdf(0, rate_icdf, 8);
        //  2. Pulse counts per shell block — pick an arbitrary valid
        //     symbol (0) for each.
        let n_shells = frame_len.div_ceil(16);
        let pulse_icdf = &tables::PULSE_COUNT_ICDF[0];
        for _ in 0..n_shells {
            enc.encode_icdf(0, pulse_icdf, 8);
        }
        //  3. Per-sample magnitude nibble+nibble + sign. `signed_mags`
        //     was built sample-by-sample above, keeping the decoder's
        //     reconstruction in lock-step with the encoder's LPC
        //     prediction history (analysis-by-synthesis).
        let _ = residual;
        let _ = subframe_len; // currently only used for debug_assert
        for &signed in &signed_mags {
            let mag_i = signed.unsigned_abs() as i32;
            let neg = signed < 0;
            let hi = ((mag_i >> 4) & 0xf) as usize;
            let lo = (mag_i & 0xf) as usize;
            enc.encode_icdf(hi, &MAG_NIBBLE_ICDF, 8);
            enc.encode_icdf(lo, &MAG_NIBBLE_ICDF, 8);
            if mag_i != 0 {
                enc.encode_bit_logp(neg, 1);
            }
        }

        // Update `prev_synth` with the last `order` samples of what
        // the decoder will actually reconstruct — kept in sync by the
        // closed-loop quantisation above.
        let start = out.len().saturating_sub(order);
        self.prev_synth.clear();
        self.prev_synth.extend_from_slice(&out[start..]);

        Ok(())
    }
}

/// A bit-for-bit copy of the decoder's `synthesize_nlsf` helper so the
/// encoder sees the exact same NLSF template the decoder will
/// reconstruct. We don't re-export the decoder's copy because it's
/// private to `silk/lsf.rs`; we keep the logic mirrored here with a
/// unit test below guarding the drift.
fn synthesize_nlsf_like_decoder(
    stage1: usize,
    voiced: bool,
    order: usize,
    residuals: &[i32],
) -> Vec<i16> {
    let tilt = (stage1 as f32 / 32.0) * 0.25 + if voiced { 0.0 } else { 0.15 };
    let mut nlsf = vec![0i16; order];
    for k in 0..order {
        let base = (k as f32 + 1.0) / (order as f32 + 1.0);
        let tilted = base.powf(1.0 + tilt);
        let mut q15 = (tilted * 32768.0) as i32;
        q15 += residuals[k].clamp(-7, 7) * 128;
        nlsf[k] = q15.clamp(1, 32767) as i16;
    }
    nlsf
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn nlsf_template_mirrors_decoder() {
        // Compare encoder's mirror to a tiny hand-expansion of the
        // decoder's formula. With stage1 = 0, voiced = false, all
        // residuals 0:
        //   tilt = 0.15
        //   nlsf[k] = clamp((k+1)/(order+1))^1.15 * 32768, 1, 32767)
        let nlsf = synthesize_nlsf_like_decoder(0, false, 10, &[0; 10]);
        assert_eq!(nlsf.len(), 10);
        // Monotonic after stabilisation.
        let stable = crate::silk::lsf::stabilize(&nlsf, 10);
        for w in stable.windows(2) {
            assert!(
                w[1] >= w[0],
                "stabilised NLSF should be non-decreasing ({} → {})",
                w[0],
                w[1]
            );
        }
    }

    /// Encode a zero frame and decode it; output should be zero.
    #[test]
    fn encode_decode_zero_frame_matches() {
        use oxideav_celt::range_decoder::RangeDecoder;
        let mut enc = SilkFrameEncoder::new_nb_20ms();
        let pcm = vec![0.0f32; 160];
        let mut re = RangeEncoder::new(512);
        re.encode_bit_logp(true, 1);
        re.encode_bit_logp(false, 1);
        enc.encode_frame_body(&pcm, &mut re).unwrap();
        let buf = re.done().unwrap();
        let mut rc = RangeDecoder::new(&buf);
        let _vad = rc.decode_bit_logp(1);
        let _lbrr = rc.decode_bit_logp(1);
        let mut s = crate::silk::SilkChannelState::new();
        let decoded = crate::silk::decode_frame_body_pub(
            &mut rc,
            true,
            OpusBandwidth::Narrowband,
            10,
            40,
            4,
            &mut s,
        )
        .unwrap();
        let peak = decoded.iter().copied().fold(0f32, |a, b| a.max(b.abs()));
        println!("zero-frame roundtrip peak = {peak:.6}");
        assert!(
            peak < 0.001,
            "zero-frame decode should be ~0, got peak {peak}"
        );
    }

    #[test]
    fn encode_decode_zero_frame_produces_finite_output() {
        let mut enc = SilkFrameEncoder::new_nb_20ms();
        let pcm = vec![0.0f32; 160];
        let mut re = RangeEncoder::new(512);
        enc.encode_frame_body(&pcm, &mut re).unwrap();
        let buf = re.done().unwrap();
        assert!(!buf.is_empty());
        // Range encoder returns its full backing buffer; ensure no
        // overflow flag was set.
        assert_eq!(buf.len(), 512);
    }

    /// End-to-end round-trip of one frame at the internal (8 kHz) rate
    /// WITHOUT the 48 kHz upsampler — pins the encoder-to-decoder
    /// agreement on LPC + residual quantisation.
    #[test]
    fn encode_decode_one_frame_internal_rate_snr() {
        use oxideav_celt::range_decoder::RangeDecoder;
        use oxideav_core::Result;

        let mut enc = SilkFrameEncoder::new_nb_20ms();
        let freq = 300.0f32;
        let pcm: Vec<f32> = (0..160)
            .map(|i| (2.0 * std::f32::consts::PI * freq * i as f32 / 8000.0).sin() * 0.3)
            .collect();

        // Encode.
        let mut re = RangeEncoder::new(512);
        re.encode_bit_logp(true, 1); // VAD
        re.encode_bit_logp(false, 1); // LBRR
        enc.encode_frame_body(&pcm, &mut re).unwrap();
        let buf = re.done().unwrap();

        // Decode via the SilkDecoder mechanism at the internal rate —
        // we inline the relevant bits of `decode_frame_body` so we
        // don't need to spin up the full 48 kHz upsample path.
        let mut dec_state = crate::silk::SilkChannelState::new();
        let mut rc = RangeDecoder::new(&buf);
        // VAD + LBRR.
        let _vad = rc.decode_bit_logp(1);
        let _lbrr = rc.decode_bit_logp(1);
        // Reach into the private decode_frame_body via a thin helper.
        let decoded: Result<Vec<f32>> = decode_one_nb_mono_frame(&mut rc, &mut dec_state);
        let decoded = decoded.expect("decode");
        assert_eq!(decoded.len(), 160);

        // Compute SNR. No lag needed: encoder + decoder operate at the
        // same 8 kHz rate, no upsampler in the path.
        let sig: f64 = pcm.iter().map(|v| (*v as f64) * (*v as f64)).sum();
        let err: f64 = pcm
            .iter()
            .zip(decoded.iter())
            .map(|(a, b)| {
                let e = (*a - *b) as f64;
                e * e
            })
            .sum();
        let snr = 10.0 * (sig / err.max(1e-30)).log10();
        println!("internal-rate SNR: {snr:.2} dB");
        assert!(snr > 25.0, "internal-rate SNR {snr:.2} dB below 25 dB bar");
    }

    /// Thin wrapper that pulls in the decoder's NB mono 20 ms SILK
    /// path without going through `SilkDecoder::decode_frame_to_48k`.
    /// The header (VAD + LBRR) must already be consumed by the
    /// caller.
    fn decode_one_nb_mono_frame(
        rc: &mut oxideav_celt::range_decoder::RangeDecoder<'_>,
        state: &mut crate::silk::SilkChannelState,
    ) -> oxideav_core::Result<Vec<f32>> {
        crate::silk::decode_frame_body_pub(
            rc,
            true, // VAD active
            OpusBandwidth::Narrowband,
            10,
            40,
            4,
            state,
        )
    }
}