Skip to main content

lvqr_codec/
aac.rs

1//! AAC `AudioSpecificConfig` (ASC) parser.
2//!
3//! The existing `lvqr-ingest::remux::fmp4::esds` writer assumes a 2-byte
4//! ASC and hand-rolls MPEG-4 descriptor lengths with a single-byte prefix.
5//! That works for AAC-LC but breaks on HE-AAC, HE-AAC v2, xHE-AAC, and
6//! any config that uses the `sampling_frequency_index == 15` explicit
7//! frequency escape. This module is the hardened parser that future
8//! `lvqr-codec`-backed muxers will call to produce correct sample entries.
9//!
10//! Reference: ISO/IEC 14496-3 ยง1.6.2 (`AudioSpecificConfig`).
11//!
12//! Scope:
13//!
14//! * Object type decoding with the 5-bit base + 6-bit escape (object
15//!   types 32..=63).
16//! * Explicit sampling frequency when `samplingFrequencyIndex == 15`.
17//! * Channel configuration decoding.
18//! * Extension object type signaling for HE-AAC (SBR) and HE-AAC v2 (PS).
19//!
20//! Out of scope:
21//!
22//! * The full `GASpecificConfig` decoder. We only need to know enough to
23//!   build an fMP4 sample entry; the ASC bytes themselves are written
24//!   verbatim into the `esds` box. Scalable, CELP, HVXC, TwinVQ, and
25//!   structured-audio payloads are parsed only up to the object-type and
26//!   sample-rate fields.
27
28use crate::bit_reader::BitReader;
29use crate::error::CodecError;
30
31/// Sampling frequency table indexed by `samplingFrequencyIndex`
32/// (ISO/IEC 14496-3 Table 1.16). Index 15 is a sentinel meaning
33/// "frequency follows explicitly as a 24-bit value".
34pub const AAC_SAMPLE_FREQUENCIES: [u32; 13] = [
35    96000, 88200, 64000, 48000, 44100, 32000, 24000, 22050, 16000, 12000, 11025, 8000, 7350,
36];
37
38/// Decoded AAC AudioSpecificConfig.
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub struct AudioSpecificConfig {
41    /// MPEG-4 audio object type (AOT). 2 = AAC-LC, 5 = HE-AAC (SBR),
42    /// 29 = HE-AAC v2 (PS), 42 = xHE-AAC.
43    pub object_type: u8,
44    /// Sampling rate in Hz.
45    pub sample_rate: u32,
46    /// Channel configuration (0 = PCE follows; 1..=7 = mapped layout).
47    pub channel_config: u8,
48    /// True if the config explicitly signals SBR (HE-AAC).
49    pub sbr_present: bool,
50    /// True if the config explicitly signals PS (HE-AAC v2).
51    pub ps_present: bool,
52}
53
54impl AudioSpecificConfig {
55    /// RFC 6381 codec string for the `mp4a` sample entry.
56    /// Format: `mp4a.40.<object_type>`.
57    pub fn codec_string(&self) -> String {
58        format!("mp4a.40.{}", self.object_type)
59    }
60}
61
62/// Parse an ASC from raw bytes. Returns a structured error on any
63/// malformed input; never panics.
64pub fn parse_asc(bytes: &[u8]) -> Result<AudioSpecificConfig, CodecError> {
65    if bytes.is_empty() {
66        return Err(CodecError::EndOfStream {
67            needed: 1,
68            remaining: 0,
69        });
70    }
71    let mut r = BitReader::new(bytes);
72    let object_type = read_object_type(&mut r)?;
73    let sample_rate = read_sample_rate(&mut r)?;
74    let channel_config = r.read_bits(4)? as u8;
75
76    // Detect SBR/PS extension. Two forms:
77    //
78    // 1. Explicit hierarchical signalling: object_type == 5 (SBR) or 29
79    //    (PS) means the config describes an extension over an AAC-LC
80    //    payload. In both cases the extensionSamplingFrequencyIndex
81    //    follows immediately and then the actual audioObjectType of the
82    //    downstream config is read.
83    // 2. Implicit (legacy): any object type may be followed by trailing
84    //    bits that signal SBR; we do not try to detect that here because
85    //    it requires scanning the GASpecificConfig payload.
86    let (sbr_present, ps_present, base_object_type) = match object_type {
87        5 | 29 => {
88            // extensionSamplingFrequencyIndex u(4) [ + explicit u(24) ]
89            let ext_sfi = r.read_bits(4)? as u8;
90            if ext_sfi == 15 {
91                // extensionSamplingFrequency u(24)
92                let _ = r.read_bits(24)?;
93            }
94            // The downstream audioObjectType (typically 2 = AAC-LC)
95            let downstream = read_object_type(&mut r)?;
96            let ps = object_type == 29;
97            (true, ps, downstream)
98        }
99        _ => (false, false, object_type),
100    };
101
102    Ok(AudioSpecificConfig {
103        object_type: if sbr_present { object_type } else { base_object_type },
104        sample_rate,
105        channel_config,
106        sbr_present,
107        ps_present,
108    })
109}
110
111/// Read a 5-bit audio object type with the 6-bit escape.
112///
113/// The wire encoding is:
114///
115/// ```text
116///   audioObjectType u(5)
117///   if audioObjectType == 31:
118///       audioObjectType = 32 + audioObjectTypeExt u(6)
119/// ```
120fn read_object_type(r: &mut BitReader<'_>) -> Result<u8, CodecError> {
121    let base = r.read_bits(5)? as u8;
122    if base == 31 {
123        let ext = r.read_bits(6)? as u8;
124        Ok(32 + ext)
125    } else {
126        Ok(base)
127    }
128}
129
130/// Read the sampling frequency: a 4-bit index into the standard table,
131/// or index 15 which means "explicit 24-bit frequency follows".
132fn read_sample_rate(r: &mut BitReader<'_>) -> Result<u32, CodecError> {
133    let sfi = r.read_bits(4)? as u8;
134    if sfi == 15 {
135        let freq = r.read_bits(24)?;
136        // Reject implausibly low explicit rates. The standard table
137        // bottoms out at 7350 Hz (`AAC_SAMPLE_FREQUENCIES[12]`), and
138        // no real-world AAC encoder produces anything below that;
139        // accepting rate=1 Hz just because the 24-bit field happened
140        // to decode that way lets attacker-shaped input through the
141        // codec parser and produces nonsense downstream (init
142        // segment timescale, LL-HLS partial duration reporting).
143        const MIN_PLAUSIBLE_HZ: u32 = 7350;
144        if freq < MIN_PLAUSIBLE_HZ {
145            return Err(CodecError::MalformedAsc("explicit sample rate below 7350 Hz"));
146        }
147        Ok(freq)
148    } else {
149        AAC_SAMPLE_FREQUENCIES
150            .get(sfi as usize)
151            .copied()
152            .ok_or(CodecError::MalformedAsc("sampling_frequency_index out of range"))
153    }
154}
155
156#[cfg(test)]
157mod tests {
158    use super::*;
159
160    #[test]
161    fn parse_aac_lc_stereo_48khz() {
162        // AOT=2 (5 bits = 00010), sfi=3 (4 bits = 0011), channel=2 (4 bits = 0010), pad 3 bits
163        // concatenated bitstream: 00010 0011 0010 000
164        // as bytes: 00010001 10010000 = 0x11 0x90
165        let asc = parse_asc(&[0x11, 0x90]).unwrap();
166        assert_eq!(asc.object_type, 2);
167        assert_eq!(asc.sample_rate, 48000);
168        assert_eq!(asc.channel_config, 2);
169        assert!(!asc.sbr_present);
170        assert!(!asc.ps_present);
171        assert_eq!(asc.codec_string(), "mp4a.40.2");
172    }
173
174    #[test]
175    fn parse_legacy_lvqr_aac_lc_stereo_44k() {
176        // The 2-byte ASC that lvqr-ingest's existing esds writer hard-codes
177        // (see HANDOFF.md session 3 notes): [0x12, 0x10]. Decode:
178        //   0001 0010 0001 0000
179        //   AOT  = 00010 = 2     (AAC-LC)
180        //   sfi  = 0100  = 4     -> 44100 Hz
181        //   chan = 0010  = 2     (stereo)
182        //   pad  = 000
183        // This test pins the interpretation of that magic pair so future
184        // refactors of lvqr-ingest cannot silently drift from it.
185        let asc = parse_asc(&[0x12, 0x10]).unwrap();
186        assert_eq!(asc.object_type, 2);
187        assert_eq!(asc.sample_rate, 44100);
188        assert_eq!(asc.channel_config, 2);
189    }
190
191    #[test]
192    fn parse_he_aac_signals_sbr() {
193        // AOT=5 (SBR), ext sfi=3 (48kHz), downstream AOT=2 (AAC-LC),
194        // sfi=3 (48kHz), channel=2
195        //
196        //   00101 0011 00010 0011 0010 0
197        //   = 0010 1001 1000 1000 1100 1000
198        //   = 0x29 0x88 0xC8 (last byte has 3 significant bits)
199        let asc = parse_asc(&[0x29, 0x88, 0xC8]).unwrap();
200        assert!(asc.sbr_present);
201        assert!(!asc.ps_present);
202        assert_eq!(asc.object_type, 5);
203    }
204
205    #[test]
206    fn parse_rejects_empty_bytes() {
207        assert!(matches!(parse_asc(&[]), Err(CodecError::EndOfStream { .. })));
208    }
209
210    #[test]
211    fn parse_escape_object_type() {
212        // AOT = 42 (xHE-AAC USAC) -> base = 31, ext = 10
213        // bits: 11111 001010 (object type) 0011 (sfi=48k) 0010 (channel=2) pad
214        // = 11111 001010 0011 0010 0
215        // = 1111 1001 0100 0110 0100 (need 19 bits, pad to 24)
216        // = 1111 1001 0100 0110 0100 0000 = 0xF9 0x46 0x40
217        let asc = parse_asc(&[0xF9, 0x46, 0x40]).unwrap();
218        assert_eq!(asc.object_type, 42);
219        assert_eq!(asc.sample_rate, 48000);
220        assert_eq!(asc.channel_config, 2);
221    }
222
223    #[test]
224    fn parse_explicit_frequency() {
225        // AOT=2, sfi=15 (escape), explicit freq=96000 (0x017700),
226        // channel=2. Layout:
227        //   AOT(5)     = 00010
228        //   sfi(4)     = 1111
229        //   freq(24)   = 000000010111011100000000  (0x017700)
230        //   channel(4) = 0010
231        //   pad(3)     = 000
232        // Full 40-bit stream concatenated:
233        //   00010 1111 00000001 01110111 00000000 0010 000
234        //
235        // Regrouped into 8-bit bytes:
236        //   0001 0111 = 0x17
237        //   1000 0000 = 0x80
238        //   1011 1011 = 0xBB
239        //   1000 0000 = 0x80
240        //   0001 0000 = 0x10
241        let asc = parse_asc(&[0x17, 0x80, 0xBB, 0x80, 0x10]).unwrap();
242        assert_eq!(asc.object_type, 2);
243        assert_eq!(asc.sample_rate, 96000);
244        assert_eq!(asc.channel_config, 2);
245    }
246
247    #[test]
248    fn parse_rejects_explicit_sample_rate_below_7350_hz() {
249        // Regression pin for the session-27 fix to `read_sample_rate`.
250        // Before the fix, AOT=2 + sfi=15 + a 24-bit explicit freq
251        // field of 1 would decode to `sample_rate = 1` and pass
252        // through. `proptest_aac::successful_parse_has_plausible_sample_rate`
253        // discovered the class via the seed `[87,128,0,0,128]`
254        // (decodes to rate=1433). After the fix, both shapes
255        // return `CodecError::MalformedAsc` so the nonsense rate
256        // never reaches downstream timescale math.
257        let explicit_seed: &[u8] = &[87, 128, 0, 0, 128];
258        match parse_asc(explicit_seed) {
259            Err(CodecError::MalformedAsc(msg)) => {
260                assert!(msg.contains("7350"), "expected the 7350 Hz floor error, got: {msg}");
261            }
262            other => panic!("expected MalformedAsc error, got {other:?}"),
263        }
264    }
265}