lvqr_codec/aac.rs
1//! AAC `AudioSpecificConfig` (ASC) parser.
2//!
3//! The existing `lvqr-ingest::remux::fmp4::esds` writer assumes a 2-byte
4//! ASC and hand-rolls MPEG-4 descriptor lengths with a single-byte prefix.
5//! That works for AAC-LC but breaks on HE-AAC, HE-AAC v2, xHE-AAC, and
6//! any config that uses the `sampling_frequency_index == 15` explicit
7//! frequency escape. This module is the hardened parser that future
8//! `lvqr-codec`-backed muxers will call to produce correct sample entries.
9//!
10//! Reference: ISO/IEC 14496-3 ยง1.6.2 (`AudioSpecificConfig`).
11//!
12//! Scope:
13//!
14//! * Object type decoding with the 5-bit base + 6-bit escape (object
15//! types 32..=63).
16//! * Explicit sampling frequency when `samplingFrequencyIndex == 15`.
17//! * Channel configuration decoding.
18//! * Extension object type signaling for HE-AAC (SBR) and HE-AAC v2 (PS).
19//!
20//! Out of scope:
21//!
22//! * The full `GASpecificConfig` decoder. We only need to know enough to
23//! build an fMP4 sample entry; the ASC bytes themselves are written
24//! verbatim into the `esds` box. Scalable, CELP, HVXC, TwinVQ, and
25//! structured-audio payloads are parsed only up to the object-type and
26//! sample-rate fields.
27
28use crate::bit_reader::BitReader;
29use crate::error::CodecError;
30
31/// Sampling frequency table indexed by `samplingFrequencyIndex`
32/// (ISO/IEC 14496-3 Table 1.16). Index 15 is a sentinel meaning
33/// "frequency follows explicitly as a 24-bit value".
34pub const AAC_SAMPLE_FREQUENCIES: [u32; 13] = [
35 96000, 88200, 64000, 48000, 44100, 32000, 24000, 22050, 16000, 12000, 11025, 8000, 7350,
36];
37
38/// Decoded AAC AudioSpecificConfig.
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub struct AudioSpecificConfig {
41 /// MPEG-4 audio object type (AOT). 2 = AAC-LC, 5 = HE-AAC (SBR),
42 /// 29 = HE-AAC v2 (PS), 42 = xHE-AAC.
43 pub object_type: u8,
44 /// Sampling rate in Hz.
45 pub sample_rate: u32,
46 /// Channel configuration (0 = PCE follows; 1..=7 = mapped layout).
47 pub channel_config: u8,
48 /// True if the config explicitly signals SBR (HE-AAC).
49 pub sbr_present: bool,
50 /// True if the config explicitly signals PS (HE-AAC v2).
51 pub ps_present: bool,
52}
53
54impl AudioSpecificConfig {
55 /// RFC 6381 codec string for the `mp4a` sample entry.
56 /// Format: `mp4a.40.<object_type>`.
57 pub fn codec_string(&self) -> String {
58 format!("mp4a.40.{}", self.object_type)
59 }
60}
61
62/// Parse an ASC from raw bytes. Returns a structured error on any
63/// malformed input; never panics.
64pub fn parse_asc(bytes: &[u8]) -> Result<AudioSpecificConfig, CodecError> {
65 if bytes.is_empty() {
66 return Err(CodecError::EndOfStream {
67 needed: 1,
68 remaining: 0,
69 });
70 }
71 let mut r = BitReader::new(bytes);
72 let object_type = read_object_type(&mut r)?;
73 let sample_rate = read_sample_rate(&mut r)?;
74 let channel_config = r.read_bits(4)? as u8;
75
76 // Detect SBR/PS extension. Two forms:
77 //
78 // 1. Explicit hierarchical signalling: object_type == 5 (SBR) or 29
79 // (PS) means the config describes an extension over an AAC-LC
80 // payload. In both cases the extensionSamplingFrequencyIndex
81 // follows immediately and then the actual audioObjectType of the
82 // downstream config is read.
83 // 2. Implicit (legacy): any object type may be followed by trailing
84 // bits that signal SBR; we do not try to detect that here because
85 // it requires scanning the GASpecificConfig payload.
86 let (sbr_present, ps_present, base_object_type) = match object_type {
87 5 | 29 => {
88 // extensionSamplingFrequencyIndex u(4) [ + explicit u(24) ]
89 let ext_sfi = r.read_bits(4)? as u8;
90 if ext_sfi == 15 {
91 // extensionSamplingFrequency u(24)
92 let _ = r.read_bits(24)?;
93 }
94 // The downstream audioObjectType (typically 2 = AAC-LC)
95 let downstream = read_object_type(&mut r)?;
96 let ps = object_type == 29;
97 (true, ps, downstream)
98 }
99 _ => (false, false, object_type),
100 };
101
102 Ok(AudioSpecificConfig {
103 object_type: if sbr_present { object_type } else { base_object_type },
104 sample_rate,
105 channel_config,
106 sbr_present,
107 ps_present,
108 })
109}
110
111/// Read a 5-bit audio object type with the 6-bit escape.
112///
113/// The wire encoding is:
114///
115/// ```text
116/// audioObjectType u(5)
117/// if audioObjectType == 31:
118/// audioObjectType = 32 + audioObjectTypeExt u(6)
119/// ```
120fn read_object_type(r: &mut BitReader<'_>) -> Result<u8, CodecError> {
121 let base = r.read_bits(5)? as u8;
122 if base == 31 {
123 let ext = r.read_bits(6)? as u8;
124 Ok(32 + ext)
125 } else {
126 Ok(base)
127 }
128}
129
130/// Read the sampling frequency: a 4-bit index into the standard table,
131/// or index 15 which means "explicit 24-bit frequency follows".
132fn read_sample_rate(r: &mut BitReader<'_>) -> Result<u32, CodecError> {
133 let sfi = r.read_bits(4)? as u8;
134 if sfi == 15 {
135 let freq = r.read_bits(24)?;
136 // Reject implausibly low explicit rates. The standard table
137 // bottoms out at 7350 Hz (`AAC_SAMPLE_FREQUENCIES[12]`), and
138 // no real-world AAC encoder produces anything below that;
139 // accepting rate=1 Hz just because the 24-bit field happened
140 // to decode that way lets attacker-shaped input through the
141 // codec parser and produces nonsense downstream (init
142 // segment timescale, LL-HLS partial duration reporting).
143 const MIN_PLAUSIBLE_HZ: u32 = 7350;
144 if freq < MIN_PLAUSIBLE_HZ {
145 return Err(CodecError::MalformedAsc("explicit sample rate below 7350 Hz"));
146 }
147 Ok(freq)
148 } else {
149 AAC_SAMPLE_FREQUENCIES
150 .get(sfi as usize)
151 .copied()
152 .ok_or(CodecError::MalformedAsc("sampling_frequency_index out of range"))
153 }
154}
155
156#[cfg(test)]
157mod tests {
158 use super::*;
159
160 #[test]
161 fn parse_aac_lc_stereo_48khz() {
162 // AOT=2 (5 bits = 00010), sfi=3 (4 bits = 0011), channel=2 (4 bits = 0010), pad 3 bits
163 // concatenated bitstream: 00010 0011 0010 000
164 // as bytes: 00010001 10010000 = 0x11 0x90
165 let asc = parse_asc(&[0x11, 0x90]).unwrap();
166 assert_eq!(asc.object_type, 2);
167 assert_eq!(asc.sample_rate, 48000);
168 assert_eq!(asc.channel_config, 2);
169 assert!(!asc.sbr_present);
170 assert!(!asc.ps_present);
171 assert_eq!(asc.codec_string(), "mp4a.40.2");
172 }
173
174 #[test]
175 fn parse_legacy_lvqr_aac_lc_stereo_44k() {
176 // The 2-byte ASC that lvqr-ingest's existing esds writer hard-codes
177 // (see HANDOFF.md session 3 notes): [0x12, 0x10]. Decode:
178 // 0001 0010 0001 0000
179 // AOT = 00010 = 2 (AAC-LC)
180 // sfi = 0100 = 4 -> 44100 Hz
181 // chan = 0010 = 2 (stereo)
182 // pad = 000
183 // This test pins the interpretation of that magic pair so future
184 // refactors of lvqr-ingest cannot silently drift from it.
185 let asc = parse_asc(&[0x12, 0x10]).unwrap();
186 assert_eq!(asc.object_type, 2);
187 assert_eq!(asc.sample_rate, 44100);
188 assert_eq!(asc.channel_config, 2);
189 }
190
191 #[test]
192 fn parse_he_aac_signals_sbr() {
193 // AOT=5 (SBR), ext sfi=3 (48kHz), downstream AOT=2 (AAC-LC),
194 // sfi=3 (48kHz), channel=2
195 //
196 // 00101 0011 00010 0011 0010 0
197 // = 0010 1001 1000 1000 1100 1000
198 // = 0x29 0x88 0xC8 (last byte has 3 significant bits)
199 let asc = parse_asc(&[0x29, 0x88, 0xC8]).unwrap();
200 assert!(asc.sbr_present);
201 assert!(!asc.ps_present);
202 assert_eq!(asc.object_type, 5);
203 }
204
205 #[test]
206 fn parse_rejects_empty_bytes() {
207 assert!(matches!(parse_asc(&[]), Err(CodecError::EndOfStream { .. })));
208 }
209
210 #[test]
211 fn parse_escape_object_type() {
212 // AOT = 42 (xHE-AAC USAC) -> base = 31, ext = 10
213 // bits: 11111 001010 (object type) 0011 (sfi=48k) 0010 (channel=2) pad
214 // = 11111 001010 0011 0010 0
215 // = 1111 1001 0100 0110 0100 (need 19 bits, pad to 24)
216 // = 1111 1001 0100 0110 0100 0000 = 0xF9 0x46 0x40
217 let asc = parse_asc(&[0xF9, 0x46, 0x40]).unwrap();
218 assert_eq!(asc.object_type, 42);
219 assert_eq!(asc.sample_rate, 48000);
220 assert_eq!(asc.channel_config, 2);
221 }
222
223 #[test]
224 fn parse_explicit_frequency() {
225 // AOT=2, sfi=15 (escape), explicit freq=96000 (0x017700),
226 // channel=2. Layout:
227 // AOT(5) = 00010
228 // sfi(4) = 1111
229 // freq(24) = 000000010111011100000000 (0x017700)
230 // channel(4) = 0010
231 // pad(3) = 000
232 // Full 40-bit stream concatenated:
233 // 00010 1111 00000001 01110111 00000000 0010 000
234 //
235 // Regrouped into 8-bit bytes:
236 // 0001 0111 = 0x17
237 // 1000 0000 = 0x80
238 // 1011 1011 = 0xBB
239 // 1000 0000 = 0x80
240 // 0001 0000 = 0x10
241 let asc = parse_asc(&[0x17, 0x80, 0xBB, 0x80, 0x10]).unwrap();
242 assert_eq!(asc.object_type, 2);
243 assert_eq!(asc.sample_rate, 96000);
244 assert_eq!(asc.channel_config, 2);
245 }
246
247 #[test]
248 fn parse_rejects_explicit_sample_rate_below_7350_hz() {
249 // Regression pin for the session-27 fix to `read_sample_rate`.
250 // Before the fix, AOT=2 + sfi=15 + a 24-bit explicit freq
251 // field of 1 would decode to `sample_rate = 1` and pass
252 // through. `proptest_aac::successful_parse_has_plausible_sample_rate`
253 // discovered the class via the seed `[87,128,0,0,128]`
254 // (decodes to rate=1433). After the fix, both shapes
255 // return `CodecError::MalformedAsc` so the nonsense rate
256 // never reaches downstream timescale math.
257 let explicit_seed: &[u8] = &[87, 128, 0, 0, 128];
258 match parse_asc(explicit_seed) {
259 Err(CodecError::MalformedAsc(msg)) => {
260 assert!(msg.contains("7350"), "expected the 7350 Hz floor error, got: {msg}");
261 }
262 other => panic!("expected MalformedAsc error, got {other:?}"),
263 }
264 }
265}