container/mp4_sanitize.rs
1//! Lenient pre-pass for the strict `mp4` crate.
2//!
3//! ISOBMFF box headers carry a `size` field that COULD be wrong —
4//! malformed encoders (older Apple QuickTime, some prosumer cameras,
5//! anything that round-trips through a buggy mux) can emit child
6//! boxes whose advertised size exceeds the parent's remaining
7//! payload. The `mp4 0.14` crate (and most strict ISOBMFF parsers)
8//! bail with `"<parent> box contains a box with a larger size than
9//! it"` and the whole demux fails.
10//!
11//! `sanitize_isobmff_box_sizes` walks the box tree from the root,
12//! and any time a child's advertised size would exceed the
13//! parent's remaining payload, rewrites the child's `size` field
14//! to fit. The output bytes are byte-compatible with strict
15//! parsers and (in the common case where the child's size was a
16//! benign over-report by 1-N bytes) preserve everything that the
17//! parser actually reads.
18//!
19//! The function is a no-op on every well-formed file — every box
20//! header is left untouched, so a clean MP4 hashes identically
21//! through this function. Only malformed files mutate.
22//!
23//! What this DOES handle:
24//! - Top-level container boxes: ftyp, moov, mdat, etc.
25//! - Recursive containers: moov > trak > mdia > minf > stbl >
26//! stsd > {mp4a, av01, hvc1, ...}.
27//! - 64-bit `largesize` extended-size form.
28//!
29//! What this does NOT handle:
30//! - `size = 0` "extends to end of file" — left untouched (strict
31//! parsers handle this correctly).
32//! - Box trees with byte-level corruption inside a leaf box's
33//! payload (e.g. a malformed `esds` descriptor). The sanitizer
34//! only touches the box header bytes; leaf payload is opaque.
35
36/// Set of box four-CCs that contain other boxes. Walking these
37/// recursively lets us reach every header in the tree. Anything
38/// not in the set is treated as a leaf — its payload is copied
39/// through without further inspection. The set covers the boxes
40/// the strict parser actually descends into; adding a new entry
41/// here is the only way to extend the sanitizer's reach when a
42/// future crate version recurses further.
43const CONTAINER_FOURCCS: &[&[u8; 4]] = &[
44 b"moov", b"trak", b"mdia", b"minf", b"stbl", b"stsd", b"edts", b"udta", b"meta", b"dinf",
45 b"mvex", b"moof", b"traf", b"mfra",
46 // Sample-entry boxes that themselves contain children (visual
47 // sample entries carry colr/mdcv/clli/av1C; audio sample
48 // entries carry esds/dOps/wave). The strict parser walks
49 // these as containers, so we must too.
50 b"mp4a", b"Opus", b"ac-3", b"ec-3", b"enca", b"av01", b"avc1", b"avc3", b"hvc1", b"hev1",
51 b"hvc2", b"hev2", b"dvh1", b"dvhe", b"vp08", b"vp09", b"apco", b"apcs", b"apcn", b"apch",
52 b"ap4h", b"ap4x",
53 // QuickTime-era audio sample-entry sub-container; legacy Apple
54 // tools wrap esds inside this.
55 b"wave",
56];
57
58#[inline]
59fn is_container(fourcc: &[u8; 4]) -> bool {
60 CONTAINER_FOURCCS.contains(&fourcc)
61}
62
63/// Visual sample entries have a fixed 78-byte header before any
64/// child boxes start. Audio sample entries (mp4a, Opus, etc.)
65/// have a 28-byte header. The sanitizer skips these to land at
66/// the start of the first child.
67///
68/// Standard sizes per ISO 14496-12 §8.5.2:
69/// - VisualSampleEntry: 8 (box header) + 6 (reserved) +
70/// 2 (data_reference_index) + 2 (pre_defined) +
71/// 2 (reserved) + 12 (pre_defined[3]) + 2 (width) + 2 (height) +
72/// 8 (resolution) + 4 (reserved) + 2 (frame_count) + 32 (compressorname) +
73/// 2 (depth) + 2 (pre_defined) = 86 bytes total. After the box
74/// header we read 78 bytes of fixed fields before children.
75/// - AudioSampleEntry: 8 (box header) + 6 (reserved) +
76/// 2 (data_reference_index) + 8 (reserved) + 2 (channels) +
77/// 2 (sample_size) + 4 (reserved) + 4 (sample_rate) = 36 bytes
78/// total. After the box header: 28 bytes of fixed fields.
79fn sample_entry_fixed_fields_len(fourcc: &[u8; 4]) -> Option<usize> {
80 let visual = matches!(
81 fourcc,
82 b"av01"
83 | b"avc1"
84 | b"avc3"
85 | b"hvc1"
86 | b"hev1"
87 | b"hvc2"
88 | b"hev2"
89 | b"dvh1"
90 | b"dvhe"
91 | b"vp08"
92 | b"vp09"
93 | b"apco"
94 | b"apcs"
95 | b"apcn"
96 | b"apch"
97 | b"ap4h"
98 | b"ap4x",
99 );
100 let audio = matches!(fourcc, b"mp4a" | b"Opus" | b"ac-3" | b"ec-3" | b"enca");
101 if visual {
102 Some(78)
103 } else if audio {
104 Some(28)
105 } else {
106 None
107 }
108}
109
110pub fn sanitize_isobmff_box_sizes(data: &[u8]) -> Vec<u8> {
111 let mut out = Vec::with_capacity(data.len());
112 // Top-level walk has no parent — `parent` = `*` is fine since
113 // top-level fourccs (ftyp, moov, mdat, ...) never need
114 // sample-entry prefix handling.
115 walk_and_sanitize(data, 0, data.len(), b"****", &mut out);
116 out
117}
118
119/// Walks a parent's payload (data[parent_payload_start..parent_payload_end])
120/// emitting box headers, recursing into containers, copying leaves
121/// verbatim, and clamping any child whose advertised size exceeds
122/// the parent's remaining payload.
123///
124/// `parent` is the parent's four-CC. Used to decide whether a
125/// sample-entry-shaped child (mp4a, av01, etc.) actually IS a
126/// sample entry (parent == stsd) or a plain box used inside a
127/// QuickTime extension (e.g. the inner `mp4a` inside `wave` for
128/// iPhone-recorded MOVs). Only sample-entry-context children get
129/// the 28/78-byte fixed-prefix skip; plain-context children with
130/// the same fourcc walk like any other container.
131fn walk_and_sanitize(data: &[u8], start: usize, end: usize, parent: &[u8; 4], out: &mut Vec<u8>) {
132 let mut cursor = start;
133 while cursor < end {
134 // Box header is 8 bytes minimum (4 size + 4 fourcc).
135 if cursor + 8 > end {
136 // Trailing junk — copy through; the strict parser will
137 // surface the issue more clearly than we can here.
138 out.extend_from_slice(&data[cursor..end]);
139 return;
140 }
141
142 let raw_size = u32::from_be_bytes([
143 data[cursor],
144 data[cursor + 1],
145 data[cursor + 2],
146 data[cursor + 3],
147 ]) as u64;
148 let fourcc: &[u8; 4] = data[cursor + 4..cursor + 8].try_into().unwrap();
149
150 // size=0 means "extends to end of file (or parent)" per spec.
151 // Leave alone — strict parsers handle this correctly.
152 if raw_size == 0 {
153 out.extend_from_slice(&data[cursor..end]);
154 return;
155 }
156
157 let mut header_len = 8usize;
158 let mut box_size = raw_size;
159
160 // Extended (64-bit) size: header is 16 bytes total.
161 if raw_size == 1 {
162 if cursor + 16 > end {
163 out.extend_from_slice(&data[cursor..end]);
164 return;
165 }
166 box_size = u64::from_be_bytes([
167 data[cursor + 8],
168 data[cursor + 9],
169 data[cursor + 10],
170 data[cursor + 11],
171 data[cursor + 12],
172 data[cursor + 13],
173 data[cursor + 14],
174 data[cursor + 15],
175 ]);
176 header_len = 16;
177 }
178
179 // Clamp: child's size must fit inside parent's remaining
180 // payload. If the file claims a size that runs past the
181 // parent boundary, rewrite the size field to land at the
182 // parent end. Handles the "mp4a box contains a box with a
183 // larger size than it" failure mode directly.
184 let remaining = (end - cursor) as u64;
185 let clamped = if box_size > remaining {
186 remaining
187 } else {
188 box_size
189 };
190
191 // Emit the (possibly rewritten) header. We always emit the
192 // 32-bit form when clamping — that's what every ISOBMFF
193 // parser expects for sizes that fit in u32. If the clamped
194 // value exceeds u32::MAX we fall back to writing the
195 // largesize form unchanged from the source (this is rare —
196 // happens only for >4 GiB boxes, where clamping is a no-op
197 // because the file is already huge enough to fit).
198 if clamped <= u32::MAX as u64 && header_len == 8 {
199 out.extend_from_slice(&(clamped as u32).to_be_bytes());
200 out.extend_from_slice(fourcc);
201 } else {
202 // Either largesize form OR clamped value too big for
203 // u32 — emit the original header bytes verbatim.
204 out.extend_from_slice(&data[cursor..cursor + header_len]);
205 }
206
207 let payload_start = cursor + header_len;
208 let payload_end = (cursor as u64 + clamped) as usize;
209 let payload_end = payload_end.min(end);
210
211 if payload_start >= payload_end {
212 // Zero-length or malformed payload after header. Keep
213 // walking from the parent's next box.
214 cursor = payload_end.max(cursor + header_len);
215 continue;
216 }
217
218 if is_container(fourcc) {
219 // Sample-entry boxes (mp4a/Opus/ac-3/ec-3/av01/avc1/...)
220 // carry a fixed-field block before their children. They
221 // are sample entries ONLY when their parent is `stsd`.
222 // Anywhere else (e.g. the inner `mp4a` inside `wave` in
223 // QuickTime / iPhone MOVs), the same fourcc is a plain
224 // container with no fixed prefix — applying the prefix
225 // skip there would mis-align the child walk and corrupt
226 // the recursion. `stsd` itself has its own 8-byte
227 // (FullBox header + entry_count) preamble.
228 let prefix_len = if fourcc == b"stsd" {
229 8
230 } else if parent == b"stsd" {
231 sample_entry_fixed_fields_len(fourcc).unwrap_or(0)
232 } else {
233 0
234 };
235 let copy_end = (payload_start + prefix_len).min(payload_end);
236 out.extend_from_slice(&data[payload_start..copy_end]);
237 walk_and_sanitize(data, copy_end, payload_end, fourcc, out);
238 } else {
239 // Leaf box — copy payload verbatim.
240 out.extend_from_slice(&data[payload_start..payload_end]);
241 }
242
243 cursor = payload_end;
244 }
245}
246
247#[cfg(test)]
248mod tests {
249 use super::*;
250
251 fn make_box(fourcc: &[u8; 4], payload: &[u8]) -> Vec<u8> {
252 let size = (8 + payload.len()) as u32;
253 let mut out = Vec::with_capacity(size as usize);
254 out.extend_from_slice(&size.to_be_bytes());
255 out.extend_from_slice(fourcc);
256 out.extend_from_slice(payload);
257 out
258 }
259
260 fn make_sized_box(fourcc: &[u8; 4], reported_size: u32, payload: &[u8]) -> Vec<u8> {
261 // Size on the wire reflects the "reported" value, but the
262 // payload appended is the actual bytes. Used to fabricate
263 // malformed boxes where reported_size != header_len + payload.len().
264 let mut out = Vec::with_capacity(8 + payload.len());
265 out.extend_from_slice(&reported_size.to_be_bytes());
266 out.extend_from_slice(fourcc);
267 out.extend_from_slice(payload);
268 out
269 }
270
271 #[test]
272 fn well_formed_file_passes_through_byte_identical() {
273 let esds = make_box(b"esds", &[0x00; 32]);
274 let mut mp4a_payload = vec![0u8; 28]; // fixed audio sample entry fields
275 mp4a_payload.extend_from_slice(&esds);
276 let mp4a = make_box(b"mp4a", &mp4a_payload);
277
278 let stsd = {
279 let mut p = vec![0u8, 0, 0, 0]; // version+flags
280 p.extend_from_slice(&1u32.to_be_bytes()); // entry_count = 1
281 p.extend_from_slice(&mp4a);
282 make_box(b"stsd", &p)
283 };
284 let stbl = make_box(b"stbl", &stsd);
285 let minf = make_box(b"minf", &stbl);
286 let mdia = make_box(b"mdia", &minf);
287 let trak = make_box(b"trak", &mdia);
288 let moov = make_box(b"moov", &trak);
289
290 let sanitized = sanitize_isobmff_box_sizes(&moov);
291 assert_eq!(
292 sanitized, moov,
293 "well-formed input must round-trip byte-identical"
294 );
295 }
296
297 #[test]
298 fn over_reported_child_inside_mp4a_gets_clamped() {
299 // The bug from the user's screenshot: an esds child whose
300 // reported size exceeds the parent mp4a's remaining payload.
301 // Reported size = 100 (way more than the 16 actual bytes
302 // including header).
303 let bad_esds = make_sized_box(b"esds", 100, &[0xAB; 8]);
304
305 let mut mp4a_payload = vec![0u8; 28]; // fixed audio fields
306 mp4a_payload.extend_from_slice(&bad_esds);
307 let mp4a = make_box(b"mp4a", &mp4a_payload);
308
309 // mp4a is only treated as a sample entry (with the 28-byte
310 // prefix) when its parent is `stsd`. Wrap properly.
311 let stsd_payload = {
312 let mut p = vec![0u8; 4]; // version + flags
313 p.extend_from_slice(&1u32.to_be_bytes()); // entry_count = 1
314 p.extend_from_slice(&mp4a);
315 p
316 };
317 let stsd = make_box(b"stsd", &stsd_payload);
318
319 let sanitized = sanitize_isobmff_box_sizes(&stsd);
320
321 // Locate the mp4a header inside the sanitized output:
322 // stsd header (8) + version+flags (4) + entry_count (4) = 16
323 let mp4a_header_offset = 16;
324 assert_eq!(
325 &sanitized[mp4a_header_offset + 4..mp4a_header_offset + 8],
326 b"mp4a"
327 );
328 // esds header sits 8 (mp4a header) + 28 (fixed audio fields)
329 // bytes past the mp4a header.
330 let esds_size_offset = mp4a_header_offset + 8 + 28;
331 let clamped_esds_size = u32::from_be_bytes([
332 sanitized[esds_size_offset],
333 sanitized[esds_size_offset + 1],
334 sanitized[esds_size_offset + 2],
335 sanitized[esds_size_offset + 3],
336 ]);
337 // mp4a payload (after its 8-byte header) is 28 (fixed) + 16
338 // (esds, including header) = 44 bytes. esds sits at offset
339 // 28 within mp4a payload, with 16 bytes remaining; clamped
340 // esds size should land at 16.
341 assert!(
342 clamped_esds_size <= 16,
343 "esds size should be clamped to fit, got {clamped_esds_size}",
344 );
345 assert!(
346 clamped_esds_size >= 8,
347 "esds size should still cover its header, got {clamped_esds_size}",
348 );
349 }
350
351 #[test]
352 fn inner_mp4a_inside_wave_is_not_treated_as_sample_entry() {
353 // iPhone MOV layout: the OUTER mp4a is a sample entry
354 // (28-byte prefix), but the INNER mp4a inside `wave` is a
355 // plain container box — applying the 28-byte prefix there
356 // would mis-align the child walk and lose the esds sibling.
357 // This test reproduces the iPhone audio drop and asserts
358 // the sanitizer's output is structurally walk-able by the
359 // manual ASC extractor.
360 let inner_mp4a = make_box(b"mp4a", &vec![0u8; 24]); // QuickTime audio config blob
361 let frma = make_box(b"frma", b"mp4a");
362 let esds_body = vec![0u8; 32];
363 let esds = make_box(b"esds", &esds_body);
364
365 let wave_payload = {
366 let mut p = Vec::new();
367 p.extend_from_slice(&frma);
368 p.extend_from_slice(&inner_mp4a);
369 p.extend_from_slice(&esds);
370 p
371 };
372 let wave = make_box(b"wave", &wave_payload);
373
374 // Outer mp4a: 28 fixed audio fields + the wave atom.
375 let mut outer_mp4a_payload = vec![0u8; 28];
376 outer_mp4a_payload.extend_from_slice(&wave);
377 let outer_mp4a = make_box(b"mp4a", &outer_mp4a_payload);
378
379 let stsd_payload = {
380 let mut p = vec![0u8; 4];
381 p.extend_from_slice(&1u32.to_be_bytes());
382 p.extend_from_slice(&outer_mp4a);
383 p
384 };
385 let stsd = make_box(b"stsd", &stsd_payload);
386
387 let sanitized = sanitize_isobmff_box_sizes(&stsd);
388 // Round-trip byte-identical (no clamping needed — every
389 // box's reported size already fits its parent).
390 assert_eq!(
391 sanitized, stsd,
392 "well-formed iPhone-shaped MP4 must pass through unchanged"
393 );
394 }
395
396 #[test]
397 fn sanitizer_is_idempotent() {
398 // Running sanitize twice should be a no-op the second time.
399 let bad_esds = make_sized_box(b"esds", 100, &[0u8; 8]);
400 let mut mp4a_payload = vec![0u8; 28];
401 mp4a_payload.extend_from_slice(&bad_esds);
402 let mp4a = make_box(b"mp4a", &mp4a_payload);
403
404 let once = sanitize_isobmff_box_sizes(&mp4a);
405 let twice = sanitize_isobmff_box_sizes(&once);
406 assert_eq!(once, twice, "sanitizer must be idempotent");
407 }
408
409 #[test]
410 fn truncated_input_is_handled_without_panic() {
411 // Box header says size=100 but only 12 bytes follow.
412 let mut bad = vec![];
413 bad.extend_from_slice(&100u32.to_be_bytes());
414 bad.extend_from_slice(b"moov");
415 bad.extend_from_slice(&[0u8; 4]); // 4 bytes of "payload"
416 let _ = sanitize_isobmff_box_sizes(&bad); // must not panic
417 }
418}