Skip to main content

container/
mp4_sanitize.rs

1//! Lenient pre-pass for the strict `mp4` crate.
2//!
3//! ISOBMFF box headers carry a `size` field that COULD be wrong —
4//! malformed encoders (older Apple QuickTime, some prosumer cameras,
5//! anything that round-trips through a buggy mux) can emit child
6//! boxes whose advertised size exceeds the parent's remaining
7//! payload. The `mp4 0.14` crate (and most strict ISOBMFF parsers)
8//! bail with `"<parent> box contains a box with a larger size than
9//! it"` and the whole demux fails.
10//!
11//! `sanitize_isobmff_box_sizes` walks the box tree from the root,
12//! and any time a child's advertised size would exceed the
13//! parent's remaining payload, rewrites the child's `size` field
14//! to fit. The output bytes are byte-compatible with strict
15//! parsers and (in the common case where the child's size was a
16//! benign over-report by 1-N bytes) preserve everything that the
17//! parser actually reads.
18//!
19//! The function is a no-op on every well-formed file — every box
20//! header is left untouched, so a clean MP4 hashes identically
21//! through this function. Only malformed files mutate.
22//!
23//! What this DOES handle:
24//!   - Top-level container boxes: ftyp, moov, mdat, etc.
25//!   - Recursive containers: moov > trak > mdia > minf > stbl >
26//!     stsd > {mp4a, av01, hvc1, ...}.
27//!   - 64-bit `largesize` extended-size form.
28//!
29//! What this does NOT handle:
30//!   - `size = 0` "extends to end of file" — left untouched (strict
31//!     parsers handle this correctly).
32//!   - Box trees with byte-level corruption inside a leaf box's
33//!     payload (e.g. a malformed `esds` descriptor). The sanitizer
34//!     only touches the box header bytes; leaf payload is opaque.
35
36/// Set of box four-CCs that contain other boxes. Walking these
37/// recursively lets us reach every header in the tree. Anything
38/// not in the set is treated as a leaf — its payload is copied
39/// through without further inspection. The set covers the boxes
40/// the strict parser actually descends into; adding a new entry
41/// here is the only way to extend the sanitizer's reach when a
42/// future crate version recurses further.
43const CONTAINER_FOURCCS: &[&[u8; 4]] = &[
44    b"moov", b"trak", b"mdia", b"minf", b"stbl", b"stsd", b"edts", b"udta", b"meta", b"dinf",
45    b"mvex", b"moof", b"traf", b"mfra",
46    // Sample-entry boxes that themselves contain children (visual
47    // sample entries carry colr/mdcv/clli/av1C; audio sample
48    // entries carry esds/dOps/wave). The strict parser walks
49    // these as containers, so we must too.
50    b"mp4a", b"Opus", b"ac-3", b"ec-3", b"enca", b"av01", b"avc1", b"avc3", b"hvc1", b"hev1",
51    b"hvc2", b"hev2", b"dvh1", b"dvhe", b"vp08", b"vp09", b"apco", b"apcs", b"apcn", b"apch",
52    b"ap4h", b"ap4x",
53    // QuickTime-era audio sample-entry sub-container; legacy Apple
54    // tools wrap esds inside this.
55    b"wave",
56];
57
58#[inline]
59fn is_container(fourcc: &[u8; 4]) -> bool {
60    CONTAINER_FOURCCS.contains(&fourcc)
61}
62
63/// Visual sample entries have a fixed 78-byte header before any
64/// child boxes start. Audio sample entries (mp4a, Opus, etc.)
65/// have a 28-byte header. The sanitizer skips these to land at
66/// the start of the first child.
67///
68/// Standard sizes per ISO 14496-12 §8.5.2:
69///   - VisualSampleEntry: 8 (box header) + 6 (reserved) +
70///     2 (data_reference_index) + 2 (pre_defined) +
71///     2 (reserved) + 12 (pre_defined[3]) + 2 (width) + 2 (height) +
72///     8 (resolution) + 4 (reserved) + 2 (frame_count) + 32 (compressorname) +
73///     2 (depth) + 2 (pre_defined) = 86 bytes total. After the box
74///     header we read 78 bytes of fixed fields before children.
75///   - AudioSampleEntry: 8 (box header) + 6 (reserved) +
76///     2 (data_reference_index) + 8 (reserved) + 2 (channels) +
77///     2 (sample_size) + 4 (reserved) + 4 (sample_rate) = 36 bytes
78///     total. After the box header: 28 bytes of fixed fields.
79fn sample_entry_fixed_fields_len(fourcc: &[u8; 4]) -> Option<usize> {
80    let visual = matches!(
81        fourcc,
82        b"av01"
83            | b"avc1"
84            | b"avc3"
85            | b"hvc1"
86            | b"hev1"
87            | b"hvc2"
88            | b"hev2"
89            | b"dvh1"
90            | b"dvhe"
91            | b"vp08"
92            | b"vp09"
93            | b"apco"
94            | b"apcs"
95            | b"apcn"
96            | b"apch"
97            | b"ap4h"
98            | b"ap4x",
99    );
100    let audio = matches!(fourcc, b"mp4a" | b"Opus" | b"ac-3" | b"ec-3" | b"enca");
101    if visual {
102        Some(78)
103    } else if audio {
104        Some(28)
105    } else {
106        None
107    }
108}
109
110pub fn sanitize_isobmff_box_sizes(data: &[u8]) -> Vec<u8> {
111    let mut out = Vec::with_capacity(data.len());
112    // Top-level walk has no parent — `parent` = `*` is fine since
113    // top-level fourccs (ftyp, moov, mdat, ...) never need
114    // sample-entry prefix handling.
115    walk_and_sanitize(data, 0, data.len(), b"****", &mut out);
116    out
117}
118
119/// Walks a parent's payload (data[parent_payload_start..parent_payload_end])
120/// emitting box headers, recursing into containers, copying leaves
121/// verbatim, and clamping any child whose advertised size exceeds
122/// the parent's remaining payload.
123///
124/// `parent` is the parent's four-CC. Used to decide whether a
125/// sample-entry-shaped child (mp4a, av01, etc.) actually IS a
126/// sample entry (parent == stsd) or a plain box used inside a
127/// QuickTime extension (e.g. the inner `mp4a` inside `wave` for
128/// iPhone-recorded MOVs). Only sample-entry-context children get
129/// the 28/78-byte fixed-prefix skip; plain-context children with
130/// the same fourcc walk like any other container.
131fn walk_and_sanitize(data: &[u8], start: usize, end: usize, parent: &[u8; 4], out: &mut Vec<u8>) {
132    let mut cursor = start;
133    while cursor < end {
134        // Box header is 8 bytes minimum (4 size + 4 fourcc).
135        if cursor + 8 > end {
136            // Trailing junk — copy through; the strict parser will
137            // surface the issue more clearly than we can here.
138            out.extend_from_slice(&data[cursor..end]);
139            return;
140        }
141
142        let raw_size = u32::from_be_bytes([
143            data[cursor],
144            data[cursor + 1],
145            data[cursor + 2],
146            data[cursor + 3],
147        ]) as u64;
148        let fourcc: &[u8; 4] = data[cursor + 4..cursor + 8].try_into().unwrap();
149
150        // size=0 means "extends to end of file (or parent)" per spec.
151        // Leave alone — strict parsers handle this correctly.
152        if raw_size == 0 {
153            out.extend_from_slice(&data[cursor..end]);
154            return;
155        }
156
157        let mut header_len = 8usize;
158        let mut box_size = raw_size;
159
160        // Extended (64-bit) size: header is 16 bytes total.
161        if raw_size == 1 {
162            if cursor + 16 > end {
163                out.extend_from_slice(&data[cursor..end]);
164                return;
165            }
166            box_size = u64::from_be_bytes([
167                data[cursor + 8],
168                data[cursor + 9],
169                data[cursor + 10],
170                data[cursor + 11],
171                data[cursor + 12],
172                data[cursor + 13],
173                data[cursor + 14],
174                data[cursor + 15],
175            ]);
176            header_len = 16;
177        }
178
179        // Clamp: child's size must fit inside parent's remaining
180        // payload. If the file claims a size that runs past the
181        // parent boundary, rewrite the size field to land at the
182        // parent end. Handles the "mp4a box contains a box with a
183        // larger size than it" failure mode directly.
184        let remaining = (end - cursor) as u64;
185        let clamped = if box_size > remaining {
186            remaining
187        } else {
188            box_size
189        };
190
191        // Emit the (possibly rewritten) header. We always emit the
192        // 32-bit form when clamping — that's what every ISOBMFF
193        // parser expects for sizes that fit in u32. If the clamped
194        // value exceeds u32::MAX we fall back to writing the
195        // largesize form unchanged from the source (this is rare —
196        // happens only for >4 GiB boxes, where clamping is a no-op
197        // because the file is already huge enough to fit).
198        if clamped <= u32::MAX as u64 && header_len == 8 {
199            out.extend_from_slice(&(clamped as u32).to_be_bytes());
200            out.extend_from_slice(fourcc);
201        } else {
202            // Either largesize form OR clamped value too big for
203            // u32 — emit the original header bytes verbatim.
204            out.extend_from_slice(&data[cursor..cursor + header_len]);
205        }
206
207        let payload_start = cursor + header_len;
208        let payload_end = (cursor as u64 + clamped) as usize;
209        let payload_end = payload_end.min(end);
210
211        if payload_start >= payload_end {
212            // Zero-length or malformed payload after header. Keep
213            // walking from the parent's next box.
214            cursor = payload_end.max(cursor + header_len);
215            continue;
216        }
217
218        if is_container(fourcc) {
219            // Sample-entry boxes (mp4a/Opus/ac-3/ec-3/av01/avc1/...)
220            // carry a fixed-field block before their children. They
221            // are sample entries ONLY when their parent is `stsd`.
222            // Anywhere else (e.g. the inner `mp4a` inside `wave` in
223            // QuickTime / iPhone MOVs), the same fourcc is a plain
224            // container with no fixed prefix — applying the prefix
225            // skip there would mis-align the child walk and corrupt
226            // the recursion. `stsd` itself has its own 8-byte
227            // (FullBox header + entry_count) preamble.
228            let prefix_len = if fourcc == b"stsd" {
229                8
230            } else if parent == b"stsd" {
231                sample_entry_fixed_fields_len(fourcc).unwrap_or(0)
232            } else {
233                0
234            };
235            let copy_end = (payload_start + prefix_len).min(payload_end);
236            out.extend_from_slice(&data[payload_start..copy_end]);
237            walk_and_sanitize(data, copy_end, payload_end, fourcc, out);
238        } else {
239            // Leaf box — copy payload verbatim.
240            out.extend_from_slice(&data[payload_start..payload_end]);
241        }
242
243        cursor = payload_end;
244    }
245}
246
247#[cfg(test)]
248mod tests {
249    use super::*;
250
251    fn make_box(fourcc: &[u8; 4], payload: &[u8]) -> Vec<u8> {
252        let size = (8 + payload.len()) as u32;
253        let mut out = Vec::with_capacity(size as usize);
254        out.extend_from_slice(&size.to_be_bytes());
255        out.extend_from_slice(fourcc);
256        out.extend_from_slice(payload);
257        out
258    }
259
260    fn make_sized_box(fourcc: &[u8; 4], reported_size: u32, payload: &[u8]) -> Vec<u8> {
261        // Size on the wire reflects the "reported" value, but the
262        // payload appended is the actual bytes. Used to fabricate
263        // malformed boxes where reported_size != header_len + payload.len().
264        let mut out = Vec::with_capacity(8 + payload.len());
265        out.extend_from_slice(&reported_size.to_be_bytes());
266        out.extend_from_slice(fourcc);
267        out.extend_from_slice(payload);
268        out
269    }
270
271    #[test]
272    fn well_formed_file_passes_through_byte_identical() {
273        let esds = make_box(b"esds", &[0x00; 32]);
274        let mut mp4a_payload = vec![0u8; 28]; // fixed audio sample entry fields
275        mp4a_payload.extend_from_slice(&esds);
276        let mp4a = make_box(b"mp4a", &mp4a_payload);
277
278        let stsd = {
279            let mut p = vec![0u8, 0, 0, 0]; // version+flags
280            p.extend_from_slice(&1u32.to_be_bytes()); // entry_count = 1
281            p.extend_from_slice(&mp4a);
282            make_box(b"stsd", &p)
283        };
284        let stbl = make_box(b"stbl", &stsd);
285        let minf = make_box(b"minf", &stbl);
286        let mdia = make_box(b"mdia", &minf);
287        let trak = make_box(b"trak", &mdia);
288        let moov = make_box(b"moov", &trak);
289
290        let sanitized = sanitize_isobmff_box_sizes(&moov);
291        assert_eq!(
292            sanitized, moov,
293            "well-formed input must round-trip byte-identical"
294        );
295    }
296
297    #[test]
298    fn over_reported_child_inside_mp4a_gets_clamped() {
299        // The bug from the user's screenshot: an esds child whose
300        // reported size exceeds the parent mp4a's remaining payload.
301        // Reported size = 100 (way more than the 16 actual bytes
302        // including header).
303        let bad_esds = make_sized_box(b"esds", 100, &[0xAB; 8]);
304
305        let mut mp4a_payload = vec![0u8; 28]; // fixed audio fields
306        mp4a_payload.extend_from_slice(&bad_esds);
307        let mp4a = make_box(b"mp4a", &mp4a_payload);
308
309        // mp4a is only treated as a sample entry (with the 28-byte
310        // prefix) when its parent is `stsd`. Wrap properly.
311        let stsd_payload = {
312            let mut p = vec![0u8; 4]; // version + flags
313            p.extend_from_slice(&1u32.to_be_bytes()); // entry_count = 1
314            p.extend_from_slice(&mp4a);
315            p
316        };
317        let stsd = make_box(b"stsd", &stsd_payload);
318
319        let sanitized = sanitize_isobmff_box_sizes(&stsd);
320
321        // Locate the mp4a header inside the sanitized output:
322        //   stsd header (8) + version+flags (4) + entry_count (4) = 16
323        let mp4a_header_offset = 16;
324        assert_eq!(
325            &sanitized[mp4a_header_offset + 4..mp4a_header_offset + 8],
326            b"mp4a"
327        );
328        // esds header sits 8 (mp4a header) + 28 (fixed audio fields)
329        // bytes past the mp4a header.
330        let esds_size_offset = mp4a_header_offset + 8 + 28;
331        let clamped_esds_size = u32::from_be_bytes([
332            sanitized[esds_size_offset],
333            sanitized[esds_size_offset + 1],
334            sanitized[esds_size_offset + 2],
335            sanitized[esds_size_offset + 3],
336        ]);
337        // mp4a payload (after its 8-byte header) is 28 (fixed) + 16
338        // (esds, including header) = 44 bytes. esds sits at offset
339        // 28 within mp4a payload, with 16 bytes remaining; clamped
340        // esds size should land at 16.
341        assert!(
342            clamped_esds_size <= 16,
343            "esds size should be clamped to fit, got {clamped_esds_size}",
344        );
345        assert!(
346            clamped_esds_size >= 8,
347            "esds size should still cover its header, got {clamped_esds_size}",
348        );
349    }
350
351    #[test]
352    fn inner_mp4a_inside_wave_is_not_treated_as_sample_entry() {
353        // iPhone MOV layout: the OUTER mp4a is a sample entry
354        // (28-byte prefix), but the INNER mp4a inside `wave` is a
355        // plain container box — applying the 28-byte prefix there
356        // would mis-align the child walk and lose the esds sibling.
357        // This test reproduces the iPhone audio drop and asserts
358        // the sanitizer's output is structurally walk-able by the
359        // manual ASC extractor.
360        let inner_mp4a = make_box(b"mp4a", &vec![0u8; 24]); // QuickTime audio config blob
361        let frma = make_box(b"frma", b"mp4a");
362        let esds_body = vec![0u8; 32];
363        let esds = make_box(b"esds", &esds_body);
364
365        let wave_payload = {
366            let mut p = Vec::new();
367            p.extend_from_slice(&frma);
368            p.extend_from_slice(&inner_mp4a);
369            p.extend_from_slice(&esds);
370            p
371        };
372        let wave = make_box(b"wave", &wave_payload);
373
374        // Outer mp4a: 28 fixed audio fields + the wave atom.
375        let mut outer_mp4a_payload = vec![0u8; 28];
376        outer_mp4a_payload.extend_from_slice(&wave);
377        let outer_mp4a = make_box(b"mp4a", &outer_mp4a_payload);
378
379        let stsd_payload = {
380            let mut p = vec![0u8; 4];
381            p.extend_from_slice(&1u32.to_be_bytes());
382            p.extend_from_slice(&outer_mp4a);
383            p
384        };
385        let stsd = make_box(b"stsd", &stsd_payload);
386
387        let sanitized = sanitize_isobmff_box_sizes(&stsd);
388        // Round-trip byte-identical (no clamping needed — every
389        // box's reported size already fits its parent).
390        assert_eq!(
391            sanitized, stsd,
392            "well-formed iPhone-shaped MP4 must pass through unchanged"
393        );
394    }
395
396    #[test]
397    fn sanitizer_is_idempotent() {
398        // Running sanitize twice should be a no-op the second time.
399        let bad_esds = make_sized_box(b"esds", 100, &[0u8; 8]);
400        let mut mp4a_payload = vec![0u8; 28];
401        mp4a_payload.extend_from_slice(&bad_esds);
402        let mp4a = make_box(b"mp4a", &mp4a_payload);
403
404        let once = sanitize_isobmff_box_sizes(&mp4a);
405        let twice = sanitize_isobmff_box_sizes(&once);
406        assert_eq!(once, twice, "sanitizer must be idempotent");
407    }
408
409    #[test]
410    fn truncated_input_is_handled_without_panic() {
411        // Box header says size=100 but only 12 bytes follow.
412        let mut bad = vec![];
413        bad.extend_from_slice(&100u32.to_be_bytes());
414        bad.extend_from_slice(b"moov");
415        bad.extend_from_slice(&[0u8; 4]); // 4 bytes of "payload"
416        let _ = sanitize_isobmff_box_sizes(&bad); // must not panic
417    }
418}