container/mux/mod.rs
1use anyhow::{Context, Result};
2use bytes::Bytes;
3use codec::encode::EncodedPacket;
4use codec::frame::{ColorMetadata, VideoCodec};
5
6use crate::nal_mux::{NalMuxCodec, NalSampleWriter};
7use std::fs::File;
8use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
9use std::path::Path;
10use tempfile::NamedTempFile;
11
12use crate::AudioInfo;
13
14mod boxes;
15mod video_track;
16mod audio_track;
17mod sample_table;
18mod mdat;
19#[cfg(test)]
20mod tests;
21
22// Re-exports for external crate callers that import from `container::mux::*`.
23pub(crate) use boxes::{BoxBuilder, write_unity_matrix, extract_sequence_header};
24pub(crate) use video_track::{build_av01, build_avc1, build_hvc1, build_avcc, build_hvcc};
25pub(crate) use audio_track::build_audio_stsd;
26pub use audio_track::{dac3_body_from_sync, dec3_body_from_sync};
27
28// Internal imports used by impl Av1Mp4Muxer below.
29use boxes::{build_ftyp, build_moov_any};
30use sample_table::{AudioBuildPlan, chunk_count_of, plan_interleaved_layout};
31
32/// Streams mdat payload bytes to a tempfile while keeping only small
33/// per-packet metadata vectors in RAM. At 15 min 1080p60 and ~500 kB/sample
34/// average the metadata Vecs are ~700 KB total; the packet payload (~500 MB
35/// per variant at AV1 CQ 32) stays on disk.
36///
37/// Faststart is preserved: `finalize_to_file` writes ftyp + moov first,
38/// then streams the tempfile's mdat bytes into the final output.
39///
40/// API:
41/// - `new(w, h, fps)` — constructs a spooled muxer, creating the tempfile
42/// immediately. Fails if tempdir is unwritable.
43/// - `add_packet(packet)` — appends packet payload to the tempfile and
44/// records size/sync metadata.
45/// - `with_audio(info)` — registers an optional audio track. Codec dispatch
46/// happens here on `info.codec` (`"aac"` / `"opus"` / `"ac3"` / `"eac3"`).
47/// Must be called before `add_audio_sample`. Bails on unsupported codecs
48/// or channel counts — no silent degradation.
49/// - `add_audio_sample(sample, pts_ticks, duration_ticks)` — appends one
50/// audio access unit plus per-sample metadata. Requires `with_audio`
51/// first.
52/// - `finalize_to_file(&Path)` — writes ftyp + moov + mdat payload to the
53/// target path. Consumes self.
54/// - `finalize()` — backward-compat shim that reads the finalized file into
55/// a `Bytes`. Useful for small tests; callers hitting the RAM ceiling
56/// should use `finalize_to_file` + `ObjectStore::upload_file`.
57pub struct Av1Mp4Muxer {
58 width: u32,
59 height: u32,
60 frame_rate: f64,
61 mdat_tmp: NamedTempFile,
62 mdat_writer: BufWriter<File>,
63 sample_sizes: Vec<u32>,
64 keyframe_indices: Vec<u32>,
65 first_packet_header: Option<Vec<u8>>,
66 packet_count: u32,
67 mdat_payload_bytes: u64,
68 audio: Option<AudioTrackState>,
69 /// Color metadata copied from the source `StreamInfo` so the visual
70 /// sample entry can carry an Apple-compliant `colr nclx` box. Defaults
71 /// to BT.709 SDR limited-range — Apple silently assumes that when
72 /// `colr` is absent, so the default is correct for SDR sources but
73 /// breaks BT.2020 / HDR clips. Real values arrive via `with_color`.
74 color_metadata: ColorMetadata,
75 /// Test-only override forcing the muxer to emit the 64-bit `largesize`
76 /// mdat header even when the payload would fit in the 32-bit `size`
77 /// field. Pre-existing payload size computation otherwise leaves the
78 /// largesize branch untestable without producing a 4 GiB tempfile.
79 /// Production callers leave this `false`; tests flip it on to assert
80 /// the bit-layout of the largesize header is correct.
81 ///
82 /// Must be a regular field (not `#[cfg(test)]`-gated) so integration
83 /// tests in `tests/` — which compile against the release library
84 /// without `cfg(test)` — can flip it via `force_largesize_mdat_for_test`.
85 #[doc(hidden)]
86 force_largesize_mdat: bool,
87 /// Output video codec. Drives the sample-entry fourcc + config box at
88 /// finalize (`av01`/`av1C`, `avc1`/`avcC`, or `hvc1`/`hvcC`).
89 codec: VideoCodec,
90 /// For H.264 / H.265: repackages the encoder's Annex-B frames into
91 /// length-prefixed mdat samples and collects the SPS/PPS(/VPS) for the
92 /// config box. `None` for AV1 (which stores OBUs verbatim).
93 nal_writer: Option<NalSampleWriter>,
94 /// Inline-parameter-set mode (H.264/H.265 multi-GPU stitch): keep SPS/PPS
95 /// inline per access unit + emit the `avc3`/`hev1` sample entry instead of
96 /// `avc1`/`hvc1`, so chunks from independent encoders self-describe.
97 inline_param_sets: bool,
98}
99
100/// Per-muxer audio track state: info + spooling tempfile + per-sample
101/// metadata. Kept internal; populated via `with_audio` + `add_audio_sample`.
102struct AudioTrackState {
103 info: AudioInfo,
104 audio_tmp: NamedTempFile,
105 audio_writer: BufWriter<File>,
106 sample_sizes: Vec<u32>,
107 durations: Vec<u32>,
108 total_duration_ticks: u64,
109 mdat_payload_bytes: u64,
110}
111
112/// Internal discriminator chosen at `with_audio` time. Saves us re-parsing
113/// the codec string at every builder call site (build_audio_stsd, etc.) and
114/// keeps the AAC / Opus / AC-3 / E-AC-3 dispatch in one place.
115#[derive(Debug, Clone, Copy, PartialEq, Eq)]
116pub(super) enum AudioCodecKind {
117 Aac,
118 Opus,
119 Ac3,
120 Eac3,
121}
122
123impl AudioCodecKind {
124 pub(super) fn from_codec_tag(codec: &str) -> Option<Self> {
125 if codec.eq_ignore_ascii_case("aac") {
126 Some(Self::Aac)
127 } else if codec.eq_ignore_ascii_case("opus") {
128 Some(Self::Opus)
129 } else if codec.eq_ignore_ascii_case("ac3") || codec.eq_ignore_ascii_case("ac-3") {
130 Some(Self::Ac3)
131 } else if codec.eq_ignore_ascii_case("eac3") || codec.eq_ignore_ascii_case("e-ac-3") {
132 Some(Self::Eac3)
133 } else {
134 None
135 }
136 }
137}
138
139impl Av1Mp4Muxer {
140 /// AV1 muxer (the default + back-compatible constructor).
141 pub fn new(width: u32, height: u32, frame_rate: f64) -> Result<Self> {
142 Self::new_with_codec(width, height, frame_rate, VideoCodec::Av1)
143 }
144
145 /// Muxer for the given output `codec` — `Av1` (`av01`/`av1C`), `H264`
146 /// (`avc1`/`avcC`), or `H265` (`hvc1`/`hvcC`). H.264/H.265 callers feed the
147 /// encoder's **Annex-B** packets; the muxer repackages them to
148 /// length-prefixed samples + collects the parameter sets.
149 pub fn new_with_codec(
150 width: u32,
151 height: u32,
152 frame_rate: f64,
153 codec: VideoCodec,
154 ) -> Result<Self> {
155 Self::new_with_codec_opts(width, height, frame_rate, codec, false)
156 }
157
158 /// Like [`new_with_codec`] but with **inline parameter sets** for H.264/H.265
159 /// (the multi-GPU stitch). Each access unit keeps its own SPS/PPS(/VPS) and
160 /// the sample entry is `avc3`/`hev1`, so chunks from independent encoders
161 /// (possibly different vendors) decode with their own parameter sets.
162 pub fn new_with_codec_inline(
163 width: u32,
164 height: u32,
165 frame_rate: f64,
166 codec: VideoCodec,
167 ) -> Result<Self> {
168 Self::new_with_codec_opts(width, height, frame_rate, codec, true)
169 }
170
171 fn new_with_codec_opts(
172 width: u32,
173 height: u32,
174 frame_rate: f64,
175 codec: VideoCodec,
176 inline_param_sets: bool,
177 ) -> Result<Self> {
178 let mdat_tmp = NamedTempFile::new().context("creating mdat tempfile")?;
179 let handle = mdat_tmp
180 .reopen()
181 .context("reopening mdat tempfile for write")?;
182 let mdat_writer = BufWriter::new(handle);
183 let make = |c: NalMuxCodec| {
184 if inline_param_sets {
185 NalSampleWriter::new_inline(c)
186 } else {
187 NalSampleWriter::new(c)
188 }
189 };
190 let nal_writer = match codec {
191 VideoCodec::Av1 => None,
192 VideoCodec::H264 => Some(make(NalMuxCodec::H264)),
193 VideoCodec::H265 => Some(make(NalMuxCodec::H265)),
194 };
195 Ok(Self {
196 width,
197 height,
198 frame_rate,
199 mdat_tmp,
200 mdat_writer,
201 sample_sizes: Vec::new(),
202 keyframe_indices: Vec::new(),
203 first_packet_header: None,
204 packet_count: 0,
205 mdat_payload_bytes: 0,
206 audio: None,
207 color_metadata: ColorMetadata::default(),
208 force_largesize_mdat: false,
209 codec,
210 nal_writer,
211 inline_param_sets,
212 })
213 }
214
215 /// Test-only knob to exercise the 64-bit mdat largesize header without
216 /// crafting a multi-GiB payload. Production callers do not touch this —
217 /// the natural threshold (`mdat_payload + 8 > u32::MAX`) selects
218 /// largesize when the file genuinely needs it.
219 #[doc(hidden)]
220 pub fn force_largesize_mdat_for_test(&mut self) -> &mut Self {
221 self.force_largesize_mdat = true;
222 self
223 }
224
225 /// Carry the source's color metadata into the visual sample entry's
226 /// `colr nclx` box. Apple QuickTime / iOS Safari silently assume
227 /// BT.709 limited-range when `colr` is missing, which corrupts
228 /// BT.2020 HDR / wide-gamut clips. Pipeline calls this once after
229 /// demux but before any `add_packet` — though calling order is
230 /// not load-bearing because the metadata is only consumed by the
231 /// finalize-time `build_av01` builder.
232 pub fn set_color_metadata(&mut self, color_metadata: ColorMetadata) -> &mut Self {
233 self.color_metadata = color_metadata;
234 self
235 }
236
237 pub fn add_packet(&mut self, packet: EncodedPacket) -> Result<()> {
238 // AV1: store the OBU stream verbatim (the first packet carries the
239 // sequence header we embed in av1C). H.264/H.265: repackage the
240 // Annex-B frame into a length-prefixed mdat sample, capturing the
241 // parameter sets for the avcC/hvcC config box.
242 match &mut self.nal_writer {
243 None => {
244 // AV1: one OBU sample per packet.
245 if self.first_packet_header.is_none() {
246 self.first_packet_header = Some(packet.data.to_vec());
247 }
248 self.write_sample(&packet.data.clone(), packet.is_keyframe)?;
249 }
250 Some(_) => {
251 // H.264/H.265: a packet may carry several access units; split it
252 // into one length-prefixed sample per frame (per-AU keyframe).
253 let writer = self.nal_writer.as_mut().unwrap();
254 let samples = writer.push_packet(&packet.data);
255 for au in samples {
256 self.write_sample(&au.data, au.is_keyframe)?;
257 }
258 }
259 }
260 Ok(())
261 }
262
263 /// Append one finished sample to the mdat tempfile + update the per-sample
264 /// tables (size, keyframe index, payload total).
265 fn write_sample(&mut self, sample: &[u8], is_keyframe: bool) -> Result<()> {
266 let size = sample.len() as u32;
267 self.mdat_writer
268 .write_all(sample)
269 .context("writing sample to mdat tempfile")?;
270 self.sample_sizes.push(size);
271 self.packet_count = self
272 .packet_count
273 .checked_add(1)
274 .context("packet count overflow")?;
275 if is_keyframe {
276 self.keyframe_indices.push(self.packet_count);
277 }
278 self.mdat_payload_bytes = self
279 .mdat_payload_bytes
280 .checked_add(size as u64)
281 .context("mdat payload overflow")?;
282 Ok(())
283 }
284
285 /// before `add_audio_sample`. Validates codec ∈ {AAC family, Opus,
286 /// AC-3, E-AC-3} with codec-appropriate channel-count gates —
287 /// anything outside the supported envelope must fail loudly (no
288 /// silent degradation, no stubs).
289 ///
290 /// AAC family path (Squad-18 + Squad-25): emits `mp4a` sample entry +
291 /// `esds` descriptor tree carrying the AudioSpecificConfig verbatim,
292 /// plus an Apple `chan` (Channel Layout) box for ≥3-channel streams
293 /// so iOS Safari / QuickTime / AVFoundation render the correct layout
294 /// instead of defaulting to L+R. Accepts:
295 /// - AAC-LC (AOT=2), mono / stereo / 5.1 / 7.1
296 /// - HE-AAC v1 (explicit-signaled SBR; ASC starts AOT=5)
297 /// - HE-AAC v2 (explicit-signaled PS; ASC starts AOT=29)
298 ///
299 /// Implicit-signaled HE-AAC (AOT=2 leading byte at low core rate ≤24 kHz)
300 /// is rejected — the caller (`pipeline::transcode::route_audio`) is
301 /// responsible for upgrading the ASC via
302 /// `aac_asc::upgrade_to_explicit_signaling` before reaching the mux.
303 ///
304 /// Opus path (Squad-23 + Squad-28, RFC 7845): emits `Opus` sample entry
305 /// + `dOps` (Opus-Specific Box) carrying the OpusHead body verbatim.
306 /// Mono / stereo via ChannelMappingFamily=0 (Squad-23) or 3..=8
307 /// channels via ChannelMappingFamily=1 surround layouts (Squad-28).
308 /// Requires `info.codec_private` populated with the appropriate-form
309 /// OpusHead body. The mdhd timescale is pinned to 48000 per RFC 7845
310 /// §3 — the `info.timescale` is validated equal.
311 ///
312 /// AC-3 path (Squad-26, ETSI TS 102 366 §F.2): emits `ac-3` sample
313 /// entry + `dac3` config box carrying the 3-byte body verbatim. Up
314 /// to 5.1 channels. Sample rates 32 / 44.1 / 48 kHz only.
315 ///
316 /// E-AC-3 path (Squad-26, ETSI TS 102 366 §F.5): emits `ec-3` sample
317 /// entry + `dec3` config box. Up to 5.1 channels in v1 scope (single
318 /// independent substream). Sample rates 16 / 22.05 / 24 / 32 / 44.1 /
319 /// 48 kHz.
320 ///
321 /// Returns `&mut Self` for builder-style chaining. The audio tempfile
322 /// is created eagerly so tempdir failures surface here rather than at
323 /// `add_audio_sample` time.
324 pub fn with_audio(&mut self, info: AudioInfo) -> Result<&mut Self> {
325 // Codec dispatch: AAC, Opus, AC-3, E-AC-3 are the supported
326 // families. Other codec tags (mp3, vorbis, ...) are intentionally
327 // rejected here so the pipeline fall-back path in `transcode.rs` can
328 // surface a clean warn and emit video-only.
329 let codec_kind = AudioCodecKind::from_codec_tag(&info.codec).ok_or_else(|| {
330 anyhow::anyhow!(
331 "audio mux: only AAC-LC, Opus, AC-3, E-AC-3 are supported; got codec '{}'",
332 info.codec
333 )
334 })?;
335 // Per-codec channel-count gates.
336 // - AAC: standard MPEG channelConfiguration values 1 (mono) /
337 // 2 (stereo) / 6 (5.1) / 7 (7.1). Multichannel adds an Apple
338 // `chan` box (Squad-25) for QuickTime / AVFoundation rendering.
339 // - Opus: 1..=8. Mono/stereo via ChannelMappingFamily=0 (Squad-23);
340 // 3..=8 ride the dOps family-1 surround trailer per RFC 7845
341 // §5.1.1.2 (Squad-28 multistream).
342 // - AC-3 / E-AC-3: up to 6 channels (5.1). The real layout lives
343 // in `acmod`+`lfeon` inside the dac3/dec3 body; the
344 // AudioSampleEntry channelcount is informational. v1 scope keeps
345 // things tight at 5.1.
346 match codec_kind {
347 AudioCodecKind::Aac => {
348 if !matches!(info.channels, 1 | 2 | 6 | 7) {
349 anyhow::bail!(
350 "audio mux: AAC supports mono/stereo/5.1(channels=6)/7.1(channels=7) layouts; \
351 got {} channels — extended Atmos / object layouts are not supported",
352 info.channels
353 );
354 }
355 }
356 AudioCodecKind::Opus => {
357 if info.channels < 1 || info.channels > 8 {
358 anyhow::bail!(
359 "audio mux: Opus supports 1..=8 channels; got {}",
360 info.channels
361 );
362 }
363 }
364 AudioCodecKind::Ac3 | AudioCodecKind::Eac3 => {
365 if !(1..=6).contains(&info.channels) {
366 anyhow::bail!(
367 "audio mux: AC-3 / E-AC-3 channel count must be 1..=6 (mono..5.1); got {}",
368 info.channels
369 );
370 }
371 }
372 }
373 if info.sample_rate == 0 {
374 anyhow::bail!("audio mux: sample_rate must be > 0");
375 }
376 if info.timescale == 0 {
377 anyhow::bail!("audio mux: timescale must be > 0");
378 }
379 match codec_kind {
380 AudioCodecKind::Aac => {
381 if info.asc_bytes.is_empty() {
382 anyhow::bail!("audio mux: AudioSpecificConfig bytes missing");
383 }
384 // Parse the ASC's leading AOT (with the 5-bit raw + 6-bit
385 // extension escape per ISO 14496-3 §1.6.2.1) so HE-AAC
386 // explicit signaling isn't rejected by a naive `>>3 & 0x1F`
387 // peek. Squad-25 lifts the prior AAC-LC-only gate.
388 let parsed = crate::aac_asc::parse_aac_asc(&info.asc_bytes)
389 .with_context(|| "audio mux: failed to parse AudioSpecificConfig")?;
390 use crate::aac_asc::AscSignaling;
391 match parsed.signaling {
392 AscSignaling::ImplicitMaybe => {
393 anyhow::bail!(
394 "audio mux: ASC uses implicit HE-AAC signaling (AOT=2 core at \
395 {} Hz with no SBR/PS layer in the ASC). Apple players silently \
396 downgrade to mono 22.05 kHz core. Caller must upgrade with \
397 aac_asc::upgrade_to_explicit_signaling before muxing.",
398 parsed.sample_rate
399 );
400 }
401 AscSignaling::NoExtension
402 | AscSignaling::ExplicitSbr
403 | AscSignaling::ExplicitPs => {
404 // AOT=2 (LC), AOT=5 (SBR-wrapped LC), AOT=29 (PS-wrapped LC),
405 // and AOT=42 (xHE-AAC USAC) are all accepted at the mux
406 // level. The `esds` writer emits the ASC verbatim so the
407 // decoder receives whatever signaling the ASC carries.
408 let core_aot = parsed.aot;
409 if !matches!(core_aot, 2 | 42) {
410 anyhow::bail!(
411 "audio mux: only AAC-LC (AOT=2) and xHE-AAC USAC (AOT=42) \
412 cores are supported; ASC core AOT={}",
413 core_aot
414 );
415 }
416 }
417 }
418 }
419 AudioCodecKind::Opus => {
420 // OpusHead body without the 8-byte 'OpusHead' magic is 11
421 // bytes minimum for ChannelMappingFamily=0 (RFC 7845 §5.1).
422 // Reject anything shorter — the dOps writer can't synthesize
423 // a missing field and producing an empty box would silently
424 // break every player.
425 if info.codec_private.len() < 11 {
426 anyhow::bail!(
427 "audio mux: Opus codec_private must be ≥11 bytes (RFC 7845 §5.1 \
428 minimum body for ChannelMappingFamily=0); got {} bytes",
429 info.codec_private.len()
430 );
431 }
432 // RFC 7845 §3: the audio mdhd timescale MUST be 48000 for
433 // Opus. The CALLER pins this in `AudioInfo::opus(...)`; if
434 // they hand-built an `AudioInfo` with a different timescale
435 // we reject loudly so a downstream stts mismatch can't
436 // silently shift PTS by a small fraction.
437 if info.timescale != 48_000 {
438 anyhow::bail!(
439 "audio mux: Opus mdhd timescale must be 48000 (RFC 7845 §3); \
440 got timescale={}",
441 info.timescale
442 );
443 }
444 // ChannelMappingFamily byte (offset 10 in the OpusHead body
445 // we emit into dOps). Family 0 is mono/stereo (1..=2
446 // channels). Family 1 (Squad-28) is surround for 1..=8
447 // channels; requires a 2 + N byte trailer
448 // (StreamCount + CoupledCount + ChannelMapping[N]) per
449 // RFC 7845 §5.1.1. Family 255 (arbitrary mappings) and
450 // any other unknown family are rejected.
451 let cmf = info.codec_private[10];
452 match cmf {
453 0 => {
454 // RFC 7845 §5.1.1: family 0 is defined for
455 // 1..=2 channels only.
456 if info.channels > 2 {
457 anyhow::bail!(
458 "audio mux: Opus ChannelMappingFamily=0 only supports 1..=2 channels; got {}",
459 info.channels
460 );
461 }
462 }
463 1 => {
464 // Family 1 needs StreamCount + CoupledCount +
465 // ChannelMapping[channels] after the 11-byte
466 // preamble. Total dOps body = 11 + 2 + N.
467 let n = info.channels as usize;
468 let needed = 11 + 2 + n;
469 if info.codec_private.len() < needed {
470 anyhow::bail!(
471 "audio mux: Opus family=1 codec_private must be ≥{needed} bytes \
472 (11 preamble + 2 stream/coupled + {n} mapping); got {}",
473 info.codec_private.len()
474 );
475 }
476 let stream_count = info.codec_private[11];
477 let coupled_count = info.codec_private[12];
478 // libopus invariants (RFC 7845 §5.1.1):
479 // - StreamCount >= 1
480 // - CoupledCount <= StreamCount
481 // - StreamCount + CoupledCount <= 255 (always
482 // true at our scale)
483 // - StreamCount + CoupledCount <= channels
484 // (every encoder stream covers >=1 channel)
485 if stream_count < 1 {
486 anyhow::bail!(
487 "audio mux: Opus family=1 StreamCount must be >= 1; got {stream_count}"
488 );
489 }
490 if coupled_count > stream_count {
491 anyhow::bail!(
492 "audio mux: Opus family=1 CoupledCount ({coupled_count}) > StreamCount ({stream_count})"
493 );
494 }
495 if (stream_count as u16) + (coupled_count as u16) > info.channels {
496 anyhow::bail!(
497 "audio mux: Opus family=1 StreamCount ({stream_count}) + CoupledCount ({coupled_count}) > channels ({})",
498 info.channels
499 );
500 }
501 // ChannelMapping[i] must be < streams +
502 // coupled (i.e. a valid encoder-stream index).
503 let mapping_max = stream_count + coupled_count;
504 for i in 0..n {
505 let m = info.codec_private[13 + i];
506 if m >= mapping_max {
507 anyhow::bail!(
508 "audio mux: Opus family=1 ChannelMapping[{i}]={m} \
509 exceeds streams+coupled ({mapping_max})"
510 );
511 }
512 }
513 }
514 other => {
515 anyhow::bail!(
516 "audio mux: only Opus ChannelMappingFamily 0 (mono/stereo) and 1 (surround 1..=8) supported; \
517 got family={other}"
518 );
519 }
520 }
521 }
522 AudioCodecKind::Ac3 => {
523 // dac3 body is exactly 3 bytes per ETSI TS 102 366 §F.4
524 // (fscod 2b | bsid 5b | bsmod 3b | acmod 3b | lfeon 1b |
525 // bit_rate_code 5b | reserved 5b => 24 bits total).
526 if info.codec_private.len() != 3 {
527 anyhow::bail!(
528 "audio mux: AC-3 codec_private (dac3 body) must be exactly 3 bytes \
529 per ETSI TS 102 366 §F.4; got {} bytes",
530 info.codec_private.len()
531 );
532 }
533 // Sample rate sanity per ETSI TS 102 366 Table F.5.
534 match info.sample_rate {
535 32_000 | 44_100 | 48_000 => {}
536 other => anyhow::bail!(
537 "audio mux: AC-3 sample_rate must be 32000 / 44100 / 48000; got {}",
538 other
539 ),
540 }
541 }
542 AudioCodecKind::Eac3 => {
543 // dec3 body is variable-size; minimum body is 5 bytes for a
544 // single independent substream with no dependent substreams
545 // (data_rate 13b + num_ind_sub 3b = 2B + per-indep-substream
546 // fscod/bsid/asvc/bsmod/acmod/lfeon/num_dep_sub fields
547 // packed into the next 3 bytes). Reject anything shorter.
548 if info.codec_private.len() < 5 {
549 anyhow::bail!(
550 "audio mux: E-AC-3 codec_private (dec3 body) must be ≥5 bytes \
551 per ETSI TS 102 366 §F.6; got {} bytes",
552 info.codec_private.len()
553 );
554 }
555 // E-AC-3 sample rates: 32 / 44.1 / 48 kHz at "full" rate
556 // plus 16 / 22.05 / 24 kHz "reduced rate" (fscod==3 path).
557 match info.sample_rate {
558 16_000 | 22_050 | 24_000 | 32_000 | 44_100 | 48_000 => {}
559 other => anyhow::bail!(
560 "audio mux: E-AC-3 sample_rate must be 16000 / 22050 / 24000 / 32000 / \
561 44100 / 48000; got {}",
562 other
563 ),
564 }
565 }
566 }
567 if self.audio.is_some() {
568 anyhow::bail!("audio mux: with_audio called twice");
569 }
570 let audio_tmp = NamedTempFile::new().context("creating audio mdat tempfile")?;
571 let handle = audio_tmp
572 .reopen()
573 .context("reopening audio tempfile for write")?;
574 let audio_writer = BufWriter::new(handle);
575 self.audio = Some(AudioTrackState {
576 info,
577 audio_tmp,
578 audio_writer,
579 sample_sizes: Vec::new(),
580 durations: Vec::new(),
581 total_duration_ticks: 0,
582 mdat_payload_bytes: 0,
583 });
584 Ok(self)
585 }
586
587 /// Append one audio access unit (AAC AU / Opus packet / AC-3 syncframe /
588 /// E-AC-3 syncframe). `pts_ticks` is currently informational only —
589 /// ISOBMFF doesn't store per-sample PTS directly; stts durations imply
590 /// a running clock starting at 0. We accept it in the API to keep the
591 /// signature extensible (edit-lists / ctts for offset signalling can
592 /// land here later).
593 pub fn add_audio_sample(
594 &mut self,
595 sample: &[u8],
596 _pts_ticks: u64,
597 duration_ticks: u32,
598 ) -> Result<()> {
599 let audio = self
600 .audio
601 .as_mut()
602 .context("audio mux: add_audio_sample called before with_audio")?;
603 if sample.is_empty() {
604 anyhow::bail!("audio mux: refusing to add empty audio access unit");
605 }
606 audio
607 .audio_writer
608 .write_all(sample)
609 .context("writing audio sample to tempfile")?;
610 audio.sample_sizes.push(sample.len() as u32);
611 let dur = if duration_ticks == 0 {
612 // Codec-aware default frame duration. AAC: 1024 samples (the
613 // natural transform length); Opus: 960 ticks @ 48 kHz = 20 ms
614 // (the standard libopus encoder frame size); AC-3: 1536 samples
615 // per syncframe (6 blocks × 256 samples per ETSI TS 102 366);
616 // E-AC-3: 1536 samples for the dominant numblkscod=3 / 6-block
617 // case (other numblkscod values would be 256/512/768 — caller
618 // should override). Most common defaults; callers can override
619 // with an explicit non-zero `duration_ticks`.
620 match AudioCodecKind::from_codec_tag(&audio.info.codec) {
621 Some(AudioCodecKind::Aac) => 1024,
622 Some(AudioCodecKind::Opus) => 960,
623 Some(AudioCodecKind::Ac3) | Some(AudioCodecKind::Eac3) => 1536,
624 None => 1024, // unreachable: with_audio gates the codec tag
625 }
626 } else {
627 duration_ticks
628 };
629 audio.durations.push(dur);
630 audio.total_duration_ticks = audio
631 .total_duration_ticks
632 .checked_add(dur as u64)
633 .context("audio total duration overflow")?;
634 audio.mdat_payload_bytes = audio
635 .mdat_payload_bytes
636 .checked_add(sample.len() as u64)
637 .context("audio mdat payload overflow")?;
638 Ok(())
639 }
640
641 /// Write ftyp + moov + mdat into `output_path`. Faststart preserved.
642 ///
643 /// When audio is present (via `with_audio`), writes an interleaved mdat
644 /// with chunk-alternation: one ~1s video chunk then one ~1s audio chunk,
645 /// repeated until both tracks are drained. stco/co64 entries in each
646 /// trak's stbl point at the first sample of that trak's chunk inside
647 /// the shared mdat.
648 pub fn finalize_to_file(mut self, output_path: &Path) -> Result<()> {
649 if self.packet_count == 0 {
650 anyhow::bail!("cannot finalize MP4 with zero packets");
651 }
652 self.mdat_writer.flush().context("flushing mdat tempfile")?;
653 if let Some(ref mut audio) = self.audio {
654 audio
655 .audio_writer
656 .flush()
657 .context("flushing audio mdat tempfile")?;
658 if audio.sample_sizes.is_empty() {
659 // Caller called with_audio but never pushed a sample. Safer
660 // to drop the audio track than emit an empty audio trak
661 // that confuses players.
662 tracing::warn!(
663 "audio mux: with_audio called but no samples pushed; dropping audio"
664 );
665 self.audio = None;
666 }
667 }
668
669 // 90 kHz matches ffmpeg/x264/x265 and divides evenly for 23.976 /
670 // 29.97 / 59.94 fps.
671 let video_timescale: u32 = 90_000;
672 let frame_duration: u32 = ((video_timescale as f64) / self.frame_rate)
673 .round()
674 .max(1.0) as u32;
675 let total_video_duration: u64 = frame_duration as u64 * self.packet_count as u64;
676
677 // Build the visual sample entry up front (codec-dispatched). For AV1
678 // it embeds the sequence-header OBU in av1C; for H.264/H.265 it embeds
679 // the parameter sets captured during add_packet in avcC/hvcC.
680 let video_sample_entry = match self.codec {
681 VideoCodec::Av1 => {
682 let first_packet = self
683 .first_packet_header
684 .as_ref()
685 .context("first packet header missing; add_packet never called?")?;
686 let av1_obus = extract_sequence_header(first_packet)
687 .context("extracting AV1 sequence header OBU from first packet")?;
688 build_av01(self.width, self.height, &av1_obus, &self.color_metadata)
689 }
690 VideoCodec::H264 => {
691 let w = self.nal_writer.as_ref().context("H.264 nal writer missing")?;
692 if !w.has_param_sets() {
693 anyhow::bail!("H.264 mux: no SPS/PPS captured from the encoder bitstream");
694 }
695 let avcc = build_avcc(&w.sps, &w.pps);
696 // `avc3` signals in-band parameter sets (inline-stitch mode);
697 // `avc1` requires them out-of-band only.
698 let fourcc = if self.inline_param_sets { b"avc3" } else { b"avc1" };
699 build_avc1(self.width, self.height, &avcc, &self.color_metadata, fourcc)
700 }
701 VideoCodec::H265 => {
702 let w = self.nal_writer.as_ref().context("H.265 nal writer missing")?;
703 if !w.has_param_sets() {
704 anyhow::bail!("H.265 mux: no VPS/SPS/PPS captured from the encoder bitstream");
705 }
706 let hvcc = build_hvcc(&w.vps, &w.sps, &w.pps);
707 // `hev1` signals in-band parameter sets; `hvc1` is out-of-band.
708 let fourcc = if self.inline_param_sets { b"hev1" } else { b"hvc1" };
709 build_hvc1(self.width, self.height, &hvcc, &self.color_metadata, fourcc)
710 }
711 };
712
713 let ftyp = build_ftyp(self.codec);
714
715 // Chunking policy: one second per chunk, capped at 120 for video
716 // and 200 for audio. Matching ~1 s per chunk on both sides keeps
717 // seek granularity consistent and bounds stsc/stco table sizes.
718 let video_spc: u32 = (self.frame_rate.round() as u32).max(1).min(120);
719
720 // Pre-compute audio chunking + per-track totals so the movie header
721 // can report `max(video_duration, audio_duration)` in movie timescale.
722 // Choose movie timescale = max(video, audio) timescales so both
723 // durations convert integer-cleanly (we use video's 90 kHz which is
724 // already a multiple of all common audio rates' divisors in the
725 // chosen target — but we do the conversion explicitly either way
726 // since 48000 ∤ 90000; we round-to-nearest which is what ISOBMFF
727 // players expect for track duration display).
728 let movie_timescale: u32 = video_timescale;
729
730 let audio_plan: Option<AudioBuildPlan> = self.audio.as_ref().map(|a| {
731 // Chunking policy: aim for ~1 second of audio per chunk.
732 // Frame size differs by codec — AAC = 1024 samples / frame,
733 // Opus = 960 samples / frame at 48 kHz (the standard encoder
734 // frame size; callers using 2.5 / 5 / 10 / 40 / 60 ms frames
735 // would diverge but the chunk-size cap and the 1-second
736 // target both still apply, so the worst case is a slightly
737 // suboptimal chunk granularity rather than a structurally
738 // broken file). The mdhd timescale is `a.info.timescale`
739 // (sample_rate for AAC, 48000 for Opus).
740 let frames_per_sec = match AudioCodecKind::from_codec_tag(&a.info.codec) {
741 Some(AudioCodecKind::Opus) => (a.info.timescale as f64) / 960.0,
742 // AC-3 / E-AC-3: 1536 samples per syncframe (6 blocks × 256).
743 Some(AudioCodecKind::Ac3) | Some(AudioCodecKind::Eac3) => {
744 (a.info.timescale as f64) / 1536.0
745 }
746 Some(AudioCodecKind::Aac) | None => (a.info.timescale as f64) / 1024.0,
747 };
748 let audio_spc = (frames_per_sec.round() as u32).max(1).min(200);
749 let audio_duration_movie: u64 =
750 ((a.total_duration_ticks as u128) * movie_timescale as u128
751 / a.info.timescale.max(1) as u128) as u64;
752 AudioBuildPlan {
753 info: a.info.clone(),
754 sample_sizes: a.sample_sizes.clone(),
755 durations: a.durations.clone(),
756 total_duration_in_own_ts: a.total_duration_ticks,
757 total_duration_in_movie_ts: audio_duration_movie,
758 samples_per_chunk: audio_spc,
759 }
760 });
761
762 let video_duration_movie: u64 = total_video_duration; // video uses 90 kHz == movie
763 let movie_duration: u64 = match audio_plan.as_ref() {
764 Some(p) => video_duration_movie.max(p.total_duration_in_movie_ts),
765 None => video_duration_movie,
766 };
767
768 // Video-side mdat byte total stays in self; audio side is in plan.
769 let video_payload_bytes = self.mdat_payload_bytes;
770 let audio_payload_bytes = audio_plan
771 .as_ref()
772 .map(|p| p.sample_sizes.iter().map(|&s| s as u64).sum::<u64>())
773 .unwrap_or(0);
774 let mdat_payload_total = video_payload_bytes
775 .checked_add(audio_payload_bytes)
776 .context("combined mdat payload overflow")?;
777
778 // mdat box-size policy. The 32-bit `size` field maxes at
779 // u32::MAX; the box header is 8 bytes (size + type). When the box
780 // body alone would push the total past u32::MAX - 8, we switch to
781 // the ISOBMFF 14496-12 §4.2 largesize form: `size = 1` (32 bits),
782 // `type = 'mdat'`, then a 64-bit `largesize` field carrying the
783 // total box length (header + payload). Header grows from 8 → 16
784 // bytes which means stco/co64 offsets must reflect the post-header
785 // start.
786 let mdat_payload_plus_short_header = 8u64
787 .checked_add(mdat_payload_total)
788 .context("mdat short-header size overflow")?;
789 // Production: pick largesize iff the payload + short header
790 // exceeds u32. Tests can force largesize on to exercise the
791 // bit-layout without crafting a 4 GiB tempfile.
792 let use_largesize_mdat =
793 mdat_payload_plus_short_header > u32::MAX as u64 || self.force_largesize_mdat;
794 let mdat_header_len: u64 = if use_largesize_mdat { 16 } else { 8 };
795 let mdat_box_size: u64 = mdat_header_len
796 .checked_add(mdat_payload_total)
797 .context("mdat box size overflow")?;
798
799 // Two-pass moov construction. On pass 1 we need placeholder offsets
800 // of consistent widths to size the moov; on pass 2 we use the real
801 // offsets computed against the planned mdat layout.
802 let video_chunk_count = chunk_count_of(self.sample_sizes.len(), video_spc);
803 let audio_chunk_count = audio_plan
804 .as_ref()
805 .map(|p| chunk_count_of(p.sample_sizes.len(), p.samples_per_chunk))
806 .unwrap_or(0);
807 let video_zero_offsets: Vec<u64> = vec![0; video_chunk_count];
808 let audio_zero_offsets: Vec<u64> = vec![0; audio_chunk_count];
809
810 let moov_co64_size = build_moov_any(
811 self.width,
812 self.height,
813 video_timescale,
814 movie_timescale,
815 movie_duration,
816 total_video_duration,
817 frame_duration,
818 &self.sample_sizes,
819 &self.keyframe_indices,
820 &video_sample_entry,
821 &video_zero_offsets,
822 video_spc,
823 audio_plan.as_ref(),
824 &audio_zero_offsets,
825 true,
826 &self.color_metadata,
827 )
828 .len() as u64;
829
830 let upper_bound: u64 = (ftyp.len() as u64)
831 .checked_add(moov_co64_size)
832 .context("moov size overflow")?
833 .checked_add(mdat_header_len)
834 .context("mdat header overflow")?
835 .checked_add(mdat_payload_total)
836 .context("mdat payload overflow")?;
837 let use_co64 = upper_bound > u32::MAX as u64;
838
839 let moov_without_offsets = build_moov_any(
840 self.width,
841 self.height,
842 video_timescale,
843 movie_timescale,
844 movie_duration,
845 total_video_duration,
846 frame_duration,
847 &self.sample_sizes,
848 &self.keyframe_indices,
849 &video_sample_entry,
850 &video_zero_offsets,
851 video_spc,
852 audio_plan.as_ref(),
853 &audio_zero_offsets,
854 use_co64,
855 &self.color_metadata,
856 );
857
858 let mdat_offset_in_file = (ftyp.len() + moov_without_offsets.len()) as u64;
859 let first_sample_file_offset = mdat_offset_in_file + mdat_header_len;
860 if !use_co64 && first_sample_file_offset > u32::MAX as u64 {
861 anyhow::bail!(
862 "internal: chose stco but first_sample_file_offset {} exceeds u32",
863 first_sample_file_offset
864 );
865 }
866
867 // Compute interleaved chunk offsets. No audio → contiguous video
868 // chunks (unchanged behaviour). Audio present → alternating video,
869 // audio, video, audio, ..., tail is whichever side has samples left.
870 let (video_chunk_offsets, audio_chunk_offsets, interleave_plan) = plan_interleaved_layout(
871 first_sample_file_offset,
872 &self.sample_sizes,
873 video_spc,
874 audio_plan.as_ref(),
875 );
876 debug_assert_eq!(video_chunk_offsets.len(), video_chunk_count);
877 debug_assert_eq!(audio_chunk_offsets.len(), audio_chunk_count);
878
879 let moov = build_moov_any(
880 self.width,
881 self.height,
882 video_timescale,
883 movie_timescale,
884 movie_duration,
885 total_video_duration,
886 frame_duration,
887 &self.sample_sizes,
888 &self.keyframe_indices,
889 &video_sample_entry,
890 &video_chunk_offsets,
891 video_spc,
892 audio_plan.as_ref(),
893 &audio_chunk_offsets,
894 use_co64,
895 &self.color_metadata,
896 );
897
898 assert_eq!(
899 moov.len(),
900 moov_without_offsets.len(),
901 "moov size must be stable across rebuild"
902 );
903
904 // Stream final layout: ftyp + moov + mdat-header + mdat-payload.
905 let out_file = File::create(output_path)
906 .with_context(|| format!("creating output file {}", output_path.display()))?;
907 let mut out = BufWriter::new(out_file);
908 out.write_all(&ftyp).context("writing ftyp")?;
909 out.write_all(&moov).context("writing moov")?;
910 if use_largesize_mdat {
911 // size=1 sentinel, then 'mdat', then 64-bit largesize.
912 out.write_all(&1u32.to_be_bytes())
913 .context("writing mdat largesize sentinel")?;
914 out.write_all(b"mdat").context("writing mdat type")?;
915 out.write_all(&mdat_box_size.to_be_bytes())
916 .context("writing mdat largesize")?;
917 } else {
918 let mdat_size_u32 = mdat_box_size as u32;
919 out.write_all(&mdat_size_u32.to_be_bytes())
920 .context("writing mdat size")?;
921 out.write_all(b"mdat").context("writing mdat type")?;
922 }
923
924 // Stream mdat bytes per the interleave plan. Each InterleaveStep
925 // records which track and how many bytes to copy from that track's
926 // tempfile. We reopen both tempfiles once and copy by range so we
927 // never buffer the full payload.
928 let video_payload_handle = self
929 .mdat_tmp
930 .reopen()
931 .context("reopening mdat tempfile for read")?;
932 let mut video_payload = BufReader::new(video_payload_handle);
933 video_payload
934 .seek(SeekFrom::Start(0))
935 .context("rewinding mdat tempfile")?;
936
937 let mut audio_payload: Option<BufReader<File>> = match self.audio.as_ref() {
938 Some(a) => {
939 let h = a
940 .audio_tmp
941 .reopen()
942 .context("reopening audio mdat tempfile for read")?;
943 let mut r = BufReader::new(h);
944 r.seek(SeekFrom::Start(0))
945 .context("rewinding audio mdat tempfile")?;
946 Some(r)
947 }
948 None => None,
949 };
950
951 let mut video_copied: u64 = 0;
952 let mut audio_copied: u64 = 0;
953 for step in &interleave_plan {
954 match step.track {
955 sample_table::InterleaveTrack::Video => {
956 let copied =
957 std::io::copy(&mut (&mut video_payload).take(step.bytes), &mut out)
958 .context("copying video chunk into mdat")?;
959 if copied != step.bytes {
960 anyhow::bail!(
961 "video chunk short read: wanted {}, got {}",
962 step.bytes,
963 copied
964 );
965 }
966 video_copied += copied;
967 }
968 sample_table::InterleaveTrack::Audio => {
969 let audio_r = audio_payload.as_mut().context(
970 "internal: interleave plan has audio step but no audio tempfile",
971 )?;
972 let copied = std::io::copy(&mut audio_r.take(step.bytes), &mut out)
973 .context("copying audio chunk into mdat")?;
974 if copied != step.bytes {
975 anyhow::bail!(
976 "audio chunk short read: wanted {}, got {}",
977 step.bytes,
978 copied
979 );
980 }
981 audio_copied += copied;
982 }
983 }
984 }
985 if video_copied != video_payload_bytes {
986 anyhow::bail!(
987 "video mdat payload length mismatch: expected {}, copied {}",
988 video_payload_bytes,
989 video_copied
990 );
991 }
992 if audio_copied != audio_payload_bytes {
993 anyhow::bail!(
994 "audio mdat payload length mismatch: expected {}, copied {}",
995 audio_payload_bytes,
996 audio_copied
997 );
998 }
999 out.flush().context("flushing output")?;
1000
1001 Ok(())
1002 }
1003
1004 /// Back-compat: finalize into memory. Writes to a second tempfile then
1005 /// reads it back. Callers hitting the 4 GB ceiling should use
1006 /// `finalize_to_file` instead.
1007 pub fn finalize(self) -> Result<Bytes> {
1008 let tmp = NamedTempFile::new().context("creating finalize buffer tempfile")?;
1009 let path = tmp.path().to_path_buf();
1010 self.finalize_to_file(&path)?;
1011 let mut f = File::open(&path).context("reopening finalize buffer tempfile")?;
1012 let mut buf = Vec::new();
1013 f.read_to_end(&mut buf).context("reading finalize buffer")?;
1014 Ok(Bytes::from(buf))
1015 }
1016}