Skip to main content

container/mux/
mod.rs

1use anyhow::{Context, Result};
2use bytes::Bytes;
3use codec::encode::EncodedPacket;
4use codec::frame::{ColorMetadata, VideoCodec};
5
6use crate::nal_mux::{NalMuxCodec, NalSampleWriter};
7use std::fs::File;
8use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
9use std::path::Path;
10use tempfile::NamedTempFile;
11
12use crate::AudioInfo;
13
14mod boxes;
15mod video_track;
16mod audio_track;
17mod sample_table;
18mod mdat;
19#[cfg(test)]
20mod tests;
21
22// Re-exports for external crate callers that import from `container::mux::*`.
23pub(crate) use boxes::{BoxBuilder, write_unity_matrix, extract_sequence_header};
24pub(crate) use video_track::{build_av01, build_avc1, build_hvc1, build_avcc, build_hvcc};
25pub(crate) use audio_track::build_audio_stsd;
26pub use audio_track::{dac3_body_from_sync, dec3_body_from_sync};
27
28// Internal imports used by impl Av1Mp4Muxer below.
29use boxes::{build_ftyp, build_moov_any};
30use sample_table::{AudioBuildPlan, chunk_count_of, plan_interleaved_layout};
31
32/// Streams mdat payload bytes to a tempfile while keeping only small
33/// per-packet metadata vectors in RAM. At 15 min 1080p60 and ~500 kB/sample
34/// average the metadata Vecs are ~700 KB total; the packet payload (~500 MB
35/// per variant at AV1 CQ 32) stays on disk.
36///
37/// Faststart is preserved: `finalize_to_file` writes ftyp + moov first,
38/// then streams the tempfile's mdat bytes into the final output.
39///
40/// API:
41/// - `new(w, h, fps)` — constructs a spooled muxer, creating the tempfile
42///   immediately. Fails if tempdir is unwritable.
43/// - `add_packet(packet)` — appends packet payload to the tempfile and
44///   records size/sync metadata.
45/// - `with_audio(info)` — registers an optional audio track. Codec dispatch
46///   happens here on `info.codec` (`"aac"` / `"opus"` / `"ac3"` / `"eac3"`).
47///   Must be called before `add_audio_sample`. Bails on unsupported codecs
48///   or channel counts — no silent degradation.
49/// - `add_audio_sample(sample, pts_ticks, duration_ticks)` — appends one
50///   audio access unit plus per-sample metadata. Requires `with_audio`
51///   first.
52/// - `finalize_to_file(&Path)` — writes ftyp + moov + mdat payload to the
53///   target path. Consumes self.
54/// - `finalize()` — backward-compat shim that reads the finalized file into
55///   a `Bytes`. Useful for small tests; callers hitting the RAM ceiling
56///   should use `finalize_to_file` + `ObjectStore::upload_file`.
57pub struct Av1Mp4Muxer {
58    width: u32,
59    height: u32,
60    frame_rate: f64,
61    mdat_tmp: NamedTempFile,
62    mdat_writer: BufWriter<File>,
63    sample_sizes: Vec<u32>,
64    keyframe_indices: Vec<u32>,
65    first_packet_header: Option<Vec<u8>>,
66    packet_count: u32,
67    mdat_payload_bytes: u64,
68    audio: Option<AudioTrackState>,
69    /// Color metadata copied from the source `StreamInfo` so the visual
70    /// sample entry can carry an Apple-compliant `colr nclx` box. Defaults
71    /// to BT.709 SDR limited-range — Apple silently assumes that when
72    /// `colr` is absent, so the default is correct for SDR sources but
73    /// breaks BT.2020 / HDR clips. Real values arrive via `with_color`.
74    color_metadata: ColorMetadata,
75    /// Test-only override forcing the muxer to emit the 64-bit `largesize`
76    /// mdat header even when the payload would fit in the 32-bit `size`
77    /// field. Pre-existing payload size computation otherwise leaves the
78    /// largesize branch untestable without producing a 4 GiB tempfile.
79    /// Production callers leave this `false`; tests flip it on to assert
80    /// the bit-layout of the largesize header is correct.
81    ///
82    /// Must be a regular field (not `#[cfg(test)]`-gated) so integration
83    /// tests in `tests/` — which compile against the release library
84    /// without `cfg(test)` — can flip it via `force_largesize_mdat_for_test`.
85    #[doc(hidden)]
86    force_largesize_mdat: bool,
87    /// Output video codec. Drives the sample-entry fourcc + config box at
88    /// finalize (`av01`/`av1C`, `avc1`/`avcC`, or `hvc1`/`hvcC`).
89    codec: VideoCodec,
90    /// For H.264 / H.265: repackages the encoder's Annex-B frames into
91    /// length-prefixed mdat samples and collects the SPS/PPS(/VPS) for the
92    /// config box. `None` for AV1 (which stores OBUs verbatim).
93    nal_writer: Option<NalSampleWriter>,
94    /// Inline-parameter-set mode (H.264/H.265 multi-GPU stitch): keep SPS/PPS
95    /// inline per access unit + emit the `avc3`/`hev1` sample entry instead of
96    /// `avc1`/`hvc1`, so chunks from independent encoders self-describe.
97    inline_param_sets: bool,
98}
99
100/// Per-muxer audio track state: info + spooling tempfile + per-sample
101/// metadata. Kept internal; populated via `with_audio` + `add_audio_sample`.
102struct AudioTrackState {
103    info: AudioInfo,
104    audio_tmp: NamedTempFile,
105    audio_writer: BufWriter<File>,
106    sample_sizes: Vec<u32>,
107    durations: Vec<u32>,
108    total_duration_ticks: u64,
109    mdat_payload_bytes: u64,
110}
111
112/// Internal discriminator chosen at `with_audio` time. Saves us re-parsing
113/// the codec string at every builder call site (build_audio_stsd, etc.) and
114/// keeps the AAC / Opus / AC-3 / E-AC-3 dispatch in one place.
115#[derive(Debug, Clone, Copy, PartialEq, Eq)]
116pub(super) enum AudioCodecKind {
117    Aac,
118    Opus,
119    Ac3,
120    Eac3,
121}
122
123impl AudioCodecKind {
124    pub(super) fn from_codec_tag(codec: &str) -> Option<Self> {
125        if codec.eq_ignore_ascii_case("aac") {
126            Some(Self::Aac)
127        } else if codec.eq_ignore_ascii_case("opus") {
128            Some(Self::Opus)
129        } else if codec.eq_ignore_ascii_case("ac3") || codec.eq_ignore_ascii_case("ac-3") {
130            Some(Self::Ac3)
131        } else if codec.eq_ignore_ascii_case("eac3") || codec.eq_ignore_ascii_case("e-ac-3") {
132            Some(Self::Eac3)
133        } else {
134            None
135        }
136    }
137}
138
139impl Av1Mp4Muxer {
140    /// AV1 muxer (the default + back-compatible constructor).
141    pub fn new(width: u32, height: u32, frame_rate: f64) -> Result<Self> {
142        Self::new_with_codec(width, height, frame_rate, VideoCodec::Av1)
143    }
144
145    /// Muxer for the given output `codec` — `Av1` (`av01`/`av1C`), `H264`
146    /// (`avc1`/`avcC`), or `H265` (`hvc1`/`hvcC`). H.264/H.265 callers feed the
147    /// encoder's **Annex-B** packets; the muxer repackages them to
148    /// length-prefixed samples + collects the parameter sets.
149    pub fn new_with_codec(
150        width: u32,
151        height: u32,
152        frame_rate: f64,
153        codec: VideoCodec,
154    ) -> Result<Self> {
155        Self::new_with_codec_opts(width, height, frame_rate, codec, false)
156    }
157
158    /// Like [`new_with_codec`] but with **inline parameter sets** for H.264/H.265
159    /// (the multi-GPU stitch). Each access unit keeps its own SPS/PPS(/VPS) and
160    /// the sample entry is `avc3`/`hev1`, so chunks from independent encoders
161    /// (possibly different vendors) decode with their own parameter sets.
162    pub fn new_with_codec_inline(
163        width: u32,
164        height: u32,
165        frame_rate: f64,
166        codec: VideoCodec,
167    ) -> Result<Self> {
168        Self::new_with_codec_opts(width, height, frame_rate, codec, true)
169    }
170
171    fn new_with_codec_opts(
172        width: u32,
173        height: u32,
174        frame_rate: f64,
175        codec: VideoCodec,
176        inline_param_sets: bool,
177    ) -> Result<Self> {
178        let mdat_tmp = NamedTempFile::new().context("creating mdat tempfile")?;
179        let handle = mdat_tmp
180            .reopen()
181            .context("reopening mdat tempfile for write")?;
182        let mdat_writer = BufWriter::new(handle);
183        let make = |c: NalMuxCodec| {
184            if inline_param_sets {
185                NalSampleWriter::new_inline(c)
186            } else {
187                NalSampleWriter::new(c)
188            }
189        };
190        let nal_writer = match codec {
191            VideoCodec::Av1 => None,
192            VideoCodec::H264 => Some(make(NalMuxCodec::H264)),
193            VideoCodec::H265 => Some(make(NalMuxCodec::H265)),
194        };
195        Ok(Self {
196            width,
197            height,
198            frame_rate,
199            mdat_tmp,
200            mdat_writer,
201            sample_sizes: Vec::new(),
202            keyframe_indices: Vec::new(),
203            first_packet_header: None,
204            packet_count: 0,
205            mdat_payload_bytes: 0,
206            audio: None,
207            color_metadata: ColorMetadata::default(),
208            force_largesize_mdat: false,
209            codec,
210            nal_writer,
211            inline_param_sets,
212        })
213    }
214
215    /// Test-only knob to exercise the 64-bit mdat largesize header without
216    /// crafting a multi-GiB payload. Production callers do not touch this —
217    /// the natural threshold (`mdat_payload + 8 > u32::MAX`) selects
218    /// largesize when the file genuinely needs it.
219    #[doc(hidden)]
220    pub fn force_largesize_mdat_for_test(&mut self) -> &mut Self {
221        self.force_largesize_mdat = true;
222        self
223    }
224
225    /// Carry the source's color metadata into the visual sample entry's
226    /// `colr nclx` box. Apple QuickTime / iOS Safari silently assume
227    /// BT.709 limited-range when `colr` is missing, which corrupts
228    /// BT.2020 HDR / wide-gamut clips. Pipeline calls this once after
229    /// demux but before any `add_packet` — though calling order is
230    /// not load-bearing because the metadata is only consumed by the
231    /// finalize-time `build_av01` builder.
232    pub fn set_color_metadata(&mut self, color_metadata: ColorMetadata) -> &mut Self {
233        self.color_metadata = color_metadata;
234        self
235    }
236
237    pub fn add_packet(&mut self, packet: EncodedPacket) -> Result<()> {
238        // AV1: store the OBU stream verbatim (the first packet carries the
239        // sequence header we embed in av1C). H.264/H.265: repackage the
240        // Annex-B frame into a length-prefixed mdat sample, capturing the
241        // parameter sets for the avcC/hvcC config box.
242        match &mut self.nal_writer {
243            None => {
244                // AV1: one OBU sample per packet.
245                if self.first_packet_header.is_none() {
246                    self.first_packet_header = Some(packet.data.to_vec());
247                }
248                self.write_sample(&packet.data.clone(), packet.is_keyframe)?;
249            }
250            Some(_) => {
251                // H.264/H.265: a packet may carry several access units; split it
252                // into one length-prefixed sample per frame (per-AU keyframe).
253                let writer = self.nal_writer.as_mut().unwrap();
254                let samples = writer.push_packet(&packet.data);
255                for au in samples {
256                    self.write_sample(&au.data, au.is_keyframe)?;
257                }
258            }
259        }
260        Ok(())
261    }
262
263    /// Append one finished sample to the mdat tempfile + update the per-sample
264    /// tables (size, keyframe index, payload total).
265    fn write_sample(&mut self, sample: &[u8], is_keyframe: bool) -> Result<()> {
266        let size = sample.len() as u32;
267        self.mdat_writer
268            .write_all(sample)
269            .context("writing sample to mdat tempfile")?;
270        self.sample_sizes.push(size);
271        self.packet_count = self
272            .packet_count
273            .checked_add(1)
274            .context("packet count overflow")?;
275        if is_keyframe {
276            self.keyframe_indices.push(self.packet_count);
277        }
278        self.mdat_payload_bytes = self
279            .mdat_payload_bytes
280            .checked_add(size as u64)
281            .context("mdat payload overflow")?;
282        Ok(())
283    }
284
285    /// before `add_audio_sample`. Validates codec ∈ {AAC family, Opus,
286    /// AC-3, E-AC-3} with codec-appropriate channel-count gates —
287    /// anything outside the supported envelope must fail loudly (no
288    /// silent degradation, no stubs).
289    ///
290    /// AAC family path (Squad-18 + Squad-25): emits `mp4a` sample entry +
291    /// `esds` descriptor tree carrying the AudioSpecificConfig verbatim,
292    /// plus an Apple `chan` (Channel Layout) box for ≥3-channel streams
293    /// so iOS Safari / QuickTime / AVFoundation render the correct layout
294    /// instead of defaulting to L+R. Accepts:
295    ///   - AAC-LC (AOT=2), mono / stereo / 5.1 / 7.1
296    ///   - HE-AAC v1 (explicit-signaled SBR; ASC starts AOT=5)
297    ///   - HE-AAC v2 (explicit-signaled PS; ASC starts AOT=29)
298    ///
299    /// Implicit-signaled HE-AAC (AOT=2 leading byte at low core rate ≤24 kHz)
300    /// is rejected — the caller (`pipeline::transcode::route_audio`) is
301    /// responsible for upgrading the ASC via
302    /// `aac_asc::upgrade_to_explicit_signaling` before reaching the mux.
303    ///
304    /// Opus path (Squad-23 + Squad-28, RFC 7845): emits `Opus` sample entry
305    /// + `dOps` (Opus-Specific Box) carrying the OpusHead body verbatim.
306    /// Mono / stereo via ChannelMappingFamily=0 (Squad-23) or 3..=8
307    /// channels via ChannelMappingFamily=1 surround layouts (Squad-28).
308    /// Requires `info.codec_private` populated with the appropriate-form
309    /// OpusHead body. The mdhd timescale is pinned to 48000 per RFC 7845
310    /// §3 — the `info.timescale` is validated equal.
311    ///
312    /// AC-3 path (Squad-26, ETSI TS 102 366 §F.2): emits `ac-3` sample
313    /// entry + `dac3` config box carrying the 3-byte body verbatim. Up
314    /// to 5.1 channels. Sample rates 32 / 44.1 / 48 kHz only.
315    ///
316    /// E-AC-3 path (Squad-26, ETSI TS 102 366 §F.5): emits `ec-3` sample
317    /// entry + `dec3` config box. Up to 5.1 channels in v1 scope (single
318    /// independent substream). Sample rates 16 / 22.05 / 24 / 32 / 44.1 /
319    /// 48 kHz.
320    ///
321    /// Returns `&mut Self` for builder-style chaining. The audio tempfile
322    /// is created eagerly so tempdir failures surface here rather than at
323    /// `add_audio_sample` time.
324    pub fn with_audio(&mut self, info: AudioInfo) -> Result<&mut Self> {
325        // Codec dispatch: AAC, Opus, AC-3, E-AC-3 are the supported
326        // families. Other codec tags (mp3, vorbis, ...) are intentionally
327        // rejected here so the pipeline fall-back path in `transcode.rs` can
328        // surface a clean warn and emit video-only.
329        let codec_kind = AudioCodecKind::from_codec_tag(&info.codec).ok_or_else(|| {
330            anyhow::anyhow!(
331                "audio mux: only AAC-LC, Opus, AC-3, E-AC-3 are supported; got codec '{}'",
332                info.codec
333            )
334        })?;
335        // Per-codec channel-count gates.
336        // - AAC: standard MPEG channelConfiguration values 1 (mono) /
337        //   2 (stereo) / 6 (5.1) / 7 (7.1). Multichannel adds an Apple
338        //   `chan` box (Squad-25) for QuickTime / AVFoundation rendering.
339        // - Opus: 1..=8. Mono/stereo via ChannelMappingFamily=0 (Squad-23);
340        //   3..=8 ride the dOps family-1 surround trailer per RFC 7845
341        //   §5.1.1.2 (Squad-28 multistream).
342        // - AC-3 / E-AC-3: up to 6 channels (5.1). The real layout lives
343        //   in `acmod`+`lfeon` inside the dac3/dec3 body; the
344        //   AudioSampleEntry channelcount is informational. v1 scope keeps
345        //   things tight at 5.1.
346        match codec_kind {
347            AudioCodecKind::Aac => {
348                if !matches!(info.channels, 1 | 2 | 6 | 7) {
349                    anyhow::bail!(
350                        "audio mux: AAC supports mono/stereo/5.1(channels=6)/7.1(channels=7) layouts; \
351                         got {} channels — extended Atmos / object layouts are not supported",
352                        info.channels
353                    );
354                }
355            }
356            AudioCodecKind::Opus => {
357                if info.channels < 1 || info.channels > 8 {
358                    anyhow::bail!(
359                        "audio mux: Opus supports 1..=8 channels; got {}",
360                        info.channels
361                    );
362                }
363            }
364            AudioCodecKind::Ac3 | AudioCodecKind::Eac3 => {
365                if !(1..=6).contains(&info.channels) {
366                    anyhow::bail!(
367                        "audio mux: AC-3 / E-AC-3 channel count must be 1..=6 (mono..5.1); got {}",
368                        info.channels
369                    );
370                }
371            }
372        }
373        if info.sample_rate == 0 {
374            anyhow::bail!("audio mux: sample_rate must be > 0");
375        }
376        if info.timescale == 0 {
377            anyhow::bail!("audio mux: timescale must be > 0");
378        }
379        match codec_kind {
380            AudioCodecKind::Aac => {
381                if info.asc_bytes.is_empty() {
382                    anyhow::bail!("audio mux: AudioSpecificConfig bytes missing");
383                }
384                // Parse the ASC's leading AOT (with the 5-bit raw + 6-bit
385                // extension escape per ISO 14496-3 §1.6.2.1) so HE-AAC
386                // explicit signaling isn't rejected by a naive `>>3 & 0x1F`
387                // peek. Squad-25 lifts the prior AAC-LC-only gate.
388                let parsed = crate::aac_asc::parse_aac_asc(&info.asc_bytes)
389                    .with_context(|| "audio mux: failed to parse AudioSpecificConfig")?;
390                use crate::aac_asc::AscSignaling;
391                match parsed.signaling {
392                    AscSignaling::ImplicitMaybe => {
393                        anyhow::bail!(
394                            "audio mux: ASC uses implicit HE-AAC signaling (AOT=2 core at \
395                             {} Hz with no SBR/PS layer in the ASC). Apple players silently \
396                             downgrade to mono 22.05 kHz core. Caller must upgrade with \
397                             aac_asc::upgrade_to_explicit_signaling before muxing.",
398                            parsed.sample_rate
399                        );
400                    }
401                    AscSignaling::NoExtension
402                    | AscSignaling::ExplicitSbr
403                    | AscSignaling::ExplicitPs => {
404                        // AOT=2 (LC), AOT=5 (SBR-wrapped LC), AOT=29 (PS-wrapped LC),
405                        // and AOT=42 (xHE-AAC USAC) are all accepted at the mux
406                        // level. The `esds` writer emits the ASC verbatim so the
407                        // decoder receives whatever signaling the ASC carries.
408                        let core_aot = parsed.aot;
409                        if !matches!(core_aot, 2 | 42) {
410                            anyhow::bail!(
411                                "audio mux: only AAC-LC (AOT=2) and xHE-AAC USAC (AOT=42) \
412                                 cores are supported; ASC core AOT={}",
413                                core_aot
414                            );
415                        }
416                    }
417                }
418            }
419            AudioCodecKind::Opus => {
420                // OpusHead body without the 8-byte 'OpusHead' magic is 11
421                // bytes minimum for ChannelMappingFamily=0 (RFC 7845 §5.1).
422                // Reject anything shorter — the dOps writer can't synthesize
423                // a missing field and producing an empty box would silently
424                // break every player.
425                if info.codec_private.len() < 11 {
426                    anyhow::bail!(
427                        "audio mux: Opus codec_private must be ≥11 bytes (RFC 7845 §5.1 \
428                         minimum body for ChannelMappingFamily=0); got {} bytes",
429                        info.codec_private.len()
430                    );
431                }
432                // RFC 7845 §3: the audio mdhd timescale MUST be 48000 for
433                // Opus. The CALLER pins this in `AudioInfo::opus(...)`; if
434                // they hand-built an `AudioInfo` with a different timescale
435                // we reject loudly so a downstream stts mismatch can't
436                // silently shift PTS by a small fraction.
437                if info.timescale != 48_000 {
438                    anyhow::bail!(
439                        "audio mux: Opus mdhd timescale must be 48000 (RFC 7845 §3); \
440                         got timescale={}",
441                        info.timescale
442                    );
443                }
444                // ChannelMappingFamily byte (offset 10 in the OpusHead body
445                // we emit into dOps). Family 0 is mono/stereo (1..=2
446                // channels). Family 1 (Squad-28) is surround for 1..=8
447                // channels; requires a 2 + N byte trailer
448                // (StreamCount + CoupledCount + ChannelMapping[N]) per
449                // RFC 7845 §5.1.1. Family 255 (arbitrary mappings) and
450                // any other unknown family are rejected.
451                let cmf = info.codec_private[10];
452                match cmf {
453                    0 => {
454                        // RFC 7845 §5.1.1: family 0 is defined for
455                        // 1..=2 channels only.
456                        if info.channels > 2 {
457                            anyhow::bail!(
458                                "audio mux: Opus ChannelMappingFamily=0 only supports 1..=2 channels; got {}",
459                                info.channels
460                            );
461                        }
462                    }
463                    1 => {
464                        // Family 1 needs StreamCount + CoupledCount +
465                        // ChannelMapping[channels] after the 11-byte
466                        // preamble. Total dOps body = 11 + 2 + N.
467                        let n = info.channels as usize;
468                        let needed = 11 + 2 + n;
469                        if info.codec_private.len() < needed {
470                            anyhow::bail!(
471                                "audio mux: Opus family=1 codec_private must be ≥{needed} bytes \
472                                 (11 preamble + 2 stream/coupled + {n} mapping); got {}",
473                                info.codec_private.len()
474                            );
475                        }
476                        let stream_count = info.codec_private[11];
477                        let coupled_count = info.codec_private[12];
478                        // libopus invariants (RFC 7845 §5.1.1):
479                        //   - StreamCount >= 1
480                        //   - CoupledCount <= StreamCount
481                        //   - StreamCount + CoupledCount <= 255 (always
482                        //     true at our scale)
483                        //   - StreamCount + CoupledCount <= channels
484                        //     (every encoder stream covers >=1 channel)
485                        if stream_count < 1 {
486                            anyhow::bail!(
487                                "audio mux: Opus family=1 StreamCount must be >= 1; got {stream_count}"
488                            );
489                        }
490                        if coupled_count > stream_count {
491                            anyhow::bail!(
492                                "audio mux: Opus family=1 CoupledCount ({coupled_count}) > StreamCount ({stream_count})"
493                            );
494                        }
495                        if (stream_count as u16) + (coupled_count as u16) > info.channels {
496                            anyhow::bail!(
497                                "audio mux: Opus family=1 StreamCount ({stream_count}) + CoupledCount ({coupled_count}) > channels ({})",
498                                info.channels
499                            );
500                        }
501                        // ChannelMapping[i] must be < streams +
502                        // coupled (i.e. a valid encoder-stream index).
503                        let mapping_max = stream_count + coupled_count;
504                        for i in 0..n {
505                            let m = info.codec_private[13 + i];
506                            if m >= mapping_max {
507                                anyhow::bail!(
508                                    "audio mux: Opus family=1 ChannelMapping[{i}]={m} \
509                                     exceeds streams+coupled ({mapping_max})"
510                                );
511                            }
512                        }
513                    }
514                    other => {
515                        anyhow::bail!(
516                            "audio mux: only Opus ChannelMappingFamily 0 (mono/stereo) and 1 (surround 1..=8) supported; \
517                             got family={other}"
518                        );
519                    }
520                }
521            }
522            AudioCodecKind::Ac3 => {
523                // dac3 body is exactly 3 bytes per ETSI TS 102 366 §F.4
524                // (fscod 2b | bsid 5b | bsmod 3b | acmod 3b | lfeon 1b |
525                //  bit_rate_code 5b | reserved 5b => 24 bits total).
526                if info.codec_private.len() != 3 {
527                    anyhow::bail!(
528                        "audio mux: AC-3 codec_private (dac3 body) must be exactly 3 bytes \
529                         per ETSI TS 102 366 §F.4; got {} bytes",
530                        info.codec_private.len()
531                    );
532                }
533                // Sample rate sanity per ETSI TS 102 366 Table F.5.
534                match info.sample_rate {
535                    32_000 | 44_100 | 48_000 => {}
536                    other => anyhow::bail!(
537                        "audio mux: AC-3 sample_rate must be 32000 / 44100 / 48000; got {}",
538                        other
539                    ),
540                }
541            }
542            AudioCodecKind::Eac3 => {
543                // dec3 body is variable-size; minimum body is 5 bytes for a
544                // single independent substream with no dependent substreams
545                // (data_rate 13b + num_ind_sub 3b = 2B + per-indep-substream
546                //  fscod/bsid/asvc/bsmod/acmod/lfeon/num_dep_sub fields
547                //  packed into the next 3 bytes). Reject anything shorter.
548                if info.codec_private.len() < 5 {
549                    anyhow::bail!(
550                        "audio mux: E-AC-3 codec_private (dec3 body) must be ≥5 bytes \
551                         per ETSI TS 102 366 §F.6; got {} bytes",
552                        info.codec_private.len()
553                    );
554                }
555                // E-AC-3 sample rates: 32 / 44.1 / 48 kHz at "full" rate
556                // plus 16 / 22.05 / 24 kHz "reduced rate" (fscod==3 path).
557                match info.sample_rate {
558                    16_000 | 22_050 | 24_000 | 32_000 | 44_100 | 48_000 => {}
559                    other => anyhow::bail!(
560                        "audio mux: E-AC-3 sample_rate must be 16000 / 22050 / 24000 / 32000 / \
561                         44100 / 48000; got {}",
562                        other
563                    ),
564                }
565            }
566        }
567        if self.audio.is_some() {
568            anyhow::bail!("audio mux: with_audio called twice");
569        }
570        let audio_tmp = NamedTempFile::new().context("creating audio mdat tempfile")?;
571        let handle = audio_tmp
572            .reopen()
573            .context("reopening audio tempfile for write")?;
574        let audio_writer = BufWriter::new(handle);
575        self.audio = Some(AudioTrackState {
576            info,
577            audio_tmp,
578            audio_writer,
579            sample_sizes: Vec::new(),
580            durations: Vec::new(),
581            total_duration_ticks: 0,
582            mdat_payload_bytes: 0,
583        });
584        Ok(self)
585    }
586
587    /// Append one audio access unit (AAC AU / Opus packet / AC-3 syncframe /
588    /// E-AC-3 syncframe). `pts_ticks` is currently informational only —
589    /// ISOBMFF doesn't store per-sample PTS directly; stts durations imply
590    /// a running clock starting at 0. We accept it in the API to keep the
591    /// signature extensible (edit-lists / ctts for offset signalling can
592    /// land here later).
593    pub fn add_audio_sample(
594        &mut self,
595        sample: &[u8],
596        _pts_ticks: u64,
597        duration_ticks: u32,
598    ) -> Result<()> {
599        let audio = self
600            .audio
601            .as_mut()
602            .context("audio mux: add_audio_sample called before with_audio")?;
603        if sample.is_empty() {
604            anyhow::bail!("audio mux: refusing to add empty audio access unit");
605        }
606        audio
607            .audio_writer
608            .write_all(sample)
609            .context("writing audio sample to tempfile")?;
610        audio.sample_sizes.push(sample.len() as u32);
611        let dur = if duration_ticks == 0 {
612            // Codec-aware default frame duration. AAC: 1024 samples (the
613            // natural transform length); Opus: 960 ticks @ 48 kHz = 20 ms
614            // (the standard libopus encoder frame size); AC-3: 1536 samples
615            // per syncframe (6 blocks × 256 samples per ETSI TS 102 366);
616            // E-AC-3: 1536 samples for the dominant numblkscod=3 / 6-block
617            // case (other numblkscod values would be 256/512/768 — caller
618            // should override). Most common defaults; callers can override
619            // with an explicit non-zero `duration_ticks`.
620            match AudioCodecKind::from_codec_tag(&audio.info.codec) {
621                Some(AudioCodecKind::Aac) => 1024,
622                Some(AudioCodecKind::Opus) => 960,
623                Some(AudioCodecKind::Ac3) | Some(AudioCodecKind::Eac3) => 1536,
624                None => 1024, // unreachable: with_audio gates the codec tag
625            }
626        } else {
627            duration_ticks
628        };
629        audio.durations.push(dur);
630        audio.total_duration_ticks = audio
631            .total_duration_ticks
632            .checked_add(dur as u64)
633            .context("audio total duration overflow")?;
634        audio.mdat_payload_bytes = audio
635            .mdat_payload_bytes
636            .checked_add(sample.len() as u64)
637            .context("audio mdat payload overflow")?;
638        Ok(())
639    }
640
641    /// Write ftyp + moov + mdat into `output_path`. Faststart preserved.
642    ///
643    /// When audio is present (via `with_audio`), writes an interleaved mdat
644    /// with chunk-alternation: one ~1s video chunk then one ~1s audio chunk,
645    /// repeated until both tracks are drained. stco/co64 entries in each
646    /// trak's stbl point at the first sample of that trak's chunk inside
647    /// the shared mdat.
648    pub fn finalize_to_file(mut self, output_path: &Path) -> Result<()> {
649        if self.packet_count == 0 {
650            anyhow::bail!("cannot finalize MP4 with zero packets");
651        }
652        self.mdat_writer.flush().context("flushing mdat tempfile")?;
653        if let Some(ref mut audio) = self.audio {
654            audio
655                .audio_writer
656                .flush()
657                .context("flushing audio mdat tempfile")?;
658            if audio.sample_sizes.is_empty() {
659                // Caller called with_audio but never pushed a sample. Safer
660                // to drop the audio track than emit an empty audio trak
661                // that confuses players.
662                tracing::warn!(
663                    "audio mux: with_audio called but no samples pushed; dropping audio"
664                );
665                self.audio = None;
666            }
667        }
668
669        // 90 kHz matches ffmpeg/x264/x265 and divides evenly for 23.976 /
670        // 29.97 / 59.94 fps.
671        let video_timescale: u32 = 90_000;
672        let frame_duration: u32 = ((video_timescale as f64) / self.frame_rate)
673            .round()
674            .max(1.0) as u32;
675        let total_video_duration: u64 = frame_duration as u64 * self.packet_count as u64;
676
677        // Build the visual sample entry up front (codec-dispatched). For AV1
678        // it embeds the sequence-header OBU in av1C; for H.264/H.265 it embeds
679        // the parameter sets captured during add_packet in avcC/hvcC.
680        let video_sample_entry = match self.codec {
681            VideoCodec::Av1 => {
682                let first_packet = self
683                    .first_packet_header
684                    .as_ref()
685                    .context("first packet header missing; add_packet never called?")?;
686                let av1_obus = extract_sequence_header(first_packet)
687                    .context("extracting AV1 sequence header OBU from first packet")?;
688                build_av01(self.width, self.height, &av1_obus, &self.color_metadata)
689            }
690            VideoCodec::H264 => {
691                let w = self.nal_writer.as_ref().context("H.264 nal writer missing")?;
692                if !w.has_param_sets() {
693                    anyhow::bail!("H.264 mux: no SPS/PPS captured from the encoder bitstream");
694                }
695                let avcc = build_avcc(&w.sps, &w.pps);
696                // `avc3` signals in-band parameter sets (inline-stitch mode);
697                // `avc1` requires them out-of-band only.
698                let fourcc = if self.inline_param_sets { b"avc3" } else { b"avc1" };
699                build_avc1(self.width, self.height, &avcc, &self.color_metadata, fourcc)
700            }
701            VideoCodec::H265 => {
702                let w = self.nal_writer.as_ref().context("H.265 nal writer missing")?;
703                if !w.has_param_sets() {
704                    anyhow::bail!("H.265 mux: no VPS/SPS/PPS captured from the encoder bitstream");
705                }
706                let hvcc = build_hvcc(&w.vps, &w.sps, &w.pps);
707                // `hev1` signals in-band parameter sets; `hvc1` is out-of-band.
708                let fourcc = if self.inline_param_sets { b"hev1" } else { b"hvc1" };
709                build_hvc1(self.width, self.height, &hvcc, &self.color_metadata, fourcc)
710            }
711        };
712
713        let ftyp = build_ftyp(self.codec);
714
715        // Chunking policy: one second per chunk, capped at 120 for video
716        // and 200 for audio. Matching ~1 s per chunk on both sides keeps
717        // seek granularity consistent and bounds stsc/stco table sizes.
718        let video_spc: u32 = (self.frame_rate.round() as u32).max(1).min(120);
719
720        // Pre-compute audio chunking + per-track totals so the movie header
721        // can report `max(video_duration, audio_duration)` in movie timescale.
722        // Choose movie timescale = max(video, audio) timescales so both
723        // durations convert integer-cleanly (we use video's 90 kHz which is
724        // already a multiple of all common audio rates' divisors in the
725        // chosen target — but we do the conversion explicitly either way
726        // since 48000 ∤ 90000; we round-to-nearest which is what ISOBMFF
727        // players expect for track duration display).
728        let movie_timescale: u32 = video_timescale;
729
730        let audio_plan: Option<AudioBuildPlan> = self.audio.as_ref().map(|a| {
731            // Chunking policy: aim for ~1 second of audio per chunk.
732            // Frame size differs by codec — AAC = 1024 samples / frame,
733            // Opus = 960 samples / frame at 48 kHz (the standard encoder
734            // frame size; callers using 2.5 / 5 / 10 / 40 / 60 ms frames
735            // would diverge but the chunk-size cap and the 1-second
736            // target both still apply, so the worst case is a slightly
737            // suboptimal chunk granularity rather than a structurally
738            // broken file). The mdhd timescale is `a.info.timescale`
739            // (sample_rate for AAC, 48000 for Opus).
740            let frames_per_sec = match AudioCodecKind::from_codec_tag(&a.info.codec) {
741                Some(AudioCodecKind::Opus) => (a.info.timescale as f64) / 960.0,
742                // AC-3 / E-AC-3: 1536 samples per syncframe (6 blocks × 256).
743                Some(AudioCodecKind::Ac3) | Some(AudioCodecKind::Eac3) => {
744                    (a.info.timescale as f64) / 1536.0
745                }
746                Some(AudioCodecKind::Aac) | None => (a.info.timescale as f64) / 1024.0,
747            };
748            let audio_spc = (frames_per_sec.round() as u32).max(1).min(200);
749            let audio_duration_movie: u64 =
750                ((a.total_duration_ticks as u128) * movie_timescale as u128
751                    / a.info.timescale.max(1) as u128) as u64;
752            AudioBuildPlan {
753                info: a.info.clone(),
754                sample_sizes: a.sample_sizes.clone(),
755                durations: a.durations.clone(),
756                total_duration_in_own_ts: a.total_duration_ticks,
757                total_duration_in_movie_ts: audio_duration_movie,
758                samples_per_chunk: audio_spc,
759            }
760        });
761
762        let video_duration_movie: u64 = total_video_duration; // video uses 90 kHz == movie
763        let movie_duration: u64 = match audio_plan.as_ref() {
764            Some(p) => video_duration_movie.max(p.total_duration_in_movie_ts),
765            None => video_duration_movie,
766        };
767
768        // Video-side mdat byte total stays in self; audio side is in plan.
769        let video_payload_bytes = self.mdat_payload_bytes;
770        let audio_payload_bytes = audio_plan
771            .as_ref()
772            .map(|p| p.sample_sizes.iter().map(|&s| s as u64).sum::<u64>())
773            .unwrap_or(0);
774        let mdat_payload_total = video_payload_bytes
775            .checked_add(audio_payload_bytes)
776            .context("combined mdat payload overflow")?;
777
778        // mdat box-size policy. The 32-bit `size` field maxes at
779        // u32::MAX; the box header is 8 bytes (size + type). When the box
780        // body alone would push the total past u32::MAX - 8, we switch to
781        // the ISOBMFF 14496-12 §4.2 largesize form: `size = 1` (32 bits),
782        // `type = 'mdat'`, then a 64-bit `largesize` field carrying the
783        // total box length (header + payload). Header grows from 8 → 16
784        // bytes which means stco/co64 offsets must reflect the post-header
785        // start.
786        let mdat_payload_plus_short_header = 8u64
787            .checked_add(mdat_payload_total)
788            .context("mdat short-header size overflow")?;
789        // Production: pick largesize iff the payload + short header
790        // exceeds u32. Tests can force largesize on to exercise the
791        // bit-layout without crafting a 4 GiB tempfile.
792        let use_largesize_mdat =
793            mdat_payload_plus_short_header > u32::MAX as u64 || self.force_largesize_mdat;
794        let mdat_header_len: u64 = if use_largesize_mdat { 16 } else { 8 };
795        let mdat_box_size: u64 = mdat_header_len
796            .checked_add(mdat_payload_total)
797            .context("mdat box size overflow")?;
798
799        // Two-pass moov construction. On pass 1 we need placeholder offsets
800        // of consistent widths to size the moov; on pass 2 we use the real
801        // offsets computed against the planned mdat layout.
802        let video_chunk_count = chunk_count_of(self.sample_sizes.len(), video_spc);
803        let audio_chunk_count = audio_plan
804            .as_ref()
805            .map(|p| chunk_count_of(p.sample_sizes.len(), p.samples_per_chunk))
806            .unwrap_or(0);
807        let video_zero_offsets: Vec<u64> = vec![0; video_chunk_count];
808        let audio_zero_offsets: Vec<u64> = vec![0; audio_chunk_count];
809
810        let moov_co64_size = build_moov_any(
811            self.width,
812            self.height,
813            video_timescale,
814            movie_timescale,
815            movie_duration,
816            total_video_duration,
817            frame_duration,
818            &self.sample_sizes,
819            &self.keyframe_indices,
820            &video_sample_entry,
821            &video_zero_offsets,
822            video_spc,
823            audio_plan.as_ref(),
824            &audio_zero_offsets,
825            true,
826            &self.color_metadata,
827        )
828        .len() as u64;
829
830        let upper_bound: u64 = (ftyp.len() as u64)
831            .checked_add(moov_co64_size)
832            .context("moov size overflow")?
833            .checked_add(mdat_header_len)
834            .context("mdat header overflow")?
835            .checked_add(mdat_payload_total)
836            .context("mdat payload overflow")?;
837        let use_co64 = upper_bound > u32::MAX as u64;
838
839        let moov_without_offsets = build_moov_any(
840            self.width,
841            self.height,
842            video_timescale,
843            movie_timescale,
844            movie_duration,
845            total_video_duration,
846            frame_duration,
847            &self.sample_sizes,
848            &self.keyframe_indices,
849            &video_sample_entry,
850            &video_zero_offsets,
851            video_spc,
852            audio_plan.as_ref(),
853            &audio_zero_offsets,
854            use_co64,
855            &self.color_metadata,
856        );
857
858        let mdat_offset_in_file = (ftyp.len() + moov_without_offsets.len()) as u64;
859        let first_sample_file_offset = mdat_offset_in_file + mdat_header_len;
860        if !use_co64 && first_sample_file_offset > u32::MAX as u64 {
861            anyhow::bail!(
862                "internal: chose stco but first_sample_file_offset {} exceeds u32",
863                first_sample_file_offset
864            );
865        }
866
867        // Compute interleaved chunk offsets. No audio → contiguous video
868        // chunks (unchanged behaviour). Audio present → alternating video,
869        // audio, video, audio, ..., tail is whichever side has samples left.
870        let (video_chunk_offsets, audio_chunk_offsets, interleave_plan) = plan_interleaved_layout(
871            first_sample_file_offset,
872            &self.sample_sizes,
873            video_spc,
874            audio_plan.as_ref(),
875        );
876        debug_assert_eq!(video_chunk_offsets.len(), video_chunk_count);
877        debug_assert_eq!(audio_chunk_offsets.len(), audio_chunk_count);
878
879        let moov = build_moov_any(
880            self.width,
881            self.height,
882            video_timescale,
883            movie_timescale,
884            movie_duration,
885            total_video_duration,
886            frame_duration,
887            &self.sample_sizes,
888            &self.keyframe_indices,
889            &video_sample_entry,
890            &video_chunk_offsets,
891            video_spc,
892            audio_plan.as_ref(),
893            &audio_chunk_offsets,
894            use_co64,
895            &self.color_metadata,
896        );
897
898        assert_eq!(
899            moov.len(),
900            moov_without_offsets.len(),
901            "moov size must be stable across rebuild"
902        );
903
904        // Stream final layout: ftyp + moov + mdat-header + mdat-payload.
905        let out_file = File::create(output_path)
906            .with_context(|| format!("creating output file {}", output_path.display()))?;
907        let mut out = BufWriter::new(out_file);
908        out.write_all(&ftyp).context("writing ftyp")?;
909        out.write_all(&moov).context("writing moov")?;
910        if use_largesize_mdat {
911            // size=1 sentinel, then 'mdat', then 64-bit largesize.
912            out.write_all(&1u32.to_be_bytes())
913                .context("writing mdat largesize sentinel")?;
914            out.write_all(b"mdat").context("writing mdat type")?;
915            out.write_all(&mdat_box_size.to_be_bytes())
916                .context("writing mdat largesize")?;
917        } else {
918            let mdat_size_u32 = mdat_box_size as u32;
919            out.write_all(&mdat_size_u32.to_be_bytes())
920                .context("writing mdat size")?;
921            out.write_all(b"mdat").context("writing mdat type")?;
922        }
923
924        // Stream mdat bytes per the interleave plan. Each InterleaveStep
925        // records which track and how many bytes to copy from that track's
926        // tempfile. We reopen both tempfiles once and copy by range so we
927        // never buffer the full payload.
928        let video_payload_handle = self
929            .mdat_tmp
930            .reopen()
931            .context("reopening mdat tempfile for read")?;
932        let mut video_payload = BufReader::new(video_payload_handle);
933        video_payload
934            .seek(SeekFrom::Start(0))
935            .context("rewinding mdat tempfile")?;
936
937        let mut audio_payload: Option<BufReader<File>> = match self.audio.as_ref() {
938            Some(a) => {
939                let h = a
940                    .audio_tmp
941                    .reopen()
942                    .context("reopening audio mdat tempfile for read")?;
943                let mut r = BufReader::new(h);
944                r.seek(SeekFrom::Start(0))
945                    .context("rewinding audio mdat tempfile")?;
946                Some(r)
947            }
948            None => None,
949        };
950
951        let mut video_copied: u64 = 0;
952        let mut audio_copied: u64 = 0;
953        for step in &interleave_plan {
954            match step.track {
955                sample_table::InterleaveTrack::Video => {
956                    let copied =
957                        std::io::copy(&mut (&mut video_payload).take(step.bytes), &mut out)
958                            .context("copying video chunk into mdat")?;
959                    if copied != step.bytes {
960                        anyhow::bail!(
961                            "video chunk short read: wanted {}, got {}",
962                            step.bytes,
963                            copied
964                        );
965                    }
966                    video_copied += copied;
967                }
968                sample_table::InterleaveTrack::Audio => {
969                    let audio_r = audio_payload.as_mut().context(
970                        "internal: interleave plan has audio step but no audio tempfile",
971                    )?;
972                    let copied = std::io::copy(&mut audio_r.take(step.bytes), &mut out)
973                        .context("copying audio chunk into mdat")?;
974                    if copied != step.bytes {
975                        anyhow::bail!(
976                            "audio chunk short read: wanted {}, got {}",
977                            step.bytes,
978                            copied
979                        );
980                    }
981                    audio_copied += copied;
982                }
983            }
984        }
985        if video_copied != video_payload_bytes {
986            anyhow::bail!(
987                "video mdat payload length mismatch: expected {}, copied {}",
988                video_payload_bytes,
989                video_copied
990            );
991        }
992        if audio_copied != audio_payload_bytes {
993            anyhow::bail!(
994                "audio mdat payload length mismatch: expected {}, copied {}",
995                audio_payload_bytes,
996                audio_copied
997            );
998        }
999        out.flush().context("flushing output")?;
1000
1001        Ok(())
1002    }
1003
1004    /// Back-compat: finalize into memory. Writes to a second tempfile then
1005    /// reads it back. Callers hitting the 4 GB ceiling should use
1006    /// `finalize_to_file` instead.
1007    pub fn finalize(self) -> Result<Bytes> {
1008        let tmp = NamedTempFile::new().context("creating finalize buffer tempfile")?;
1009        let path = tmp.path().to_path_buf();
1010        self.finalize_to_file(&path)?;
1011        let mut f = File::open(&path).context("reopening finalize buffer tempfile")?;
1012        let mut buf = Vec::new();
1013        f.read_to_end(&mut buf).context("reading finalize buffer")?;
1014        Ok(Bytes::from(buf))
1015    }
1016}