Skip to main content

container/cmaf/
fragment.rs

1//! Fragment-level box writers: `mfhd`, `tfhd`, `tfdt`, `trun`, `traf`, `moof`.
2//!
3//! Every function here maps to one ISO 14496-12 §8.8 box.  The public ones
4//! (`build_mfhd`, `build_tfhd`, `build_tfdt`, `build_moof_video`,
5//! `build_moof_audio`) are re-exported from the parent module.  The private
6//! helpers (`build_trun_video`, `build_trun_audio`, `build_traf`) are used
7//! only by the two `build_moof_*` compositors and stay crate-private.
8
9use crate::mux::BoxBuilder;
10
11use super::{CmafSample, SampleFlags};
12
13/// `mfhd` — Movie Fragment Header (14496-12 §8.8.5).
14///
15/// Carries the per-fragment sequence number. CMAF requires
16/// `sequence_number` to be monotonic and start at 1 for the first
17/// fragment of each track.
18///
19/// Wire layout (16 bytes total):
20/// ```text
21///   size:u32          = 16
22///   type:'mfhd'
23///   version:u8        = 0
24///   flags:u24         = 0
25///   sequence_number:u32
26/// ```
27pub fn build_mfhd(sequence_number: u32) -> Vec<u8> {
28    let mut b = BoxBuilder::new(b"mfhd");
29    b.u8(0); // version
30    b.extend(&[0, 0, 0]); // flags
31    b.u32(sequence_number);
32    b.finish()
33}
34
35/// `tfhd` — Track Fragment Header (14496-12 §8.8.7).
36///
37/// We always set the `default-base-is-moof` flag (`0x020000`) — required
38/// by CMAF §7.3.2.1. With this flag, sample data offsets in `trun`
39/// become relative to the start of the enclosing `moof`, which is
40/// exactly what HLS-CMAF expects. We avoid emitting `base_data_offset`
41/// (an absolute file offset that breaks segment portability).
42///
43/// Optional fields are emitted based on the bitwise combination of
44/// `tf_flags`:
45///   0x000001 base_data_offset            (NOT emitted; we use default-base-is-moof)
46///   0x000002 sample_description_index    (only if non-default needed)
47///   0x000008 default_sample_duration     (emitted when `default_duration.is_some()`)
48///   0x000010 default_sample_size         (emitted when `default_size.is_some()`)
49///   0x000020 default_sample_flags        (emitted when `default_flags.is_some()`)
50///   0x010000 duration-is-empty
51///   0x020000 default-base-is-moof        (always emitted)
52pub fn build_tfhd(
53    track_id: u32,
54    default_duration: Option<u32>,
55    default_size: Option<u32>,
56    default_flags: Option<u32>,
57) -> Vec<u8> {
58    let mut tf_flags: u32 = 0x020000; // default-base-is-moof
59    if default_duration.is_some() {
60        tf_flags |= 0x000008;
61    }
62    if default_size.is_some() {
63        tf_flags |= 0x000010;
64    }
65    if default_flags.is_some() {
66        tf_flags |= 0x000020;
67    }
68
69    let mut b = BoxBuilder::new(b"tfhd");
70    b.u8(0); // version
71    let flag_bytes = tf_flags.to_be_bytes();
72    b.extend(&flag_bytes[1..]); // 24-bit flags (drop high byte)
73    b.u32(track_id);
74    if let Some(d) = default_duration {
75        b.u32(d);
76    }
77    if let Some(s) = default_size {
78        b.u32(s);
79    }
80    if let Some(f) = default_flags {
81        b.u32(f);
82    }
83    b.finish()
84}
85
86/// `tfdt` — Track Fragment Decode Time (14496-12 §8.8.12).
87///
88/// Carries the absolute decode time of the first sample in this
89/// fragment, in track timescale ticks, accumulated from the start of
90/// the track (NOT from the start of the fragment). Required by CMAF
91/// §7.3.2.1.
92///
93/// We always emit version 1 (u64 decode time). Version 0's u32 wraps
94/// at ~24h for a 48 kHz audio track; version 1 covers >12 million
95/// years at the same rate. The 4 extra bytes are immaterial.
96///
97/// Wire layout (20 bytes total):
98/// ```text
99///   size:u32          = 20
100///   type:'tfdt'
101///   version:u8        = 1
102///   flags:u24         = 0
103///   base_media_decode_time:u64
104/// ```
105pub fn build_tfdt(base_media_decode_time: u64) -> Vec<u8> {
106    let mut b = BoxBuilder::new(b"tfdt");
107    b.u8(1); // version 1
108    b.extend(&[0, 0, 0]); // flags
109    b.u64(base_media_decode_time);
110    b.finish()
111}
112
113/// `trun` — Track Run (14496-12 §8.8.8) for a video fragment.
114///
115/// Encodes the per-sample table for the fragment's run of samples.
116/// CMAF allows multiple `trun`s per `traf` but we always emit exactly
117/// one (cleaner manifest, no functional difference).
118///
119/// Flag bits we always set:
120///   0x000001 data-offset-present       (offset from moof start to mdat data)
121///   0x000004 first-sample-flags-present (override of default for sample 0)
122///   0x000100 sample-duration-present
123///   0x000200 sample-size-present
124///
125/// We don't emit per-sample-flags (0x000400) because all non-first
126/// samples in a video fragment share the default (P-frame), and we
127/// don't emit sample-composition-time-offsets (0x000800) because
128/// AV1 has no B-frame reordering in our pipeline (PTS == DTS).
129///
130/// `data_offset` is the byte offset from the START of the enclosing
131/// `moof` to the first byte of the fragment's `mdat` payload. It
132/// CANNOT be filled in until the full `moof` size is known, so this
133/// builder leaves it as 0 and returns the byte position to be patched.
134/// See [`MoofData::patch_data_offset`].
135fn build_trun_video(samples: &[CmafSample]) -> (Vec<u8>, usize) {
136    let mut b = BoxBuilder::new(b"trun");
137    b.u8(0); // version
138    // Flags: data-offset (1) | first-sample-flags (4) | duration (0x100) | size (0x200)
139    let flags: u32 = 0x000001 | 0x000004 | 0x000100 | 0x000200;
140    let flag_bytes = flags.to_be_bytes();
141    b.extend(&flag_bytes[1..]);
142    b.u32(samples.len() as u32);
143    // data_offset placeholder — final value patched in once moof size is
144    // known. We track its absolute position WITHIN this trun box (header
145    // 8 + version 1 + flags 3 + sample_count 4 = 16) so the caller can
146    // translate to a position-within-moof later.
147    let data_offset_pos_within_trun = b.current_len();
148    b.u32(0); // placeholder
149
150    // first_sample_flags: the spec's standard pattern is to mark sample
151    // 0 explicitly (almost always a sync sample for the first fragment;
152    // for subsequent fragments the first sample is whatever the GOP
153    // boundary produced — typically also sync since CMAF segments must
154    // start with a sync sample per §7.3.2.1).
155    if let Some(first) = samples.first() {
156        b.u32(first.flags.pack());
157    } else {
158        b.u32(0);
159    }
160
161    for s in samples {
162        b.u32(s.duration);
163        b.u32(s.size);
164    }
165
166    let bytes = b.finish();
167    (bytes, data_offset_pos_within_trun)
168}
169
170/// `trun` for an audio fragment. Same shape as video but no sync-flags
171/// distinction (every audio sample is independently decodable in
172/// AAC-LC / Opus / AC-3 / E-AC-3), so we don't emit first-sample-flags
173/// — the default in `trex` / `tfhd` covers them all.
174fn build_trun_audio(samples: &[CmafSample]) -> (Vec<u8>, usize) {
175    let mut b = BoxBuilder::new(b"trun");
176    b.u8(0); // version
177    // Flags: data-offset (1) | duration (0x100) | size (0x200)
178    let flags: u32 = 0x000001 | 0x000100 | 0x000200;
179    let flag_bytes = flags.to_be_bytes();
180    b.extend(&flag_bytes[1..]);
181    b.u32(samples.len() as u32);
182    let data_offset_pos_within_trun = b.current_len();
183    b.u32(0); // placeholder
184
185    for s in samples {
186        b.u32(s.duration);
187        b.u32(s.size);
188    }
189
190    let bytes = b.finish();
191    (bytes, data_offset_pos_within_trun)
192}
193
194/// `traf` — Track Fragment (14496-12 §8.8.6).
195///
196/// Wraps `tfhd` + `tfdt` + `trun` for one track inside one `moof`.
197/// CMAF mandates exactly one `traf` per `moof` (§7.3.2.1: "Each CMAF
198/// Fragment SHALL contain exactly one Track Fragment Box.").
199fn build_traf(tfhd: &[u8], tfdt: &[u8], trun: &[u8]) -> Vec<u8> {
200    let mut b = BoxBuilder::new(b"traf");
201    b.extend(tfhd);
202    b.extend(tfdt);
203    b.extend(trun);
204    b.finish()
205}
206
207/// Full `moof` blob with the inner `trun.data_offset` patched up.
208///
209/// Returned by [`build_moof_video`] and [`build_moof_audio`]. Holds the
210/// final byte vector AND knows where inside it the `data_offset` field
211/// lives, so callers can either accept the default offset (immediately
212/// after the moof — i.e. mdat starts right after this moof in the file)
213/// OR substitute their own if they're writing some intervening bytes.
214///
215/// The default `data_offset` is `bytes.len() + 8`: full moof size plus
216/// the 8-byte mdat header. That's the standard "moof immediately
217/// followed by mdat" CMAF layout.
218pub struct MoofData {
219    pub bytes: Vec<u8>,
220    /// Byte position WITHIN `bytes` of the 4-byte big-endian
221    /// `data_offset` field inside `trun`. Use [`Self::patch_data_offset`]
222    /// to overwrite it.
223    pub data_offset_pos: usize,
224}
225
226impl MoofData {
227    /// Patch the `trun.data_offset` field in place. Call once with the
228    /// final byte offset from the START of the moof to the START of
229    /// the mdat payload (i.e. moof_size + 8 for a no-gap layout).
230    pub fn patch_data_offset(&mut self, data_offset: u32) {
231        self.bytes[self.data_offset_pos..self.data_offset_pos + 4]
232            .copy_from_slice(&data_offset.to_be_bytes());
233    }
234
235    /// Convenience: patch with the default no-gap offset (moof
236    /// immediately followed by mdat). Use this in the common case
237    /// where moof + mdat are written contiguously.
238    pub fn patch_default_no_gap(&mut self) {
239        let off = (self.bytes.len() + 8) as u32;
240        self.patch_data_offset(off);
241    }
242}
243
244/// Build a video `moof` for one CMAF fragment.
245///
246/// Composes `mfhd` + `traf{tfhd, tfdt, trun}` and tracks the byte
247/// position of `trun.data_offset` so the caller can patch it once
248/// the moof's final size is known (or accept the default no-gap
249/// layout via [`MoofData::patch_default_no_gap`]).
250pub fn build_moof_video(
251    sequence_number: u32,
252    track_id: u32,
253    base_media_decode_time: u64,
254    samples: &[CmafSample],
255) -> MoofData {
256    let mfhd = build_mfhd(sequence_number);
257    // Default duration/size omitted — they'll vary per-sample, so
258    // emitting them as defaults would be wrong. Default flags set to
259    // delta-frame so per-sample flags are needed only on the first
260    // (sync) sample, which we override via first_sample_flags in trun.
261    let tfhd = build_tfhd(
262        track_id,
263        None,
264        None,
265        Some(SampleFlags::delta_frame().pack()),
266    );
267    let tfdt = build_tfdt(base_media_decode_time);
268    let (trun, data_offset_pos_within_trun) = build_trun_video(samples);
269
270    // Compute where `data_offset` lives within the eventual moof.
271    // moof_header(8) + mfhd(16) + traf_header(8) + tfhd_len + tfdt(20) +
272    //   data_offset_pos_within_trun.
273    let moof_header = 8usize;
274    let traf_header = 8usize;
275    let pos_in_moof = moof_header
276        + mfhd.len()
277        + traf_header
278        + tfhd.len()
279        + tfdt.len()
280        + data_offset_pos_within_trun;
281
282    let traf = build_traf(&tfhd, &tfdt, &trun);
283    let mut b = BoxBuilder::new(b"moof");
284    b.extend(&mfhd);
285    b.extend(&traf);
286    let bytes = b.finish();
287
288    MoofData {
289        bytes,
290        data_offset_pos: pos_in_moof,
291    }
292}
293
294/// Build an audio `moof`. Same composition as video but without
295/// first-sample-flags differentiation in `trun` (every audio sample
296/// is independently decodable).
297pub fn build_moof_audio(
298    sequence_number: u32,
299    track_id: u32,
300    base_media_decode_time: u64,
301    samples: &[CmafSample],
302) -> MoofData {
303    let mfhd = build_mfhd(sequence_number);
304    // Audio default-flags: every sample is independently decodable,
305    // so default to sync.
306    let tfhd = build_tfhd(track_id, None, None, Some(SampleFlags::keyframe().pack()));
307    let tfdt = build_tfdt(base_media_decode_time);
308    let (trun, data_offset_pos_within_trun) = build_trun_audio(samples);
309
310    let moof_header = 8usize;
311    let traf_header = 8usize;
312    let pos_in_moof = moof_header
313        + mfhd.len()
314        + traf_header
315        + tfhd.len()
316        + tfdt.len()
317        + data_offset_pos_within_trun;
318
319    let traf = build_traf(&tfhd, &tfdt, &trun);
320    let mut b = BoxBuilder::new(b"moof");
321    b.extend(&mfhd);
322    b.extend(&traf);
323    let bytes = b.finish();
324
325    MoofData {
326        bytes,
327        data_offset_pos: pos_in_moof,
328    }
329}