container/cmaf/fragment.rs
1//! Fragment-level box writers: `mfhd`, `tfhd`, `tfdt`, `trun`, `traf`, `moof`.
2//!
3//! Every function here maps to one ISO 14496-12 §8.8 box. The public ones
4//! (`build_mfhd`, `build_tfhd`, `build_tfdt`, `build_moof_video`,
5//! `build_moof_audio`) are re-exported from the parent module. The private
6//! helpers (`build_trun_video`, `build_trun_audio`, `build_traf`) are used
7//! only by the two `build_moof_*` compositors and stay crate-private.
8
9use crate::mux::BoxBuilder;
10
11use super::{CmafSample, SampleFlags};
12
13/// `mfhd` — Movie Fragment Header (14496-12 §8.8.5).
14///
15/// Carries the per-fragment sequence number. CMAF requires
16/// `sequence_number` to be monotonic and start at 1 for the first
17/// fragment of each track.
18///
19/// Wire layout (16 bytes total):
20/// ```text
21/// size:u32 = 16
22/// type:'mfhd'
23/// version:u8 = 0
24/// flags:u24 = 0
25/// sequence_number:u32
26/// ```
27pub fn build_mfhd(sequence_number: u32) -> Vec<u8> {
28 let mut b = BoxBuilder::new(b"mfhd");
29 b.u8(0); // version
30 b.extend(&[0, 0, 0]); // flags
31 b.u32(sequence_number);
32 b.finish()
33}
34
35/// `tfhd` — Track Fragment Header (14496-12 §8.8.7).
36///
37/// We always set the `default-base-is-moof` flag (`0x020000`) — required
38/// by CMAF §7.3.2.1. With this flag, sample data offsets in `trun`
39/// become relative to the start of the enclosing `moof`, which is
40/// exactly what HLS-CMAF expects. We avoid emitting `base_data_offset`
41/// (an absolute file offset that breaks segment portability).
42///
43/// Optional fields are emitted based on the bitwise combination of
44/// `tf_flags`:
45/// 0x000001 base_data_offset (NOT emitted; we use default-base-is-moof)
46/// 0x000002 sample_description_index (only if non-default needed)
47/// 0x000008 default_sample_duration (emitted when `default_duration.is_some()`)
48/// 0x000010 default_sample_size (emitted when `default_size.is_some()`)
49/// 0x000020 default_sample_flags (emitted when `default_flags.is_some()`)
50/// 0x010000 duration-is-empty
51/// 0x020000 default-base-is-moof (always emitted)
52pub fn build_tfhd(
53 track_id: u32,
54 default_duration: Option<u32>,
55 default_size: Option<u32>,
56 default_flags: Option<u32>,
57) -> Vec<u8> {
58 let mut tf_flags: u32 = 0x020000; // default-base-is-moof
59 if default_duration.is_some() {
60 tf_flags |= 0x000008;
61 }
62 if default_size.is_some() {
63 tf_flags |= 0x000010;
64 }
65 if default_flags.is_some() {
66 tf_flags |= 0x000020;
67 }
68
69 let mut b = BoxBuilder::new(b"tfhd");
70 b.u8(0); // version
71 let flag_bytes = tf_flags.to_be_bytes();
72 b.extend(&flag_bytes[1..]); // 24-bit flags (drop high byte)
73 b.u32(track_id);
74 if let Some(d) = default_duration {
75 b.u32(d);
76 }
77 if let Some(s) = default_size {
78 b.u32(s);
79 }
80 if let Some(f) = default_flags {
81 b.u32(f);
82 }
83 b.finish()
84}
85
86/// `tfdt` — Track Fragment Decode Time (14496-12 §8.8.12).
87///
88/// Carries the absolute decode time of the first sample in this
89/// fragment, in track timescale ticks, accumulated from the start of
90/// the track (NOT from the start of the fragment). Required by CMAF
91/// §7.3.2.1.
92///
93/// We always emit version 1 (u64 decode time). Version 0's u32 wraps
94/// at ~24h for a 48 kHz audio track; version 1 covers >12 million
95/// years at the same rate. The 4 extra bytes are immaterial.
96///
97/// Wire layout (20 bytes total):
98/// ```text
99/// size:u32 = 20
100/// type:'tfdt'
101/// version:u8 = 1
102/// flags:u24 = 0
103/// base_media_decode_time:u64
104/// ```
105pub fn build_tfdt(base_media_decode_time: u64) -> Vec<u8> {
106 let mut b = BoxBuilder::new(b"tfdt");
107 b.u8(1); // version 1
108 b.extend(&[0, 0, 0]); // flags
109 b.u64(base_media_decode_time);
110 b.finish()
111}
112
113/// `trun` — Track Run (14496-12 §8.8.8) for a video fragment.
114///
115/// Encodes the per-sample table for the fragment's run of samples.
116/// CMAF allows multiple `trun`s per `traf` but we always emit exactly
117/// one (cleaner manifest, no functional difference).
118///
119/// Flag bits we always set:
120/// 0x000001 data-offset-present (offset from moof start to mdat data)
121/// 0x000004 first-sample-flags-present (override of default for sample 0)
122/// 0x000100 sample-duration-present
123/// 0x000200 sample-size-present
124///
125/// We don't emit per-sample-flags (0x000400) because all non-first
126/// samples in a video fragment share the default (P-frame), and we
127/// don't emit sample-composition-time-offsets (0x000800) because
128/// AV1 has no B-frame reordering in our pipeline (PTS == DTS).
129///
130/// `data_offset` is the byte offset from the START of the enclosing
131/// `moof` to the first byte of the fragment's `mdat` payload. It
132/// CANNOT be filled in until the full `moof` size is known, so this
133/// builder leaves it as 0 and returns the byte position to be patched.
134/// See [`MoofData::patch_data_offset`].
135fn build_trun_video(samples: &[CmafSample]) -> (Vec<u8>, usize) {
136 let mut b = BoxBuilder::new(b"trun");
137 b.u8(0); // version
138 // Flags: data-offset (1) | first-sample-flags (4) | duration (0x100) | size (0x200)
139 let flags: u32 = 0x000001 | 0x000004 | 0x000100 | 0x000200;
140 let flag_bytes = flags.to_be_bytes();
141 b.extend(&flag_bytes[1..]);
142 b.u32(samples.len() as u32);
143 // data_offset placeholder — final value patched in once moof size is
144 // known. We track its absolute position WITHIN this trun box (header
145 // 8 + version 1 + flags 3 + sample_count 4 = 16) so the caller can
146 // translate to a position-within-moof later.
147 let data_offset_pos_within_trun = b.current_len();
148 b.u32(0); // placeholder
149
150 // first_sample_flags: the spec's standard pattern is to mark sample
151 // 0 explicitly (almost always a sync sample for the first fragment;
152 // for subsequent fragments the first sample is whatever the GOP
153 // boundary produced — typically also sync since CMAF segments must
154 // start with a sync sample per §7.3.2.1).
155 if let Some(first) = samples.first() {
156 b.u32(first.flags.pack());
157 } else {
158 b.u32(0);
159 }
160
161 for s in samples {
162 b.u32(s.duration);
163 b.u32(s.size);
164 }
165
166 let bytes = b.finish();
167 (bytes, data_offset_pos_within_trun)
168}
169
170/// `trun` for an audio fragment. Same shape as video but no sync-flags
171/// distinction (every audio sample is independently decodable in
172/// AAC-LC / Opus / AC-3 / E-AC-3), so we don't emit first-sample-flags
173/// — the default in `trex` / `tfhd` covers them all.
174fn build_trun_audio(samples: &[CmafSample]) -> (Vec<u8>, usize) {
175 let mut b = BoxBuilder::new(b"trun");
176 b.u8(0); // version
177 // Flags: data-offset (1) | duration (0x100) | size (0x200)
178 let flags: u32 = 0x000001 | 0x000100 | 0x000200;
179 let flag_bytes = flags.to_be_bytes();
180 b.extend(&flag_bytes[1..]);
181 b.u32(samples.len() as u32);
182 let data_offset_pos_within_trun = b.current_len();
183 b.u32(0); // placeholder
184
185 for s in samples {
186 b.u32(s.duration);
187 b.u32(s.size);
188 }
189
190 let bytes = b.finish();
191 (bytes, data_offset_pos_within_trun)
192}
193
194/// `traf` — Track Fragment (14496-12 §8.8.6).
195///
196/// Wraps `tfhd` + `tfdt` + `trun` for one track inside one `moof`.
197/// CMAF mandates exactly one `traf` per `moof` (§7.3.2.1: "Each CMAF
198/// Fragment SHALL contain exactly one Track Fragment Box.").
199fn build_traf(tfhd: &[u8], tfdt: &[u8], trun: &[u8]) -> Vec<u8> {
200 let mut b = BoxBuilder::new(b"traf");
201 b.extend(tfhd);
202 b.extend(tfdt);
203 b.extend(trun);
204 b.finish()
205}
206
207/// Full `moof` blob with the inner `trun.data_offset` patched up.
208///
209/// Returned by [`build_moof_video`] and [`build_moof_audio`]. Holds the
210/// final byte vector AND knows where inside it the `data_offset` field
211/// lives, so callers can either accept the default offset (immediately
212/// after the moof — i.e. mdat starts right after this moof in the file)
213/// OR substitute their own if they're writing some intervening bytes.
214///
215/// The default `data_offset` is `bytes.len() + 8`: full moof size plus
216/// the 8-byte mdat header. That's the standard "moof immediately
217/// followed by mdat" CMAF layout.
218pub struct MoofData {
219 pub bytes: Vec<u8>,
220 /// Byte position WITHIN `bytes` of the 4-byte big-endian
221 /// `data_offset` field inside `trun`. Use [`Self::patch_data_offset`]
222 /// to overwrite it.
223 pub data_offset_pos: usize,
224}
225
226impl MoofData {
227 /// Patch the `trun.data_offset` field in place. Call once with the
228 /// final byte offset from the START of the moof to the START of
229 /// the mdat payload (i.e. moof_size + 8 for a no-gap layout).
230 pub fn patch_data_offset(&mut self, data_offset: u32) {
231 self.bytes[self.data_offset_pos..self.data_offset_pos + 4]
232 .copy_from_slice(&data_offset.to_be_bytes());
233 }
234
235 /// Convenience: patch with the default no-gap offset (moof
236 /// immediately followed by mdat). Use this in the common case
237 /// where moof + mdat are written contiguously.
238 pub fn patch_default_no_gap(&mut self) {
239 let off = (self.bytes.len() + 8) as u32;
240 self.patch_data_offset(off);
241 }
242}
243
244/// Build a video `moof` for one CMAF fragment.
245///
246/// Composes `mfhd` + `traf{tfhd, tfdt, trun}` and tracks the byte
247/// position of `trun.data_offset` so the caller can patch it once
248/// the moof's final size is known (or accept the default no-gap
249/// layout via [`MoofData::patch_default_no_gap`]).
250pub fn build_moof_video(
251 sequence_number: u32,
252 track_id: u32,
253 base_media_decode_time: u64,
254 samples: &[CmafSample],
255) -> MoofData {
256 let mfhd = build_mfhd(sequence_number);
257 // Default duration/size omitted — they'll vary per-sample, so
258 // emitting them as defaults would be wrong. Default flags set to
259 // delta-frame so per-sample flags are needed only on the first
260 // (sync) sample, which we override via first_sample_flags in trun.
261 let tfhd = build_tfhd(
262 track_id,
263 None,
264 None,
265 Some(SampleFlags::delta_frame().pack()),
266 );
267 let tfdt = build_tfdt(base_media_decode_time);
268 let (trun, data_offset_pos_within_trun) = build_trun_video(samples);
269
270 // Compute where `data_offset` lives within the eventual moof.
271 // moof_header(8) + mfhd(16) + traf_header(8) + tfhd_len + tfdt(20) +
272 // data_offset_pos_within_trun.
273 let moof_header = 8usize;
274 let traf_header = 8usize;
275 let pos_in_moof = moof_header
276 + mfhd.len()
277 + traf_header
278 + tfhd.len()
279 + tfdt.len()
280 + data_offset_pos_within_trun;
281
282 let traf = build_traf(&tfhd, &tfdt, &trun);
283 let mut b = BoxBuilder::new(b"moof");
284 b.extend(&mfhd);
285 b.extend(&traf);
286 let bytes = b.finish();
287
288 MoofData {
289 bytes,
290 data_offset_pos: pos_in_moof,
291 }
292}
293
294/// Build an audio `moof`. Same composition as video but without
295/// first-sample-flags differentiation in `trun` (every audio sample
296/// is independently decodable).
297pub fn build_moof_audio(
298 sequence_number: u32,
299 track_id: u32,
300 base_media_decode_time: u64,
301 samples: &[CmafSample],
302) -> MoofData {
303 let mfhd = build_mfhd(sequence_number);
304 // Audio default-flags: every sample is independently decodable,
305 // so default to sync.
306 let tfhd = build_tfhd(track_id, None, None, Some(SampleFlags::keyframe().pack()));
307 let tfdt = build_tfdt(base_media_decode_time);
308 let (trun, data_offset_pos_within_trun) = build_trun_audio(samples);
309
310 let moof_header = 8usize;
311 let traf_header = 8usize;
312 let pos_in_moof = moof_header
313 + mfhd.len()
314 + traf_header
315 + tfhd.len()
316 + tfdt.len()
317 + data_offset_pos_within_trun;
318
319 let traf = build_traf(&tfhd, &tfdt, &trun);
320 let mut b = BoxBuilder::new(b"moof");
321 b.extend(&mfhd);
322 b.extend(&traf);
323 let bytes = b.finish();
324
325 MoofData {
326 bytes,
327 data_offset_pos: pos_in_moof,
328 }
329}