Skip to main content

moq_mux/container/fmp4/
mod.rs

1//! Fragmented MP4 (fMP4 / CMAF).
2//!
3//! A widely supported file format that's also a viable wire format.
4//! Each moq frame carries one moof+mdat fragment, optionally with
5//! several samples packed inside. [`Wire`] is the wire-level
6//! container; [`Import`] parses external fMP4 streams and [`Export`]
7//! produces them.
8
9mod export;
10mod import;
11
12pub use export::*;
13pub use import::*;
14
15#[cfg(test)]
16mod export_test;
17#[cfg(test)]
18mod import_test;
19
20use std::task::Poll;
21
22use bytes::Bytes;
23use hang::catalog::{AudioCodec, AudioConfig, VideoCodec, VideoConfig};
24use mp4_atom::Atom;
25
26use crate::container::{Container, Frame, Timestamp};
27
28#[derive(Debug, thiserror::Error)]
29#[non_exhaustive]
30pub enum Error {
31	#[error("mp4: {0}")]
32	Mp4(#[from] mp4_atom::Error),
33
34	#[error("moq: {0}")]
35	Moq(#[from] moq_net::Error),
36
37	#[error("timestamp overflow")]
38	TimestampOverflow(#[from] moq_net::TimeOverflow),
39
40	#[error("no traf in moof")]
41	NoTraf,
42
43	#[error("no tfdt in traf")]
44	NoTfdt,
45
46	#[error("PTS overflow")]
47	PtsOverflow,
48
49	#[error("no moof found in CMAF frame data")]
50	NoMoof,
51
52	#[error("no mdat found in CMAF frame data")]
53	NoMdat,
54
55	#[error("no moov found in init data")]
56	NoMoov,
57
58	#[error("no tracks in moov")]
59	NoTracks,
60
61	#[error("multiple tracks in moov, use Trak instead")]
62	MultipleTracks,
63
64	#[error("can't synthesize CMAF init for {0}")]
65	UnsupportedSynthesis(String),
66}
67
68/// CMAF container: encodes/decodes a single track's moof+mdat fragments.
69///
70/// Build from a CMAF init segment with [`Wire::from_init`], or wrap a
71/// pre-extracted [`mp4_atom::Trak`] directly with [`Wire::new`].
72///
73/// The [`mp4_atom::Trak`] is heap-allocated so that embedding `Wire`
74/// in other enums (e.g. [`catalog::hang::Container`](crate::catalog::hang::Container))
75/// doesn't bloat unrelated variants.
76pub struct Wire {
77	trak: Box<mp4_atom::Trak>,
78}
79
80impl Wire {
81	/// Wrap an already-parsed track.
82	pub fn new(trak: mp4_atom::Trak) -> Self {
83		Self { trak: Box::new(trak) }
84	}
85
86	/// Parse a CMAF init segment (ftyp+moov), extracting the single track.
87	pub fn from_init(init_data: &[u8]) -> Result<Self, Error> {
88		use mp4_atom::DecodeMaybe;
89
90		let mut cursor = std::io::Cursor::new(init_data);
91		while let Some(atom) = mp4_atom::Any::decode_maybe(&mut cursor)? {
92			if let mp4_atom::Any::Moov(mut moov) = atom {
93				return match moov.trak.len() {
94					1 => Ok(Self::new(moov.trak.remove(0))),
95					0 => Err(Error::NoTracks),
96					_ => Err(Error::MultipleTracks),
97				};
98			}
99		}
100		Err(Error::NoMoov)
101	}
102
103	pub fn trak(&self) -> &mp4_atom::Trak {
104		&self.trak
105	}
106}
107
108impl Container for Wire {
109	type Error = Error;
110
111	fn write(&self, group: &mut moq_net::GroupProducer, frames: &[Frame]) -> Result<(), Self::Error> {
112		let timescale = self.trak.mdia.mdhd.timescale as u64;
113		let track_id = self.trak.tkhd.track_id;
114		encode(group, frames, timescale, track_id)
115	}
116
117	fn poll_read(
118		&self,
119		group: &mut moq_net::GroupConsumer,
120		waiter: &conducer::Waiter,
121	) -> Poll<Result<Option<Vec<Frame>>, Self::Error>> {
122		use std::task::ready;
123
124		let Some(data) = ready!(group.poll_read_frame(waiter)?) else {
125			return Poll::Ready(Ok(None));
126		};
127
128		let timescale = self.trak.mdia.mdhd.timescale as u64;
129		Poll::Ready(Ok(Some(decode(data, timescale)?)))
130	}
131}
132
133pub(crate) fn decode(data: Bytes, timescale: u64) -> Result<Vec<Frame>, Error> {
134	use mp4_atom::DecodeMaybe;
135
136	let mut cursor = std::io::Cursor::new(&data);
137	let mut moof = None;
138	let mut mdat_data = None;
139
140	while let Some(atom) = mp4_atom::Any::decode_maybe(&mut cursor)? {
141		match atom {
142			mp4_atom::Any::Moof(m) => moof = Some(m),
143			mp4_atom::Any::Mdat(m) => mdat_data = Some(m.data),
144			_ => {}
145		}
146	}
147
148	let moof = moof.ok_or(Error::NoMoof)?;
149	let mdat_data = mdat_data.ok_or(Error::NoMdat)?;
150	let traf = moof.traf.first().ok_or(Error::NoTraf)?;
151	let tfdt = traf.tfdt.as_ref().ok_or(Error::NoTfdt)?;
152	let base_dts = tfdt.base_media_decode_time;
153
154	let default_size = traf.tfhd.default_sample_size;
155	let default_duration = traf.tfhd.default_sample_duration;
156
157	let mut frames = Vec::new();
158	let mut offset = 0usize;
159	let mut dts = base_dts;
160
161	for trun in &traf.trun {
162		for entry in &trun.entries {
163			let size = entry.size.or(default_size).unwrap_or(0) as usize;
164			let end = offset + size;
165
166			if end > mdat_data.len() {
167				return Ok(frames);
168			}
169
170			let cts = entry.cts.unwrap_or_default() as i64;
171			let pts = dts.checked_add_signed(cts).ok_or(Error::PtsOverflow)?;
172			let timestamp = Timestamp::from_scale(pts, timescale)?;
173			let payload = Bytes::copy_from_slice(&mdat_data[offset..end]);
174			let flags = entry.flags.unwrap_or(0);
175			// depends_on_no_other (bits 24-25 == 0x2) means keyframe
176			let keyframe = (flags >> 24) & 0x3 == 0x2;
177
178			frames.push(Frame {
179				timestamp,
180				payload,
181				keyframe,
182			});
183
184			offset = end;
185			dts += entry.duration.or(default_duration).unwrap_or(0) as u64;
186		}
187	}
188
189	Ok(frames)
190}
191
192pub(crate) fn encode(
193	group: &mut moq_net::GroupProducer,
194	frames: &[Frame],
195	timescale: u64,
196	track_id: u32,
197) -> Result<(), Error> {
198	if frames.is_empty() {
199		return Ok(());
200	}
201
202	let sequence_number = group.frame_count() as u32;
203	let bytes = encode_fragment(track_id, timescale, sequence_number, frames)?;
204	let mut writer = group.create_frame(bytes.len().into())?;
205	writer.write(bytes)?;
206	writer.finish()?;
207
208	Ok(())
209}
210
211/// Encode a single-traf moof+mdat fragment from a sequence of frames.
212///
213/// Performs the two-pass encoding required by ISO/IEC 14496-12: encode once
214/// to learn the moof size, then again with `trun.data_offset` pointing past
215/// the moof and mdat header. The DTS of the first frame is computed at the
216/// caller-supplied `timescale`.
217///
218/// Returns an empty `Bytes` when `frames` is empty.
219pub(crate) fn encode_fragment(
220	track_id: u32,
221	timescale: u64,
222	sequence_number: u32,
223	frames: &[Frame],
224) -> Result<Bytes, Error> {
225	use mp4_atom::Encode;
226
227	if frames.is_empty() {
228		return Ok(Bytes::new());
229	}
230
231	let dts = (frames[0].timestamp.as_micros() * timescale as u128 / 1_000_000) as u64;
232
233	let entries: Vec<_> = frames
234		.iter()
235		.map(|f| {
236			let flags = if f.keyframe { 0x0200_0000 } else { 0x0001_0000 };
237			mp4_atom::TrunEntry {
238				size: Some(f.payload.len() as u32),
239				flags: Some(flags),
240				..Default::default()
241			}
242		})
243		.collect();
244
245	let mdat_data: Vec<u8> = frames.iter().flat_map(|f| f.payload.iter().copied()).collect();
246
247	let build_moof = |data_offset| mp4_atom::Moof {
248		mfhd: mp4_atom::Mfhd { sequence_number },
249		traf: vec![mp4_atom::Traf {
250			tfhd: mp4_atom::Tfhd {
251				track_id,
252				..Default::default()
253			},
254			tfdt: Some(mp4_atom::Tfdt {
255				base_media_decode_time: dts,
256			}),
257			trun: vec![mp4_atom::Trun {
258				data_offset: Some(data_offset),
259				entries: entries.clone(),
260			}],
261			..Default::default()
262		}],
263	};
264
265	// First pass to learn the moof size.
266	let mut buf = Vec::new();
267	build_moof(0).encode(&mut buf)?;
268	let moof_size = buf.len();
269
270	// Second pass with data_offset = moof_size + 8 (mdat header).
271	buf.clear();
272	build_moof((moof_size + 8) as i32).encode(&mut buf)?;
273
274	let mdat = mp4_atom::Mdat { data: mdat_data };
275	mdat.encode(&mut buf)?;
276
277	Ok(Bytes::from(buf))
278}
279
280/// Synthesize a CMAF `Trak` for a video rendition that has no init segment.
281///
282/// Used by the fMP4 exporter when its source is a `Container::Legacy` track
283/// (Avc3/Hev1/etc. importers that publish raw codec bitstreams). The codec's
284/// out-of-band configuration record (`description`) must be available, e.g.
285/// because the Avc1 / Hvc1 transform has finished building it from inline
286/// parameter sets.
287pub(crate) fn synthesize_video_trak(
288	track_id: u32,
289	timescale: u64,
290	config: &VideoConfig,
291	description: &[u8],
292) -> Result<mp4_atom::Trak, Error> {
293	let width = config.coded_width.unwrap_or(0) as u16;
294	let height = config.coded_height.unwrap_or(0) as u16;
295	let visual = mp4_atom::Visual {
296		data_reference_index: 1,
297		width,
298		height,
299		..Default::default()
300	};
301
302	let sample_entry = match &config.codec {
303		VideoCodec::H264(_) => {
304			let mut cursor = std::io::Cursor::new(description);
305			let avcc = mp4_atom::Avcc::decode_body(&mut cursor).map_err(Error::Mp4)?;
306			mp4_atom::Codec::from(mp4_atom::Avc1 {
307				visual,
308				avcc,
309				..Default::default()
310			})
311		}
312		VideoCodec::H265(h265) => {
313			let mut cursor = std::io::Cursor::new(description);
314			let hvcc = mp4_atom::Hvcc::decode_body(&mut cursor).map_err(Error::Mp4)?;
315			// `in_band` (catalog) ↔ hev1 sample entry; otherwise hvc1.
316			if h265.in_band {
317				mp4_atom::Codec::from(mp4_atom::Hev1 {
318					visual,
319					hvcc,
320					..Default::default()
321				})
322			} else {
323				mp4_atom::Codec::from(mp4_atom::Hvc1 {
324					visual,
325					hvcc,
326					..Default::default()
327				})
328			}
329		}
330		other => return Err(Error::UnsupportedSynthesis(format!("video codec {:?}", other))),
331	};
332
333	Ok(build_video_trak(track_id, timescale, sample_entry, width, height))
334}
335
336/// Synthesize a CMAF `Trak` for an audio rendition that has no init segment.
337pub(crate) fn synthesize_audio_trak(
338	track_id: u32,
339	timescale: u64,
340	config: &AudioConfig,
341) -> Result<mp4_atom::Trak, Error> {
342	let audio = mp4_atom::Audio {
343		data_reference_index: 1,
344		channel_count: config.channel_count as u16,
345		sample_size: 16,
346		sample_rate: mp4_atom::FixedPoint::from(config.sample_rate as u16),
347	};
348
349	let sample_entry = match &config.codec {
350		AudioCodec::Opus => mp4_atom::Codec::from(mp4_atom::Opus {
351			audio,
352			dops: mp4_atom::Dops {
353				output_channel_count: config.channel_count as u8,
354				pre_skip: 0,
355				input_sample_rate: config.sample_rate,
356				output_gain: 0,
357			},
358			btrt: None,
359		}),
360		other => return Err(Error::UnsupportedSynthesis(format!("audio codec {:?}", other))),
361	};
362
363	Ok(build_audio_trak(track_id, timescale, sample_entry))
364}
365
366fn build_video_trak(
367	track_id: u32,
368	timescale: u64,
369	sample_entry: mp4_atom::Codec,
370	width: u16,
371	height: u16,
372) -> mp4_atom::Trak {
373	mp4_atom::Trak {
374		tkhd: mp4_atom::Tkhd {
375			track_id,
376			enabled: true,
377			width: mp4_atom::FixedPoint::from(width),
378			height: mp4_atom::FixedPoint::from(height),
379			..Default::default()
380		},
381		mdia: build_mdia(timescale, b"vide", true, sample_entry),
382		..Default::default()
383	}
384}
385
386fn build_audio_trak(track_id: u32, timescale: u64, sample_entry: mp4_atom::Codec) -> mp4_atom::Trak {
387	mp4_atom::Trak {
388		tkhd: mp4_atom::Tkhd {
389			track_id,
390			enabled: true,
391			..Default::default()
392		},
393		mdia: build_mdia(timescale, b"soun", false, sample_entry),
394		..Default::default()
395	}
396}
397
398fn build_mdia(timescale: u64, handler: &[u8; 4], is_video: bool, sample_entry: mp4_atom::Codec) -> mp4_atom::Mdia {
399	mp4_atom::Mdia {
400		mdhd: mp4_atom::Mdhd {
401			timescale: timescale as u32,
402			..Default::default()
403		},
404		hdlr: mp4_atom::Hdlr {
405			handler: mp4_atom::FourCC::new(handler),
406			name: String::new(),
407		},
408		minf: mp4_atom::Minf {
409			vmhd: is_video.then(mp4_atom::Vmhd::default),
410			smhd: (!is_video).then(mp4_atom::Smhd::default),
411			dinf: mp4_atom::Dinf {
412				dref: mp4_atom::Dref {
413					urls: vec![mp4_atom::Url::default()],
414				},
415			},
416			stbl: mp4_atom::Stbl {
417				stsd: mp4_atom::Stsd {
418					codecs: vec![sample_entry],
419				},
420				..Default::default()
421			},
422			..Default::default()
423		},
424	}
425}
426
427/// Default video timescale when the catalog doesn't supply one.
428///
429/// Used by the fMP4 exporter when synthesizing an init segment for a
430/// Legacy or LOC source: prefer `framerate * 1000` (so each frame has an
431/// integer duration), falling back to 90 kHz (the MPEG-TS convention).
432pub(crate) fn default_video_timescale(config: &VideoConfig) -> u64 {
433	if let Some(fps) = config.framerate {
434		(fps * 1000.0) as u64
435	} else {
436		90000
437	}
438}