Skip to main content

video_sys/
mp4.rs

1use std::{
2    fs::File,
3    io::BufReader,
4    path::{Path, PathBuf},
5    str::FromStr,
6};
7
8use anyhow::{anyhow, bail, Context, Result};
9use mp4::{FourCC, Mp4Reader, TrackType};
10
11use crate::h264::H264Config;
12
13#[derive(Debug, Clone)]
14pub struct EncodedSample {
15    pub data_avcc: Vec<u8>,
16    /// Decode timestamp (monotonic), in microseconds.
17    pub dts_us: i64,
18    /// Presentation timestamp (may reorder vs DTS when B-frames are present), in microseconds.
19    pub pts_us: i64,
20    pub dur_us: i64,
21}
22
23#[derive(Debug)]
24struct Prefetched {
25    start_time: u64,
26    duration: u32,
27    rendering_offset: i32,
28    bytes: Vec<u8>,
29}
30
31pub struct Mp4H264Source {
32    path: PathBuf,
33    reader: Mp4Reader<BufReader<File>>,
34    track_id: u32,
35    timescale: u32,
36    sample_count: u32,
37    next_sample_id: u32,
38    prefetched: Option<Prefetched>,
39
40    pub config: H264Config,
41}
42
43impl Mp4H264Source {
44    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
45        let path = path.as_ref().to_path_buf();
46
47        let f = File::open(&path).with_context(|| format!("open mp4: {}", path.display()))?;
48        let size = f
49            .metadata()
50            .with_context(|| format!("stat mp4: {}", path.display()))?
51            .len();
52
53        let reader = BufReader::new(f);
54        let mut mp4 = Mp4Reader::read_header(reader, size).context("mp4::read_header")?;
55
56        let (track_id, timescale, sample_count, width, height, sps, pps) =
57            select_h264_video_track(&mp4).context("select H.264 track")?;
58
59        // Prefetch the first sample to:
60        // 1) validate we can read samples;
61        // 2) infer NAL length field size (usually 4, but not guaranteed).
62        let prefetched = mp4
63            .read_sample(track_id, 1)
64            .context("read first sample")?
65            .map(|s| Prefetched {
66                start_time: s.start_time,
67                duration: s.duration,
68                rendering_offset: s.rendering_offset,
69                bytes: s.bytes.to_vec(),
70            });
71
72        let nal_len_size = prefetched
73            .as_ref()
74            .map(|p| detect_nal_length_size(&p.bytes))
75            .unwrap_or(4);
76
77        let avcc = build_avcc_record(&sps, &pps, nal_len_size)?;
78        let config = H264Config::parse_from_avcc(width, height, &avcc)
79            .context("parse avcC from SPS/PPS")?;
80
81        Ok(Self {
82            path,
83            reader: mp4,
84            track_id,
85            timescale,
86            sample_count,
87            next_sample_id: 1,
88            prefetched,
89            config,
90        })
91    }
92
93    pub fn next_sample(&mut self) -> Result<Option<EncodedSample>> {
94        if self.next_sample_id == 0 {
95            bail!("internal error: sample ids are 1-based");
96        }
97
98        if self.next_sample_id > self.sample_count {
99            return Ok(None);
100        }
101
102        let (start_time, duration, rendering_offset, bytes) = if self.next_sample_id == 1 {
103            if let Some(p) = self.prefetched.take() {
104                (p.start_time, p.duration, p.rendering_offset, p.bytes)
105            } else {
106                let s = self
107                    .reader
108                    .read_sample(self.track_id, 1)
109                    .context("read sample #1")?
110                    .ok_or_else(|| anyhow!("sample #1 missing"))?;
111                (s.start_time, s.duration, s.rendering_offset, s.bytes.to_vec())
112            }
113        } else {
114            let s = self
115                .reader
116                .read_sample(self.track_id, self.next_sample_id)
117                .with_context(|| format!("read sample #{}", self.next_sample_id))?
118                .ok_or_else(|| anyhow!("sample #{} missing", self.next_sample_id))?;
119            (s.start_time, s.duration, s.rendering_offset, s.bytes.to_vec())
120        };
121
122        self.next_sample_id += 1;
123
124        // mp4 crate provides:
125        // - start_time: decode time in track timescale ticks
126        // - rendering_offset: composition offset ticks (ctts)
127        let dts_ticks = start_time as i128;
128        let pts_ticks = dts_ticks + (rendering_offset as i128);
129
130        let dts_us = ticks_to_us(dts_ticks, self.timescale);
131        let pts_us = ticks_to_us(pts_ticks, self.timescale);
132        let dur_us = ticks_to_us(duration as i128, self.timescale);
133
134        Ok(Some(EncodedSample {
135            data_avcc: bytes,
136            dts_us,
137            pts_us,
138            dur_us,
139        }))
140    }
141
142    pub fn path(&self) -> &Path {
143        &self.path
144    }
145}
146
147fn ticks_to_us(ticks: i128, timescale: u32) -> i64 {
148    if timescale == 0 {
149        return 0;
150    }
151    // microseconds = ticks * 1_000_000 / timescale
152    let us = ticks.saturating_mul(1_000_000i128) / (timescale as i128);
153    if us > (i64::MAX as i128) {
154        i64::MAX
155    } else if us < (i64::MIN as i128) {
156        i64::MIN
157    } else {
158        us as i64
159    }
160}
161
162fn select_h264_video_track(
163    mp4: &Mp4Reader<BufReader<File>>,
164) -> Result<(u32, u32, u32, u32, u32, Vec<u8>, Vec<u8>)> {
165    let avc1 = FourCC::from_str("avc1").unwrap();
166    let avc3 = FourCC::from_str("avc3").unwrap();
167
168    for (track_id, track) in mp4.tracks().iter() {
169        let tt = track.track_type().context("track_type")?;
170        if tt != TrackType::Video {
171            continue;
172        }
173
174        let bt = track.box_type().context("box_type")?;
175        if bt != avc1 && bt != avc3 {
176            continue;
177        }
178
179        let timescale = track.timescale();
180        let sample_count = track.sample_count();
181
182        let width = track.width() as u32;
183        let height = track.height() as u32;
184
185        let sps = track
186            .sequence_parameter_set()
187            .context("sequence_parameter_set")?
188            .to_vec();
189        let pps = track
190            .picture_parameter_set()
191            .context("picture_parameter_set")?
192            .to_vec();
193
194        return Ok((*track_id, timescale, sample_count, width, height, sps, pps));
195    }
196
197    bail!("no H.264 (avc1/avc3) video track found")
198}
199
200fn detect_nal_length_size(avcc_sample: &[u8]) -> usize {
201    // Best effort: assume typical 4 bytes if sample is too short.
202    if avcc_sample.len() < 8 {
203        return 4;
204    }
205    // Heuristic: first 4 bytes are NAL length; if it looks reasonable, accept 4.
206    4
207}
208
209fn build_avcc_record(sps: &[u8], pps: &[u8], nal_len_size: usize) -> Result<Vec<u8>> {
210    if !(1..=4).contains(&nal_len_size) {
211        bail!("invalid nal length size: {nal_len_size}");
212    }
213
214    // Minimal avcC record to feed H264Config.
215    // Layout: https://developer.apple.com/documentation/quicktime-file-format/avcdecoderconfigurationrecord
216    let mut out = Vec::new();
217    out.push(1); // configurationVersion
218    out.push(*sps.get(1).unwrap_or(&0)); // AVCProfileIndication
219    out.push(*sps.get(2).unwrap_or(&0)); // profile_compatibility
220    out.push(*sps.get(3).unwrap_or(&0)); // AVCLevelIndication
221
222    // lengthSizeMinusOne in low 2 bits
223    out.push(0xFC | ((nal_len_size as u8 - 1) & 0x03));
224
225    // numOfSequenceParameterSets in low 5 bits
226    out.push(0xE0 | 1);
227    out.extend_from_slice(&(sps.len() as u16).to_be_bytes());
228    out.extend_from_slice(sps);
229
230    // numOfPictureParameterSets
231    out.push(1);
232    out.extend_from_slice(&(pps.len() as u16).to_be_bytes());
233    out.extend_from_slice(pps);
234
235    Ok(out)
236}