1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
/*
REFERENCES
----------
1. https://github.com/shaka-project/shaka-player/blob/a4e926772e1b754fe968ee6f97490f08a40fe535/lib/text/mp4_vtt_parser.js
*/
//! WebVTT subtitle parser for MP4 container streams.
use crate::{
Error, Reader, Result, bail,
boxes::{MdhdBox, TfdtBox, TfhdBox, TrunBox, TrunSample},
data,
parser::{self, Mp4Parser},
sub::builder::{Cue, Subtitles},
};
use std::{cell::RefCell, rc::Rc};
/// A parser for extracting WebVTT (VTT) subtitles from MP4 files.
pub struct WvttSubsParser {
pub timescale: u32,
}
impl WvttSubsParser {
/// Creates a new `Mp4VttParser` from the given initialization segment.
pub fn from_init(data: &[u8]) -> Result<Self> {
let saw_wvtt = data!(false);
let timescale = data!();
let saw_wvtt_c = saw_wvtt.clone();
let timescale_c = timescale.clone();
Mp4Parser::new()
.base_box("moov", parser::children)
.base_box("trak", parser::children)
.base_box("mdia", parser::children)
.full_box("mdhd", move |mut box_| {
let box_version = box_.version.unwrap();
if box_version != 0 && box_version != 1 {
bail!("MDHD box version can only be 0 or 1.");
}
let mdhd_box = MdhdBox::new(&mut box_)?;
*timescale_c.borrow_mut() = Some(mdhd_box.timescale);
Ok(())
})
.base_box("minf", parser::children)
.base_box("stbl", parser::children)
.full_box("stsd", parser::sample_description)
.base_box("wvtt", move |_| {
// A valid vtt init segment, though we have no actual subtitles yet.
*saw_wvtt_c.borrow_mut() = true;
Ok(())
})
.parse(data, false, false)?;
if !saw_wvtt.take() {
bail!("WVTT box not found.");
}
if let Some(timescale) = timescale.take() {
Ok(Self { timescale })
} else {
Err(Error::Other(
"Missing timescale (should exist inside MDHD box).".to_owned(),
))
}
}
/// Parses the given media segment data to extract subtitles.
pub fn parse(&self, data: &[u8], period_start: Option<f32>) -> Result<Subtitles> {
let period_start = period_start.unwrap_or_default();
let timescale = self.timescale;
let base_time = data!(0_u64);
let default_duration = data!();
let presentations = data!(Vec::new());
let saw_tfdt = data!(false);
let saw_trun = data!(false);
let subtitles = data!(Subtitles::new());
let base_time_c = base_time.clone();
let default_duration_c = default_duration.clone();
let presentations_c = presentations.clone();
let saw_tfdt_c = saw_tfdt.clone();
let saw_trun_c = saw_trun.clone();
let subtitles_c = subtitles.clone();
Mp4Parser::new()
.base_box("moof", parser::children)
.base_box("traf", parser::children)
.full_box("tfdt", move |mut box_| {
*saw_tfdt_c.borrow_mut() = true;
let box_version = box_.version.unwrap();
if box_version != 0 && box_version != 1 {
bail!("TFDT box version can only be 0 or 1.");
}
let tfdt_box = TfdtBox::new(&mut box_)?;
*base_time_c.borrow_mut() = tfdt_box.base_media_decode_time;
Ok(())
})
.full_box("tfhd", move |mut box_| {
if box_.flags.is_none() {
bail!("TFHD box should have a valid flags value.");
}
let tfhd_box = TfhdBox::new(&mut box_)?;
*default_duration_c.borrow_mut() = tfhd_box.default_sample_duration;
Ok(())
})
.full_box("trun", move |mut box_| {
*saw_trun_c.borrow_mut() = true;
if box_.version.is_none() {
bail!("TRUN box should have a valid version value.");
}
if box_.flags.is_none() {
bail!("TRUN box should have a valid flags value.");
}
let trun_box = TrunBox::new(&mut box_)?;
*presentations_c.borrow_mut() = trun_box.sample_data;
Ok(())
})
.base_box(
"mdat",
parser::alldata(move |data| {
if !*saw_tfdt.borrow() && !*saw_trun.borrow() {
bail!("Some required boxes (either TFDT or TRUN) are missing.");
}
let cues = Self::parse_mdat(
*base_time.borrow(),
*default_duration.borrow(),
period_start,
&presentations.borrow(),
&data,
timescale,
)?;
subtitles_c.borrow_mut().extend_cues(cues);
Ok(())
}),
)
.parse(data, false, false)?;
Ok(subtitles.take())
}
fn parse_mdat(
base_time: u64,
default_duration: Option<u32>,
period_start: f32,
presentations: &[TrunSample],
raw_payload: &[u8],
timescale: u32,
) -> Result<Vec<Cue>> {
let mut cues = Vec::new();
let mut current_time = base_time;
let mut reader = Reader::new_big_endian(raw_payload);
for presentation in presentations {
// If one presentation corresponds to multiple payloads, it is assumed
// that all of those payloads have the same start time and duration.
let duration = presentation.sample_duration.or(default_duration);
let start_time = if let Some(sample_composition_time_offset) =
presentation.sample_composition_time_offset
{
base_time + sample_composition_time_offset as u64
} else {
current_time
};
current_time = start_time + duration.unwrap_or_default() as u64;
// Read samples until it adds up to the given size.
let mut total_size = 0;
loop {
// Read the payload size.
let payload_size = reader.read_u32()? as i32;
total_size += payload_size;
// Skip the type.
let payload_type = reader.read_u32()?;
let payload_name = parser::type_to_string(payload_type as usize)?;
// Read the data payload.
let mut payload = None;
match payload_name.as_str() {
"vttc" => {
if payload_size > 8 {
payload = Some(reader.read_bytes_u8((payload_size - 8) as usize)?);
}
}
"vtte" => {
// It's a vtte, which is a vtt cue that is empty. Ignore any data that
// does exist.
reader.skip((payload_size - 8) as u64)?;
}
_ => {
// println!("Unknown box {} ! Skipping!", payload_name);
reader.skip((payload_size - 8) as u64)?;
}
}
if duration.is_some() {
if let Some(payload) = payload {
let cue = Self::parse_vttc(
&payload,
period_start + start_time as f32 / timescale as f32,
period_start + current_time as f32 / timescale as f32,
)?;
cues.push(cue);
}
} else {
bail!("WVTT sample duration unknown, and no default found.");
}
if !(presentation.sample_size.is_none()
|| total_size <= presentation.sample_size.unwrap_or_default() as i32)
{
bail!(
"The samples do not fit evenly into the sample sizes given in the TRUN box."
);
};
// If no sampleSize was specified, it's assumed that this presentation
// corresponds to only a single cue.
if !(presentation.sample_size.is_some()
&& (total_size < presentation.sample_size.unwrap_or_default() as i32))
{
break;
}
}
}
if reader.has_more_data() {
bail!("MDAT which contain VTT cues and non-VTT data are not currently supported.");
};
Ok(cues.into_iter().flatten().collect())
}
/// Parses a vttc box into a cue.
fn parse_vttc(data: &[u8], start_time: f32, end_time: f32) -> Result<Option<Cue>> {
let payload = Rc::new(RefCell::new(String::new()));
let settings = Rc::new(RefCell::new(String::new()));
let payload_c = payload.clone();
let settings_c = settings.clone();
Mp4Parser::new()
.base_box(
"payl",
parser::alldata(move |data| {
*payload_c.borrow_mut() = String::from_utf8(data)?;
Ok(())
}),
)
// .base_box(
// "iden",
// parser::alldata(move |data| {
// *id_c.borrow_mut() = String::from_utf8(data)?;
// Ok(())
// }),
// )
.base_box(
"sttg",
parser::alldata(move |data| {
*settings_c.borrow_mut() = String::from_utf8(data)?;
Ok(())
}),
)
.parse(data, false, false)?;
let payload = payload.take();
if !payload.is_empty() {
return Ok(Some(Cue {
payload,
settings: settings.take(),
start_time,
end_time,
}));
}
Ok(None)
}
}