re_video/decode/mod.rs
1//! Video frame decoding.
2//! =========================
3//!
4//! Whirlwind tour of how to interpret picture data (from a Video perspective)
5//! ---------------------------------------------------------------------------------
6//!
7//! Extracted from the [av1 codec wiki](https://wiki.x266.mov/docs/colorimetry/intro) and other sources.
8//! Follows the trail of information we get from our AV1 decoder.
9//!
10//! ### How to get from YUV to RGB?
11//!
12//! Things to know about the incoming yuv data:
13//! * `picture.bit_depth()`
14//! * is either 8 or 16
15//! * that's how the decoder stores for us but the per component we have either 8 or 10 or 12 bits -> see `picture.bits_per_component()`
16//! * `picture.pixel_layout()`
17//! * `4:0:0` grayscale
18//! * `4:2:0` half horizontal and half vertical resolution for chroma
19//! * `4:2:2` half horizontal resolution for chroma
20//! * `4:4:4` full resolution for chroma
21//! * note that the AV1 decoder gives us always (!) planar data
22//! * `picture.color_range()`
23//! * yuv data range may be either `limited` or `full`
24//! * `full` is what you'd naively expect, just full use up the entire 8/10/12 bits!
25//! * `limited` means that only a certain range of values is valid
26//! * weirdly enough, DO NOT CLAMP! a lot of software may say it's limited but then use the so-called foot and head space anyways to go outside the regular colors
27//! * reportedly (read this on some forums ;-)) some players _do_ clamp, so let's not get too concerned about this
28//! * it's a remnant of the analog age, but it's still very common!
29//!
30//! ### Given a normalized YUV triplet, how do we get color?
31//!
32//! * `picture.matrix_coefficients()` (see <https://wiki.x266.mov/docs/colorimetry/matrix>)
33//! * this tells us what to multiply the incoming YUV data with to get SOME RGB data
34//! * there's various standards of how to do this, but the most common is BT.709
35//! * here's a fun special one: `identity` means it's not actually YUV, but GBR!
36//! * `picture.primaries()`
37//! * now we have RGB but we kinda have no idea what that means!
38//! * the color primaries tell us which space we're in
39//! * ...meaning that if the primaries are anything else we'd have to do some conversion BUT
40//! it also means that we have no chance of displaying the picture perfectly on a screen taking in sRGB (or any other not-matching color space)
41//! * [Wikipedia says](https://en.wikipedia.org/wiki/Rec._709#Relationship_to_sRGB) sRGB uses the same primaries as BT.709
42//! * but I also found other sources (e.g. [this forum post](https://forum.doom9.org/showthread.php?p=1640342#post1640342))
43//! clamining that they're just close enough to be considered the same for practical purposes
44//! * `picture.transfer_characteristics()`
45//! * until this point everything is "gamma compressed", or more accurately, went through Opto Electric Transfer Function (OETF)
46//! * i.e. measure of light in, electronic signal out
47//! * we have to keep in mind the EOTF that our screen at the other end will use which for today's renderpipeline is always sRGB
48//! (meaning it's a 2.2 gamma curve with a small linear part)
49//! * Similar to the primaries, BT.709 uses a _similar_ transfer function as sRGB, but not exactly the same
50//! <https://www.image-engineering.de/library/technotes/714-color-spaces-rec-709-vs-srgb>
51//! * There's reason to believe players just ignore this:
52//! * From a [VLC issue](https://code.videolan.org/videolan/vlc/-/issues/26999):
53//! > We do not support transfers or primaries anyway, so it does not matter
54//! > (we do support HDR transfer functions PQ and HLG, not SDR ones and we support BT.2020 primaries, but not SMPTE C (which is what BT.601 NTSC is))."
55//! * …I'm sure I found a report of other video players ignoring this and most of everything except `matrix_coefficients` but I can't find it anymore :(
56//!
57//! All of the above are completely optional for a video to specify and there's sometimes some interplay of relationships with those.
58//! (a standard would often specify several things at once, there's typical and less typical combinations)
59//! So naturally, people will use terms sloppily and interchangeably,
60//! If anything is lacking a video player has to make a guess.
61//! … and as discussed above, even it's there, often video players tend to ignore some settings!
62//!
63//! With all this out of the way…
64//!
65//! ### What's the state of us making use of all these things?
66//!
67//! * ❌ `picture.bit_depth()`
68//! * TODO(#7594): ignored, we just pretend everything is 8 bits
69//! * ✅ `picture.pixel_layout()`
70//! * ✅ `picture.color_range()`
71//! * 🟧 `picture.matrix_coefficients()`
72//! * we try to figure out whether to use `BT.709` or `BT.601` coefficients, using other characteristics for guessing if nothing else is available.
73//! * ❌ `picture.primaries()`
74//! * ❌ `picture.transfer_characteristics()`
75//!
76//! We'll very likely be good with this until either we get specific feature requests and/or we'll start
77//! supporting HDR content at which point more properties will be important!
78//!
79
80#[cfg(with_dav1d)]
81mod async_decoder_wrapper;
82#[cfg(with_dav1d)]
83mod av1;
84
85#[cfg(with_ffmpeg)]
86mod ffmpeg_h264;
87
88#[cfg(with_ffmpeg)]
89pub use ffmpeg_h264::{
90 Error as FFmpegError, FFmpegVersion, FFmpegVersionParseError, ffmpeg_download_url,
91};
92
93#[cfg(target_arch = "wasm32")]
94mod webcodecs;
95
96mod gop_detection;
97
98pub use gop_detection::{DetectGopStartError, GopStartDetection, detect_gop_start};
99
100use crate::{SampleIndex, Time, VideoDataDescription};
101
102#[derive(thiserror::Error, Debug, Clone)]
103pub enum DecodeError {
104 #[error("Unsupported codec: {0}")]
105 UnsupportedCodec(String),
106
107 #[cfg(with_dav1d)]
108 #[error("dav1d: {0}")]
109 Dav1d(#[from] dav1d::Error),
110
111 #[error("To enabled native AV1 decoding, compile Rerun with the `nasm` feature enabled.")]
112 Dav1dWithoutNasm,
113
114 #[error(
115 "Rerun does not yet support native AV1 decoding on Linux ARM64. See https://github.com/rerun-io/rerun/issues/7755"
116 )]
117 NoDav1dOnLinuxArm64,
118
119 #[cfg(target_arch = "wasm32")]
120 #[error(transparent)]
121 WebDecoder(#[from] webcodecs::WebError),
122
123 #[cfg(with_ffmpeg)]
124 #[error(transparent)]
125 Ffmpeg(std::sync::Arc<FFmpegError>),
126
127 #[error("Unsupported bits per component: {0}")]
128 BadBitsPerComponent(usize),
129}
130
131impl DecodeError {
132 pub fn should_request_more_frames(&self) -> bool {
133 // Decoders often (not always!) recover from errors and will succeed eventually.
134 // Gotta keep trying!
135 match self {
136 // Unsupported codec / decoder not available:
137 Self::UnsupportedCodec(_) | Self::Dav1dWithoutNasm | Self::NoDav1dOnLinuxArm64 => false,
138
139 // Issue with AV1 decoding.
140 #[cfg(with_dav1d)]
141 Self::Dav1d(_) => true,
142
143 // Issue with WebCodecs decoding.
144 #[cfg(target_arch = "wasm32")]
145 Self::WebDecoder(_) => true,
146
147 // Issue with FFmpeg decoding.
148 #[cfg(with_ffmpeg)]
149 Self::Ffmpeg(err) => err.should_request_more_frames(),
150
151 // Unsupported format.
152 Self::BadBitsPerComponent(_) => false,
153 }
154 }
155}
156
157pub type Result<T = (), E = DecodeError> = std::result::Result<T, E>;
158
159/// Callback for decoding a single frame, called by decoders upon decoding a frame or hitting an error.
160#[allow(dead_code)] // May be unused in some configurations where we don't have any decoder.
161pub type OutputCallback = dyn Fn(Result<Frame>) + Send + Sync;
162
163/// Interface for an asynchronous video decoder.
164///
165/// Output callback is passed in on creation of a concrete type.
166pub trait AsyncDecoder: Send + Sync {
167 /// Submits a chunk for decoding in the background.
168 ///
169 /// Chunks are expected to come in the order of their decoding timestamp.
170 fn submit_chunk(&mut self, chunk: Chunk) -> Result<()>;
171
172 /// Called after submitting the last chunk.
173 ///
174 /// Should flush all pending frames.
175 /// If you plan on sending more chunks after calling `end_of_video`,
176 /// you MUST call [`Self::reset`] FIRST.
177 ///
178 /// Implementation note:
179 /// As of writing there's two decoders that have requirements on what happens for new frames after `end_of_video`
180 /// * WebCodec: The next submitted chunk has to be a key frame.
181 /// * FFmpeg-executable: We've shut down stdin, thus we need to restart the process. Doing this without the full context of `reset` is not possible right now.
182 fn end_of_video(&mut self) -> Result<()> {
183 Ok(())
184 }
185
186 /// Resets the decoder.
187 ///
188 /// Expected to be called for backward seeking and major jumps forward in the video.
189 /// Newly created decoder can assume to get reset at least once before any chunks are submitted.
190 ///
191 /// This does not block, all chunks sent to `decode` before this point will be discarded.
192 /// Previously missing [`VideoDataDescription::encoding_details`] may be present now.
193 fn reset(&mut self, video_descr: &VideoDataDescription) -> Result<()>;
194
195 /// Minimum number of samples the decoder requests to stay head of the currently requested sample.
196 ///
197 /// I.e. if sample N is requested, then the encoder would like to see at least all the samples from
198 /// [start of N's GOP] until [N + `min_num_samples_to_enqueue_ahead`].
199 /// Codec specific constraints regarding what samples can be decoded (samples may depend on other samples in their GOP)
200 /// still apply independently of this.
201 ///
202 /// This can be used as a workaround for decoders that are known to need additional samples to produce outputs.
203 fn min_num_samples_to_enqueue_ahead(&self) -> usize {
204 0
205 }
206}
207
208/// Creates a new async decoder for the given `video` data.
209pub fn new_decoder(
210 debug_name: &str,
211 video: &crate::VideoDataDescription,
212 decode_settings: &DecodeSettings,
213 on_output: impl Fn(Result<Frame>) + Send + Sync + 'static,
214) -> Result<Box<dyn AsyncDecoder>> {
215 #![allow(unused_variables, clippy::needless_return)] // With some feature flags
216
217 re_tracing::profile_function!();
218
219 re_log::trace!(
220 "Looking for decoder for {}",
221 video.human_readable_codec_string()
222 );
223
224 #[cfg(target_arch = "wasm32")]
225 return Ok(Box::new(webcodecs::WebVideoDecoder::new(
226 video,
227 decode_settings.hw_acceleration,
228 on_output,
229 )?));
230
231 #[cfg(not(target_arch = "wasm32"))]
232 match video.codec {
233 #[cfg(feature = "av1")]
234 crate::VideoCodec::AV1 => {
235 #[cfg(linux_arm64)]
236 {
237 return Err(DecodeError::NoDav1dOnLinuxArm64);
238 }
239
240 #[cfg(with_dav1d)]
241 {
242 re_log::trace!("Decoding AV1…");
243 return Ok(Box::new(async_decoder_wrapper::AsyncDecoderWrapper::new(
244 debug_name.to_owned(),
245 Box::new(av1::SyncDav1dDecoder::new(debug_name.to_owned())?),
246 on_output,
247 )));
248 }
249 }
250
251 #[cfg(with_ffmpeg)]
252 crate::VideoCodec::H264 => {
253 re_log::trace!("Decoding H.264…");
254 Ok(Box::new(ffmpeg_h264::FFmpegCliH264Decoder::new(
255 debug_name.to_owned(),
256 &video.encoding_details,
257 on_output,
258 decode_settings.ffmpeg_path.clone(),
259 )?))
260 }
261
262 _ => Err(DecodeError::UnsupportedCodec(
263 video.human_readable_codec_string(),
264 )),
265 }
266}
267
268/// One chunk of encoded video data, representing a single [`crate::SampleMetadata`].
269///
270/// For details on how to interpret the data, see [`crate::SampleMetadata`].
271///
272/// In MP4, one sample is one frame.
273pub struct Chunk {
274 /// The start of a new [`crate::demux::GroupOfPictures`]?
275 ///
276 /// This probably means this is a _keyframe_, and that and entire frame
277 /// can be decoded from only this one sample (though I'm not 100% sure).
278 pub is_sync: bool,
279
280 pub data: Vec<u8>,
281
282 /// Which sample (frame) did this chunk come from?
283 ///
284 /// This is the order of which the samples appear in the container,
285 /// which is usually ordered by [`Self::decode_timestamp`].
286 pub sample_idx: usize,
287
288 /// Which frame does this chunk belong to?
289 ///
290 /// This is on the assumption that each sample produces a single frame,
291 /// which is true for MP4.
292 ///
293 /// This is the index of samples ordered by [`Self::presentation_timestamp`].
294 ///
295 /// Do *not* use this to index into the video data description!
296 /// Use [`Self::sample_idx`] instead.
297 pub frame_nr: u32,
298
299 /// Decode timestamp of this sample.
300 /// Chunks are expected to be submitted in the order of decode timestamp.
301 ///
302 /// `decode_timestamp <= presentation_timestamp`
303 pub decode_timestamp: Time,
304
305 /// Time at which this sample appears in the frame stream, in time units.
306 ///
307 /// The frame should be shown at this time.
308 /// Often synonymous with `composition_timestamp`.
309 ///
310 /// `decode_timestamp <= presentation_timestamp`
311 pub presentation_timestamp: Time,
312
313 /// Duration of the sample.
314 ///
315 /// Typically the time difference in presentation timestamp to the next sample.
316 /// May be unknown if this is the last sample in an ongoing video stream.
317 pub duration: Option<Time>,
318}
319
320/// Data for a decoded frame on native targets.
321#[cfg(not(target_arch = "wasm32"))]
322pub struct FrameContent {
323 pub data: Vec<u8>,
324 pub width: u32,
325 pub height: u32,
326 pub format: PixelFormat,
327}
328
329#[cfg(not(target_arch = "wasm32"))]
330impl re_byte_size::SizeBytes for FrameContent {
331 fn heap_size_bytes(&self) -> u64 {
332 let Self {
333 data,
334 width: _,
335 height: _,
336 format: _,
337 } = self;
338 data.heap_size_bytes()
339 }
340}
341
342#[cfg(not(target_arch = "wasm32"))]
343impl FrameContent {
344 pub fn width(&self) -> u32 {
345 self.width
346 }
347
348 pub fn height(&self) -> u32 {
349 self.height
350 }
351}
352
353/// Data for a decoded frame on the web.
354#[cfg(target_arch = "wasm32")]
355pub type FrameContent = webcodecs::WebVideoFrame;
356
357#[cfg(target_arch = "wasm32")]
358impl FrameContent {
359 pub fn width(&self) -> u32 {
360 self.display_width()
361 }
362
363 pub fn height(&self) -> u32 {
364 self.display_height()
365 }
366}
367
368/// Meta information about a decoded video frame, as reported by the decoder.
369#[derive(Debug, Clone)]
370pub struct FrameInfo {
371 /// The start of a new [`crate::demux::GroupOfPictures`]?
372 ///
373 /// This probably means this is a _keyframe_, and that and entire frame
374 /// can be decoded from only this one sample (though I'm not 100% sure).
375 ///
376 /// None = unknown.
377 pub is_sync: Option<bool>,
378
379 /// Which sample in the video is this from?
380 ///
381 /// We always assume one sample leads one frame
382 /// (but may provide arbitrary additional information which may be needed for other frames in the GOP).
383 ///
384 /// This is the order of which the samples appear in the container,
385 /// which is ordered by [`Self::latest_decode_timestamp`].
386 /// I.e. this is NOT ordered by [`Self::presentation_timestamp`].
387 ///
388 /// None = unknown.
389 pub sample_idx: Option<SampleIndex>,
390
391 /// Which frame is this?
392 ///
393 /// This is on the assumption that each sample produces a single frame,
394 /// which is true for MP4.
395 ///
396 /// This is the index of frames ordered by [`Self::presentation_timestamp`].
397 ///
398 /// None = unknown.
399 pub frame_nr: Option<u32>,
400
401 /// Time at which this frame appears in the frame stream, in time units.
402 ///
403 /// The frame should be shown at this time.
404 /// We expect this timestamp to be identical with a the presentation timestamp of the [`crate::Chunk`]
405 /// which is associated with this frame.
406 /// Often synonymous with `composition_timestamp`.
407 ///
408 /// `decode_timestamp <= presentation_timestamp`
409 pub presentation_timestamp: Time,
410
411 /// Duration of the frame.
412 ///
413 /// Typically the time difference in presentation timestamp to the next frame.
414 /// May be unknown if this is the last frame in an ongoing video stream.
415 pub duration: Option<Time>,
416
417 /// The decode timestamp of the last chunk that was needed to decode this frame.
418 ///
419 /// None = unknown.
420 pub latest_decode_timestamp: Option<Time>,
421}
422
423impl FrameInfo {
424 /// Presentation timestamp range in which this frame is valid.
425 ///
426 /// If there's no known duration, the range is open ended.
427 pub fn presentation_time_range(&self) -> std::ops::Range<Time> {
428 if let Some(duration) = self.duration {
429 self.presentation_timestamp..self.presentation_timestamp + duration
430 } else {
431 self.presentation_timestamp..Time::MAX
432 }
433 }
434}
435
436/// One decoded video frame.
437pub struct Frame {
438 pub content: FrameContent,
439 pub info: FrameInfo,
440}
441
442impl re_byte_size::SizeBytes for Frame {
443 fn heap_size_bytes(&self) -> u64 {
444 let Self { content, info: _ } = self;
445 content.heap_size_bytes()
446 }
447}
448
449/// Pixel format/layout used by [`FrameContent::data`].
450#[derive(Debug, Clone)]
451pub enum PixelFormat {
452 Rgb8Unorm,
453 Rgba8Unorm,
454
455 Yuv {
456 layout: YuvPixelLayout,
457 range: YuvRange,
458 // TODO(andreas): Color primaries should also apply to RGB data,
459 // but for now we just always assume RGB to be BT.709 ~= sRGB.
460 coefficients: YuvMatrixCoefficients,
461 // Note that we don't handle chroma sample location at all so far.
462 },
463}
464
465impl PixelFormat {
466 pub fn bits_per_pixel(&self) -> u32 {
467 match self {
468 Self::Rgb8Unorm { .. } => 24,
469 Self::Rgba8Unorm { .. } => 32,
470 Self::Yuv { layout, .. } => match layout {
471 YuvPixelLayout::Y_U_V444 => 24,
472 YuvPixelLayout::Y_U_V422 => 16,
473 YuvPixelLayout::Y_U_V420 => 12,
474 YuvPixelLayout::Y400 => 8,
475 },
476 }
477 }
478}
479
480/// Pixel layout used by [`PixelFormat::Yuv`].
481///
482/// For details see `re_renderer`'s `YuvPixelLayout` type.
483#[allow(non_camel_case_types)]
484#[derive(Debug, Clone, Copy, PartialEq, Eq)]
485pub enum YuvPixelLayout {
486 Y_U_V444,
487 Y_U_V422,
488 Y_U_V420,
489 Y400,
490}
491
492/// Yuv value range used by [`PixelFormat::Yuv`].
493///
494/// For details see `re_renderer`'s `YuvRange` type.
495#[derive(Debug, Clone, Copy)]
496pub enum YuvRange {
497 Limited,
498 Full,
499}
500
501/// Yuv matrix coefficients used by [`PixelFormat::Yuv`].
502///
503/// For details see `re_renderer`'s `YuvMatrixCoefficients` type.
504#[derive(Debug, Clone, Copy)]
505pub enum YuvMatrixCoefficients {
506 /// Interpret YUV as GBR.
507 Identity,
508
509 Bt601,
510
511 Bt709,
512}
513
514/// How the video should be decoded.
515///
516/// Depending on the decoder backend, these settings are merely hints and may be ignored.
517/// However, they can be useful in some situations to work around issues.
518///
519/// On the web this directly corresponds to
520/// <https://www.w3.org/TR/webcodecs/#hardware-acceleration>
521#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Hash)]
522#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
523pub enum DecodeHardwareAcceleration {
524 /// May use hardware acceleration if available and compatible with the codec.
525 #[default]
526 Auto,
527
528 /// Should use a software decoder even if hardware acceleration is available.
529 ///
530 /// If no software decoder is present, this may cause decoding to fail.
531 PreferSoftware,
532
533 /// Should use a hardware decoder.
534 ///
535 /// If no hardware decoder is present, this may cause decoding to fail.
536 PreferHardware,
537}
538
539/// Settings for video decoding.
540#[derive(Debug, Clone, PartialEq, Eq, Default, Hash)]
541#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
542pub struct DecodeSettings {
543 /// How the video should be decoded.
544 pub hw_acceleration: DecodeHardwareAcceleration,
545
546 /// Custom path for the ffmpeg binary.
547 ///
548 /// If not provided, we use the path automatically determined by `ffmpeg_sidecar`.
549 #[cfg(not(target_arch = "wasm32"))]
550 pub ffmpeg_path: Option<std::path::PathBuf>,
551}
552
553impl std::fmt::Display for DecodeHardwareAcceleration {
554 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
555 match self {
556 Self::Auto => write!(f, "Auto"),
557 Self::PreferSoftware => write!(f, "Prefer software"),
558 Self::PreferHardware => write!(f, "Prefer hardware"),
559 }
560 }
561}
562
563impl std::str::FromStr for DecodeHardwareAcceleration {
564 type Err = ();
565
566 fn from_str(s: &str) -> Result<Self, Self::Err> {
567 match s.trim().to_lowercase().replace('-', "_").as_str() {
568 "auto" => Ok(Self::Auto),
569 "prefer_software" | "software" => Ok(Self::PreferSoftware),
570 "prefer_hardware" | "hardware" => Ok(Self::PreferHardware),
571 _ => Err(()),
572 }
573 }
574}