re_video/decode/
mod.rs

1//! Video frame decoding.
2//! =========================
3//!
4//! Whirlwind tour of how to interpret picture data (from a Video perspective)
5//! ---------------------------------------------------------------------------------
6//!
7//! Extracted from the [av1 codec wiki](https://wiki.x266.mov/docs/colorimetry/intro) and other sources.
8//! Follows the trail of information we get from our AV1 decoder.
9//!
10//! ### How to get from YUV to RGB?
11//!
12//! Things to know about the incoming yuv data:
13//! * `picture.bit_depth()`
14//!   * is either 8 or 16
15//!   * that's how the decoder stores for us but the per component we have either 8 or 10 or 12 bits -> see `picture.bits_per_component()`
16//! * `picture.pixel_layout()`
17//!   * `4:0:0` grayscale
18//!   * `4:2:0` half horizontal and half vertical resolution for chroma
19//!   * `4:2:2` half horizontal resolution for chroma
20//!   * `4:4:4` full resolution for chroma
21//!   * note that the AV1 decoder gives us always (!) planar data
22//! * `picture.color_range()`
23//!   * yuv data range may be either `limited` or `full`
24//!   * `full` is what you'd naively expect, just full use up the entire 8/10/12 bits!
25//!   * `limited` means that only a certain range of values is valid
26//!      * weirdly enough, DO NOT CLAMP! a lot of software may say it's limited but then use the so-called foot and head space anyways to go outside the regular colors
27//!          * reportedly (read this on some forums ;-)) some players _do_ clamp, so let's not get too concerned about this
28//!      * it's a remnant of the analog age, but it's still very common!
29//!
30//! ### Given a normalized YUV triplet, how do we get color?
31//!
32//! * `picture.matrix_coefficients()` (see <https://wiki.x266.mov/docs/colorimetry/matrix>)
33//!   * this tells us what to multiply the incoming YUV data with to get SOME RGB data
34//!   * there's various standards of how to do this, but the most common is BT.709
35//!   * here's a fun special one: `identity` means it's not actually YUV, but GBR!
36//! * `picture.primaries()`
37//!   * now we have RGB but we kinda have no idea what that means!
38//!   * the color primaries tell us which space we're in
39//!   * ...meaning that if the primaries are anything else we'd have to do some conversion BUT
40//!     it also means that we have no chance of displaying the picture perfectly on a screen taking in sRGB (or any other not-matching color space)
41//!   * [Wikipedia says](https://en.wikipedia.org/wiki/Rec._709#Relationship_to_sRGB) sRGB uses the same primaries as BT.709
42//!       * but I also found other sources (e.g. [this forum post](https://forum.doom9.org/showthread.php?p=1640342#post1640342))
43//!         clamining that they're just close enough to be considered the same for practical purposes
44//! * `picture.transfer_characteristics()`
45//!   * until this point everything is "gamma compressed", or more accurately, went through Opto Electric Transfer Function (OETF)
46//!       * i.e. measure of light in, electronic signal out
47//!   * we have to keep in mind the EOTF that our screen at the other end will use which for today's renderpipeline is always sRGB
48//!     (meaning it's a 2.2 gamma curve with a small linear part)
49//!   * Similar to the primaries, BT.709 uses a _similar_ transfer function as sRGB, but not exactly the same
50//!      <https://www.image-engineering.de/library/technotes/714-color-spaces-rec-709-vs-srgb>
51//!        * There's reason to believe players just ignore this:
52//!           * From a [VLC issue](https://code.videolan.org/videolan/vlc/-/issues/26999):
53//!              > We do not support transfers or primaries anyway, so it does not matter
54//!              > (we do support HDR transfer functions PQ and HLG, not SDR ones and we support BT.2020 primaries, but not SMPTE C (which is what BT.601 NTSC is))."
55//!           * …I'm sure I found a report of other video players ignoring this and most of everything except `matrix_coefficients` but I can't find it anymore :(
56//!
57//! All of the above are completely optional for a video to specify and there's sometimes some interplay of relationships with those.
58//! (a standard would often specify several things at once, there's typical and less typical combinations)
59//! So naturally, people will use terms sloppily and interchangeably,
60//! If anything is lacking a video player has to make a guess.
61//! … and as discussed above, even it's there, often video players tend to ignore some settings!
62//!
63//! With all this out of the way…
64//!
65//! ### What's the state of us making use of all these things?
66//!
67//! * ❌ `picture.bit_depth()`
68//!   * TODO(#7594): ignored, we just pretend everything is 8 bits
69//! * ✅ `picture.pixel_layout()`
70//! * ✅ `picture.color_range()`
71//! * 🟧 `picture.matrix_coefficients()`
72//!    * we try to figure out whether to use `BT.709` or `BT.601` coefficients, using other characteristics for guessing if nothing else is available.
73//! * ❌ `picture.primaries()`
74//! * ❌ `picture.transfer_characteristics()`
75//!
76//! We'll very likely be good with this until either we get specific feature requests and/or we'll start
77//! supporting HDR content at which point more properties will be important!
78//!
79
80#[cfg(with_dav1d)]
81mod async_decoder_wrapper;
82#[cfg(with_dav1d)]
83mod av1;
84
85#[cfg(with_ffmpeg)]
86mod ffmpeg_h264;
87
88#[cfg(with_ffmpeg)]
89pub use ffmpeg_h264::{
90    Error as FFmpegError, FFmpegVersion, FFmpegVersionParseError, ffmpeg_download_url,
91};
92
93#[cfg(target_arch = "wasm32")]
94mod webcodecs;
95
96mod gop_detection;
97
98pub use gop_detection::{DetectGopStartError, GopStartDetection, detect_gop_start};
99
100use crate::{SampleIndex, Time, VideoDataDescription};
101
102#[derive(thiserror::Error, Debug, Clone)]
103pub enum DecodeError {
104    #[error("Unsupported codec: {0}")]
105    UnsupportedCodec(String),
106
107    #[cfg(with_dav1d)]
108    #[error("dav1d: {0}")]
109    Dav1d(#[from] dav1d::Error),
110
111    #[error("To enabled native AV1 decoding, compile Rerun with the `nasm` feature enabled.")]
112    Dav1dWithoutNasm,
113
114    #[error(
115        "Rerun does not yet support native AV1 decoding on Linux ARM64. See https://github.com/rerun-io/rerun/issues/7755"
116    )]
117    NoDav1dOnLinuxArm64,
118
119    #[cfg(target_arch = "wasm32")]
120    #[error(transparent)]
121    WebDecoder(#[from] webcodecs::WebError),
122
123    #[cfg(with_ffmpeg)]
124    #[error(transparent)]
125    Ffmpeg(std::sync::Arc<FFmpegError>),
126
127    #[error("Unsupported bits per component: {0}")]
128    BadBitsPerComponent(usize),
129}
130
131impl DecodeError {
132    pub fn should_request_more_frames(&self) -> bool {
133        // Decoders often (not always!) recover from errors and will succeed eventually.
134        // Gotta keep trying!
135        match self {
136            // Unsupported codec / decoder not available:
137            Self::UnsupportedCodec(_) | Self::Dav1dWithoutNasm | Self::NoDav1dOnLinuxArm64 => false,
138
139            // Issue with AV1 decoding.
140            #[cfg(with_dav1d)]
141            Self::Dav1d(_) => true,
142
143            // Issue with WebCodecs decoding.
144            #[cfg(target_arch = "wasm32")]
145            Self::WebDecoder(_) => true,
146
147            // Issue with FFmpeg decoding.
148            #[cfg(with_ffmpeg)]
149            Self::Ffmpeg(err) => err.should_request_more_frames(),
150
151            // Unsupported format.
152            Self::BadBitsPerComponent(_) => false,
153        }
154    }
155}
156
157pub type Result<T = (), E = DecodeError> = std::result::Result<T, E>;
158
159/// Callback for decoding a single frame, called by decoders upon decoding a frame or hitting an error.
160#[allow(dead_code)] // May be unused in some configurations where we don't have any decoder.
161pub type OutputCallback = dyn Fn(Result<Frame>) + Send + Sync;
162
163/// Interface for an asynchronous video decoder.
164///
165/// Output callback is passed in on creation of a concrete type.
166pub trait AsyncDecoder: Send + Sync {
167    /// Submits a chunk for decoding in the background.
168    ///
169    /// Chunks are expected to come in the order of their decoding timestamp.
170    fn submit_chunk(&mut self, chunk: Chunk) -> Result<()>;
171
172    /// Called after submitting the last chunk.
173    ///
174    /// Should flush all pending frames.
175    /// If you plan on sending more chunks after calling `end_of_video`,
176    /// you MUST call [`Self::reset`] FIRST.
177    ///
178    /// Implementation note:
179    /// As of writing there's two decoders that have requirements on what happens for new frames after `end_of_video`
180    /// * WebCodec: The next submitted chunk has to be a key frame.
181    /// * FFmpeg-executable: We've shut down stdin, thus we need to restart the process. Doing this without the full context of `reset` is not possible right now.
182    fn end_of_video(&mut self) -> Result<()> {
183        Ok(())
184    }
185
186    /// Resets the decoder.
187    ///
188    /// Expected to be called for backward seeking and major jumps forward in the video.
189    /// Newly created decoder can assume to get reset at least once before any chunks are submitted.
190    ///
191    /// This does not block, all chunks sent to `decode` before this point will be discarded.
192    /// Previously missing [`VideoDataDescription::encoding_details`] may be present now.
193    fn reset(&mut self, video_descr: &VideoDataDescription) -> Result<()>;
194
195    /// Minimum number of samples the decoder requests to stay head of the currently requested sample.
196    ///
197    /// I.e. if sample N is requested, then the encoder would like to see at least all the samples from
198    /// [start of N's GOP] until [N + `min_num_samples_to_enqueue_ahead`].
199    /// Codec specific constraints regarding what samples can be decoded (samples may depend on other samples in their GOP)
200    /// still apply independently of this.
201    ///
202    /// This can be used as a workaround for decoders that are known to need additional samples to produce outputs.
203    fn min_num_samples_to_enqueue_ahead(&self) -> usize {
204        0
205    }
206}
207
208/// Creates a new async decoder for the given `video` data.
209pub fn new_decoder(
210    debug_name: &str,
211    video: &crate::VideoDataDescription,
212    decode_settings: &DecodeSettings,
213    on_output: impl Fn(Result<Frame>) + Send + Sync + 'static,
214) -> Result<Box<dyn AsyncDecoder>> {
215    #![allow(unused_variables, clippy::needless_return)] // With some feature flags
216
217    re_tracing::profile_function!();
218
219    re_log::trace!(
220        "Looking for decoder for {}",
221        video.human_readable_codec_string()
222    );
223
224    #[cfg(target_arch = "wasm32")]
225    return Ok(Box::new(webcodecs::WebVideoDecoder::new(
226        video,
227        decode_settings.hw_acceleration,
228        on_output,
229    )?));
230
231    #[cfg(not(target_arch = "wasm32"))]
232    match video.codec {
233        #[cfg(feature = "av1")]
234        crate::VideoCodec::AV1 => {
235            #[cfg(linux_arm64)]
236            {
237                return Err(DecodeError::NoDav1dOnLinuxArm64);
238            }
239
240            #[cfg(with_dav1d)]
241            {
242                re_log::trace!("Decoding AV1…");
243                return Ok(Box::new(async_decoder_wrapper::AsyncDecoderWrapper::new(
244                    debug_name.to_owned(),
245                    Box::new(av1::SyncDav1dDecoder::new(debug_name.to_owned())?),
246                    on_output,
247                )));
248            }
249        }
250
251        #[cfg(with_ffmpeg)]
252        crate::VideoCodec::H264 => {
253            re_log::trace!("Decoding H.264…");
254            Ok(Box::new(ffmpeg_h264::FFmpegCliH264Decoder::new(
255                debug_name.to_owned(),
256                &video.encoding_details,
257                on_output,
258                decode_settings.ffmpeg_path.clone(),
259            )?))
260        }
261
262        _ => Err(DecodeError::UnsupportedCodec(
263            video.human_readable_codec_string(),
264        )),
265    }
266}
267
268/// One chunk of encoded video data, representing a single [`crate::SampleMetadata`].
269///
270/// For details on how to interpret the data, see [`crate::SampleMetadata`].
271///
272/// In MP4, one sample is one frame.
273pub struct Chunk {
274    /// The start of a new [`crate::demux::GroupOfPictures`]?
275    ///
276    /// This probably means this is a _keyframe_, and that and entire frame
277    /// can be decoded from only this one sample (though I'm not 100% sure).
278    pub is_sync: bool,
279
280    pub data: Vec<u8>,
281
282    /// Which sample (frame) did this chunk come from?
283    ///
284    /// This is the order of which the samples appear in the container,
285    /// which is usually ordered by [`Self::decode_timestamp`].
286    pub sample_idx: usize,
287
288    /// Which frame does this chunk belong to?
289    ///
290    /// This is on the assumption that each sample produces a single frame,
291    /// which is true for MP4.
292    ///
293    /// This is the index of samples ordered by [`Self::presentation_timestamp`].
294    ///
295    /// Do *not* use this to index into the video data description!
296    /// Use [`Self::sample_idx`] instead.
297    pub frame_nr: u32,
298
299    /// Decode timestamp of this sample.
300    /// Chunks are expected to be submitted in the order of decode timestamp.
301    ///
302    /// `decode_timestamp <= presentation_timestamp`
303    pub decode_timestamp: Time,
304
305    /// Time at which this sample appears in the frame stream, in time units.
306    ///
307    /// The frame should be shown at this time.
308    /// Often synonymous with `composition_timestamp`.
309    ///
310    /// `decode_timestamp <= presentation_timestamp`
311    pub presentation_timestamp: Time,
312
313    /// Duration of the sample.
314    ///
315    /// Typically the time difference in presentation timestamp to the next sample.
316    /// May be unknown if this is the last sample in an ongoing video stream.
317    pub duration: Option<Time>,
318}
319
320/// Data for a decoded frame on native targets.
321#[cfg(not(target_arch = "wasm32"))]
322pub struct FrameContent {
323    pub data: Vec<u8>,
324    pub width: u32,
325    pub height: u32,
326    pub format: PixelFormat,
327}
328
329#[cfg(not(target_arch = "wasm32"))]
330impl re_byte_size::SizeBytes for FrameContent {
331    fn heap_size_bytes(&self) -> u64 {
332        let Self {
333            data,
334            width: _,
335            height: _,
336            format: _,
337        } = self;
338        data.heap_size_bytes()
339    }
340}
341
342#[cfg(not(target_arch = "wasm32"))]
343impl FrameContent {
344    pub fn width(&self) -> u32 {
345        self.width
346    }
347
348    pub fn height(&self) -> u32 {
349        self.height
350    }
351}
352
353/// Data for a decoded frame on the web.
354#[cfg(target_arch = "wasm32")]
355pub type FrameContent = webcodecs::WebVideoFrame;
356
357#[cfg(target_arch = "wasm32")]
358impl FrameContent {
359    pub fn width(&self) -> u32 {
360        self.display_width()
361    }
362
363    pub fn height(&self) -> u32 {
364        self.display_height()
365    }
366}
367
368/// Meta information about a decoded video frame, as reported by the decoder.
369#[derive(Debug, Clone)]
370pub struct FrameInfo {
371    /// The start of a new [`crate::demux::GroupOfPictures`]?
372    ///
373    /// This probably means this is a _keyframe_, and that and entire frame
374    /// can be decoded from only this one sample (though I'm not 100% sure).
375    ///
376    /// None = unknown.
377    pub is_sync: Option<bool>,
378
379    /// Which sample in the video is this from?
380    ///
381    /// We always assume one sample leads one frame
382    /// (but may provide arbitrary additional information which may be needed for other frames in the GOP).
383    ///
384    /// This is the order of which the samples appear in the container,
385    /// which is ordered by [`Self::latest_decode_timestamp`].
386    /// I.e. this is NOT ordered by [`Self::presentation_timestamp`].
387    ///
388    /// None = unknown.
389    pub sample_idx: Option<SampleIndex>,
390
391    /// Which frame is this?
392    ///
393    /// This is on the assumption that each sample produces a single frame,
394    /// which is true for MP4.
395    ///
396    /// This is the index of frames ordered by [`Self::presentation_timestamp`].
397    ///
398    /// None = unknown.
399    pub frame_nr: Option<u32>,
400
401    /// Time at which this frame appears in the frame stream, in time units.
402    ///
403    /// The frame should be shown at this time.
404    /// We expect this timestamp to be identical with a the presentation timestamp of the [`crate::Chunk`]
405    /// which is associated with this frame.
406    /// Often synonymous with `composition_timestamp`.
407    ///
408    /// `decode_timestamp <= presentation_timestamp`
409    pub presentation_timestamp: Time,
410
411    /// Duration of the frame.
412    ///
413    /// Typically the time difference in presentation timestamp to the next frame.
414    /// May be unknown if this is the last frame in an ongoing video stream.
415    pub duration: Option<Time>,
416
417    /// The decode timestamp of the last chunk that was needed to decode this frame.
418    ///
419    /// None = unknown.
420    pub latest_decode_timestamp: Option<Time>,
421}
422
423impl FrameInfo {
424    /// Presentation timestamp range in which this frame is valid.
425    ///
426    /// If there's no known duration, the range is open ended.
427    pub fn presentation_time_range(&self) -> std::ops::Range<Time> {
428        if let Some(duration) = self.duration {
429            self.presentation_timestamp..self.presentation_timestamp + duration
430        } else {
431            self.presentation_timestamp..Time::MAX
432        }
433    }
434}
435
436/// One decoded video frame.
437pub struct Frame {
438    pub content: FrameContent,
439    pub info: FrameInfo,
440}
441
442impl re_byte_size::SizeBytes for Frame {
443    fn heap_size_bytes(&self) -> u64 {
444        let Self { content, info: _ } = self;
445        content.heap_size_bytes()
446    }
447}
448
449/// Pixel format/layout used by [`FrameContent::data`].
450#[derive(Debug, Clone)]
451pub enum PixelFormat {
452    Rgb8Unorm,
453    Rgba8Unorm,
454
455    Yuv {
456        layout: YuvPixelLayout,
457        range: YuvRange,
458        // TODO(andreas): Color primaries should also apply to RGB data,
459        // but for now we just always assume RGB to be BT.709 ~= sRGB.
460        coefficients: YuvMatrixCoefficients,
461        // Note that we don't handle chroma sample location at all so far.
462    },
463}
464
465impl PixelFormat {
466    pub fn bits_per_pixel(&self) -> u32 {
467        match self {
468            Self::Rgb8Unorm { .. } => 24,
469            Self::Rgba8Unorm { .. } => 32,
470            Self::Yuv { layout, .. } => match layout {
471                YuvPixelLayout::Y_U_V444 => 24,
472                YuvPixelLayout::Y_U_V422 => 16,
473                YuvPixelLayout::Y_U_V420 => 12,
474                YuvPixelLayout::Y400 => 8,
475            },
476        }
477    }
478}
479
480/// Pixel layout used by [`PixelFormat::Yuv`].
481///
482/// For details see `re_renderer`'s `YuvPixelLayout` type.
483#[allow(non_camel_case_types)]
484#[derive(Debug, Clone, Copy, PartialEq, Eq)]
485pub enum YuvPixelLayout {
486    Y_U_V444,
487    Y_U_V422,
488    Y_U_V420,
489    Y400,
490}
491
492/// Yuv value range used by [`PixelFormat::Yuv`].
493///
494/// For details see `re_renderer`'s `YuvRange` type.
495#[derive(Debug, Clone, Copy)]
496pub enum YuvRange {
497    Limited,
498    Full,
499}
500
501/// Yuv matrix coefficients used by [`PixelFormat::Yuv`].
502///
503/// For details see `re_renderer`'s `YuvMatrixCoefficients` type.
504#[derive(Debug, Clone, Copy)]
505pub enum YuvMatrixCoefficients {
506    /// Interpret YUV as GBR.
507    Identity,
508
509    Bt601,
510
511    Bt709,
512}
513
514/// How the video should be decoded.
515///
516/// Depending on the decoder backend, these settings are merely hints and may be ignored.
517/// However, they can be useful in some situations to work around issues.
518///
519/// On the web this directly corresponds to
520/// <https://www.w3.org/TR/webcodecs/#hardware-acceleration>
521#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Hash)]
522#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
523pub enum DecodeHardwareAcceleration {
524    /// May use hardware acceleration if available and compatible with the codec.
525    #[default]
526    Auto,
527
528    /// Should use a software decoder even if hardware acceleration is available.
529    ///
530    /// If no software decoder is present, this may cause decoding to fail.
531    PreferSoftware,
532
533    /// Should use a hardware decoder.
534    ///
535    /// If no hardware decoder is present, this may cause decoding to fail.
536    PreferHardware,
537}
538
539/// Settings for video decoding.
540#[derive(Debug, Clone, PartialEq, Eq, Default, Hash)]
541#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
542pub struct DecodeSettings {
543    /// How the video should be decoded.
544    pub hw_acceleration: DecodeHardwareAcceleration,
545
546    /// Custom path for the ffmpeg binary.
547    ///
548    /// If not provided, we use the path automatically determined by `ffmpeg_sidecar`.
549    #[cfg(not(target_arch = "wasm32"))]
550    pub ffmpeg_path: Option<std::path::PathBuf>,
551}
552
553impl std::fmt::Display for DecodeHardwareAcceleration {
554    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
555        match self {
556            Self::Auto => write!(f, "Auto"),
557            Self::PreferSoftware => write!(f, "Prefer software"),
558            Self::PreferHardware => write!(f, "Prefer hardware"),
559        }
560    }
561}
562
563impl std::str::FromStr for DecodeHardwareAcceleration {
564    type Err = ();
565
566    fn from_str(s: &str) -> Result<Self, Self::Err> {
567        match s.trim().to_lowercase().replace('-', "_").as_str() {
568            "auto" => Ok(Self::Auto),
569            "prefer_software" | "software" => Ok(Self::PreferSoftware),
570            "prefer_hardware" | "hardware" => Ok(Self::PreferHardware),
571            _ => Err(()),
572        }
573    }
574}