Skip to main content

codec/colorspace/
downsample_444.rs

1// =============================================================================
2// 4:4:4 → 4:2:0 chroma downsample (Squad-31, roadmap #6).
3// =============================================================================
4//
5// ProRes 4444 (and other 4:4:4 sources) decode at full chroma resolution —
6// Cb / Cr planes match the luma plane in both dimensions. The encoder side
7// (rav1e + HW backends) only accepts 4:2:0, where chroma is half-resolution
8// in both axes. This module bridges the gap with a 2×2 box-average filter:
9// for each 2×2 block of source chroma, output one chroma sample equal to
10// the rounded mean. Y plane is unchanged (full-resolution luma in both
11// formats — 4:4:4 and 4:2:0 differ only in chroma layout).
12//
13// Filter choice: 2×2 box average. The simplest correct filter for 4:4:4
14// → 4:2:0 chroma siting (MPEG-2 left-aligned). For each output sample at
15// (cx, cy), input samples are (2*cx, 2*cy), (2*cx+1, 2*cy), (2*cx, 2*cy+1),
16// (2*cx+1, 2*cy+1). Output is `(s00 + s01 + s10 + s11 + 2) >> 2` —
17// rounding by adding half the divisor before truncating shift.
18//
19// Higher-quality alternatives (6-tap separable FIR per BT.601/709 H.131,
20// or a Lanczos-2 horizontal+vertical pair) are deferred to a follow-up;
21// they cost ~10× the cycles for ~0.3 dB chroma PSNR improvement, which
22// most consumer transcoders consider not worth it. The box average matches
23// libswscale's default 4:4:4 → 4:2:0 path when no scaler is requested.
24//
25// Odd-dimension policy: when the source width or height is odd, the output
26// dimensions round up (`(src + 1) / 2`), and the rightmost / bottom row of
27// 2×2 blocks straddles a single source row/column. We **clamp** — the
28// missing neighbour reuses the in-bounds sample. Clamping vs replication
29// is identical for a 1-pixel boundary; we pick clamping because it's the
30// simplest scalar implementation and matches what libswscale does.
31//
32// Alpha plane (Yuva444p10le): the 4:2:0 encoder format has no alpha. We
33// **drop** alpha with a single warn-log (in pipeline integration). AV1
34// has alpha support in some experimental profiles but rav1e 0.7 doesn't
35// expose it, and pre-compositing onto a black background changes pixel
36// values — keying / compositing on the source side would have already
37// happened. Documented in SUPPORTED.md.
38
39use anyhow::{Result, bail};
40use bytes::Bytes;
41
42use crate::frame::{PixelFormat, VideoFrame};
43
44/// 2×2 box-average chroma downsample for 8-bit `Yuv444p` → `Yuv420p`.
45/// Y plane is copied verbatim; Cb and Cr planes shrink 2× in each axis
46/// with rounded averages.
47///
48/// Output dimensions: chroma plane is `((width + 1) / 2) × ((height + 1) / 2)`,
49/// which matches the encoder's 4:2:0 expectation for any input dims
50/// (odd or even). For the common even case (e.g. 1920×1080) this is
51/// 960×540 chroma per plane.
52///
53/// Returns the new packed `Yuv420p` byte buffer (Y || Cb || Cr).
54pub fn downsample_chroma_444_to_420(
55    y: &[u8],
56    cb: &[u8],
57    cr: &[u8],
58    width: usize,
59    height: usize,
60) -> Vec<u8> {
61    debug_assert_eq!(y.len(), width * height, "Y plane size");
62    debug_assert_eq!(cb.len(), width * height, "Cb plane size (4:4:4)");
63    debug_assert_eq!(cr.len(), width * height, "Cr plane size (4:4:4)");
64
65    let cw = width.div_ceil(2);
66    let ch = height.div_ceil(2);
67
68    let mut out = Vec::with_capacity(width * height + 2 * cw * ch);
69
70    // Y plane: straight copy. Luma resolution is identical between
71    // 4:4:4 and 4:2:0.
72    out.extend_from_slice(y);
73
74    // Cb then Cr — same algorithm per plane.
75    for plane in [cb, cr] {
76        for cy in 0..ch {
77            // Source rows: 2*cy and 2*cy+1, clamped to height-1.
78            let y0 = 2 * cy;
79            let y1 = (y0 + 1).min(height - 1);
80            for cx in 0..cw {
81                let x0 = 2 * cx;
82                let x1 = (x0 + 1).min(width - 1);
83                // Box average. 8-bit max is 255 × 4 = 1020, fits in u16.
84                let s00 = plane[y0 * width + x0] as u16;
85                let s01 = plane[y0 * width + x1] as u16;
86                let s10 = plane[y1 * width + x0] as u16;
87                let s11 = plane[y1 * width + x1] as u16;
88                let avg = ((s00 + s01 + s10 + s11 + 2) >> 2) as u8;
89                out.push(avg);
90            }
91        }
92    }
93
94    out
95}
96
97/// 10-bit variant for `Yuv444p10le` → `Yuv420p10le`. Operates on `u16`
98/// samples in the 0..=1023 range; output samples are written as LE
99/// `u16` bytes packed alongside the copied Y plane.
100///
101/// Accumulator: `u32`. Worst case 4 × 1023 + 2 = 4094 fits comfortably
102/// in `u16` already, but `u32` keeps the math aligned with the spec
103/// recommendation (BT.709 Annex A) and allows easy future swap to a
104/// wider filter without overflow rework.
105pub fn downsample_chroma_444_to_420_10bit(
106    y: &[u16],
107    cb: &[u16],
108    cr: &[u16],
109    width: usize,
110    height: usize,
111) -> Vec<u8> {
112    debug_assert_eq!(y.len(), width * height, "Y plane samples");
113    debug_assert_eq!(cb.len(), width * height, "Cb plane samples (4:4:4)");
114    debug_assert_eq!(cr.len(), width * height, "Cr plane samples (4:4:4)");
115
116    let cw = width.div_ceil(2);
117    let ch = height.div_ceil(2);
118    let total_samples = width * height + 2 * cw * ch;
119    let mut out = Vec::with_capacity(total_samples * 2);
120
121    // Y plane: emit as u16 LE bytes. Y is unchanged (full luma).
122    for &s in y {
123        out.extend_from_slice(&s.to_le_bytes());
124    }
125
126    for plane in [cb, cr] {
127        for cy in 0..ch {
128            let y0 = 2 * cy;
129            let y1 = (y0 + 1).min(height - 1);
130            for cx in 0..cw {
131                let x0 = 2 * cx;
132                let x1 = (x0 + 1).min(width - 1);
133                let s00 = plane[y0 * width + x0] as u32;
134                let s01 = plane[y0 * width + x1] as u32;
135                let s10 = plane[y1 * width + x0] as u32;
136                let s11 = plane[y1 * width + x1] as u32;
137                let avg = ((s00 + s01 + s10 + s11 + 2) >> 2) as u16;
138                out.extend_from_slice(&avg.to_le_bytes());
139            }
140        }
141    }
142
143    out
144}
145
146/// High-level frame-shaped wrapper. Takes a `Yuv444p10le` /
147/// `Yuva444p10le` `VideoFrame` and returns a `Yuv420p10le`
148/// `VideoFrame` ready for the 10-bit AV1 encoder. Alpha plane (if
149/// present) is **dropped** with a warn-log — see module docstring for
150/// rationale. 8-bit equivalent (`Yuv444p` → `Yuv420p`) follows the
151/// same pattern, plumbed through `downsample_chroma_444_to_420`.
152///
153/// Errors if the source format is not 4:4:4.
154pub fn downsample_444_to_420_frame(frame: &VideoFrame) -> Result<VideoFrame> {
155    let w = frame.width as usize;
156    let h = frame.height as usize;
157    if w == 0 || h == 0 {
158        bail!("zero-dimension frame");
159    }
160
161    match frame.format {
162        PixelFormat::Yuv444p => {
163            let plane = w * h;
164            if frame.data.len() < 3 * plane {
165                bail!(
166                    "Yuv444p frame data too short for {}x{}: {} bytes",
167                    w,
168                    h,
169                    frame.data.len()
170                );
171            }
172            let y = &frame.data[..plane];
173            let cb = &frame.data[plane..2 * plane];
174            let cr = &frame.data[2 * plane..3 * plane];
175            let out = downsample_chroma_444_to_420(y, cb, cr, w, h);
176            Ok(VideoFrame::new(
177                Bytes::from(out),
178                frame.width,
179                frame.height,
180                PixelFormat::Yuv420p,
181                frame.color_space,
182                frame.pts,
183            ))
184        }
185        PixelFormat::Yuv444p10le | PixelFormat::Yuva444p10le => {
186            let plane = w * h;
187            // 10-bit (or 16-bit alpha) is 2 bytes/sample. Y/Cb/Cr always
188            // 10-bit, alpha (if present) is 16-bit, but layout is per-
189            // plane LE u16 either way. We only consume the first three
190            // planes; alpha (plane 4) is dropped on the floor.
191            let needed = if frame.format == PixelFormat::Yuva444p10le {
192                4 * plane * 2
193            } else {
194                3 * plane * 2
195            };
196            if frame.data.len() < needed {
197                bail!(
198                    "{:?} frame data too short for {}x{}: {} bytes (need {})",
199                    frame.format,
200                    w,
201                    h,
202                    frame.data.len(),
203                    needed
204                );
205            }
206            // Decode three u16 LE planes from the source bytes.
207            let y = super::read_u16le(&frame.data[..plane * 2]);
208            let cb = super::read_u16le(&frame.data[plane * 2..2 * plane * 2]);
209            let cr = super::read_u16le(&frame.data[2 * plane * 2..3 * plane * 2]);
210
211            if frame.format == PixelFormat::Yuva444p10le {
212                tracing::warn!(
213                    pts = frame.pts,
214                    "dropping alpha plane on 4:4:4→4:2:0 downsample \
215                     (rav1e 0.7 has no alpha; pipeline target is Yuv420p10le)"
216                );
217            }
218
219            let out = downsample_chroma_444_to_420_10bit(&y, &cb, &cr, w, h);
220            Ok(VideoFrame::new(
221                Bytes::from(out),
222                frame.width,
223                frame.height,
224                PixelFormat::Yuv420p10le,
225                frame.color_space,
226                frame.pts,
227            ))
228        }
229        other => bail!(
230            "downsample_444_to_420_frame: expected 4:4:4 input, got {:?}",
231            other
232        ),
233    }
234}