Skip to main content

codec/
colorspace.rs

1use anyhow::{Result, bail};
2use bytes::{Bytes, BytesMut};
3
4use crate::frame::{ColorMetadata, ColorSpace, PixelFormat, TransferFn, VideoFrame};
5use crate::tonemap::tonemap_yuv420p10le_bt2020_to_yuv420p_bt709;
6
7/// Normalize a decoder frame for the encoder.
8///
9/// **8-bit path** (target `Yuv420p` / `Bt709`): every supported 8-bit
10/// pixel format is converted to packed 4:2:0 BT.709 limited-range. The
11/// dispatcher does (a) chroma layout normalisation — NV12/NV21
12/// deinterleave, 4:2:2 vertical-2:1 average, 4:4:4 box average — then
13/// (b) RGB → YUV matrix when the source is RGB, then (c) BT.601 → BT.709
14/// matrix correction for tagged-BT.601 YUV sources.
15///
16/// **10-bit path** (target `Yuv420p10le`, HDR-aware): 10-bit and
17/// alpha-bearing 10-bit formats are downsampled to `Yuv420p10le` and
18/// returned as-is on the matrix axis. The pipeline preserves the source's
19/// `ColorMetadata` (primaries / transfer / matrix) so the muxer's
20/// `colr nclx` box and the AV1 sequence header carry the HDR / wide-gamut
21/// signaling unchanged. Squad-19, roadmap #5.
22///
23/// **Format coverage** (input → output):
24/// - `Yuv420p` (BT.709) → passthrough
25/// - `Yuv420p` (BT.601 / BT.2020) → matrix correction to BT.709 (BT.2020
26///   8-bit is rare; treated as BT.601 for the matrix, downstream `colr
27///   nclx` keeps the truth)
28/// - `Yuv422p` / `Yuv422p10le` → vertical 2:1 chroma average
29/// - `Yuv444p` / `Yuv444p10le` / `Yuva444p10le` → 2×2 box average
30///   (alpha dropped for `Yuva444p10le`)
31/// - `Nv12` → UV deinterleave
32/// - `Nv21` → VU deinterleave (same as NV12 with planes swapped)
33/// - `Rgb24` / `Rgba32` → BT.709 RGB→YUV matrix (alpha discarded for
34///   `Rgba32`)
35/// - `Yuv420p10le` → passthrough
36/// - `Yuv420p12le` → not yet wired; `bail!` (no decoder in tree emits
37///   12-bit today)
38/// HDR-aware variant. When the source `ColorMetadata` indicates a PQ /
39/// HLG transfer function, the 10-bit input is tonemapped to 8-bit BT.709
40/// limited via the Hable filmic curve (`crate::tonemap`). For SDR
41/// sources (transfer Bt709 / Bt470Bg / Linear / Unspecified), behaviour
42/// is identical to `convert_to_yuv420p_bt709` — including the existing
43/// 10-bit BT.709 passthrough.
44///
45/// This is the dispatch the pipeline should call when it has access to
46/// the source's `ColorMetadata`. Existing 8-bit-only callers that only
47/// have a frame in scope can continue to use `convert_to_yuv420p_bt709`
48/// directly; SDR semantics there are unchanged.
49pub fn convert_to_sdr_bt709(
50    frame: &VideoFrame,
51    color_metadata: &ColorMetadata,
52) -> Result<VideoFrame> {
53    let is_hdr_transfer = matches!(
54        color_metadata.transfer,
55        TransferFn::St2084 | TransferFn::AribStdB67
56    );
57    if is_hdr_transfer && matches!(frame.format, PixelFormat::Yuv420p10le) {
58        let max_white_nits = color_metadata
59            .mastering_display
60            .as_ref()
61            // mastering_display.max_luminance is in 0.0001 cd/m² ticks
62            // per H.265 SEI 137 / ST 2086. Divide to get nits.
63            .map(|m| (m.max_luminance as f32) / 10_000.0)
64            .filter(|n| *n > 0.0);
65        return tonemap_yuv420p10le_bt2020_to_yuv420p_bt709(
66            frame,
67            color_metadata.transfer,
68            max_white_nits,
69        );
70    }
71    // SDR path — also handles Yuv422p10le / Yuv444p10le HDR by first
72    // funnelling through the existing 10-bit passthrough chain. Those
73    // chroma formats are rarely HDR in practice; if they show up the
74    // mux's colr nclx still tags them PQ / HLG and downstream playback
75    // honours the transfer. Future work: extend the tonemap to accept
76    // those chroma layouts directly.
77    convert_to_yuv420p_bt709(frame)
78}
79
80pub fn convert_to_yuv420p_bt709(frame: &VideoFrame) -> Result<VideoFrame> {
81    use PixelFormat::*;
82
83    // ── 10-bit / wide-gamut path ──────────────────────────────────────
84    // HDR / wide-gamut passthrough on the matrix axis. Chroma layout
85    // gets normalised to 4:2:0 if needed, but matrix coefficients are
86    // preserved on the frame's `color_space` field — the encoder
87    // signals it through the AV1 sequence header and the mux writes
88    // `colr nclx` so a player/browser can reverse the matrix.
89    match frame.format {
90        Yuv420p10le => return Ok(frame.clone()),
91        Yuv422p10le => return yuv422p10le_to_yuv420p10le(frame),
92        Yuv444p10le | Yuva444p10le => return downsample_444_to_420_frame(frame),
93        Yuv420p12le => bail!(
94            "Yuv420p12le not yet supported in convert_to_yuv420p_bt709 \
95             (no decoder in tree emits 12-bit; add a 12→10-bit dither \
96             when a decoder lands that does)"
97        ),
98        _ => {}
99    }
100
101    // ── 8-bit path: RGB sources go straight to Yuv420p/Bt709 ─────────
102    match frame.format {
103        Rgb24 => return rgb_to_yuv420p_bt709(frame, /*has_alpha=*/ false),
104        Rgba32 => return rgb_to_yuv420p_bt709(frame, /*has_alpha=*/ true),
105        _ => {}
106    }
107
108    // ── 8-bit path: YUV chroma-layout normalize → Yuv420p ────────────
109    let yuv420p = match frame.format {
110        Yuv420p => frame.clone(),
111        Nv12 => nv12_to_yuv420p(frame)?,
112        Nv21 => nv21_to_yuv420p(frame)?,
113        Yuv422p => yuv422p_to_yuv420p(frame)?,
114        Yuv444p => downsample_444_to_420_frame(frame)?,
115        other => bail!(
116            "unsupported conversion: {:?}/{:?} → Yuv420p/Bt709",
117            other,
118            frame.color_space
119        ),
120    };
121
122    // ── 8-bit path: matrix correction → Bt709 ────────────────────────
123    if yuv420p.color_space == ColorSpace::Bt709 {
124        Ok(yuv420p)
125    } else {
126        // BT.601 and BT.2020 (rare in 8-bit SDR) both route through the
127        // BT.601 → BT.709 matrix. BT.2020-via-BT.601 produces a slight
128        // hue shift but the alternative — bailing — would block every
129        // BT.2020-tagged 8-bit input from transcoding. The mux's
130        // `colr nclx` carries the post-conversion BT.709 tag so a
131        // downstream player applies the right inverse.
132        recolor_yuv420p_bt601_to_bt709(&yuv420p)
133    }
134}
135
136fn nv12_to_yuv420p(frame: &VideoFrame) -> Result<VideoFrame> {
137    deinterleave_semiplanar_to_yuv420p(frame, /*v_first=*/ false)
138}
139
140/// NV21 has the same packed layout as NV12 but the chroma plane carries
141/// `VU` interleaved instead of `UV`. Sharing the implementation reduces
142/// the chance of one path drifting from the other on bug fixes.
143fn nv21_to_yuv420p(frame: &VideoFrame) -> Result<VideoFrame> {
144    deinterleave_semiplanar_to_yuv420p(frame, /*v_first=*/ true)
145}
146
147fn deinterleave_semiplanar_to_yuv420p(frame: &VideoFrame, v_first: bool) -> Result<VideoFrame> {
148    let w = frame.width as usize;
149    let h = frame.height as usize;
150    let y_size = w * h;
151    let uv_size = y_size / 4;
152    if frame.data.len() < y_size + 2 * uv_size {
153        bail!(
154            "{} frame too small for {}x{}: need {} bytes got {}",
155            if v_first { "NV21" } else { "NV12" },
156            w,
157            h,
158            y_size + 2 * uv_size,
159            frame.data.len()
160        );
161    }
162    let mut out = BytesMut::with_capacity(y_size + uv_size * 2);
163
164    // Y plane — straight copy.
165    out.extend_from_slice(&frame.data[..y_size]);
166
167    // Deinterleave the packed chroma plane.
168    let uv = &frame.data[y_size..];
169    let mut u_plane = Vec::with_capacity(uv_size);
170    let mut v_plane = Vec::with_capacity(uv_size);
171    for i in 0..uv_size {
172        let (a, b) = (uv[i * 2], uv[i * 2 + 1]);
173        if v_first {
174            v_plane.push(a);
175            u_plane.push(b);
176        } else {
177            u_plane.push(a);
178            v_plane.push(b);
179        }
180    }
181    out.extend_from_slice(&u_plane);
182    out.extend_from_slice(&v_plane);
183
184    Ok(VideoFrame::new(
185        out.freeze(),
186        frame.width,
187        frame.height,
188        PixelFormat::Yuv420p,
189        frame.color_space,
190        frame.pts,
191    ))
192}
193
194/// `Yuv422p` has full-width chroma rows but vertically subsampled to
195/// the SAME row count as luma is HALVED to land 4:2:0. Average two
196/// adjacent vertical chroma rows per output row.
197fn yuv422p_to_yuv420p(frame: &VideoFrame) -> Result<VideoFrame> {
198    let w = frame.width as usize;
199    let h = frame.height as usize;
200    let cw = w.div_ceil(2);
201    // 4:2:2 has chroma rows == luma rows, with chroma cols halved.
202    let ch_in = h;
203    let ch_out = h.div_ceil(2);
204    let y_size = w * h;
205    let chroma_in_size = cw * ch_in;
206    let chroma_out_size = cw * ch_out;
207    if frame.data.len() < y_size + 2 * chroma_in_size {
208        bail!(
209            "Yuv422p frame too small for {}x{}: need {} bytes got {}",
210            w,
211            h,
212            y_size + 2 * chroma_in_size,
213            frame.data.len()
214        );
215    }
216
217    let (y_in, rest) = frame.data.split_at(y_size);
218    let (cb_in, cr_in) = rest.split_at(chroma_in_size);
219
220    let mut out = BytesMut::with_capacity(y_size + 2 * chroma_out_size);
221    out.extend_from_slice(y_in);
222
223    for plane in [cb_in, cr_in] {
224        for cy in 0..ch_out {
225            let y0 = 2 * cy;
226            let y1 = (y0 + 1).min(ch_in - 1);
227            for cx in 0..cw {
228                let s0 = plane[y0 * cw + cx] as u16;
229                let s1 = plane[y1 * cw + cx] as u16;
230                out.extend_from_slice(&[((s0 + s1 + 1) >> 1) as u8]);
231            }
232        }
233    }
234
235    Ok(VideoFrame::new(
236        out.freeze(),
237        frame.width,
238        frame.height,
239        PixelFormat::Yuv420p,
240        frame.color_space,
241        frame.pts,
242    ))
243}
244
245/// 10-bit equivalent of `yuv422p_to_yuv420p`. Samples stored u16 LE.
246fn yuv422p10le_to_yuv420p10le(frame: &VideoFrame) -> Result<VideoFrame> {
247    let w = frame.width as usize;
248    let h = frame.height as usize;
249    let cw = w.div_ceil(2);
250    let ch_in = h;
251    let ch_out = h.div_ceil(2);
252    let y_samples = w * h;
253    let chroma_in_samples = cw * ch_in;
254    let chroma_out_samples = cw * ch_out;
255    let need_bytes = (y_samples + 2 * chroma_in_samples) * 2;
256    if frame.data.len() < need_bytes {
257        bail!(
258            "Yuv422p10le frame too small for {}x{}: need {} bytes got {}",
259            w,
260            h,
261            need_bytes,
262            frame.data.len()
263        );
264    }
265    let words = read_u16le(&frame.data[..need_bytes]);
266    let (y_in, rest) = words.split_at(y_samples);
267    let (cb_in, cr_in) = rest.split_at(chroma_in_samples);
268
269    let mut out = BytesMut::with_capacity((y_samples + 2 * chroma_out_samples) * 2);
270    write_u16le(&mut out, y_in);
271
272    for plane in [cb_in, cr_in] {
273        for cy in 0..ch_out {
274            let y0 = 2 * cy;
275            let y1 = (y0 + 1).min(ch_in - 1);
276            for cx in 0..cw {
277                let s0 = plane[y0 * cw + cx] as u32;
278                let s1 = plane[y1 * cw + cx] as u32;
279                let avg = ((s0 + s1 + 1) >> 1) as u16;
280                out.extend_from_slice(&avg.to_le_bytes());
281            }
282        }
283    }
284
285    Ok(VideoFrame::new(
286        out.freeze(),
287        frame.width,
288        frame.height,
289        PixelFormat::Yuv420p10le,
290        frame.color_space,
291        frame.pts,
292    ))
293}
294
295/// RGB (or RGBA, alpha discarded) → BT.709 YCbCr limited-range Yuv420p.
296///
297/// Per ITU-R BT.709 / H.273 matrix coefficient = 1, with the standard
298/// 8-bit studio-range scaling (Y in [16,235], Cb/Cr in [16,240]):
299///
300/// ```text
301/// Y  =  16 + 0.2126·R + 0.7152·G + 0.0722·B  (scaled to 219 swing)
302/// Cb = 128 + (B - Y) / (2·(1 - 0.0722))      (scaled to 224 swing)
303/// Cr = 128 + (R - Y) / (2·(1 - 0.2126))      (scaled to 224 swing)
304/// ```
305///
306/// Implemented as integer fixed-point (Q15) so a per-pixel pass is
307/// branch-free and SIMD-friendly. Chroma is then produced by 2×2
308/// averaging the four RGB pixels per chroma site (matches the BT.709
309/// Annex A box-average prescription used in our 4:4:4 → 4:2:0 path).
310fn rgb_to_yuv420p_bt709(frame: &VideoFrame, has_alpha: bool) -> Result<VideoFrame> {
311    let w = frame.width as usize;
312    let h = frame.height as usize;
313    let stride = if has_alpha { 4 } else { 3 };
314    let need = w * h * stride;
315    if frame.data.len() < need {
316        bail!(
317            "{} frame too small for {}x{}: need {} bytes got {}",
318            if has_alpha { "Rgba32" } else { "Rgb24" },
319            w,
320            h,
321            need,
322            frame.data.len()
323        );
324    }
325    let cw = w.div_ceil(2);
326    let ch = h.div_ceil(2);
327    let y_size = w * h;
328    let chroma_size = cw * ch;
329    let mut out = BytesMut::with_capacity(y_size + 2 * chroma_size);
330    out.resize(y_size + 2 * chroma_size, 0);
331
332    // BT.709 limited-range Q15 fixed-point coefficients.
333    // Y  =  ((kr·R + kg·G + kb·B) · 219 / 255) + 16
334    // We pre-scale into Q15 so integer math gives ≈Y_studio:
335    //   YR = round(0.2126 · 219/255 · 32768) = 5982
336    //   YG = round(0.7152 · 219/255 · 32768) = 20128
337    //   YB = round(0.0722 · 219/255 · 32768) = 2032
338    //   Y  = ((R·YR + G·YG + B·YB + 16384) >> 15) + 16
339    const Y_R: i32 = 5982;
340    const Y_G: i32 = 20128;
341    const Y_B: i32 = 2032;
342    // Cb = ((B - Y_full) / (2·(1-Kb))) · 224/255 + 128
343    // Decompose into per-channel Q15 against R,G,B (acts on full-range
344    // intermediate before the 224 swing, then re-scaled).
345    //   CbR = round(-0.1146 · 224/255 · 32768) = -3299
346    //   CbG = round(-0.3854 · 224/255 · 32768) = -11086
347    //   CbB = round( 0.5000 · 224/255 · 32768) = 14385
348    const CB_R: i32 = -3299;
349    const CB_G: i32 = -11086;
350    const CB_B: i32 = 14385;
351    //   CrR = round( 0.5000 · 224/255 · 32768) = 14385
352    //   CrG = round(-0.4542 · 224/255 · 32768) = -13066
353    //   CrB = round(-0.0458 · 224/255 · 32768) = -1319
354    const CR_R: i32 = 14385;
355    const CR_G: i32 = -13066;
356    const CR_B: i32 = -1319;
357
358    // Y plane: per-pixel scalar pass.
359    for y in 0..h {
360        for x in 0..w {
361            let off = (y * w + x) * stride;
362            let r = frame.data[off] as i32;
363            let g = frame.data[off + 1] as i32;
364            let b = frame.data[off + 2] as i32;
365            let y_val = ((r * Y_R + g * Y_G + b * Y_B + (1 << 14)) >> 15) + 16;
366            out[y * w + x] = y_val.clamp(16, 235) as u8;
367        }
368    }
369
370    // Chroma planes: 2×2 average of the source RGB pixels per chroma
371    // site, then matrix to Cb/Cr.
372    let cb_off = y_size;
373    let cr_off = y_size + chroma_size;
374    for cy in 0..ch {
375        let y0 = 2 * cy;
376        let y1 = (y0 + 1).min(h - 1);
377        for cx in 0..cw {
378            let x0 = 2 * cx;
379            let x1 = (x0 + 1).min(w - 1);
380            // Average the four source RGB pixels.
381            let mut r_sum = 0i32;
382            let mut g_sum = 0i32;
383            let mut b_sum = 0i32;
384            for &(py, px) in &[(y0, x0), (y0, x1), (y1, x0), (y1, x1)] {
385                let off = (py * w + px) * stride;
386                r_sum += frame.data[off] as i32;
387                g_sum += frame.data[off + 1] as i32;
388                b_sum += frame.data[off + 2] as i32;
389            }
390            let r = (r_sum + 2) >> 2;
391            let g = (g_sum + 2) >> 2;
392            let b = (b_sum + 2) >> 2;
393            let cb = ((r * CB_R + g * CB_G + b * CB_B + (1 << 14)) >> 15) + 128;
394            let cr = ((r * CR_R + g * CR_G + b * CR_B + (1 << 14)) >> 15) + 128;
395            out[cb_off + cy * cw + cx] = cb.clamp(16, 240) as u8;
396            out[cr_off + cy * cw + cx] = cr.clamp(16, 240) as u8;
397        }
398    }
399
400    Ok(VideoFrame::new(
401        out.freeze(),
402        frame.width,
403        frame.height,
404        PixelFormat::Yuv420p,
405        ColorSpace::Bt709,
406        frame.pts,
407    ))
408}
409
410// =============================================================================
411// BT.601 → BT.709 YCbCr matrix conversion (limited-range 8-bit).
412// =============================================================================
413//
414// Derived from BT.601 M_YUV→RGB composed with BT.709 M_RGB→YUV, both
415// in 8-bit studio-range form (Y in [16,235], Cb/Cr in [16,240]).
416//
417// Derivation path:
418//   1. BT.601 YCbCr (limited) → R'G'B' in [0,1], using Kr=0.299,
419//      Kg=0.587, Kb=0.114 and the standard limited-range scaling
420//      (Y'=(Y-16)/219, Pb=(Cb-128)/224, Pr=(Cr-128)/224).
421//   2. BT.709 R'G'B' → YCbCr (limited), using Kr=0.2126, Kg=0.7152,
422//      Kb=0.0722 and the inverse scaling (Y_out = 219·Y' + 16,
423//      Cb_out = 224·Pb + 128, Cr_out = 224·Pr + 128).
424//   3. Multiply M_709 · M_601^-1 on the delta vector
425//      (Y-16, Cb-128, Cr-128) to get a single 3×3 with zero offsets
426//      in delta space.
427//
428// Result (matrix applied to deltas):
429//   ΔY709  = 1.00000·ΔY - 0.11555·ΔCb - 0.20794·ΔCr
430//   ΔCb709 = 0·ΔY + 1.01864·ΔCb + 0.11462·ΔCr
431//   ΔCr709 = 0·ΔY + 0.07505·ΔCb + 1.02533·ΔCr
432//
433// where ΔY = Y-16, ΔCb = Cb-128, ΔCr = Cr-128. Chroma rows have NO
434// luma coupling — because BT.601 and BT.709 share the same limited-
435// range chroma basis (Pb,Pr scaled by 224 in both), and the chroma
436// basis vectors change only with Kr/Kb, not with luma.
437//
438// Sanity check under this matrix:
439//   (16, 128, 128) → (16, 128, 128)   [black round-trips]
440//   (235, 128, 128) → (235, 128, 128) [white round-trips]
441//   (128, 128, 128) → (128, 128, 128) [any gray round-trips]
442// because all three inputs have ΔCb=ΔCr=0, so ΔY709 = ΔY → Y
443// unchanged, and ΔCb709 = ΔCr709 = 0 → chroma unchanged.
444
445/// Q15 fixed-point coefficients for the 3×3 BT.601→BT.709 matrix.
446/// Multiplying an i16 delta by these and shifting right 15 yields the
447/// 709-domain delta. (Coefficients ≥1 round to 32768+, which fits in
448/// i32 but not i16; the AVX2 path splits those out and adds back the
449/// identity contribution to stay in i16 range for `mulhrs`.)
450const Q15: i32 = 15;
451const Q15_ROUND: i32 = 1 << (Q15 - 1);
452
453// Row 0 (Y): Y709 = Y601·1.0 + M_Y_CB·ΔCb + M_Y_CR·ΔCr. The 1.0
454// coefficient is applied as a direct copy (no fixed-point multiply).
455#[allow(dead_code)] // documented identity; not emitted into the hot path
456const M_Y_Y: i32 = 32768;
457const M_Y_CB: i32 = (-0.11554975_f64 * 32768.0) as i32; // -3786
458const M_Y_CR: i32 = (-0.20793764_f64 * 32768.0) as i32; // -6814
459// Row 1 (Cb): no luma coupling
460const M_CB_CB: i32 = (1.01863972_f64 * 32768.0).round() as i32; // 33379
461const M_CB_CR: i32 = (0.11461795_f64 * 32768.0).round() as i32; //  3756
462// Row 2 (Cr): no luma coupling
463const M_CR_CB: i32 = (0.07504945_f64 * 32768.0).round() as i32; //  2459
464const M_CR_CR: i32 = (1.02532707_f64 * 32768.0).round() as i32; // 33598
465
466#[inline(always)]
467fn clamp_y(v: i32) -> u8 {
468    v.clamp(16, 235) as u8
469}
470
471#[inline(always)]
472fn clamp_c(v: i32) -> u8 {
473    v.clamp(16, 240) as u8
474}
475
476/// Scalar reference implementation — correctness baseline.
477///
478/// Operates in-place on three planes (Y, Cb, Cr). Chroma planes are
479/// half-width / half-height of luma (4:2:0 subsampling). The matrix
480/// couples chroma into luma (Y709 depends on ΔCb, ΔCr), so for each
481/// luma sample we read the Cb/Cr sample covering it via the 2:1
482/// subsampling grid (chroma-centered between luma rows/cols, but for
483/// this pipeline we use the standard "shared per 2×2 block" mapping
484/// — both decoders in-tree produce this layout).
485///
486/// Order matters: we must read the *original* Cb/Cr values before
487/// overwriting them with Cb709/Cr709. We therefore update luma first
488/// (consuming original chroma deltas) and update chroma last.
489fn bt601_to_bt709_scalar(y: &mut [u8], cb: &mut [u8], cr: &mut [u8], width: usize, height: usize) {
490    debug_assert_eq!(y.len(), width * height);
491    debug_assert_eq!(cb.len(), (width / 2) * (height / 2));
492    debug_assert_eq!(cr.len(), (width / 2) * (height / 2));
493
494    let cw = width / 2;
495
496    // Luma: Y709 = Y601 + M_Y_CB * ΔCb + M_Y_CR * ΔCr  (per-sample).
497    // Each chroma sample covers a 2×2 luma block.
498    for yi in 0..height {
499        let cy = yi >> 1;
500        for xi in 0..width {
501            let cx = xi >> 1;
502            let cbl = cb[cy * cw + cx] as i32 - 128;
503            let crl = cr[cy * cw + cx] as i32 - 128;
504            let y_orig = y[yi * width + xi] as i32;
505            let delta = (M_Y_CB * cbl + M_Y_CR * crl + Q15_ROUND) >> Q15;
506            y[yi * width + xi] = clamp_y(y_orig + delta);
507        }
508    }
509
510    // Chroma: no luma coupling. Pure 2×2 chroma → chroma transform.
511    for v in cb.iter_mut().zip(cr.iter_mut()) {
512        let (cbp, crp) = v;
513        let cbl = *cbp as i32 - 128;
514        let crl = *crp as i32 - 128;
515        let new_cb = (M_CB_CB * cbl + M_CB_CR * crl + Q15_ROUND) >> Q15;
516        let new_cr = (M_CR_CB * cbl + M_CR_CR * crl + Q15_ROUND) >> Q15;
517        *cbp = clamp_c(new_cb + 128);
518        *crp = clamp_c(new_cr + 128);
519    }
520}
521
522/// Public scalar entry point — for bench / tests.
523pub fn bt601_to_bt709_planes_scalar(
524    y: &mut [u8],
525    cb: &mut [u8],
526    cr: &mut [u8],
527    width: usize,
528    height: usize,
529) {
530    bt601_to_bt709_scalar(y, cb, cr, width, height);
531}
532
533/// Runtime-dispatched entry point. Uses AVX2 if the CPU advertises
534/// it, scalar fallback otherwise. Safe wrapper around the unsafe
535/// target-feature specialization.
536pub fn bt601_to_bt709_planes(
537    y: &mut [u8],
538    cb: &mut [u8],
539    cr: &mut [u8],
540    width: usize,
541    height: usize,
542) {
543    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
544    {
545        if std::is_x86_feature_detected!("avx2") {
546            // SAFETY: avx2 feature was runtime-detected above.
547            unsafe {
548                bt601_to_bt709_avx2(y, cb, cr, width, height);
549            }
550            return;
551        }
552    }
553    bt601_to_bt709_scalar(y, cb, cr, width, height);
554}
555
556#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
557#[target_feature(enable = "avx2")]
558unsafe fn bt601_to_bt709_avx2(
559    y: &mut [u8],
560    cb: &mut [u8],
561    cr: &mut [u8],
562    width: usize,
563    height: usize,
564) {
565    unsafe {
566        #[cfg(target_arch = "x86")]
567        use std::arch::x86::*;
568        #[cfg(target_arch = "x86_64")]
569        use std::arch::x86_64::*;
570
571        let cw = width / 2;
572        let ch = height / 2;
573
574        // Q15 coefficients as packed 16-bit lanes. `_mm256_mulhrs_epi16`
575        // computes (a*b + 0x4000) >> 15 on i16 lanes — the fixed-point
576        // multiply we want. Inputs must be pre-subtracted to deltas
577        // (ΔCb = Cb-128, ΔCr = Cr-128, in [-128, 127]). All six chroma
578        // coefficients fit in i16 (range ~[-6814, 33598]): the 1.02533
579        // coefficient (33598) exceeds i16_max (32767), so we handle it
580        // specially by splitting out the identity contribution for that
581        // coupling.
582        //
583        // Trick: write Cb709 = ΔCb + (M_CB_CB - 32768)/32768 · ΔCb + ...
584        //        Cr709 = ΔCr + M_CR_CB/32768 · ΔCb + (M_CR_CR - 32768)/32768 · ΔCr
585        // so the ~1.0 coefficients are represented as a free identity add
586        // + a small correction. All i16-safe.
587        let v_m_y_cb = _mm256_set1_epi16(M_Y_CB as i16); // -3786
588        let v_m_y_cr = _mm256_set1_epi16(M_Y_CR as i16); // -6814
589        let v_m_cb_cb_corr = _mm256_set1_epi16((M_CB_CB - 32768) as i16); // 611
590        let v_m_cb_cr = _mm256_set1_epi16(M_CB_CR as i16); // 3756
591        let v_m_cr_cb = _mm256_set1_epi16(M_CR_CB as i16); // 2459
592        let v_m_cr_cr_corr = _mm256_set1_epi16((M_CR_CR - 32768) as i16); // 830
593
594        let v_128 = _mm256_set1_epi16(128);
595        let v_chroma_lo = _mm256_set1_epi16(16);
596        let v_chroma_hi = _mm256_set1_epi16(240);
597        let v_luma_lo = _mm256_set1_epi16(16);
598        let v_luma_hi = _mm256_set1_epi16(235);
599
600        // ---- Luma pass ----
601        // For each 2×2 luma block we share one (Cb, Cr) sample. Process
602        // 16 chroma samples per iteration → 32 luma cols on two rows (64
603        // luma outputs per iter).
604        for cy_idx in 0..ch {
605            let y_row0 = cy_idx * 2 * width;
606            let y_row1 = y_row0 + width;
607            let c_row = cy_idx * cw;
608
609            let mut cx = 0usize;
610            while cx + 16 <= cw {
611                // Load 16 Cb/Cr, compute per-chroma delta.
612                let cb_u8 = _mm_loadu_si128(cb.as_ptr().add(c_row + cx) as *const _);
613                let cr_u8 = _mm_loadu_si128(cr.as_ptr().add(c_row + cx) as *const _);
614                let cb_i16 = _mm256_cvtepu8_epi16(cb_u8);
615                let cr_i16 = _mm256_cvtepu8_epi16(cr_u8);
616                let cbl = _mm256_sub_epi16(cb_i16, v_128);
617                let crl = _mm256_sub_epi16(cr_i16, v_128);
618
619                // Per-chroma Y delta: d = -0.1155·ΔCb - 0.2079·ΔCr in Q15.
620                // mulhrs: (a*b + 0x4000) >> 15.
621                let dy_cb = _mm256_mulhrs_epi16(cbl, v_m_y_cb);
622                let dy_cr = _mm256_mulhrs_epi16(crl, v_m_y_cr);
623                let dy_chroma = _mm256_add_epi16(dy_cb, dy_cr); // 16 per-chroma deltas
624
625                // Apply to luma: each chroma sample covers a 2×2 block.
626                // Horizontal: duplicate each 16-bit lane into two adjacent
627                // 16-bit lanes → 32 deltas aligned with 32 luma cols.
628                // `_mm256_unpacklo_epi16` / `unpackhi_epi16` with self
629                // interleaves adjacent lanes; but we want dy[0], dy[0],
630                // dy[1], dy[1], .... Use two shuffles + permute.
631                //
632                // Within each 128-bit lane, `unpacklo_epi16(a, a)` yields
633                // a0 a0 a1 a1 a2 a2 a3 a3; `unpackhi_epi16(a, a)` yields
634                // a4 a4 ... a7 a7. We have 16 lanes → 4 × 128-bit lanes of
635                // output, which we rearrange with permute4x64.
636                // Trick: treat the 256-bit register as two halves.
637                // Expand 16 chroma deltas → 32 per-luma deltas via a small
638                // stack-scratch path. An in-register expand would save two
639                // 64-byte cache-line round-trips per 16 chroma samples, but
640                // on a 1080p frame this is ~68k bytes of scratch total —
641                // negligible vs the 2M luma stores. Kept simple for
642                // maintainability; revisit if this function ever shows up
643                // at the top of a pprof.
644                let mut dy_luma = [0i16; 32];
645                _mm256_storeu_si256(dy_luma.as_mut_ptr().add(0) as *mut _, dy_chroma);
646                // Above stored 16 chroma deltas. Now expand in-register.
647                // Actually simpler: use a second aligned buffer with pair
648                // duplication done by indexing.
649                let mut dy_luma_pair = [0i16; 32];
650                for i in 0..16 {
651                    dy_luma_pair[i * 2] = dy_luma[i];
652                    dy_luma_pair[i * 2 + 1] = dy_luma[i];
653                }
654                let dy_luma_lo = _mm256_loadu_si256(dy_luma_pair.as_ptr().add(0) as *const _);
655                let dy_luma_hi = _mm256_loadu_si256(dy_luma_pair.as_ptr().add(16) as *const _);
656
657                // Process both luma rows for this chroma row. Both share
658                // dy_luma_* because chroma is 4:2:0.
659                for row_off in [y_row0, y_row1] {
660                    // Load 32 luma pixels.
661                    let y_u8 = _mm256_loadu_si256(y.as_ptr().add(row_off + cx * 2) as *const _);
662                    // Widen low 16 bytes and high 16 bytes to i16.
663                    let y_lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_u8));
664                    let y_hi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(y_u8));
665
666                    let y_lo_out = _mm256_add_epi16(y_lo, dy_luma_lo);
667                    let y_hi_out = _mm256_add_epi16(y_hi, dy_luma_hi);
668
669                    // Clamp to limited-range luma [16, 235].
670                    let y_lo_out =
671                        _mm256_min_epi16(_mm256_max_epi16(y_lo_out, v_luma_lo), v_luma_hi);
672                    let y_hi_out =
673                        _mm256_min_epi16(_mm256_max_epi16(y_hi_out, v_luma_lo), v_luma_hi);
674
675                    // Pack i16 → u8 with saturation and store 32 bytes.
676                    let packed = _mm256_packus_epi16(y_lo_out, y_hi_out);
677                    // packus interleaves lanes; permute to
678                    // [lo[0..7], hi[0..7], lo[8..15], hi[8..15]] → lane order.
679                    let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(packed);
680                    _mm256_storeu_si256(y.as_mut_ptr().add(row_off + cx * 2) as *mut _, packed);
681                }
682
683                cx += 16;
684            }
685
686            // Scalar tail for luma of this chroma row.
687            while cx < cw {
688                let cb_idx = c_row + cx;
689                let cbl = cb[cb_idx] as i32 - 128;
690                let crl = cr[cb_idx] as i32 - 128;
691                let delta = (M_Y_CB * cbl + M_Y_CR * crl + Q15_ROUND) >> Q15;
692                let xi = cx * 2;
693                for row_off in [y_row0, y_row1] {
694                    for sub in 0..2 {
695                        let idx = row_off + xi + sub;
696                        y[idx] = clamp_y(y[idx] as i32 + delta);
697                    }
698                }
699                cx += 1;
700            }
701        }
702
703        // ---- Chroma pass (no luma coupling) ----
704        // 16 samples per iteration.
705        let total_c = cb.len();
706        let mut i = 0usize;
707        while i + 16 <= total_c {
708            let cb_u8 = _mm_loadu_si128(cb.as_ptr().add(i) as *const _);
709            let cr_u8 = _mm_loadu_si128(cr.as_ptr().add(i) as *const _);
710            let cb_i16 = _mm256_cvtepu8_epi16(cb_u8);
711            let cr_i16 = _mm256_cvtepu8_epi16(cr_u8);
712            let cbl = _mm256_sub_epi16(cb_i16, v_128);
713            let crl = _mm256_sub_epi16(cr_i16, v_128);
714
715            // Cb709 = ΔCb + (M_CB_CB-32768)·ΔCb·2^-15 + M_CB_CR·ΔCr·2^-15 + 128
716            let cb_corr = _mm256_mulhrs_epi16(cbl, v_m_cb_cb_corr);
717            let cb_cross = _mm256_mulhrs_epi16(crl, v_m_cb_cr);
718            let new_cb = _mm256_add_epi16(_mm256_add_epi16(cbl, cb_corr), cb_cross);
719            let new_cb = _mm256_add_epi16(new_cb, v_128);
720
721            // Cr709 = ΔCr + (M_CR_CR-32768)·ΔCr·2^-15 + M_CR_CB·ΔCb·2^-15 + 128
722            let cr_corr = _mm256_mulhrs_epi16(crl, v_m_cr_cr_corr);
723            let cr_cross = _mm256_mulhrs_epi16(cbl, v_m_cr_cb);
724            let new_cr = _mm256_add_epi16(_mm256_add_epi16(crl, cr_corr), cr_cross);
725            let new_cr = _mm256_add_epi16(new_cr, v_128);
726
727            // Clamp [16, 240].
728            let new_cb = _mm256_min_epi16(_mm256_max_epi16(new_cb, v_chroma_lo), v_chroma_hi);
729            let new_cr = _mm256_min_epi16(_mm256_max_epi16(new_cr, v_chroma_lo), v_chroma_hi);
730
731            // Pack and store.
732            let cb_packed = _mm256_packus_epi16(new_cb, new_cb);
733            let cr_packed = _mm256_packus_epi16(new_cr, new_cr);
734            let cb_packed = _mm256_permute4x64_epi64::<0b00_00_10_00>(cb_packed);
735            let cr_packed = _mm256_permute4x64_epi64::<0b00_00_10_00>(cr_packed);
736            _mm_storeu_si128(
737                cb.as_mut_ptr().add(i) as *mut _,
738                _mm256_castsi256_si128(cb_packed),
739            );
740            _mm_storeu_si128(
741                cr.as_mut_ptr().add(i) as *mut _,
742                _mm256_castsi256_si128(cr_packed),
743            );
744
745            i += 16;
746        }
747
748        // Scalar tail for chroma.
749        while i < total_c {
750            let cbl = cb[i] as i32 - 128;
751            let crl = cr[i] as i32 - 128;
752            let new_cb = (M_CB_CB * cbl + M_CB_CR * crl + Q15_ROUND) >> Q15;
753            let new_cr = (M_CR_CB * cbl + M_CR_CR * crl + Q15_ROUND) >> Q15;
754            cb[i] = clamp_c(new_cb + 128);
755            cr[i] = clamp_c(new_cr + 128);
756            i += 1;
757        }
758    }
759}
760
761// =============================================================================
762// 10-bit BT.601 → BT.709 (Squad-29, follow-up to Squad-19's 10-bit pipeline).
763// =============================================================================
764//
765// 10-bit limited-range constants (Rec. ITU-R BT.2100-2 Table 9 /
766// the standard "limited-range 10-bit" of BT.709/BT.2020):
767//   luma center  = 64   (16 << 2)
768//   chroma center = 512 (128 << 2)
769//   luma clamp   = [64, 940]   (16<<2 .. 235<<2)
770//   chroma clamp = [64, 960]   (16<<2 .. 240<<2)
771//
772// The matrix coefficients are identical to the 8-bit case — they're
773// derived from the BT.601 / BT.709 spec ratios (Kr / Kg / Kb) and
774// don't depend on bit depth. Only the offsets and clamp range change.
775//
776// Use case: rare. The 10-bit pipeline is HDR-passthrough by default
777// (Squad-19) — for HDR sources (BT.2020 + PQ/HLG) we never convert
778// because the matrix shift would corrupt the wide gamut. This 10-bit
779// BT.601→BT.709 path exists for explicitly-tagged BT.601 10-bit
780// content (some Sony broadcast cameras output 10-bit BT.601). Wired
781// behind a public entry point but not invoked from the default
782// pipeline; callers must opt in explicitly.
783
784#[inline(always)]
785fn clamp_y_10bit(v: i32) -> u16 {
786    v.clamp(64, 940) as u16
787}
788
789#[inline(always)]
790fn clamp_c_10bit(v: i32) -> u16 {
791    v.clamp(64, 960) as u16
792}
793
794const CHROMA_CENTER_10BIT: i32 = 512;
795
796/// Scalar 10-bit BT.601 → BT.709 reference. Same algorithm as the
797/// 8-bit `bt601_to_bt709_scalar`, but operates on `u16` planes
798/// (10-bit values in 0..=1023). `width` / `height` are luma
799/// dimensions; chroma planes are half-resolution per axis (4:2:0).
800pub fn bt601_to_bt709_planes_10bit_scalar(
801    y: &mut [u16],
802    cb: &mut [u16],
803    cr: &mut [u16],
804    width: usize,
805    height: usize,
806) {
807    debug_assert_eq!(y.len(), width * height);
808    debug_assert_eq!(cb.len(), (width / 2) * (height / 2));
809    debug_assert_eq!(cr.len(), (width / 2) * (height / 2));
810
811    let cw = width / 2;
812
813    // Luma: Y709 = Y601 + M_Y_CB * ΔCb + M_Y_CR * ΔCr.
814    for yi in 0..height {
815        let cy = yi >> 1;
816        for xi in 0..width {
817            let cx = xi >> 1;
818            let cbl = cb[cy * cw + cx] as i32 - CHROMA_CENTER_10BIT;
819            let crl = cr[cy * cw + cx] as i32 - CHROMA_CENTER_10BIT;
820            let y_orig = y[yi * width + xi] as i32;
821            let delta = (M_Y_CB * cbl + M_Y_CR * crl + Q15_ROUND) >> Q15;
822            y[yi * width + xi] = clamp_y_10bit(y_orig + delta);
823        }
824    }
825
826    // Chroma: pure 2×2 chroma → chroma transform (no luma coupling).
827    for v in cb.iter_mut().zip(cr.iter_mut()) {
828        let (cbp, crp) = v;
829        let cbl = *cbp as i32 - CHROMA_CENTER_10BIT;
830        let crl = *crp as i32 - CHROMA_CENTER_10BIT;
831        let new_cb = (M_CB_CB * cbl + M_CB_CR * crl + Q15_ROUND) >> Q15;
832        let new_cr = (M_CR_CB * cbl + M_CR_CR * crl + Q15_ROUND) >> Q15;
833        *cbp = clamp_c_10bit(new_cb + CHROMA_CENTER_10BIT);
834        *crp = clamp_c_10bit(new_cr + CHROMA_CENTER_10BIT);
835    }
836}
837
838/// Runtime-dispatched 10-bit BT.601 → BT.709. AVX2 on x86_64 when
839/// available, scalar fallback otherwise. Squad-29.
840pub fn bt601_to_bt709_planes_10bit(
841    y: &mut [u16],
842    cb: &mut [u16],
843    cr: &mut [u16],
844    width: usize,
845    height: usize,
846) {
847    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
848    {
849        if std::is_x86_feature_detected!("avx2") {
850            // SAFETY: avx2 feature was runtime-detected above.
851            unsafe {
852                bt601_to_bt709_10bit_avx2(y, cb, cr, width, height);
853            }
854            return;
855        }
856    }
857    bt601_to_bt709_planes_10bit_scalar(y, cb, cr, width, height);
858}
859
860/// AVX2 specialization for the 10-bit BT.601 → BT.709 matrix
861/// conversion. Mirrors `bt601_to_bt709_avx2` on `u16` lanes
862/// (16 chroma samples per 256-bit register vs 16 in the 8-bit
863/// path — same lane count because 8-bit path already widened
864/// u8 → i16 inside the kernel).
865///
866/// Q15 fixed-point math via `_mm256_mulhrs_epi16` ((a*b + 0x4000) >> 15).
867/// Chroma deltas are in [-512, 511] (10-bit center 512), so values
868/// fit in i16 with room to spare. Coefficients ≥ 1 (M_CB_CB=33379,
869/// M_CR_CR=33598) split off the identity contribution as the 8-bit
870/// path does.
871#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
872#[target_feature(enable = "avx2")]
873unsafe fn bt601_to_bt709_10bit_avx2(
874    y: &mut [u16],
875    cb: &mut [u16],
876    cr: &mut [u16],
877    width: usize,
878    height: usize,
879) {
880    unsafe {
881        #[cfg(target_arch = "x86")]
882        use std::arch::x86::*;
883        #[cfg(target_arch = "x86_64")]
884        use std::arch::x86_64::*;
885
886        let cw = width / 2;
887        let ch = height / 2;
888
889        let v_m_y_cb = _mm256_set1_epi16(M_Y_CB as i16);
890        let v_m_y_cr = _mm256_set1_epi16(M_Y_CR as i16);
891        let v_m_cb_cb_corr = _mm256_set1_epi16((M_CB_CB - 32768) as i16);
892        let v_m_cb_cr = _mm256_set1_epi16(M_CB_CR as i16);
893        let v_m_cr_cb = _mm256_set1_epi16(M_CR_CB as i16);
894        let v_m_cr_cr_corr = _mm256_set1_epi16((M_CR_CR - 32768) as i16);
895
896        let v_chroma_center = _mm256_set1_epi16(CHROMA_CENTER_10BIT as i16);
897        let v_chroma_lo = _mm256_set1_epi16(64);
898        let v_chroma_hi = _mm256_set1_epi16(960);
899        let v_luma_lo = _mm256_set1_epi16(64);
900        let v_luma_hi = _mm256_set1_epi16(940);
901
902        // ---- Luma pass ----
903        // 16 chroma samples per iter → 32 luma cols on two rows (64 luma
904        // outputs per iter). Same expand pattern as the 8-bit kernel:
905        // duplicate each chroma delta into two adjacent luma slots
906        // because chroma is 4:2:0 (one chroma per 2×2 luma block).
907        for cy_idx in 0..ch {
908            let y_row0 = cy_idx * 2 * width;
909            let y_row1 = y_row0 + width;
910            let c_row = cy_idx * cw;
911
912            let mut cx = 0usize;
913            while cx + 16 <= cw {
914                // Load 16 Cb/Cr u16, compute deltas (subtract chroma center).
915                let cb_i16 = _mm256_loadu_si256(cb.as_ptr().add(c_row + cx) as *const _);
916                let cr_i16 = _mm256_loadu_si256(cr.as_ptr().add(c_row + cx) as *const _);
917                let cbl = _mm256_sub_epi16(cb_i16, v_chroma_center);
918                let crl = _mm256_sub_epi16(cr_i16, v_chroma_center);
919
920                // Per-chroma Y delta in Q15.
921                let dy_cb = _mm256_mulhrs_epi16(cbl, v_m_y_cb);
922                let dy_cr = _mm256_mulhrs_epi16(crl, v_m_y_cr);
923                let dy_chroma = _mm256_add_epi16(dy_cb, dy_cr);
924
925                // Expand 16 chroma deltas → 32 per-luma deltas via stack
926                // scratch (same as 8-bit kernel — see comment there for
927                // the in-register-vs-scratch tradeoff).
928                let mut dy_luma = [0i16; 16];
929                _mm256_storeu_si256(dy_luma.as_mut_ptr() as *mut _, dy_chroma);
930                let mut dy_luma_pair = [0i16; 32];
931                for i in 0..16 {
932                    dy_luma_pair[i * 2] = dy_luma[i];
933                    dy_luma_pair[i * 2 + 1] = dy_luma[i];
934                }
935                let dy_luma_lo = _mm256_loadu_si256(dy_luma_pair.as_ptr() as *const _);
936                let dy_luma_hi = _mm256_loadu_si256(dy_luma_pair.as_ptr().add(16) as *const _);
937
938                // Apply to both luma rows for this chroma row.
939                for row_off in [y_row0, y_row1] {
940                    // Load 32 luma u16 across two 256-bit registers.
941                    let y_lo = _mm256_loadu_si256(y.as_ptr().add(row_off + cx * 2) as *const _);
942                    let y_hi =
943                        _mm256_loadu_si256(y.as_ptr().add(row_off + cx * 2 + 16) as *const _);
944
945                    let y_lo_out = _mm256_add_epi16(y_lo, dy_luma_lo);
946                    let y_hi_out = _mm256_add_epi16(y_hi, dy_luma_hi);
947
948                    // Clamp to limited-range luma [64, 940].
949                    let y_lo_out =
950                        _mm256_min_epi16(_mm256_max_epi16(y_lo_out, v_luma_lo), v_luma_hi);
951                    let y_hi_out =
952                        _mm256_min_epi16(_mm256_max_epi16(y_hi_out, v_luma_lo), v_luma_hi);
953
954                    _mm256_storeu_si256(y.as_mut_ptr().add(row_off + cx * 2) as *mut _, y_lo_out);
955                    _mm256_storeu_si256(
956                        y.as_mut_ptr().add(row_off + cx * 2 + 16) as *mut _,
957                        y_hi_out,
958                    );
959                }
960
961                cx += 16;
962            }
963
964            // Scalar tail for luma of this chroma row.
965            while cx < cw {
966                let cb_idx = c_row + cx;
967                let cbl = cb[cb_idx] as i32 - CHROMA_CENTER_10BIT;
968                let crl = cr[cb_idx] as i32 - CHROMA_CENTER_10BIT;
969                let delta = (M_Y_CB * cbl + M_Y_CR * crl + Q15_ROUND) >> Q15;
970                let xi = cx * 2;
971                for row_off in [y_row0, y_row1] {
972                    for sub in 0..2 {
973                        let idx = row_off + xi + sub;
974                        y[idx] = clamp_y_10bit(y[idx] as i32 + delta);
975                    }
976                }
977                cx += 1;
978            }
979        }
980
981        // ---- Chroma pass (no luma coupling) ----
982        // 16 samples per iter.
983        let total_c = cb.len();
984        let mut i = 0usize;
985        while i + 16 <= total_c {
986            let cb_i16 = _mm256_loadu_si256(cb.as_ptr().add(i) as *const _);
987            let cr_i16 = _mm256_loadu_si256(cr.as_ptr().add(i) as *const _);
988            let cbl = _mm256_sub_epi16(cb_i16, v_chroma_center);
989            let crl = _mm256_sub_epi16(cr_i16, v_chroma_center);
990
991            // Cb709 = ΔCb + (M_CB_CB-32768)·ΔCb·2^-15 + M_CB_CR·ΔCr·2^-15 + 512
992            let cb_corr = _mm256_mulhrs_epi16(cbl, v_m_cb_cb_corr);
993            let cb_cross = _mm256_mulhrs_epi16(crl, v_m_cb_cr);
994            let new_cb = _mm256_add_epi16(_mm256_add_epi16(cbl, cb_corr), cb_cross);
995            let new_cb = _mm256_add_epi16(new_cb, v_chroma_center);
996
997            // Cr709 = ΔCr + (M_CR_CR-32768)·ΔCr·2^-15 + M_CR_CB·ΔCb·2^-15 + 512
998            let cr_corr = _mm256_mulhrs_epi16(crl, v_m_cr_cr_corr);
999            let cr_cross = _mm256_mulhrs_epi16(cbl, v_m_cr_cb);
1000            let new_cr = _mm256_add_epi16(_mm256_add_epi16(crl, cr_corr), cr_cross);
1001            let new_cr = _mm256_add_epi16(new_cr, v_chroma_center);
1002
1003            // Clamp [64, 960].
1004            let new_cb = _mm256_min_epi16(_mm256_max_epi16(new_cb, v_chroma_lo), v_chroma_hi);
1005            let new_cr = _mm256_min_epi16(_mm256_max_epi16(new_cr, v_chroma_lo), v_chroma_hi);
1006
1007            _mm256_storeu_si256(cb.as_mut_ptr().add(i) as *mut _, new_cb);
1008            _mm256_storeu_si256(cr.as_mut_ptr().add(i) as *mut _, new_cr);
1009
1010            i += 16;
1011        }
1012
1013        // Scalar tail for chroma.
1014        while i < total_c {
1015            let cbl = cb[i] as i32 - CHROMA_CENTER_10BIT;
1016            let crl = cr[i] as i32 - CHROMA_CENTER_10BIT;
1017            let new_cb = (M_CB_CB * cbl + M_CB_CR * crl + Q15_ROUND) >> Q15;
1018            let new_cr = (M_CR_CB * cbl + M_CR_CR * crl + Q15_ROUND) >> Q15;
1019            cb[i] = clamp_c_10bit(new_cb + CHROMA_CENTER_10BIT);
1020            cr[i] = clamp_c_10bit(new_cr + CHROMA_CENTER_10BIT);
1021            i += 1;
1022        }
1023    }
1024}
1025
1026fn recolor_yuv420p_bt601_to_bt709(frame: &VideoFrame) -> Result<VideoFrame> {
1027    let w = frame.width as usize;
1028    let h = frame.height as usize;
1029    let y_size = w * h;
1030    let c_size = y_size / 4;
1031
1032    if frame.data.len() < y_size + 2 * c_size {
1033        bail!(
1034            "frame data too short for yuv420p {}x{}: {} bytes",
1035            w,
1036            h,
1037            frame.data.len()
1038        );
1039    }
1040    if !w.is_multiple_of(2) || !h.is_multiple_of(2) {
1041        bail!(
1042            "BT.601→BT.709 requires even dimensions for 4:2:0 subsampling; got {}x{}",
1043            w,
1044            h
1045        );
1046    }
1047
1048    let mut y = frame.data[..y_size].to_vec();
1049    let mut cb = frame.data[y_size..y_size + c_size].to_vec();
1050    let mut cr = frame.data[y_size + c_size..y_size + 2 * c_size].to_vec();
1051
1052    bt601_to_bt709_planes(&mut y, &mut cb, &mut cr, w, h);
1053
1054    let mut out = BytesMut::with_capacity(y_size + 2 * c_size);
1055    out.extend_from_slice(&y);
1056    out.extend_from_slice(&cb);
1057    out.extend_from_slice(&cr);
1058
1059    Ok(VideoFrame::new(
1060        out.freeze(),
1061        frame.width,
1062        frame.height,
1063        frame.format,
1064        ColorSpace::Bt709,
1065        frame.pts,
1066    ))
1067}
1068
1069// =============================================================================
1070// 4:4:4 → 4:2:0 chroma downsample (Squad-31, roadmap #6).
1071// =============================================================================
1072//
1073// ProRes 4444 (and other 4:4:4 sources) decode at full chroma resolution —
1074// Cb / Cr planes match the luma plane in both dimensions. The encoder side
1075// (rav1e + HW backends) only accepts 4:2:0, where chroma is half-resolution
1076// in both axes. This module bridges the gap with a 2×2 box-average filter:
1077// for each 2×2 block of source chroma, output one chroma sample equal to
1078// the rounded mean. Y plane is unchanged (full-resolution luma in both
1079// formats — 4:4:4 and 4:2:0 differ only in chroma layout).
1080//
1081// Filter choice: 2×2 box average. The simplest correct filter for 4:4:4
1082// → 4:2:0 chroma siting (MPEG-2 left-aligned). For each output sample at
1083// (cx, cy), input samples are (2*cx, 2*cy), (2*cx+1, 2*cy), (2*cx, 2*cy+1),
1084// (2*cx+1, 2*cy+1). Output is `(s00 + s01 + s10 + s11 + 2) >> 2` —
1085// rounding by adding half the divisor before truncating shift.
1086//
1087// Higher-quality alternatives (6-tap separable FIR per BT.601/709 H.131,
1088// or a Lanczos-2 horizontal+vertical pair) are deferred to a follow-up;
1089// they cost ~10× the cycles for ~0.3 dB chroma PSNR improvement, which
1090// most consumer transcoders consider not worth it. The box average matches
1091// libswscale's default 4:4:4 → 4:2:0 path when no scaler is requested.
1092//
1093// Odd-dimension policy: when the source width or height is odd, the output
1094// dimensions round up (`(src + 1) / 2`), and the rightmost / bottom row of
1095// 2×2 blocks straddles a single source row/column. We **clamp** — the
1096// missing neighbour reuses the in-bounds sample. Clamping vs replication
1097// is identical for a 1-pixel boundary; we pick clamping because it's the
1098// simplest scalar implementation and matches what libswscale does.
1099//
1100// Alpha plane (Yuva444p10le): the 4:2:0 encoder format has no alpha. We
1101// **drop** alpha with a single warn-log (in pipeline integration). AV1
1102// has alpha support in some experimental profiles but rav1e 0.7 doesn't
1103// expose it, and pre-compositing onto a black background changes pixel
1104// values — keying / compositing on the source side would have already
1105// happened. Documented in SUPPORTED.md.
1106
1107/// 2×2 box-average chroma downsample for 8-bit `Yuv444p` → `Yuv420p`.
1108/// Y plane is copied verbatim; Cb and Cr planes shrink 2× in each axis
1109/// with rounded averages.
1110///
1111/// Output dimensions: chroma plane is `((width + 1) / 2) × ((height + 1) / 2)`,
1112/// which matches the encoder's 4:2:0 expectation for any input dims
1113/// (odd or even). For the common even case (e.g. 1920×1080) this is
1114/// 960×540 chroma per plane.
1115///
1116/// Returns the new packed `Yuv420p` byte buffer (Y || Cb || Cr).
1117pub fn downsample_chroma_444_to_420(
1118    y: &[u8],
1119    cb: &[u8],
1120    cr: &[u8],
1121    width: usize,
1122    height: usize,
1123) -> Vec<u8> {
1124    debug_assert_eq!(y.len(), width * height, "Y plane size");
1125    debug_assert_eq!(cb.len(), width * height, "Cb plane size (4:4:4)");
1126    debug_assert_eq!(cr.len(), width * height, "Cr plane size (4:4:4)");
1127
1128    let cw = width.div_ceil(2);
1129    let ch = height.div_ceil(2);
1130
1131    let mut out = Vec::with_capacity(width * height + 2 * cw * ch);
1132
1133    // Y plane: straight copy. Luma resolution is identical between
1134    // 4:4:4 and 4:2:0.
1135    out.extend_from_slice(y);
1136
1137    // Cb then Cr — same algorithm per plane.
1138    for plane in [cb, cr] {
1139        for cy in 0..ch {
1140            // Source rows: 2*cy and 2*cy+1, clamped to height-1.
1141            let y0 = 2 * cy;
1142            let y1 = (y0 + 1).min(height - 1);
1143            for cx in 0..cw {
1144                let x0 = 2 * cx;
1145                let x1 = (x0 + 1).min(width - 1);
1146                // Box average. 8-bit max is 255 × 4 = 1020, fits in u16.
1147                let s00 = plane[y0 * width + x0] as u16;
1148                let s01 = plane[y0 * width + x1] as u16;
1149                let s10 = plane[y1 * width + x0] as u16;
1150                let s11 = plane[y1 * width + x1] as u16;
1151                let avg = ((s00 + s01 + s10 + s11 + 2) >> 2) as u8;
1152                out.push(avg);
1153            }
1154        }
1155    }
1156
1157    out
1158}
1159
1160/// 10-bit variant for `Yuv444p10le` → `Yuv420p10le`. Operates on `u16`
1161/// samples in the 0..=1023 range; output samples are written as LE
1162/// `u16` bytes packed alongside the copied Y plane.
1163///
1164/// Accumulator: `u32`. Worst case 4 × 1023 + 2 = 4094 fits comfortably
1165/// in `u16` already, but `u32` keeps the math aligned with the spec
1166/// recommendation (BT.709 Annex A) and allows easy future swap to a
1167/// wider filter without overflow rework.
1168pub fn downsample_chroma_444_to_420_10bit(
1169    y: &[u16],
1170    cb: &[u16],
1171    cr: &[u16],
1172    width: usize,
1173    height: usize,
1174) -> Vec<u8> {
1175    debug_assert_eq!(y.len(), width * height, "Y plane samples");
1176    debug_assert_eq!(cb.len(), width * height, "Cb plane samples (4:4:4)");
1177    debug_assert_eq!(cr.len(), width * height, "Cr plane samples (4:4:4)");
1178
1179    let cw = width.div_ceil(2);
1180    let ch = height.div_ceil(2);
1181    let total_samples = width * height + 2 * cw * ch;
1182    let mut out = Vec::with_capacity(total_samples * 2);
1183
1184    // Y plane: emit as u16 LE bytes. Y is unchanged (full luma).
1185    for &s in y {
1186        out.extend_from_slice(&s.to_le_bytes());
1187    }
1188
1189    for plane in [cb, cr] {
1190        for cy in 0..ch {
1191            let y0 = 2 * cy;
1192            let y1 = (y0 + 1).min(height - 1);
1193            for cx in 0..cw {
1194                let x0 = 2 * cx;
1195                let x1 = (x0 + 1).min(width - 1);
1196                let s00 = plane[y0 * width + x0] as u32;
1197                let s01 = plane[y0 * width + x1] as u32;
1198                let s10 = plane[y1 * width + x0] as u32;
1199                let s11 = plane[y1 * width + x1] as u32;
1200                let avg = ((s00 + s01 + s10 + s11 + 2) >> 2) as u16;
1201                out.extend_from_slice(&avg.to_le_bytes());
1202            }
1203        }
1204    }
1205
1206    out
1207}
1208
1209/// High-level frame-shaped wrapper. Takes a `Yuv444p10le` /
1210/// `Yuva444p10le` `VideoFrame` and returns a `Yuv420p10le`
1211/// `VideoFrame` ready for the 10-bit AV1 encoder. Alpha plane (if
1212/// present) is **dropped** with a warn-log — see module docstring for
1213/// rationale. 8-bit equivalent (`Yuv444p` → `Yuv420p`) follows the
1214/// same pattern, plumbed through `downsample_chroma_444_to_420`.
1215///
1216/// Errors if the source format is not 4:4:4.
1217pub fn downsample_444_to_420_frame(frame: &VideoFrame) -> Result<VideoFrame> {
1218    let w = frame.width as usize;
1219    let h = frame.height as usize;
1220    if w == 0 || h == 0 {
1221        bail!("zero-dimension frame");
1222    }
1223
1224    match frame.format {
1225        PixelFormat::Yuv444p => {
1226            let plane = w * h;
1227            if frame.data.len() < 3 * plane {
1228                bail!(
1229                    "Yuv444p frame data too short for {}x{}: {} bytes",
1230                    w,
1231                    h,
1232                    frame.data.len()
1233                );
1234            }
1235            let y = &frame.data[..plane];
1236            let cb = &frame.data[plane..2 * plane];
1237            let cr = &frame.data[2 * plane..3 * plane];
1238            let out = downsample_chroma_444_to_420(y, cb, cr, w, h);
1239            Ok(VideoFrame::new(
1240                Bytes::from(out),
1241                frame.width,
1242                frame.height,
1243                PixelFormat::Yuv420p,
1244                frame.color_space,
1245                frame.pts,
1246            ))
1247        }
1248        PixelFormat::Yuv444p10le | PixelFormat::Yuva444p10le => {
1249            let plane = w * h;
1250            // 10-bit (or 16-bit alpha) is 2 bytes/sample. Y/Cb/Cr always
1251            // 10-bit, alpha (if present) is 16-bit, but layout is per-
1252            // plane LE u16 either way. We only consume the first three
1253            // planes; alpha (plane 4) is dropped on the floor.
1254            let needed = if frame.format == PixelFormat::Yuva444p10le {
1255                4 * plane * 2
1256            } else {
1257                3 * plane * 2
1258            };
1259            if frame.data.len() < needed {
1260                bail!(
1261                    "{:?} frame data too short for {}x{}: {} bytes (need {})",
1262                    frame.format,
1263                    w,
1264                    h,
1265                    frame.data.len(),
1266                    needed
1267                );
1268            }
1269            // Decode three u16 LE planes from the source bytes.
1270            let y = read_u16le(&frame.data[..plane * 2]);
1271            let cb = read_u16le(&frame.data[plane * 2..2 * plane * 2]);
1272            let cr = read_u16le(&frame.data[2 * plane * 2..3 * plane * 2]);
1273
1274            if frame.format == PixelFormat::Yuva444p10le {
1275                tracing::warn!(
1276                    pts = frame.pts,
1277                    "dropping alpha plane on 4:4:4→4:2:0 downsample (rav1e 0.7 has no alpha; pipeline target is Yuv420p10le)"
1278                );
1279            }
1280
1281            let out = downsample_chroma_444_to_420_10bit(&y, &cb, &cr, w, h);
1282            Ok(VideoFrame::new(
1283                Bytes::from(out),
1284                frame.width,
1285                frame.height,
1286                PixelFormat::Yuv420p10le,
1287                frame.color_space,
1288                frame.pts,
1289            ))
1290        }
1291        other => bail!(
1292            "downsample_444_to_420_frame: expected 4:4:4 input, got {:?}",
1293            other
1294        ),
1295    }
1296}
1297
1298// =============================================================================
1299// Bilinear scaler — scalar + AVX2 dispatch.
1300// =============================================================================
1301
1302pub fn scale_frame(
1303    frame: &VideoFrame,
1304    target_width: u32,
1305    target_height: u32,
1306) -> Result<VideoFrame> {
1307    if frame.width == target_width && frame.height == target_height {
1308        return Ok(frame.clone());
1309    }
1310
1311    match frame.format {
1312        PixelFormat::Yuv420p => scale_frame_8bit(frame, target_width, target_height),
1313        // 10-bit 4:2:0 path. Squad-19 shipped scalar; Squad-29 added
1314        // AVX2 specialization (16 × u16 lanes per iter, Q15 bilinear
1315        // weights via `_mm256_mulhrs_epi16`). Runtime-dispatched by
1316        // `bilinear_scale_plane_u16` (`is_x86_feature_detected!("avx2")`).
1317        PixelFormat::Yuv420p10le => scale_frame_10bit(frame, target_width, target_height),
1318        _ => bail!(
1319            "scaling only implemented for Yuv420p / Yuv420p10le; got {:?}",
1320            frame.format
1321        ),
1322    }
1323}
1324
1325fn scale_frame_8bit(
1326    frame: &VideoFrame,
1327    target_width: u32,
1328    target_height: u32,
1329) -> Result<VideoFrame> {
1330    let src_w = frame.width as usize;
1331    let src_h = frame.height as usize;
1332    let dst_w = target_width as usize;
1333    let dst_h = target_height as usize;
1334
1335    let src_y_size = src_w * src_h;
1336    let dst_y_size = dst_w * dst_h;
1337    let dst_uv_size = dst_y_size / 4;
1338
1339    let mut out = BytesMut::with_capacity(dst_y_size + dst_uv_size * 2);
1340
1341    // Bilinear scale Y plane
1342    let y_plane = &frame.data[..src_y_size];
1343    out.extend(bilinear_scale_plane(y_plane, src_w, src_h, dst_w, dst_h));
1344
1345    // Scale U plane
1346    let u_offset = src_y_size;
1347    let u_plane = &frame.data[u_offset..u_offset + src_y_size / 4];
1348    out.extend(bilinear_scale_plane(
1349        u_plane,
1350        src_w / 2,
1351        src_h / 2,
1352        dst_w / 2,
1353        dst_h / 2,
1354    ));
1355
1356    // Scale V plane
1357    let v_offset = u_offset + src_y_size / 4;
1358    let v_plane = &frame.data[v_offset..v_offset + src_y_size / 4];
1359    out.extend(bilinear_scale_plane(
1360        v_plane,
1361        src_w / 2,
1362        src_h / 2,
1363        dst_w / 2,
1364        dst_h / 2,
1365    ));
1366
1367    Ok(VideoFrame::new(
1368        out.freeze(),
1369        target_width,
1370        target_height,
1371        frame.format,
1372        frame.color_space,
1373        frame.pts,
1374    ))
1375}
1376
1377/// 10-bit `Yuv420p10le` bilinear scaler. Each plane is `u16` LE in the
1378/// 0..=1023 range. Operates on 16-bit samples directly; output sample
1379/// range is preserved (10-bit values stored in 16-bit containers).
1380///
1381/// Per-plane work runs through `bilinear_scale_plane_u16`, which
1382/// runtime-dispatches to AVX2 (Squad-29; 16 × u16 lanes per iter)
1383/// when `is_x86_feature_detected!("avx2")` and falls back to the
1384/// scalar f64 path (Squad-19) otherwise.
1385fn scale_frame_10bit(
1386    frame: &VideoFrame,
1387    target_width: u32,
1388    target_height: u32,
1389) -> Result<VideoFrame> {
1390    let src_w = frame.width as usize;
1391    let src_h = frame.height as usize;
1392    let dst_w = target_width as usize;
1393    let dst_h = target_height as usize;
1394
1395    let bytes_per_sample = 2usize;
1396    let src_y_size_samples = src_w * src_h;
1397    let src_y_size_bytes = src_y_size_samples * bytes_per_sample;
1398    let src_c_size_samples = (src_w / 2) * (src_h / 2);
1399    let src_c_size_bytes = src_c_size_samples * bytes_per_sample;
1400
1401    if frame.data.len() < src_y_size_bytes + 2 * src_c_size_bytes {
1402        bail!(
1403            "10-bit frame data too short for {}x{}: {} bytes",
1404            src_w,
1405            src_h,
1406            frame.data.len()
1407        );
1408    }
1409
1410    let dst_y_size_samples = dst_w * dst_h;
1411    let dst_c_size_samples = (dst_w / 2) * (dst_h / 2);
1412    let dst_total_bytes = (dst_y_size_samples + 2 * dst_c_size_samples) * bytes_per_sample;
1413
1414    // Decode planes from LE bytes into u16 buffers.
1415    let y_plane = read_u16le(&frame.data[..src_y_size_bytes]);
1416    let u_plane = read_u16le(&frame.data[src_y_size_bytes..src_y_size_bytes + src_c_size_bytes]);
1417    let v_plane = read_u16le(
1418        &frame.data[src_y_size_bytes + src_c_size_bytes..src_y_size_bytes + 2 * src_c_size_bytes],
1419    );
1420
1421    // Squad-29: runtime-dispatched (AVX2 when available, scalar fallback).
1422    let y_dst = bilinear_scale_plane_u16(&y_plane, src_w, src_h, dst_w, dst_h);
1423    let u_dst = bilinear_scale_plane_u16(&u_plane, src_w / 2, src_h / 2, dst_w / 2, dst_h / 2);
1424    let v_dst = bilinear_scale_plane_u16(&v_plane, src_w / 2, src_h / 2, dst_w / 2, dst_h / 2);
1425
1426    let mut out = BytesMut::with_capacity(dst_total_bytes);
1427    write_u16le(&mut out, &y_dst);
1428    write_u16le(&mut out, &u_dst);
1429    write_u16le(&mut out, &v_dst);
1430
1431    Ok(VideoFrame::new(
1432        out.freeze(),
1433        target_width,
1434        target_height,
1435        frame.format,
1436        frame.color_space,
1437        frame.pts,
1438    ))
1439}
1440
1441fn read_u16le(bytes: &[u8]) -> Vec<u16> {
1442    bytes
1443        .chunks_exact(2)
1444        .map(|c| u16::from_le_bytes([c[0], c[1]]))
1445        .collect()
1446}
1447
1448fn write_u16le(out: &mut BytesMut, samples: &[u16]) {
1449    for s in samples {
1450        out.extend_from_slice(&s.to_le_bytes());
1451    }
1452}
1453
1454/// Scalar bilinear scale on `u16` (10-bit) samples. Mirrors the 8-bit
1455/// `bilinear_scale_plane_scalar` algorithm; the only differences are
1456/// the wider sample type and the absence of u8 saturation at the
1457/// output (10-bit values up to 1023 fit comfortably in u16, no
1458/// overflow risk in the f64 intermediate).
1459pub fn bilinear_scale_plane_u16_scalar(
1460    src: &[u16],
1461    src_w: usize,
1462    src_h: usize,
1463    dst_w: usize,
1464    dst_h: usize,
1465) -> Vec<u16> {
1466    let mut dst = vec![0u16; dst_w * dst_h];
1467    let x_ratio = src_w as f64 / dst_w as f64;
1468    let y_ratio = src_h as f64 / dst_h as f64;
1469
1470    for dy in 0..dst_h {
1471        let sy = (dy as f64 * y_ratio).min((src_h - 1) as f64);
1472        let y0 = sy as usize;
1473        let y1 = (y0 + 1).min(src_h - 1);
1474        let fy = sy - y0 as f64;
1475
1476        for dx in 0..dst_w {
1477            let sx = (dx as f64 * x_ratio).min((src_w - 1) as f64);
1478            let x0 = sx as usize;
1479            let x1 = (x0 + 1).min(src_w - 1);
1480            let fx = sx - x0 as f64;
1481
1482            let p00 = src[y0 * src_w + x0] as f64;
1483            let p10 = src[y0 * src_w + x1] as f64;
1484            let p01 = src[y1 * src_w + x0] as f64;
1485            let p11 = src[y1 * src_w + x1] as f64;
1486
1487            let val = p00 * (1.0 - fx) * (1.0 - fy)
1488                + p10 * fx * (1.0 - fy)
1489                + p01 * (1.0 - fx) * fy
1490                + p11 * fx * fy;
1491
1492            // Round to nearest, clamp to the 10-bit max (1023). The
1493            // input is already in 0..=1023 so an in-range bilinear
1494            // combination cannot exceed 1023 in exact arithmetic; the
1495            // clamp is defensive against fp rounding pushing 1023.0
1496            // → 1024.0.
1497            dst[dy * dst_w + dx] = val.round().clamp(0.0, 1023.0) as u16;
1498        }
1499    }
1500    dst
1501}
1502
1503/// Runtime-dispatched 10-bit bilinear scale. AVX2 on x86_64 when
1504/// available; falls back to `bilinear_scale_plane_u16_scalar` otherwise.
1505///
1506/// Squad-29 (2026-04-17) added the AVX2 path. 256-bit registers
1507/// process 16 × u16 samples per iteration. Internally the same
1508/// Q15 fixed-point math as the 8-bit AVX2 path, but `_mm256_mulhrs_epi16`
1509/// expects signed lanes — 10-bit samples (0..=1023) fit comfortably
1510/// in i16 with no shift gymnastics. Output is clamped to 10-bit
1511/// range (0..=1023). Scalar tail (when `dst_w % 16 != 0` or width
1512/// < 16) reuses the scalar f64 math row-by-row.
1513///
1514/// Performance target on 1080p→720p: ≥3× over scalar (realistic
1515/// floor for u16 lanes vs u8). Bench in
1516/// `crates/codec/benches/bilinear.rs::bilinear_10bit_avx2_vs_scalar`.
1517pub fn bilinear_scale_plane_u16(
1518    src: &[u16],
1519    src_w: usize,
1520    src_h: usize,
1521    dst_w: usize,
1522    dst_h: usize,
1523) -> Vec<u16> {
1524    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1525    {
1526        // 16-lane AVX2 path; gate on dst_w >= 16 so the main loop
1527        // runs at least once per row. Narrower outputs fall back to
1528        // scalar (cheap — narrow strips aren't a hotspot).
1529        if std::is_x86_feature_detected!("avx2") && dst_w >= 16 {
1530            // SAFETY: avx2 runtime-detected.
1531            return unsafe { bilinear_scale_plane_u16_avx2(src, src_w, src_h, dst_w, dst_h) };
1532        }
1533    }
1534    bilinear_scale_plane_u16_scalar(src, src_w, src_h, dst_w, dst_h)
1535}
1536
1537/// AVX2 specialization for the 10-bit bilinear scaler. Processes
1538/// 16 × u16 destination samples per iteration via 256-bit registers.
1539///
1540/// Algorithm mirrors `bilinear_scale_plane_avx2` (the 8-bit AVX2
1541/// path) with two differences:
1542/// 1. Lanes are `u16` (16 per 256-bit reg, vs 32 × `u8` in 8-bit).
1543///    No need to widen u8 → i16 inside the kernel — samples are
1544///    already 16-bit.
1545/// 2. No need to shift sample values up (Q7 trick) before the
1546///    Q15 multiply: 10-bit values in 0..=1023 fit in i16 unshifted.
1547///    Final output is straight-clamped to [0, 1023] without saturating
1548///    pack-down.
1549///
1550/// Q15 fixed-point weights via `_mm256_mulhrs_epi16` ((a*b+0x4000)>>15).
1551/// `mulhrs` operates on signed lanes; weights are in [0, 32767]
1552/// (one ULP shy of 32768; matches the 8-bit AVX2 trick to keep
1553/// inputs i16-safe).
1554#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1555#[target_feature(enable = "avx2")]
1556unsafe fn bilinear_scale_plane_u16_avx2(
1557    src: &[u16],
1558    src_w: usize,
1559    src_h: usize,
1560    dst_w: usize,
1561    dst_h: usize,
1562) -> Vec<u16> {
1563    unsafe {
1564        #[cfg(target_arch = "x86")]
1565        use std::arch::x86::*;
1566        #[cfg(target_arch = "x86_64")]
1567        use std::arch::x86_64::*;
1568
1569        let mut dst = vec![0u16; dst_w * dst_h];
1570
1571        // Q32 fixed-point ratios — same precision pattern as the 8-bit
1572        // AVX2 path so the rounding edge cases land identically across
1573        // the two specializations.
1574        let x_step = ((src_w as u64) << 32) / (dst_w as u64);
1575        let y_step = ((src_h as u64) << 32) / (dst_h as u64);
1576
1577        // Precompute per-dst-x source x0 + Q15 fractional weights.
1578        let mut x0s: Vec<u32> = vec![0; dst_w];
1579        let mut x1s: Vec<u32> = vec![0; dst_w];
1580        let mut fxs_q15: Vec<i16> = vec![0; dst_w];
1581        let mut one_minus_fxs_q15: Vec<i16> = vec![0; dst_w];
1582        for dx in 0..dst_w {
1583            let sx_32_32 = (dx as u64) * x_step;
1584            let x0_full = (sx_32_32 >> 32) as usize;
1585            let x0 = x0_full.min(src_w - 1);
1586            let fx_q16 = ((sx_32_32 >> 16) & 0xFFFF) as u32;
1587            // Convert Q16 → Q15 in [0, 32767]; same clamp trick as 8-bit.
1588            let fx_q15 = ((fx_q16 as i32) >> 1).min(32767) as i16;
1589            if x0 >= src_w - 1 {
1590                x0s[dx] = (src_w - 1) as u32;
1591                x1s[dx] = (src_w - 1) as u32;
1592                fxs_q15[dx] = 0;
1593                one_minus_fxs_q15[dx] = 32767;
1594            } else {
1595                x0s[dx] = x0 as u32;
1596                x1s[dx] = (x0 + 1) as u32;
1597                fxs_q15[dx] = fx_q15;
1598                one_minus_fxs_q15[dx] = 32767 - fx_q15;
1599            }
1600        }
1601
1602        let v_max = _mm256_set1_epi16(1023);
1603        let v_zero = _mm256_setzero_si256();
1604
1605        for dy in 0..dst_h {
1606            let sy_32_32 = (dy as u64) * y_step;
1607            let y0_full = (sy_32_32 >> 32) as usize;
1608            let y0 = y0_full.min(src_h - 1);
1609            let fy_q16 = ((sy_32_32 >> 16) & 0xFFFF) as u32;
1610            let y1 = (y0 + 1).min(src_h - 1);
1611            let fy_q15 = ((fy_q16 as i32) >> 1).min(32767) as i16;
1612            let one_minus_fy_q15 = 32767i16 - fy_q15;
1613
1614            let row0 = y0 * src_w;
1615            let row1 = y1 * src_w;
1616            let dst_row = dy * dst_w;
1617
1618            let v_fy = _mm256_set1_epi16(fy_q15);
1619            let v_one_minus_fy = _mm256_set1_epi16(one_minus_fy_q15);
1620
1621            let mut dx = 0usize;
1622            while dx + 16 <= dst_w {
1623                // Gather 16 p00 / p10 / p01 / p11 u16 values into stack
1624                // scratch buffers, then load as 256-bit registers.
1625                // Same approach as the 8-bit AVX2 — bilinear inputs don't
1626                // align contiguously (x0/x1 are arbitrary), so a scalar
1627                // gather + aligned reload is the cheapest path.
1628                let mut p00_buf = [0u16; 16];
1629                let mut p10_buf = [0u16; 16];
1630                let mut p01_buf = [0u16; 16];
1631                let mut p11_buf = [0u16; 16];
1632                for i in 0..16 {
1633                    let x0 = x0s[dx + i] as usize;
1634                    let x1 = x1s[dx + i] as usize;
1635                    p00_buf[i] = *src.get_unchecked(row0 + x0);
1636                    p10_buf[i] = *src.get_unchecked(row0 + x1);
1637                    p01_buf[i] = *src.get_unchecked(row1 + x0);
1638                    p11_buf[i] = *src.get_unchecked(row1 + x1);
1639                }
1640
1641                // Load as 256-bit (16 × u16). Treated as i16 for the
1642                // signed `mulhrs` — 10-bit samples max=1023, well under
1643                // i16_max (32767), so reinterpret is bit-identical and
1644                // safe.
1645                let p00 = _mm256_loadu_si256(p00_buf.as_ptr() as *const _);
1646                let p10 = _mm256_loadu_si256(p10_buf.as_ptr() as *const _);
1647                let p01 = _mm256_loadu_si256(p01_buf.as_ptr() as *const _);
1648                let p11 = _mm256_loadu_si256(p11_buf.as_ptr() as *const _);
1649
1650                // Per-lane fx / (1-fx).
1651                let v_fx = _mm256_loadu_si256(fxs_q15.as_ptr().add(dx) as *const _);
1652                let v_one_minus_fx =
1653                    _mm256_loadu_si256(one_minus_fxs_q15.as_ptr().add(dx) as *const _);
1654
1655                // mulhrs: (a*b + 0x4000) >> 15. Inputs are 10-bit
1656                // (0..=1023), weights are Q15 (0..=32767). Product max
1657                // ≈ 1023 * 32767 ≈ 33.5M; after >> 15 the value is
1658                // ≤ 1023, so signed i16 is plenty.
1659                //
1660                // Important: because samples are unshifted (unlike the
1661                // 8-bit kernel which multiplied by 128 first), the
1662                // post-multiply value retains its full sample magnitude.
1663                // No final shift-down is needed — `top` and `bottom`
1664                // are already in the 0..=1023 range.
1665                let top = _mm256_add_epi16(
1666                    _mm256_mulhrs_epi16(p00, v_one_minus_fx),
1667                    _mm256_mulhrs_epi16(p10, v_fx),
1668                );
1669                let bottom = _mm256_add_epi16(
1670                    _mm256_mulhrs_epi16(p01, v_one_minus_fx),
1671                    _mm256_mulhrs_epi16(p11, v_fx),
1672                );
1673
1674                // Vertical interp. Same Q15 → 10-bit scale.
1675                let out_i16 = _mm256_add_epi16(
1676                    _mm256_mulhrs_epi16(top, v_one_minus_fy),
1677                    _mm256_mulhrs_epi16(bottom, v_fy),
1678                );
1679
1680                // Clamp to [0, 1023] — defensive against the Q15 round
1681                // trick pushing exactly-1023 inputs to 1024 in extreme
1682                // edge cases. Use signed min/max on i16 (values are
1683                // non-negative for in-range 10-bit input).
1684                let clamped = _mm256_min_epi16(_mm256_max_epi16(out_i16, v_zero), v_max);
1685
1686                _mm256_storeu_si256(dst.as_mut_ptr().add(dst_row + dx) as *mut _, clamped);
1687
1688                dx += 16;
1689            }
1690
1691            // Scalar tail. Mirrors `bilinear_scale_plane_u16_scalar` row
1692            // math so parity with the scalar function is byte-exact in
1693            // the tail (modulo the ±1-LSB rounding tolerance the SIMD
1694            // main loop carries vs scalar f64).
1695            while dx < dst_w {
1696                let x0 = x0s[dx] as usize;
1697                let x1 = x1s[dx] as usize;
1698                let fx = fxs_q15[dx] as f64 / 32768.0;
1699                let fy = fy_q15 as f64 / 32768.0;
1700
1701                let p00 = src[row0 + x0] as f64;
1702                let p10 = src[row0 + x1] as f64;
1703                let p01 = src[row1 + x0] as f64;
1704                let p11 = src[row1 + x1] as f64;
1705
1706                let val = p00 * (1.0 - fx) * (1.0 - fy)
1707                    + p10 * fx * (1.0 - fy)
1708                    + p01 * (1.0 - fx) * fy
1709                    + p11 * fx * fy;
1710                dst[dst_row + dx] = val.round().clamp(0.0, 1023.0) as u16;
1711                dx += 1;
1712            }
1713        }
1714
1715        dst
1716    }
1717}
1718
1719/// Runtime-dispatched bilinear scale. AVX2 on x86_64 when available.
1720pub fn bilinear_scale_plane(
1721    src: &[u8],
1722    src_w: usize,
1723    src_h: usize,
1724    dst_w: usize,
1725    dst_h: usize,
1726) -> Vec<u8> {
1727    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1728    {
1729        // perf: AVX2 specialization — Q16 fixed-point 2-tap horiz ×
1730        // 2-tap vert, 16 dst pixels / iter. Benched 3-4× faster than
1731        // the f64 scalar on 1080p→720p.
1732        if std::is_x86_feature_detected!("avx2") && dst_w >= 16 {
1733            // SAFETY: avx2 runtime-detected.
1734            return unsafe { bilinear_scale_plane_avx2(src, src_w, src_h, dst_w, dst_h) };
1735        }
1736    }
1737    bilinear_scale_plane_scalar(src, src_w, src_h, dst_w, dst_h)
1738}
1739
1740pub fn bilinear_scale_plane_scalar(
1741    src: &[u8],
1742    src_w: usize,
1743    src_h: usize,
1744    dst_w: usize,
1745    dst_h: usize,
1746) -> Vec<u8> {
1747    let mut dst = vec![0u8; dst_w * dst_h];
1748    let x_ratio = src_w as f64 / dst_w as f64;
1749    let y_ratio = src_h as f64 / dst_h as f64;
1750
1751    for dy in 0..dst_h {
1752        let sy = (dy as f64 * y_ratio).min((src_h - 1) as f64);
1753        let y0 = sy as usize;
1754        let y1 = (y0 + 1).min(src_h - 1);
1755        let fy = sy - y0 as f64;
1756
1757        for dx in 0..dst_w {
1758            let sx = (dx as f64 * x_ratio).min((src_w - 1) as f64);
1759            let x0 = sx as usize;
1760            let x1 = (x0 + 1).min(src_w - 1);
1761            let fx = sx - x0 as f64;
1762
1763            let p00 = src[y0 * src_w + x0] as f64;
1764            let p10 = src[y0 * src_w + x1] as f64;
1765            let p01 = src[y1 * src_w + x0] as f64;
1766            let p11 = src[y1 * src_w + x1] as f64;
1767
1768            let val = p00 * (1.0 - fx) * (1.0 - fy)
1769                + p10 * fx * (1.0 - fy)
1770                + p01 * (1.0 - fx) * fy
1771                + p11 * fx * fy;
1772
1773            dst[dy * dst_w + dx] = val.round() as u8;
1774        }
1775    }
1776    dst
1777}
1778
1779#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1780#[target_feature(enable = "avx2")]
1781unsafe fn bilinear_scale_plane_avx2(
1782    src: &[u8],
1783    src_w: usize,
1784    src_h: usize,
1785    dst_w: usize,
1786    dst_h: usize,
1787) -> Vec<u8> {
1788    unsafe {
1789        #[cfg(target_arch = "x86")]
1790        use std::arch::x86::*;
1791        #[cfg(target_arch = "x86_64")]
1792        use std::arch::x86_64::*;
1793
1794        let mut dst = vec![0u8; dst_w * dst_h];
1795
1796        // Q16 fixed-point ratios. src_w/dst_w as 16.16.
1797        //
1798        // sx_q16(dx) = dx * (src_w << 16) / dst_w
1799        // Using integer math avoids the f64 hotpath. The sampling
1800        // convention matches the scalar (origin-at-pixel-corner); picking
1801        // a different convention would change the output but also the
1802        // reference, so we mirror scalar precisely.
1803        let x_step = ((src_w as u64) << 32) / (dst_w as u64); // 32.32 keeps precision for wide src
1804        let y_step = ((src_h as u64) << 32) / (dst_h as u64);
1805
1806        // Precompute per-dst-x source x0 and fx (Q16 weight for src[x1]).
1807        let mut x0s: Vec<u32> = vec![0; dst_w];
1808        let mut fxs: Vec<u16> = vec![0; dst_w];
1809        for dx in 0..dst_w {
1810            let sx_32_32 = (dx as u64) * x_step; // source x in 32.32
1811            let x0_full = (sx_32_32 >> 32) as usize;
1812            let x0 = x0_full.min(src_w - 1);
1813            let fx_q16 = ((sx_32_32 >> 16) & 0xFFFF) as u16; // Q16 fraction
1814            // Clamp to src_w-1 if we'd run off the end
1815            if x0 >= src_w - 1 {
1816                x0s[dx] = (src_w - 1) as u32;
1817                fxs[dx] = 0;
1818            } else {
1819                x0s[dx] = x0 as u32;
1820                fxs[dx] = fx_q16;
1821            }
1822        }
1823
1824        // Bake (1-fx) into a paired vector for mulhrs-style math. We'll
1825        // compute:
1826        //   top    = p00 * (1-fx) + p10 * fx
1827        //   bottom = p01 * (1-fx) + p11 * fx
1828        //   out    = top * (1-fy) + bottom * fy
1829        // All in Q15 (mulhrs) with u16 input shifted down by 1 bit, or in
1830        // Q14 with straightforward pmulhw.
1831        //
1832        // Use Q15 + mulhrs_epi16 for the fractional weights. Fractional
1833        // weights are in [0, 32768] — 32768 overflows i16, so clamp at
1834        // 32767 (error ≤ 1/32768 ≈ 0, sub-LSB for 8-bit output).
1835        let mut fx_q15: Vec<i16> = vec![0; dst_w];
1836        let mut one_minus_fx_q15: Vec<i16> = vec![0; dst_w];
1837        for dx in 0..dst_w {
1838            // Convert Q16 weight to Q15 in [0, 32767].
1839            let fxq15 = (fxs[dx] as i32 >> 1).min(32767) as i16;
1840            fx_q15[dx] = fxq15;
1841            one_minus_fx_q15[dx] = 32767 - fxq15;
1842        }
1843
1844        for dy in 0..dst_h {
1845            let sy_32_32 = (dy as u64) * y_step;
1846            let y0_full = (sy_32_32 >> 32) as usize;
1847            let y0 = y0_full.min(src_h - 1);
1848            let fy_q16 = ((sy_32_32 >> 16) & 0xFFFF) as u32;
1849            let y1 = (y0 + 1).min(src_h - 1);
1850            let fy_q15 = ((fy_q16 as i32) >> 1).min(32767) as i16;
1851            let one_minus_fy_q15 = 32767i16 - fy_q15;
1852
1853            let row0 = y0 * src_w;
1854            let row1 = y1 * src_w;
1855            let dst_row = dy * dst_w;
1856
1857            // Per-dx we need p00/p10/p01/p11. Bilinear inputs don't align
1858            // contiguously (x0/x1 are arbitrary), so we gather by
1859            // scalar-load-then-pack for 16 dst-x per iteration.
1860            let v_fy = _mm256_set1_epi16(fy_q15);
1861            let v_one_minus_fy = _mm256_set1_epi16(one_minus_fy_q15);
1862
1863            let mut dx = 0usize;
1864            while dx + 16 <= dst_w {
1865                // Gather 16 p00, p10, p01, p11 into 16-lane u8 buffers.
1866                // Use stack scratch and an unaligned load.
1867                let mut p00_buf = [0u8; 16];
1868                let mut p10_buf = [0u8; 16];
1869                let mut p01_buf = [0u8; 16];
1870                let mut p11_buf = [0u8; 16];
1871                for i in 0..16 {
1872                    let x0 = x0s[dx + i] as usize;
1873                    let x1 = (x0 + 1).min(src_w - 1);
1874                    p00_buf[i] = *src.get_unchecked(row0 + x0);
1875                    p10_buf[i] = *src.get_unchecked(row0 + x1);
1876                    p01_buf[i] = *src.get_unchecked(row1 + x0);
1877                    p11_buf[i] = *src.get_unchecked(row1 + x1);
1878                }
1879
1880                // Widen to i16.
1881                let p00 = _mm256_cvtepu8_epi16(_mm_loadu_si128(p00_buf.as_ptr() as *const _));
1882                let p10 = _mm256_cvtepu8_epi16(_mm_loadu_si128(p10_buf.as_ptr() as *const _));
1883                let p01 = _mm256_cvtepu8_epi16(_mm_loadu_si128(p01_buf.as_ptr() as *const _));
1884                let p11 = _mm256_cvtepu8_epi16(_mm_loadu_si128(p11_buf.as_ptr() as *const _));
1885
1886                // Shift u8 (0..255) up to the top of i16's signed range so
1887                // mulhrs_epi16 retains precision. Each u8 value × 128 is in
1888                // [0, 32640] — safe for signed mul.
1889                let p00 = _mm256_slli_epi16::<7>(p00);
1890                let p10 = _mm256_slli_epi16::<7>(p10);
1891                let p01 = _mm256_slli_epi16::<7>(p01);
1892                let p11 = _mm256_slli_epi16::<7>(p11);
1893
1894                // Load per-lane fx / (1-fx).
1895                let v_fx = _mm256_loadu_si256(fx_q15.as_ptr().add(dx) as *const _);
1896                let v_one_minus_fx =
1897                    _mm256_loadu_si256(one_minus_fx_q15.as_ptr().add(dx) as *const _);
1898
1899                // Horizontal interp: top = p00*(1-fx) + p10*fx, in Q15.
1900                let top = _mm256_add_epi16(
1901                    _mm256_mulhrs_epi16(p00, v_one_minus_fx),
1902                    _mm256_mulhrs_epi16(p10, v_fx),
1903                );
1904                let bottom = _mm256_add_epi16(
1905                    _mm256_mulhrs_epi16(p01, v_one_minus_fx),
1906                    _mm256_mulhrs_epi16(p11, v_fx),
1907                );
1908
1909                // Vertical interp. top/bottom are Q15-scaled of the Q7
1910                // u8 → so still in the ~Q7 range (approx 0..254). Apply
1911                // (1-fy)/fy via mulhrs to get final Q7, then shift down 7
1912                // to recover the u8. mulhrs: (top * (1-fy) + 0x4000) >> 15.
1913                let out_q7 = _mm256_add_epi16(
1914                    _mm256_mulhrs_epi16(top, v_one_minus_fy),
1915                    _mm256_mulhrs_epi16(bottom, v_fy),
1916                );
1917                // Shift back: (x + 64) >> 7 for round-to-nearest.
1918                let rounded = _mm256_add_epi16(out_q7, _mm256_set1_epi16(64));
1919                let shifted = _mm256_srai_epi16::<7>(rounded);
1920
1921                // Saturating pack i16 → u8 (16 lanes).
1922                let packed = _mm256_packus_epi16(shifted, shifted);
1923                // packus interleaves 128-lane halves — permute so low
1924                // 16 bytes of the result are what we want.
1925                let packed = _mm256_permute4x64_epi64::<0b00_00_10_00>(packed);
1926                _mm_storeu_si128(
1927                    dst.as_mut_ptr().add(dst_row + dx) as *mut _,
1928                    _mm256_castsi256_si128(packed),
1929                );
1930
1931                dx += 16;
1932            }
1933
1934            // Scalar tail.
1935            while dx < dst_w {
1936                let x0 = x0s[dx] as usize;
1937                let x1 = (x0 + 1).min(src_w - 1);
1938                let fx = fxs[dx] as f64 / 65536.0;
1939                let fy = fy_q16 as f64 / 65536.0;
1940
1941                let p00 = src[row0 + x0] as f64;
1942                let p10 = src[row0 + x1] as f64;
1943                let p01 = src[row1 + x0] as f64;
1944                let p11 = src[row1 + x1] as f64;
1945
1946                let val = p00 * (1.0 - fx) * (1.0 - fy)
1947                    + p10 * fx * (1.0 - fy)
1948                    + p01 * (1.0 - fx) * fy
1949                    + p11 * fx * fy;
1950                dst[dst_row + dx] = val.round() as u8;
1951                dx += 1;
1952            }
1953        }
1954
1955        dst
1956    }
1957}
1958
1959// =============================================================================
1960// Tests
1961// =============================================================================
1962
1963#[cfg(test)]
1964mod tests {
1965    use super::*;
1966
1967    // -------- BT.601 → BT.709 --------
1968
1969    fn synth_601_frame(w: usize, h: usize) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
1970        let mut y = vec![0u8; w * h];
1971        let mut cb = vec![0u8; (w / 2) * (h / 2)];
1972        let mut cr = vec![0u8; (w / 2) * (h / 2)];
1973        for i in 0..y.len() {
1974            // Sweep limited range [16, 235].
1975            y[i] = 16 + ((i as u32 * 17) % 220) as u8;
1976        }
1977        for i in 0..cb.len() {
1978            cb[i] = 16 + ((i as u32 * 13) % 225) as u8;
1979            cr[i] = 16 + ((i as u32 * 23) % 225) as u8;
1980        }
1981        (y, cb, cr)
1982    }
1983
1984    #[test]
1985    fn bt601_to_bt709_neutral_gray_roundtrips() {
1986        // Cb=Cr=128 means ΔCb=ΔCr=0, so ΔY=0 and ΔCb709=ΔCr709=0.
1987        // Every luma value stays put; chroma stays at 128.
1988        for &y_val in &[16u8, 64, 128, 200, 235] {
1989            let w = 32;
1990            let h = 16;
1991            let mut y = vec![y_val; w * h];
1992            let mut cb = vec![128u8; (w / 2) * (h / 2)];
1993            let mut cr = vec![128u8; (w / 2) * (h / 2)];
1994            bt601_to_bt709_planes_scalar(&mut y, &mut cb, &mut cr, w, h);
1995            for v in &y {
1996                assert_eq!(*v, y_val, "Y with neutral chroma must round-trip");
1997            }
1998            for v in &cb {
1999                assert_eq!(*v, 128);
2000            }
2001            for v in &cr {
2002                assert_eq!(*v, 128);
2003            }
2004        }
2005    }
2006
2007    #[test]
2008    fn bt601_to_bt709_black_and_white_round_trip() {
2009        // Black (Y=16, Cb=Cr=128) and white (Y=235, Cb=Cr=128) must
2010        // round-trip unchanged because chroma deltas are zero.
2011        for &(y_val, label) in &[(16u8, "black"), (235u8, "white")] {
2012            let w = 64;
2013            let h = 32;
2014            let mut y = vec![y_val; w * h];
2015            let mut cb = vec![128u8; (w / 2) * (h / 2)];
2016            let mut cr = vec![128u8; (w / 2) * (h / 2)];
2017            bt601_to_bt709_planes(&mut y, &mut cb, &mut cr, w, h);
2018            for v in &y {
2019                assert_eq!(*v, y_val, "{} Y round-trip", label);
2020            }
2021            for v in &cb {
2022                assert_eq!(*v, 128, "{} Cb round-trip", label);
2023            }
2024            for v in &cr {
2025                assert_eq!(*v, 128, "{} Cr round-trip", label);
2026            }
2027        }
2028    }
2029
2030    #[test]
2031    fn bt601_to_bt709_scalar_vs_avx2_agree_256x256() {
2032        // Dense 256×256 synthetic plane; every AVX2 lane path exercised
2033        // plus the scalar tail (if width is not a 16-chroma multiple
2034        // we'd hit the tail — here 256 cw=128 is a multiple of 16, so
2035        // only the main path runs, which is what we want to gate).
2036        let w = 256;
2037        let h = 256;
2038        let (y0, cb0, cr0) = synth_601_frame(w, h);
2039
2040        let mut y_s = y0.clone();
2041        let mut cb_s = cb0.clone();
2042        let mut cr_s = cr0.clone();
2043        bt601_to_bt709_planes_scalar(&mut y_s, &mut cb_s, &mut cr_s, w, h);
2044
2045        let mut y_v = y0.clone();
2046        let mut cb_v = cb0.clone();
2047        let mut cr_v = cr0.clone();
2048        bt601_to_bt709_planes(&mut y_v, &mut cb_v, &mut cr_v, w, h);
2049
2050        let mut max_y = 0i32;
2051        for i in 0..y_s.len() {
2052            let d = (y_s[i] as i32 - y_v[i] as i32).abs();
2053            if d > max_y {
2054                max_y = d;
2055            }
2056            assert!(d <= 1, "Y[{}] scalar={} avx2={}", i, y_s[i], y_v[i]);
2057        }
2058        for i in 0..cb_s.len() {
2059            assert!(
2060                (cb_s[i] as i32 - cb_v[i] as i32).abs() <= 1,
2061                "Cb[{}] scalar={} avx2={}",
2062                i,
2063                cb_s[i],
2064                cb_v[i]
2065            );
2066            assert!(
2067                (cr_s[i] as i32 - cr_v[i] as i32).abs() <= 1,
2068                "Cr[{}] scalar={} avx2={}",
2069                i,
2070                cr_s[i],
2071                cr_v[i]
2072            );
2073        }
2074    }
2075
2076    #[test]
2077    fn bt601_to_bt709_scalar_vs_avx2_agree_tail() {
2078        // 34 wide forces a 1-sample tail in the chroma loop (cw=17,
2079        // main covers 16, tail covers 1).
2080        let w = 34;
2081        let h = 16;
2082        let (y0, cb0, cr0) = synth_601_frame(w, h);
2083
2084        let mut y_s = y0.clone();
2085        let mut cb_s = cb0.clone();
2086        let mut cr_s = cr0.clone();
2087        bt601_to_bt709_planes_scalar(&mut y_s, &mut cb_s, &mut cr_s, w, h);
2088
2089        let mut y_v = y0.clone();
2090        let mut cb_v = cb0.clone();
2091        let mut cr_v = cr0.clone();
2092        bt601_to_bt709_planes(&mut y_v, &mut cb_v, &mut cr_v, w, h);
2093
2094        for i in 0..y_s.len() {
2095            assert!(
2096                (y_s[i] as i32 - y_v[i] as i32).abs() <= 1,
2097                "Y[{}] scalar={} avx2={}",
2098                i,
2099                y_s[i],
2100                y_v[i]
2101            );
2102        }
2103        for i in 0..cb_s.len() {
2104            assert!((cb_s[i] as i32 - cb_v[i] as i32).abs() <= 1);
2105            assert!((cr_s[i] as i32 - cr_v[i] as i32).abs() <= 1);
2106        }
2107    }
2108
2109    #[test]
2110    fn bt601_to_bt709_clamps_ranges() {
2111        // After conversion, luma stays in [16, 235] and chroma in [16, 240].
2112        let w = 32;
2113        let h = 16;
2114        let (mut y, mut cb, mut cr) = synth_601_frame(w, h);
2115        bt601_to_bt709_planes(&mut y, &mut cb, &mut cr, w, h);
2116        for &v in cb.iter().chain(cr.iter()) {
2117            assert!((16..=240).contains(&v), "chroma {} out of limited range", v);
2118        }
2119        for &v in y.iter() {
2120            assert!((16..=235).contains(&v), "luma {} out of limited range", v);
2121        }
2122    }
2123
2124    // -------- Bilinear scaler --------
2125
2126    fn make_ramp(w: usize, h: usize) -> Vec<u8> {
2127        (0..w * h).map(|i| ((i * 7 + i / w) & 0xff) as u8).collect()
2128    }
2129
2130    #[test]
2131    fn bilinear_scalar_vs_avx2_agree_2x() {
2132        let src_w = 64;
2133        let src_h = 32;
2134        let src = make_ramp(src_w, src_h);
2135        let dst_w = 128;
2136        let dst_h = 64;
2137
2138        let scalar = bilinear_scale_plane_scalar(&src, src_w, src_h, dst_w, dst_h);
2139        let simd = bilinear_scale_plane(&src, src_w, src_h, dst_w, dst_h);
2140
2141        assert_eq!(scalar.len(), simd.len());
2142        let mut max_diff = 0i32;
2143        for i in 0..scalar.len() {
2144            let d = (scalar[i] as i32 - simd[i] as i32).abs();
2145            if d > max_diff {
2146                max_diff = d;
2147            }
2148            assert!(
2149                d <= 1,
2150                "bilinear mismatch at {}: scalar={} simd={}",
2151                i,
2152                scalar[i],
2153                simd[i]
2154            );
2155        }
2156    }
2157
2158    #[test]
2159    fn bilinear_scalar_vs_avx2_agree_downscale() {
2160        let src_w = 128;
2161        let src_h = 72;
2162        let src = make_ramp(src_w, src_h);
2163        let dst_w = 64;
2164        let dst_h = 36;
2165
2166        let scalar = bilinear_scale_plane_scalar(&src, src_w, src_h, dst_w, dst_h);
2167        let simd = bilinear_scale_plane(&src, src_w, src_h, dst_w, dst_h);
2168
2169        for i in 0..scalar.len() {
2170            let d = (scalar[i] as i32 - simd[i] as i32).abs();
2171            assert!(
2172                d <= 1,
2173                "bilinear mismatch at {}: scalar={} simd={}",
2174                i,
2175                scalar[i],
2176                simd[i]
2177            );
2178        }
2179    }
2180
2181    #[test]
2182    fn bilinear_constant_input_yields_constant_output() {
2183        let src = vec![42u8; 64 * 32];
2184        let out = bilinear_scale_plane(&src, 64, 32, 128, 64);
2185        for &v in &out {
2186            assert_eq!(v, 42, "constant input must yield constant output");
2187        }
2188    }
2189
2190    #[test]
2191    fn bilinear_identity_scale() {
2192        let src = make_ramp(32, 32);
2193        let out = bilinear_scale_plane_scalar(&src, 32, 32, 32, 32);
2194        assert_eq!(out, src);
2195    }
2196
2197    // -------- 10-bit (Squad-19) --------
2198
2199    fn make_10bit_frame_planar(w: usize, h: usize, y_val: u16, c_val: u16) -> VideoFrame {
2200        let y_samples = w * h;
2201        let c_samples = (w / 2) * (h / 2);
2202        let total = y_samples + 2 * c_samples;
2203        let mut buf = Vec::with_capacity(total * 2);
2204        for _ in 0..y_samples {
2205            buf.extend_from_slice(&y_val.to_le_bytes());
2206        }
2207        for _ in 0..(2 * c_samples) {
2208            buf.extend_from_slice(&c_val.to_le_bytes());
2209        }
2210        VideoFrame::new(
2211            bytes::Bytes::from(buf),
2212            w as u32,
2213            h as u32,
2214            PixelFormat::Yuv420p10le,
2215            ColorSpace::Bt2020,
2216            0,
2217        )
2218    }
2219
2220    #[test]
2221    fn convert_to_yuv420p_bt709_passthrough_10bit() {
2222        // The HDR-passthrough contract: a 10-bit `Yuv420p10le` frame
2223        // must come out of `convert_to_yuv420p_bt709` byte-identical
2224        // (no tonemap, no matrix conversion). The matrix conversion
2225        // is BT.601→BT.709 on 8-bit; for 10-bit we always passthrough
2226        // because the source could be HDR / wide-gamut and the matrix
2227        // shift would corrupt it.
2228        let frame = make_10bit_frame_planar(16, 16, 600, 512);
2229        let out = convert_to_yuv420p_bt709(&frame).expect("10-bit passthrough");
2230        assert_eq!(out.format, PixelFormat::Yuv420p10le);
2231        assert_eq!(out.width, 16);
2232        assert_eq!(out.height, 16);
2233        assert_eq!(out.data.len(), frame.data.len());
2234        assert_eq!(
2235            &out.data[..],
2236            &frame.data[..],
2237            "10-bit data must be byte-identical (no tonemap)"
2238        );
2239        assert_eq!(
2240            out.color_space,
2241            ColorSpace::Bt2020,
2242            "color space must not change"
2243        );
2244    }
2245
2246    #[test]
2247    fn scale_frame_10bit_constant_input_yields_constant_output() {
2248        let frame = make_10bit_frame_planar(64, 64, 600, 400);
2249        let out = scale_frame(&frame, 32, 32).expect("10-bit scale");
2250        assert_eq!(out.format, PixelFormat::Yuv420p10le);
2251        assert_eq!(out.width, 32);
2252        assert_eq!(out.height, 32);
2253
2254        // Decode the output planes back to u16 and assert constant.
2255        let y_samples = 32 * 32;
2256        let c_samples = 16 * 16;
2257        let y_bytes = y_samples * 2;
2258        let c_bytes = c_samples * 2;
2259        assert_eq!(out.data.len(), y_bytes + 2 * c_bytes);
2260
2261        let y = read_u16le(&out.data[..y_bytes]);
2262        let u = read_u16le(&out.data[y_bytes..y_bytes + c_bytes]);
2263        let v = read_u16le(&out.data[y_bytes + c_bytes..y_bytes + 2 * c_bytes]);
2264        for &s in &y {
2265            assert_eq!(s, 600, "luma must be constant after bilinear");
2266        }
2267        for &s in u.iter().chain(v.iter()) {
2268            assert_eq!(s, 400, "chroma must be constant after bilinear");
2269        }
2270    }
2271
2272    #[test]
2273    fn scale_frame_10bit_identity_yields_byte_identical() {
2274        let frame = make_10bit_frame_planar(32, 32, 768, 256);
2275        // identity scale (same dims) early-returns clone — verify
2276        let out = scale_frame(&frame, 32, 32).expect("identity");
2277        assert_eq!(&out.data[..], &frame.data[..]);
2278    }
2279
2280    #[test]
2281    fn bilinear_10bit_scalar_clamps_inside_10bit_range() {
2282        // Synthetic ramp in 10-bit range; verify output is bounded.
2283        let mut src = vec![0u16; 64 * 32];
2284        for (i, s) in src.iter_mut().enumerate() {
2285            *s = (i as u16) % 1024;
2286        }
2287        let out = bilinear_scale_plane_u16_scalar(&src, 64, 32, 128, 64);
2288        for &v in &out {
2289            assert!(v <= 1023, "10-bit sample {} exceeds 1023", v);
2290        }
2291    }
2292
2293    // -------- 10-bit AVX2 (Squad-29) --------
2294
2295    fn make_10bit_ramp(w: usize, h: usize) -> Vec<u16> {
2296        // Deterministic 10-bit ramp; cycles through 0..=1023.
2297        (0..w * h)
2298            .map(|i| ((i * 7 + i / w) % 1024) as u16)
2299            .collect()
2300    }
2301
2302    #[test]
2303    fn bilinear_10bit_scalar_vs_avx2_agree_2x_upscale() {
2304        // 2× upscale exercises every fractional weight in the source.
2305        let src_w = 64;
2306        let src_h = 32;
2307        let src = make_10bit_ramp(src_w, src_h);
2308        let dst_w = 128;
2309        let dst_h = 64;
2310
2311        let scalar = bilinear_scale_plane_u16_scalar(&src, src_w, src_h, dst_w, dst_h);
2312        let simd = bilinear_scale_plane_u16(&src, src_w, src_h, dst_w, dst_h);
2313
2314        assert_eq!(scalar.len(), simd.len());
2315        let mut max_diff = 0i32;
2316        for i in 0..scalar.len() {
2317            let d = (scalar[i] as i32 - simd[i] as i32).abs();
2318            if d > max_diff {
2319                max_diff = d;
2320            }
2321            assert!(
2322                d <= 1,
2323                "bilinear 10-bit mismatch at {}: scalar={} simd={}",
2324                i,
2325                scalar[i],
2326                simd[i]
2327            );
2328        }
2329    }
2330
2331    #[test]
2332    fn bilinear_10bit_scalar_vs_avx2_agree_downscale_1080p_to_720p() {
2333        // Headline case: 1920×1080 → 1280×720 luma plane. Same pattern
2334        // bench uses; gates the AVX2 main path (16-lane while loop runs
2335        // ~80 iters per row at dst_w=1280).
2336        let src_w = 1920;
2337        let src_h = 1080;
2338        let src = make_10bit_ramp(src_w, src_h);
2339        let dst_w = 1280;
2340        let dst_h = 720;
2341
2342        let scalar = bilinear_scale_plane_u16_scalar(&src, src_w, src_h, dst_w, dst_h);
2343        let simd = bilinear_scale_plane_u16(&src, src_w, src_h, dst_w, dst_h);
2344
2345        for i in 0..scalar.len() {
2346            let d = (scalar[i] as i32 - simd[i] as i32).abs();
2347            assert!(
2348                d <= 1,
2349                "bilinear 10-bit mismatch at {}: scalar={} simd={}",
2350                i,
2351                scalar[i],
2352                simd[i]
2353            );
2354        }
2355    }
2356
2357    #[test]
2358    fn bilinear_10bit_avx2_constant_input_yields_constant_output() {
2359        // Constant 600 (mid-luma in 10-bit limited range) should stay
2360        // exactly 600 through both axes of bilinear interp.
2361        let src = vec![600u16; 128 * 64];
2362        let out = bilinear_scale_plane_u16(&src, 128, 64, 256, 128);
2363        for &v in &out {
2364            assert_eq!(v, 600, "constant 10-bit input must yield constant output");
2365        }
2366    }
2367
2368    #[test]
2369    fn bilinear_10bit_avx2_max_value_clamped() {
2370        // Max-value (1023) input must stay clamped at 1023 — defensive
2371        // against the Q15 round trick pushing exactly-1023 to 1024.
2372        let src = vec![1023u16; 64 * 32];
2373        let out = bilinear_scale_plane_u16(&src, 64, 32, 128, 64);
2374        for &v in &out {
2375            assert!(v <= 1023, "10-bit AVX2 sample {} exceeds 1023", v);
2376            assert_eq!(v, 1023, "constant 1023 should stay 1023");
2377        }
2378    }
2379
2380    #[test]
2381    fn bilinear_10bit_narrow_width_falls_back_to_scalar() {
2382        // dst_w < 16 gates the AVX2 main path; dispatch should fall
2383        // back to scalar without panicking.
2384        let src_w = 8;
2385        let src_h = 8;
2386        let src = make_10bit_ramp(src_w, src_h);
2387        let dst_w = 4;
2388        let dst_h = 4;
2389
2390        let scalar = bilinear_scale_plane_u16_scalar(&src, src_w, src_h, dst_w, dst_h);
2391        let dispatched = bilinear_scale_plane_u16(&src, src_w, src_h, dst_w, dst_h);
2392
2393        assert_eq!(
2394            scalar, dispatched,
2395            "narrow strip should match scalar exactly"
2396        );
2397    }
2398
2399    #[test]
2400    fn bilinear_10bit_odd_dst_dims_handled() {
2401        // dst_w=17 forces a 1-sample tail (16 main + 1 tail).
2402        let src_w = 32;
2403        let src_h = 32;
2404        let src = make_10bit_ramp(src_w, src_h);
2405        let dst_w = 17;
2406        let dst_h = 9;
2407
2408        let scalar = bilinear_scale_plane_u16_scalar(&src, src_w, src_h, dst_w, dst_h);
2409        let simd = bilinear_scale_plane_u16(&src, src_w, src_h, dst_w, dst_h);
2410        assert_eq!(scalar.len(), simd.len());
2411        for i in 0..scalar.len() {
2412            let d = (scalar[i] as i32 - simd[i] as i32).abs();
2413            assert!(
2414                d <= 1,
2415                "tail mismatch at {}: scalar={} simd={}",
2416                i,
2417                scalar[i],
2418                simd[i]
2419            );
2420        }
2421    }
2422
2423    #[test]
2424    fn bilinear_10bit_tall_narrow_strip() {
2425        // 16×512 → 16×256 — main loop runs once per row (dst_w=16),
2426        // many rows.
2427        let src_w = 16;
2428        let src_h = 512;
2429        let src = make_10bit_ramp(src_w, src_h);
2430        let dst_w = 16;
2431        let dst_h = 256;
2432
2433        let scalar = bilinear_scale_plane_u16_scalar(&src, src_w, src_h, dst_w, dst_h);
2434        let simd = bilinear_scale_plane_u16(&src, src_w, src_h, dst_w, dst_h);
2435        for i in 0..scalar.len() {
2436            let d = (scalar[i] as i32 - simd[i] as i32).abs();
2437            assert!(d <= 1, "tall strip mismatch at {}", i);
2438        }
2439    }
2440
2441    // -------- BT.601 → BT.709 10-bit (Squad-29) --------
2442
2443    fn synth_601_frame_10bit(w: usize, h: usize) -> (Vec<u16>, Vec<u16>, Vec<u16>) {
2444        // Sweep limited 10-bit range [64, 940] for luma, [64, 960] for chroma.
2445        let mut y = vec![0u16; w * h];
2446        let mut cb = vec![0u16; (w / 2) * (h / 2)];
2447        let mut cr = vec![0u16; (w / 2) * (h / 2)];
2448        for i in 0..y.len() {
2449            y[i] = 64 + ((i as u32 * 17) % 877) as u16;
2450        }
2451        for i in 0..cb.len() {
2452            cb[i] = 64 + ((i as u32 * 13) % 897) as u16;
2453            cr[i] = 64 + ((i as u32 * 23) % 897) as u16;
2454        }
2455        (y, cb, cr)
2456    }
2457
2458    #[test]
2459    fn bt601_to_bt709_10bit_neutral_gray_roundtrips() {
2460        // Cb=Cr=512 (10-bit chroma center) — every gray luma round-trips.
2461        // 10-bit limited-range luma analogues of 16/64/128/200/235:
2462        //   16 << 2 = 64,  64 << 2 = 256,  128 << 2 = 512,
2463        //   200 << 2 = 800, 235 << 2 = 940.
2464        for &y_val in &[64u16, 256, 512, 800, 940] {
2465            let w = 32;
2466            let h = 16;
2467            let mut y = vec![y_val; w * h];
2468            let mut cb = vec![512u16; (w / 2) * (h / 2)];
2469            let mut cr = vec![512u16; (w / 2) * (h / 2)];
2470            bt601_to_bt709_planes_10bit_scalar(&mut y, &mut cb, &mut cr, w, h);
2471            for v in &y {
2472                assert_eq!(*v, y_val, "Y with neutral chroma must round-trip");
2473            }
2474            for v in &cb {
2475                assert_eq!(*v, 512);
2476            }
2477            for v in &cr {
2478                assert_eq!(*v, 512);
2479            }
2480        }
2481    }
2482
2483    #[test]
2484    fn bt601_to_bt709_10bit_scalar_vs_avx2_agree_256x256() {
2485        // 256×256 → cw=128, multiple of 16 for chroma. Main AVX2 path
2486        // covers the entire plane.
2487        let w = 256;
2488        let h = 256;
2489        let (y0, cb0, cr0) = synth_601_frame_10bit(w, h);
2490
2491        let mut y_s = y0.clone();
2492        let mut cb_s = cb0.clone();
2493        let mut cr_s = cr0.clone();
2494        bt601_to_bt709_planes_10bit_scalar(&mut y_s, &mut cb_s, &mut cr_s, w, h);
2495
2496        let mut y_v = y0.clone();
2497        let mut cb_v = cb0.clone();
2498        let mut cr_v = cr0.clone();
2499        bt601_to_bt709_planes_10bit(&mut y_v, &mut cb_v, &mut cr_v, w, h);
2500
2501        for i in 0..y_s.len() {
2502            let d = (y_s[i] as i32 - y_v[i] as i32).abs();
2503            assert!(d <= 1, "Y[{}] scalar={} avx2={}", i, y_s[i], y_v[i]);
2504        }
2505        for i in 0..cb_s.len() {
2506            assert!(
2507                (cb_s[i] as i32 - cb_v[i] as i32).abs() <= 1,
2508                "Cb[{}] scalar={} avx2={}",
2509                i,
2510                cb_s[i],
2511                cb_v[i]
2512            );
2513            assert!(
2514                (cr_s[i] as i32 - cr_v[i] as i32).abs() <= 1,
2515                "Cr[{}] scalar={} avx2={}",
2516                i,
2517                cr_s[i],
2518                cr_v[i]
2519            );
2520        }
2521    }
2522
2523    #[test]
2524    fn bt601_to_bt709_10bit_scalar_vs_avx2_agree_tail() {
2525        // 34 wide forces a 1-sample chroma tail (cw=17, main covers 16,
2526        // tail covers 1).
2527        let w = 34;
2528        let h = 16;
2529        let (y0, cb0, cr0) = synth_601_frame_10bit(w, h);
2530
2531        let mut y_s = y0.clone();
2532        let mut cb_s = cb0.clone();
2533        let mut cr_s = cr0.clone();
2534        bt601_to_bt709_planes_10bit_scalar(&mut y_s, &mut cb_s, &mut cr_s, w, h);
2535
2536        let mut y_v = y0.clone();
2537        let mut cb_v = cb0.clone();
2538        let mut cr_v = cr0.clone();
2539        bt601_to_bt709_planes_10bit(&mut y_v, &mut cb_v, &mut cr_v, w, h);
2540
2541        for i in 0..y_s.len() {
2542            assert!(
2543                (y_s[i] as i32 - y_v[i] as i32).abs() <= 1,
2544                "Y[{}] scalar={} avx2={}",
2545                i,
2546                y_s[i],
2547                y_v[i]
2548            );
2549        }
2550        for i in 0..cb_s.len() {
2551            assert!((cb_s[i] as i32 - cb_v[i] as i32).abs() <= 1);
2552            assert!((cr_s[i] as i32 - cr_v[i] as i32).abs() <= 1);
2553        }
2554    }
2555
2556    #[test]
2557    fn bt601_to_bt709_10bit_clamps_ranges() {
2558        // After conversion, luma stays in [64, 940] and chroma in [64, 960].
2559        let w = 32;
2560        let h = 16;
2561        let (mut y, mut cb, mut cr) = synth_601_frame_10bit(w, h);
2562        bt601_to_bt709_planes_10bit(&mut y, &mut cb, &mut cr, w, h);
2563        for &v in cb.iter().chain(cr.iter()) {
2564            assert!(
2565                (64..=960).contains(&v),
2566                "chroma {} out of 10-bit limited range",
2567                v
2568            );
2569        }
2570        for &v in y.iter() {
2571            assert!(
2572                (64..=940).contains(&v),
2573                "luma {} out of 10-bit limited range",
2574                v
2575            );
2576        }
2577    }
2578
2579    #[test]
2580    fn bt601_to_bt709_10bit_extreme_chroma_clamped_at_high_end() {
2581        // Chroma at the limited-range max should produce in-range output.
2582        let w = 32;
2583        let h = 16;
2584        let mut y = vec![940u16; w * h];
2585        let mut cb = vec![960u16; (w / 2) * (h / 2)];
2586        let mut cr = vec![960u16; (w / 2) * (h / 2)];
2587        bt601_to_bt709_planes_10bit(&mut y, &mut cb, &mut cr, w, h);
2588        for &v in y.iter() {
2589            assert!(v <= 940, "luma {} > 940 (clamp violated)", v);
2590        }
2591        for &v in cb.iter().chain(cr.iter()) {
2592            assert!(v <= 960, "chroma {} > 960 (clamp violated)", v);
2593        }
2594    }
2595
2596    // -------- 4:4:4 → 4:2:0 chroma downsample (Squad-31, roadmap #6) --------
2597
2598    #[test]
2599    fn downsample_4x4_box_average_8bit_hand_verified() {
2600        // 4×4 chroma plane (16 samples) → 2×2 output. Hand-compute the
2601        // 4 averages so the test is its own oracle.
2602        //
2603        //   Cb = [ 10  20 |  30  40
2604        //          50  60 |  70  80
2605        //          ---------+--------
2606        //          90 100 | 110 120
2607        //         130 140 | 150 160 ]
2608        //
2609        // Block (0,0): (10+20+50+60+2)>>2 = 142>>2 = 35
2610        // Block (1,0): (30+40+70+80+2)>>2 = 222>>2 = 55
2611        // Block (0,1): (90+100+130+140+2)>>2 = 462>>2 = 115
2612        // Block (1,1): (110+120+150+160+2)>>2 = 542>>2 = 135
2613        let cb: Vec<u8> = vec![
2614            10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160,
2615        ];
2616        // Cr distinct so we know the per-plane logic is independent.
2617        let cr: Vec<u8> = vec![
2618            5, 15, 25, 35, 45, 55, 65, 75, 85, 95, 105, 115, 125, 135, 145, 155,
2619        ];
2620        // Y plane is unchanged — pick a recognizable ramp so we can verify
2621        // the copy is verbatim.
2622        let y: Vec<u8> = (0..16).map(|i| i as u8 * 8).collect();
2623
2624        let out = downsample_chroma_444_to_420(&y, &cb, &cr, 4, 4);
2625        // Expected layout: 16 Y bytes || 4 Cb bytes || 4 Cr bytes.
2626        assert_eq!(out.len(), 16 + 4 + 4);
2627        assert_eq!(&out[..16], y.as_slice(), "Y must round-trip verbatim");
2628        // Cb output (4 samples in row-major 2×2 order)
2629        assert_eq!(out[16], 35, "Cb block (0,0)");
2630        assert_eq!(out[17], 55, "Cb block (1,0)");
2631        assert_eq!(out[18], 115, "Cb block (0,1)");
2632        assert_eq!(out[19], 135, "Cb block (1,1)");
2633        // Cr output
2634        assert_eq!(out[20], 30, "Cr block (0,0): (5+15+45+55+2)>>2 = 30");
2635        assert_eq!(out[21], 50, "Cr block (1,0): (25+35+65+75+2)>>2 = 50");
2636        assert_eq!(out[22], 110, "Cr block (0,1): (85+95+125+135+2)>>2 = 110");
2637        assert_eq!(out[23], 130, "Cr block (1,1): (105+115+145+155+2)>>2 = 130");
2638    }
2639
2640    #[test]
2641    fn downsample_constant_input_8bit_yields_constant_output() {
2642        // Cb=128 (chroma midpoint) — average of four 128s with rounding
2643        // is still 128. Round-trip identity for any constant.
2644        let w = 16;
2645        let h = 16;
2646        let y = vec![64u8; w * h];
2647        let cb = vec![128u8; w * h];
2648        let cr = vec![128u8; w * h];
2649        let out = downsample_chroma_444_to_420(&y, &cb, &cr, w, h);
2650        let cw = (w + 1) / 2;
2651        let ch = (h + 1) / 2;
2652        assert_eq!(out.len(), w * h + 2 * cw * ch);
2653        // Y unchanged.
2654        for i in 0..w * h {
2655            assert_eq!(out[i], 64, "Y[{}] should be 64", i);
2656        }
2657        // Cb / Cr: each output sample == 128.
2658        for i in (w * h)..(w * h + 2 * cw * ch) {
2659            assert_eq!(out[i], 128, "chroma[{}] should be 128", i - w * h);
2660        }
2661    }
2662
2663    #[test]
2664    fn downsample_odd_dimensions_clamp_policy() {
2665        // 7×7 input → 4×4 output. The rightmost column of 2×2 blocks
2666        // (cx=3) and bottom row (cy=3) straddle exactly one source row /
2667        // column; clamp policy reuses the in-bounds neighbour.
2668        //
2669        //   plane[cx=3, cy=0] takes samples (6, 0), (6, 0), (6, 1), (6, 1)
2670        //     because x1 = min(7, w-1=6) = 6 — both x0 and x1 = 6.
2671        //   So the corner sample reduces to a 1-sample average:
2672        //     (s + s + s' + s' + 2) >> 2 = (s + s')/2 with rounding.
2673        //
2674        // Easiest verification: constant-fill input → constant output
2675        // even at the odd boundary.
2676        let w = 7;
2677        let h = 7;
2678        let y = vec![100u8; w * h];
2679        let cb = vec![128u8; w * h];
2680        let cr = vec![64u8; w * h];
2681        let out = downsample_chroma_444_to_420(&y, &cb, &cr, w, h);
2682        let cw = (w + 1) / 2; // 4
2683        let ch = (h + 1) / 2; // 4
2684        assert_eq!(cw, 4);
2685        assert_eq!(ch, 4);
2686        assert_eq!(out.len(), w * h + 2 * cw * ch);
2687        // Y verbatim.
2688        for i in 0..w * h {
2689            assert_eq!(out[i], 100);
2690        }
2691        // Cb constant 128.
2692        for cx in 0..cw {
2693            for cy in 0..ch {
2694                let idx = w * h + cy * cw + cx;
2695                assert_eq!(out[idx], 128, "Cb[{},{}] expected 128", cx, cy);
2696            }
2697        }
2698        // Cr constant 64.
2699        for cx in 0..cw {
2700            for cy in 0..ch {
2701                let idx = w * h + cw * ch + cy * cw + cx;
2702                assert_eq!(out[idx], 64, "Cr[{},{}] expected 64", cx, cy);
2703            }
2704        }
2705    }
2706
2707    #[test]
2708    fn downsample_10bit_constant_input_yields_constant_output() {
2709        // Cb=512 (10-bit midpoint = 1024/2). Identity for constant input.
2710        let w = 16;
2711        let h = 16;
2712        let y = vec![400u16; w * h];
2713        let cb = vec![512u16; w * h];
2714        let cr = vec![512u16; w * h];
2715        let out = downsample_chroma_444_to_420_10bit(&y, &cb, &cr, w, h);
2716        let cw = (w + 1) / 2;
2717        let ch = (h + 1) / 2;
2718        assert_eq!(out.len(), 2 * (w * h + 2 * cw * ch), "10-bit byte count");
2719
2720        // Verify each u16 LE sample. Y plane.
2721        for i in 0..w * h {
2722            let s = u16::from_le_bytes([out[i * 2], out[i * 2 + 1]]);
2723            assert_eq!(s, 400, "Y[{}] should be 400", i);
2724        }
2725        // Cb plane.
2726        let cb_byte_off = w * h * 2;
2727        for i in 0..cw * ch {
2728            let s = u16::from_le_bytes([out[cb_byte_off + i * 2], out[cb_byte_off + i * 2 + 1]]);
2729            assert_eq!(s, 512, "Cb[{}] should be 512", i);
2730        }
2731        // Cr plane.
2732        let cr_byte_off = cb_byte_off + cw * ch * 2;
2733        for i in 0..cw * ch {
2734            let s = u16::from_le_bytes([out[cr_byte_off + i * 2], out[cr_byte_off + i * 2 + 1]]);
2735            assert_eq!(s, 512, "Cr[{}] should be 512", i);
2736        }
2737    }
2738
2739    #[test]
2740    fn downsample_10bit_max_value_no_overflow() {
2741        // 4 × 1023 + 2 = 4094 fits in u16 (max 65535) and even in i16
2742        // (max 32767). The u32 accumulator gives plenty of headroom.
2743        // Verify a full-1023 input doesn't wrap to 0.
2744        let w = 4;
2745        let h = 4;
2746        let y = vec![1023u16; w * h];
2747        let cb = vec![1023u16; w * h];
2748        let cr = vec![1023u16; w * h];
2749        let out = downsample_chroma_444_to_420_10bit(&y, &cb, &cr, w, h);
2750        let cw = (w + 1) / 2;
2751        let ch = (h + 1) / 2;
2752
2753        // Y verbatim (1023).
2754        for i in 0..w * h {
2755            let s = u16::from_le_bytes([out[i * 2], out[i * 2 + 1]]);
2756            assert_eq!(s, 1023, "Y[{}]", i);
2757        }
2758        // Cb / Cr: (1023 + 1023 + 1023 + 1023 + 2) >> 2 = 4094 >> 2 = 1023.
2759        let cb_byte_off = w * h * 2;
2760        for i in 0..2 * cw * ch {
2761            let s = u16::from_le_bytes([out[cb_byte_off + i * 2], out[cb_byte_off + i * 2 + 1]]);
2762            assert_eq!(s, 1023, "chroma[{}] should be 1023 (no overflow)", i);
2763        }
2764    }
2765
2766    #[test]
2767    fn downsample_10bit_4x4_box_average_hand_verified() {
2768        // Same 4×4 hand-verified case as 8-bit but in 10-bit.
2769        let cb_u: Vec<u16> = vec![
2770            10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160,
2771        ];
2772        let cr_u: Vec<u16> = vec![
2773            500, 600, 700, 800, 500, 600, 700, 800, 500, 600, 700, 800, 500, 600, 700, 800,
2774        ];
2775        let y_u: Vec<u16> = (0..16).map(|i| i as u16 * 50).collect();
2776
2777        let out = downsample_chroma_444_to_420_10bit(&y_u, &cb_u, &cr_u, 4, 4);
2778        // Y bytes: 16 × 2 = 32. Then 4 Cb (8 bytes) + 4 Cr (8 bytes) = 48.
2779        assert_eq!(out.len(), 32 + 8 + 8);
2780
2781        // Y round-trip.
2782        for i in 0..16 {
2783            let s = u16::from_le_bytes([out[i * 2], out[i * 2 + 1]]);
2784            assert_eq!(s, i as u16 * 50, "Y[{}]", i);
2785        }
2786        // Cb expected (35, 55, 115, 135) — same as 8-bit case.
2787        let cb_off = 32;
2788        let cb0 = u16::from_le_bytes([out[cb_off], out[cb_off + 1]]);
2789        let cb1 = u16::from_le_bytes([out[cb_off + 2], out[cb_off + 3]]);
2790        let cb2 = u16::from_le_bytes([out[cb_off + 4], out[cb_off + 5]]);
2791        let cb3 = u16::from_le_bytes([out[cb_off + 6], out[cb_off + 7]]);
2792        assert_eq!(cb0, 35);
2793        assert_eq!(cb1, 55);
2794        assert_eq!(cb2, 115);
2795        assert_eq!(cb3, 135);
2796        // Cr: rows are identical, so each 2×2 block average == row average.
2797        // (500+600+500+600+2)>>2 = 2202>>2 = 550
2798        // (700+800+700+800+2)>>2 = 3002>>2 = 750
2799        let cr_off = cb_off + 8;
2800        let cr0 = u16::from_le_bytes([out[cr_off], out[cr_off + 1]]);
2801        let cr1 = u16::from_le_bytes([out[cr_off + 2], out[cr_off + 3]]);
2802        assert_eq!(cr0, 550);
2803        assert_eq!(cr1, 750);
2804    }
2805
2806    #[test]
2807    fn downsample_frame_yuv444p10le_to_yuv420p10le() {
2808        // High-level frame wrapper. Constant 4:4:4 10-bit → constant
2809        // 4:2:0 10-bit, dims preserved, format flipped.
2810        let w = 16;
2811        let h = 16;
2812        let plane = w * h;
2813        let mut buf = Vec::with_capacity(3 * plane * 2);
2814        for _ in 0..plane {
2815            buf.extend_from_slice(&500u16.to_le_bytes()); // Y
2816        }
2817        for _ in 0..plane {
2818            buf.extend_from_slice(&512u16.to_le_bytes()); // Cb
2819        }
2820        for _ in 0..plane {
2821            buf.extend_from_slice(&512u16.to_le_bytes()); // Cr
2822        }
2823        let frame = VideoFrame::new(
2824            bytes::Bytes::from(buf),
2825            w as u32,
2826            h as u32,
2827            PixelFormat::Yuv444p10le,
2828            ColorSpace::Bt2020,
2829            42,
2830        );
2831        let out = downsample_444_to_420_frame(&frame).expect("downsample");
2832        assert_eq!(out.format, PixelFormat::Yuv420p10le);
2833        assert_eq!(out.width, w as u32);
2834        assert_eq!(out.height, h as u32);
2835        assert_eq!(out.pts, 42, "PTS preserved");
2836        assert_eq!(out.color_space, ColorSpace::Bt2020, "color_space preserved");
2837
2838        // Spot-check the output samples.
2839        let cw = w / 2;
2840        let ch = h / 2;
2841        let expected_bytes = 2 * (w * h + 2 * cw * ch);
2842        assert_eq!(out.data.len(), expected_bytes);
2843
2844        // First Y sample = 500.
2845        let y0 = u16::from_le_bytes([out.data[0], out.data[1]]);
2846        assert_eq!(y0, 500);
2847        // First Cb sample (after Y plane) = 512.
2848        let cb0 = u16::from_le_bytes([out.data[w * h * 2], out.data[w * h * 2 + 1]]);
2849        assert_eq!(cb0, 512);
2850    }
2851
2852    #[test]
2853    fn downsample_frame_yuva444p10le_drops_alpha() {
2854        // 4-plane source, alpha is 16-bit precision. Output is plain
2855        // Yuv420p10le (no alpha plane).
2856        let w = 8;
2857        let h = 8;
2858        let plane = w * h;
2859        let mut buf = Vec::with_capacity(4 * plane * 2);
2860        for _ in 0..plane {
2861            buf.extend_from_slice(&600u16.to_le_bytes());
2862        }
2863        for _ in 0..plane {
2864            buf.extend_from_slice(&500u16.to_le_bytes());
2865        }
2866        for _ in 0..plane {
2867            buf.extend_from_slice(&500u16.to_le_bytes());
2868        }
2869        for _ in 0..plane {
2870            // Alpha — 16-bit, would have value 65535 if it survived.
2871            buf.extend_from_slice(&65535u16.to_le_bytes());
2872        }
2873        let frame = VideoFrame::new(
2874            bytes::Bytes::from(buf),
2875            w as u32,
2876            h as u32,
2877            PixelFormat::Yuva444p10le,
2878            ColorSpace::Bt2020,
2879            7,
2880        );
2881        let out = downsample_444_to_420_frame(&frame).expect("downsample with alpha");
2882        assert_eq!(out.format, PixelFormat::Yuv420p10le);
2883        // Output byte count: only Y/Cb/Cr — NO alpha plane.
2884        let cw = w / 2;
2885        let ch = h / 2;
2886        let expected = 2 * (w * h + 2 * cw * ch);
2887        assert_eq!(out.data.len(), expected);
2888        // Verify alpha wasn't smuggled in (no 65535 samples).
2889        for i in (0..out.data.len()).step_by(2) {
2890            let s = u16::from_le_bytes([out.data[i], out.data[i + 1]]);
2891            assert!(
2892                s < 1024 || s == 65535 && false,
2893                "stray alpha sample {} at {}",
2894                s,
2895                i
2896            );
2897            assert_ne!(s, 65535, "alpha plane leaked into output");
2898        }
2899    }
2900
2901    #[test]
2902    fn downsample_frame_rejects_non_444() {
2903        // 4:2:0 input must error — the frame is already in target format.
2904        let w = 16;
2905        let h = 16;
2906        let plane = w * h;
2907        let mut buf = Vec::with_capacity(plane + 2 * (plane / 4));
2908        buf.resize(plane + 2 * (plane / 4), 128);
2909        let frame = VideoFrame::new(
2910            bytes::Bytes::from(buf),
2911            w as u32,
2912            h as u32,
2913            PixelFormat::Yuv420p,
2914            ColorSpace::Bt709,
2915            0,
2916        );
2917        let err = downsample_444_to_420_frame(&frame).unwrap_err();
2918        assert!(format!("{}", err).contains("expected 4:4:4 input"));
2919    }
2920}