1use anyhow::{Result, bail};
2use bytes::{Bytes, BytesMut};
3
4use crate::frame::{ColorMetadata, ColorSpace, PixelFormat, TransferFn, VideoFrame};
5use crate::tonemap::tonemap_yuv420p10le_bt2020_to_yuv420p_bt709;
6
7pub fn convert_to_sdr_bt709(
50 frame: &VideoFrame,
51 color_metadata: &ColorMetadata,
52) -> Result<VideoFrame> {
53 let is_hdr_transfer = matches!(
54 color_metadata.transfer,
55 TransferFn::St2084 | TransferFn::AribStdB67
56 );
57 if is_hdr_transfer && matches!(frame.format, PixelFormat::Yuv420p10le) {
58 let max_white_nits = color_metadata
59 .mastering_display
60 .as_ref()
61 .map(|m| (m.max_luminance as f32) / 10_000.0)
64 .filter(|n| *n > 0.0);
65 return tonemap_yuv420p10le_bt2020_to_yuv420p_bt709(
66 frame,
67 color_metadata.transfer,
68 max_white_nits,
69 );
70 }
71 convert_to_yuv420p_bt709(frame)
78}
79
80pub fn convert_to_yuv420p_bt709(frame: &VideoFrame) -> Result<VideoFrame> {
81 use PixelFormat::*;
82
83 match frame.format {
90 Yuv420p10le => return Ok(frame.clone()),
91 Yuv422p10le => return yuv422p10le_to_yuv420p10le(frame),
92 Yuv444p10le | Yuva444p10le => return downsample_444_to_420_frame(frame),
93 Yuv420p12le => bail!(
94 "Yuv420p12le not yet supported in convert_to_yuv420p_bt709 \
95 (no decoder in tree emits 12-bit; add a 12→10-bit dither \
96 when a decoder lands that does)"
97 ),
98 _ => {}
99 }
100
101 match frame.format {
103 Rgb24 => return rgb_to_yuv420p_bt709(frame, false),
104 Rgba32 => return rgb_to_yuv420p_bt709(frame, true),
105 _ => {}
106 }
107
108 let yuv420p = match frame.format {
110 Yuv420p => frame.clone(),
111 Nv12 => nv12_to_yuv420p(frame)?,
112 Nv21 => nv21_to_yuv420p(frame)?,
113 Yuv422p => yuv422p_to_yuv420p(frame)?,
114 Yuv444p => downsample_444_to_420_frame(frame)?,
115 other => bail!(
116 "unsupported conversion: {:?}/{:?} → Yuv420p/Bt709",
117 other,
118 frame.color_space
119 ),
120 };
121
122 if yuv420p.color_space == ColorSpace::Bt709 {
124 Ok(yuv420p)
125 } else {
126 recolor_yuv420p_bt601_to_bt709(&yuv420p)
133 }
134}
135
136fn nv12_to_yuv420p(frame: &VideoFrame) -> Result<VideoFrame> {
137 deinterleave_semiplanar_to_yuv420p(frame, false)
138}
139
140fn nv21_to_yuv420p(frame: &VideoFrame) -> Result<VideoFrame> {
144 deinterleave_semiplanar_to_yuv420p(frame, true)
145}
146
147fn deinterleave_semiplanar_to_yuv420p(frame: &VideoFrame, v_first: bool) -> Result<VideoFrame> {
148 let w = frame.width as usize;
149 let h = frame.height as usize;
150 let y_size = w * h;
151 let uv_size = y_size / 4;
152 if frame.data.len() < y_size + 2 * uv_size {
153 bail!(
154 "{} frame too small for {}x{}: need {} bytes got {}",
155 if v_first { "NV21" } else { "NV12" },
156 w,
157 h,
158 y_size + 2 * uv_size,
159 frame.data.len()
160 );
161 }
162 let mut out = BytesMut::with_capacity(y_size + uv_size * 2);
163
164 out.extend_from_slice(&frame.data[..y_size]);
166
167 let uv = &frame.data[y_size..];
169 let mut u_plane = Vec::with_capacity(uv_size);
170 let mut v_plane = Vec::with_capacity(uv_size);
171 for i in 0..uv_size {
172 let (a, b) = (uv[i * 2], uv[i * 2 + 1]);
173 if v_first {
174 v_plane.push(a);
175 u_plane.push(b);
176 } else {
177 u_plane.push(a);
178 v_plane.push(b);
179 }
180 }
181 out.extend_from_slice(&u_plane);
182 out.extend_from_slice(&v_plane);
183
184 Ok(VideoFrame::new(
185 out.freeze(),
186 frame.width,
187 frame.height,
188 PixelFormat::Yuv420p,
189 frame.color_space,
190 frame.pts,
191 ))
192}
193
194fn yuv422p_to_yuv420p(frame: &VideoFrame) -> Result<VideoFrame> {
198 let w = frame.width as usize;
199 let h = frame.height as usize;
200 let cw = w.div_ceil(2);
201 let ch_in = h;
203 let ch_out = h.div_ceil(2);
204 let y_size = w * h;
205 let chroma_in_size = cw * ch_in;
206 let chroma_out_size = cw * ch_out;
207 if frame.data.len() < y_size + 2 * chroma_in_size {
208 bail!(
209 "Yuv422p frame too small for {}x{}: need {} bytes got {}",
210 w,
211 h,
212 y_size + 2 * chroma_in_size,
213 frame.data.len()
214 );
215 }
216
217 let (y_in, rest) = frame.data.split_at(y_size);
218 let (cb_in, cr_in) = rest.split_at(chroma_in_size);
219
220 let mut out = BytesMut::with_capacity(y_size + 2 * chroma_out_size);
221 out.extend_from_slice(y_in);
222
223 for plane in [cb_in, cr_in] {
224 for cy in 0..ch_out {
225 let y0 = 2 * cy;
226 let y1 = (y0 + 1).min(ch_in - 1);
227 for cx in 0..cw {
228 let s0 = plane[y0 * cw + cx] as u16;
229 let s1 = plane[y1 * cw + cx] as u16;
230 out.extend_from_slice(&[((s0 + s1 + 1) >> 1) as u8]);
231 }
232 }
233 }
234
235 Ok(VideoFrame::new(
236 out.freeze(),
237 frame.width,
238 frame.height,
239 PixelFormat::Yuv420p,
240 frame.color_space,
241 frame.pts,
242 ))
243}
244
245fn yuv422p10le_to_yuv420p10le(frame: &VideoFrame) -> Result<VideoFrame> {
247 let w = frame.width as usize;
248 let h = frame.height as usize;
249 let cw = w.div_ceil(2);
250 let ch_in = h;
251 let ch_out = h.div_ceil(2);
252 let y_samples = w * h;
253 let chroma_in_samples = cw * ch_in;
254 let chroma_out_samples = cw * ch_out;
255 let need_bytes = (y_samples + 2 * chroma_in_samples) * 2;
256 if frame.data.len() < need_bytes {
257 bail!(
258 "Yuv422p10le frame too small for {}x{}: need {} bytes got {}",
259 w,
260 h,
261 need_bytes,
262 frame.data.len()
263 );
264 }
265 let words = read_u16le(&frame.data[..need_bytes]);
266 let (y_in, rest) = words.split_at(y_samples);
267 let (cb_in, cr_in) = rest.split_at(chroma_in_samples);
268
269 let mut out = BytesMut::with_capacity((y_samples + 2 * chroma_out_samples) * 2);
270 write_u16le(&mut out, y_in);
271
272 for plane in [cb_in, cr_in] {
273 for cy in 0..ch_out {
274 let y0 = 2 * cy;
275 let y1 = (y0 + 1).min(ch_in - 1);
276 for cx in 0..cw {
277 let s0 = plane[y0 * cw + cx] as u32;
278 let s1 = plane[y1 * cw + cx] as u32;
279 let avg = ((s0 + s1 + 1) >> 1) as u16;
280 out.extend_from_slice(&avg.to_le_bytes());
281 }
282 }
283 }
284
285 Ok(VideoFrame::new(
286 out.freeze(),
287 frame.width,
288 frame.height,
289 PixelFormat::Yuv420p10le,
290 frame.color_space,
291 frame.pts,
292 ))
293}
294
295fn rgb_to_yuv420p_bt709(frame: &VideoFrame, has_alpha: bool) -> Result<VideoFrame> {
311 let w = frame.width as usize;
312 let h = frame.height as usize;
313 let stride = if has_alpha { 4 } else { 3 };
314 let need = w * h * stride;
315 if frame.data.len() < need {
316 bail!(
317 "{} frame too small for {}x{}: need {} bytes got {}",
318 if has_alpha { "Rgba32" } else { "Rgb24" },
319 w,
320 h,
321 need,
322 frame.data.len()
323 );
324 }
325 let cw = w.div_ceil(2);
326 let ch = h.div_ceil(2);
327 let y_size = w * h;
328 let chroma_size = cw * ch;
329 let mut out = BytesMut::with_capacity(y_size + 2 * chroma_size);
330 out.resize(y_size + 2 * chroma_size, 0);
331
332 const Y_R: i32 = 5982;
340 const Y_G: i32 = 20128;
341 const Y_B: i32 = 2032;
342 const CB_R: i32 = -3299;
349 const CB_G: i32 = -11086;
350 const CB_B: i32 = 14385;
351 const CR_R: i32 = 14385;
355 const CR_G: i32 = -13066;
356 const CR_B: i32 = -1319;
357
358 for y in 0..h {
360 for x in 0..w {
361 let off = (y * w + x) * stride;
362 let r = frame.data[off] as i32;
363 let g = frame.data[off + 1] as i32;
364 let b = frame.data[off + 2] as i32;
365 let y_val = ((r * Y_R + g * Y_G + b * Y_B + (1 << 14)) >> 15) + 16;
366 out[y * w + x] = y_val.clamp(16, 235) as u8;
367 }
368 }
369
370 let cb_off = y_size;
373 let cr_off = y_size + chroma_size;
374 for cy in 0..ch {
375 let y0 = 2 * cy;
376 let y1 = (y0 + 1).min(h - 1);
377 for cx in 0..cw {
378 let x0 = 2 * cx;
379 let x1 = (x0 + 1).min(w - 1);
380 let mut r_sum = 0i32;
382 let mut g_sum = 0i32;
383 let mut b_sum = 0i32;
384 for &(py, px) in &[(y0, x0), (y0, x1), (y1, x0), (y1, x1)] {
385 let off = (py * w + px) * stride;
386 r_sum += frame.data[off] as i32;
387 g_sum += frame.data[off + 1] as i32;
388 b_sum += frame.data[off + 2] as i32;
389 }
390 let r = (r_sum + 2) >> 2;
391 let g = (g_sum + 2) >> 2;
392 let b = (b_sum + 2) >> 2;
393 let cb = ((r * CB_R + g * CB_G + b * CB_B + (1 << 14)) >> 15) + 128;
394 let cr = ((r * CR_R + g * CR_G + b * CR_B + (1 << 14)) >> 15) + 128;
395 out[cb_off + cy * cw + cx] = cb.clamp(16, 240) as u8;
396 out[cr_off + cy * cw + cx] = cr.clamp(16, 240) as u8;
397 }
398 }
399
400 Ok(VideoFrame::new(
401 out.freeze(),
402 frame.width,
403 frame.height,
404 PixelFormat::Yuv420p,
405 ColorSpace::Bt709,
406 frame.pts,
407 ))
408}
409
410const Q15: i32 = 15;
451const Q15_ROUND: i32 = 1 << (Q15 - 1);
452
453#[allow(dead_code)] const M_Y_Y: i32 = 32768;
457const M_Y_CB: i32 = (-0.11554975_f64 * 32768.0) as i32; const M_Y_CR: i32 = (-0.20793764_f64 * 32768.0) as i32; const M_CB_CB: i32 = (1.01863972_f64 * 32768.0).round() as i32; const M_CB_CR: i32 = (0.11461795_f64 * 32768.0).round() as i32; const M_CR_CB: i32 = (0.07504945_f64 * 32768.0).round() as i32; const M_CR_CR: i32 = (1.02532707_f64 * 32768.0).round() as i32; #[inline(always)]
467fn clamp_y(v: i32) -> u8 {
468 v.clamp(16, 235) as u8
469}
470
471#[inline(always)]
472fn clamp_c(v: i32) -> u8 {
473 v.clamp(16, 240) as u8
474}
475
476fn bt601_to_bt709_scalar(y: &mut [u8], cb: &mut [u8], cr: &mut [u8], width: usize, height: usize) {
490 debug_assert_eq!(y.len(), width * height);
491 debug_assert_eq!(cb.len(), (width / 2) * (height / 2));
492 debug_assert_eq!(cr.len(), (width / 2) * (height / 2));
493
494 let cw = width / 2;
495
496 for yi in 0..height {
499 let cy = yi >> 1;
500 for xi in 0..width {
501 let cx = xi >> 1;
502 let cbl = cb[cy * cw + cx] as i32 - 128;
503 let crl = cr[cy * cw + cx] as i32 - 128;
504 let y_orig = y[yi * width + xi] as i32;
505 let delta = (M_Y_CB * cbl + M_Y_CR * crl + Q15_ROUND) >> Q15;
506 y[yi * width + xi] = clamp_y(y_orig + delta);
507 }
508 }
509
510 for v in cb.iter_mut().zip(cr.iter_mut()) {
512 let (cbp, crp) = v;
513 let cbl = *cbp as i32 - 128;
514 let crl = *crp as i32 - 128;
515 let new_cb = (M_CB_CB * cbl + M_CB_CR * crl + Q15_ROUND) >> Q15;
516 let new_cr = (M_CR_CB * cbl + M_CR_CR * crl + Q15_ROUND) >> Q15;
517 *cbp = clamp_c(new_cb + 128);
518 *crp = clamp_c(new_cr + 128);
519 }
520}
521
522pub fn bt601_to_bt709_planes_scalar(
524 y: &mut [u8],
525 cb: &mut [u8],
526 cr: &mut [u8],
527 width: usize,
528 height: usize,
529) {
530 bt601_to_bt709_scalar(y, cb, cr, width, height);
531}
532
533pub fn bt601_to_bt709_planes(
537 y: &mut [u8],
538 cb: &mut [u8],
539 cr: &mut [u8],
540 width: usize,
541 height: usize,
542) {
543 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
544 {
545 if std::is_x86_feature_detected!("avx2") {
546 unsafe {
548 bt601_to_bt709_avx2(y, cb, cr, width, height);
549 }
550 return;
551 }
552 }
553 bt601_to_bt709_scalar(y, cb, cr, width, height);
554}
555
556#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
557#[target_feature(enable = "avx2")]
558unsafe fn bt601_to_bt709_avx2(
559 y: &mut [u8],
560 cb: &mut [u8],
561 cr: &mut [u8],
562 width: usize,
563 height: usize,
564) {
565 unsafe {
566 #[cfg(target_arch = "x86")]
567 use std::arch::x86::*;
568 #[cfg(target_arch = "x86_64")]
569 use std::arch::x86_64::*;
570
571 let cw = width / 2;
572 let ch = height / 2;
573
574 let v_m_y_cb = _mm256_set1_epi16(M_Y_CB as i16); let v_m_y_cr = _mm256_set1_epi16(M_Y_CR as i16); let v_m_cb_cb_corr = _mm256_set1_epi16((M_CB_CB - 32768) as i16); let v_m_cb_cr = _mm256_set1_epi16(M_CB_CR as i16); let v_m_cr_cb = _mm256_set1_epi16(M_CR_CB as i16); let v_m_cr_cr_corr = _mm256_set1_epi16((M_CR_CR - 32768) as i16); let v_128 = _mm256_set1_epi16(128);
595 let v_chroma_lo = _mm256_set1_epi16(16);
596 let v_chroma_hi = _mm256_set1_epi16(240);
597 let v_luma_lo = _mm256_set1_epi16(16);
598 let v_luma_hi = _mm256_set1_epi16(235);
599
600 for cy_idx in 0..ch {
605 let y_row0 = cy_idx * 2 * width;
606 let y_row1 = y_row0 + width;
607 let c_row = cy_idx * cw;
608
609 let mut cx = 0usize;
610 while cx + 16 <= cw {
611 let cb_u8 = _mm_loadu_si128(cb.as_ptr().add(c_row + cx) as *const _);
613 let cr_u8 = _mm_loadu_si128(cr.as_ptr().add(c_row + cx) as *const _);
614 let cb_i16 = _mm256_cvtepu8_epi16(cb_u8);
615 let cr_i16 = _mm256_cvtepu8_epi16(cr_u8);
616 let cbl = _mm256_sub_epi16(cb_i16, v_128);
617 let crl = _mm256_sub_epi16(cr_i16, v_128);
618
619 let dy_cb = _mm256_mulhrs_epi16(cbl, v_m_y_cb);
622 let dy_cr = _mm256_mulhrs_epi16(crl, v_m_y_cr);
623 let dy_chroma = _mm256_add_epi16(dy_cb, dy_cr); let mut dy_luma = [0i16; 32];
645 _mm256_storeu_si256(dy_luma.as_mut_ptr().add(0) as *mut _, dy_chroma);
646 let mut dy_luma_pair = [0i16; 32];
650 for i in 0..16 {
651 dy_luma_pair[i * 2] = dy_luma[i];
652 dy_luma_pair[i * 2 + 1] = dy_luma[i];
653 }
654 let dy_luma_lo = _mm256_loadu_si256(dy_luma_pair.as_ptr().add(0) as *const _);
655 let dy_luma_hi = _mm256_loadu_si256(dy_luma_pair.as_ptr().add(16) as *const _);
656
657 for row_off in [y_row0, y_row1] {
660 let y_u8 = _mm256_loadu_si256(y.as_ptr().add(row_off + cx * 2) as *const _);
662 let y_lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y_u8));
664 let y_hi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(y_u8));
665
666 let y_lo_out = _mm256_add_epi16(y_lo, dy_luma_lo);
667 let y_hi_out = _mm256_add_epi16(y_hi, dy_luma_hi);
668
669 let y_lo_out =
671 _mm256_min_epi16(_mm256_max_epi16(y_lo_out, v_luma_lo), v_luma_hi);
672 let y_hi_out =
673 _mm256_min_epi16(_mm256_max_epi16(y_hi_out, v_luma_lo), v_luma_hi);
674
675 let packed = _mm256_packus_epi16(y_lo_out, y_hi_out);
677 let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(packed);
680 _mm256_storeu_si256(y.as_mut_ptr().add(row_off + cx * 2) as *mut _, packed);
681 }
682
683 cx += 16;
684 }
685
686 while cx < cw {
688 let cb_idx = c_row + cx;
689 let cbl = cb[cb_idx] as i32 - 128;
690 let crl = cr[cb_idx] as i32 - 128;
691 let delta = (M_Y_CB * cbl + M_Y_CR * crl + Q15_ROUND) >> Q15;
692 let xi = cx * 2;
693 for row_off in [y_row0, y_row1] {
694 for sub in 0..2 {
695 let idx = row_off + xi + sub;
696 y[idx] = clamp_y(y[idx] as i32 + delta);
697 }
698 }
699 cx += 1;
700 }
701 }
702
703 let total_c = cb.len();
706 let mut i = 0usize;
707 while i + 16 <= total_c {
708 let cb_u8 = _mm_loadu_si128(cb.as_ptr().add(i) as *const _);
709 let cr_u8 = _mm_loadu_si128(cr.as_ptr().add(i) as *const _);
710 let cb_i16 = _mm256_cvtepu8_epi16(cb_u8);
711 let cr_i16 = _mm256_cvtepu8_epi16(cr_u8);
712 let cbl = _mm256_sub_epi16(cb_i16, v_128);
713 let crl = _mm256_sub_epi16(cr_i16, v_128);
714
715 let cb_corr = _mm256_mulhrs_epi16(cbl, v_m_cb_cb_corr);
717 let cb_cross = _mm256_mulhrs_epi16(crl, v_m_cb_cr);
718 let new_cb = _mm256_add_epi16(_mm256_add_epi16(cbl, cb_corr), cb_cross);
719 let new_cb = _mm256_add_epi16(new_cb, v_128);
720
721 let cr_corr = _mm256_mulhrs_epi16(crl, v_m_cr_cr_corr);
723 let cr_cross = _mm256_mulhrs_epi16(cbl, v_m_cr_cb);
724 let new_cr = _mm256_add_epi16(_mm256_add_epi16(crl, cr_corr), cr_cross);
725 let new_cr = _mm256_add_epi16(new_cr, v_128);
726
727 let new_cb = _mm256_min_epi16(_mm256_max_epi16(new_cb, v_chroma_lo), v_chroma_hi);
729 let new_cr = _mm256_min_epi16(_mm256_max_epi16(new_cr, v_chroma_lo), v_chroma_hi);
730
731 let cb_packed = _mm256_packus_epi16(new_cb, new_cb);
733 let cr_packed = _mm256_packus_epi16(new_cr, new_cr);
734 let cb_packed = _mm256_permute4x64_epi64::<0b00_00_10_00>(cb_packed);
735 let cr_packed = _mm256_permute4x64_epi64::<0b00_00_10_00>(cr_packed);
736 _mm_storeu_si128(
737 cb.as_mut_ptr().add(i) as *mut _,
738 _mm256_castsi256_si128(cb_packed),
739 );
740 _mm_storeu_si128(
741 cr.as_mut_ptr().add(i) as *mut _,
742 _mm256_castsi256_si128(cr_packed),
743 );
744
745 i += 16;
746 }
747
748 while i < total_c {
750 let cbl = cb[i] as i32 - 128;
751 let crl = cr[i] as i32 - 128;
752 let new_cb = (M_CB_CB * cbl + M_CB_CR * crl + Q15_ROUND) >> Q15;
753 let new_cr = (M_CR_CB * cbl + M_CR_CR * crl + Q15_ROUND) >> Q15;
754 cb[i] = clamp_c(new_cb + 128);
755 cr[i] = clamp_c(new_cr + 128);
756 i += 1;
757 }
758 }
759}
760
761#[inline(always)]
785fn clamp_y_10bit(v: i32) -> u16 {
786 v.clamp(64, 940) as u16
787}
788
789#[inline(always)]
790fn clamp_c_10bit(v: i32) -> u16 {
791 v.clamp(64, 960) as u16
792}
793
794const CHROMA_CENTER_10BIT: i32 = 512;
795
796pub fn bt601_to_bt709_planes_10bit_scalar(
801 y: &mut [u16],
802 cb: &mut [u16],
803 cr: &mut [u16],
804 width: usize,
805 height: usize,
806) {
807 debug_assert_eq!(y.len(), width * height);
808 debug_assert_eq!(cb.len(), (width / 2) * (height / 2));
809 debug_assert_eq!(cr.len(), (width / 2) * (height / 2));
810
811 let cw = width / 2;
812
813 for yi in 0..height {
815 let cy = yi >> 1;
816 for xi in 0..width {
817 let cx = xi >> 1;
818 let cbl = cb[cy * cw + cx] as i32 - CHROMA_CENTER_10BIT;
819 let crl = cr[cy * cw + cx] as i32 - CHROMA_CENTER_10BIT;
820 let y_orig = y[yi * width + xi] as i32;
821 let delta = (M_Y_CB * cbl + M_Y_CR * crl + Q15_ROUND) >> Q15;
822 y[yi * width + xi] = clamp_y_10bit(y_orig + delta);
823 }
824 }
825
826 for v in cb.iter_mut().zip(cr.iter_mut()) {
828 let (cbp, crp) = v;
829 let cbl = *cbp as i32 - CHROMA_CENTER_10BIT;
830 let crl = *crp as i32 - CHROMA_CENTER_10BIT;
831 let new_cb = (M_CB_CB * cbl + M_CB_CR * crl + Q15_ROUND) >> Q15;
832 let new_cr = (M_CR_CB * cbl + M_CR_CR * crl + Q15_ROUND) >> Q15;
833 *cbp = clamp_c_10bit(new_cb + CHROMA_CENTER_10BIT);
834 *crp = clamp_c_10bit(new_cr + CHROMA_CENTER_10BIT);
835 }
836}
837
838pub fn bt601_to_bt709_planes_10bit(
841 y: &mut [u16],
842 cb: &mut [u16],
843 cr: &mut [u16],
844 width: usize,
845 height: usize,
846) {
847 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
848 {
849 if std::is_x86_feature_detected!("avx2") {
850 unsafe {
852 bt601_to_bt709_10bit_avx2(y, cb, cr, width, height);
853 }
854 return;
855 }
856 }
857 bt601_to_bt709_planes_10bit_scalar(y, cb, cr, width, height);
858}
859
860#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
872#[target_feature(enable = "avx2")]
873unsafe fn bt601_to_bt709_10bit_avx2(
874 y: &mut [u16],
875 cb: &mut [u16],
876 cr: &mut [u16],
877 width: usize,
878 height: usize,
879) {
880 unsafe {
881 #[cfg(target_arch = "x86")]
882 use std::arch::x86::*;
883 #[cfg(target_arch = "x86_64")]
884 use std::arch::x86_64::*;
885
886 let cw = width / 2;
887 let ch = height / 2;
888
889 let v_m_y_cb = _mm256_set1_epi16(M_Y_CB as i16);
890 let v_m_y_cr = _mm256_set1_epi16(M_Y_CR as i16);
891 let v_m_cb_cb_corr = _mm256_set1_epi16((M_CB_CB - 32768) as i16);
892 let v_m_cb_cr = _mm256_set1_epi16(M_CB_CR as i16);
893 let v_m_cr_cb = _mm256_set1_epi16(M_CR_CB as i16);
894 let v_m_cr_cr_corr = _mm256_set1_epi16((M_CR_CR - 32768) as i16);
895
896 let v_chroma_center = _mm256_set1_epi16(CHROMA_CENTER_10BIT as i16);
897 let v_chroma_lo = _mm256_set1_epi16(64);
898 let v_chroma_hi = _mm256_set1_epi16(960);
899 let v_luma_lo = _mm256_set1_epi16(64);
900 let v_luma_hi = _mm256_set1_epi16(940);
901
902 for cy_idx in 0..ch {
908 let y_row0 = cy_idx * 2 * width;
909 let y_row1 = y_row0 + width;
910 let c_row = cy_idx * cw;
911
912 let mut cx = 0usize;
913 while cx + 16 <= cw {
914 let cb_i16 = _mm256_loadu_si256(cb.as_ptr().add(c_row + cx) as *const _);
916 let cr_i16 = _mm256_loadu_si256(cr.as_ptr().add(c_row + cx) as *const _);
917 let cbl = _mm256_sub_epi16(cb_i16, v_chroma_center);
918 let crl = _mm256_sub_epi16(cr_i16, v_chroma_center);
919
920 let dy_cb = _mm256_mulhrs_epi16(cbl, v_m_y_cb);
922 let dy_cr = _mm256_mulhrs_epi16(crl, v_m_y_cr);
923 let dy_chroma = _mm256_add_epi16(dy_cb, dy_cr);
924
925 let mut dy_luma = [0i16; 16];
929 _mm256_storeu_si256(dy_luma.as_mut_ptr() as *mut _, dy_chroma);
930 let mut dy_luma_pair = [0i16; 32];
931 for i in 0..16 {
932 dy_luma_pair[i * 2] = dy_luma[i];
933 dy_luma_pair[i * 2 + 1] = dy_luma[i];
934 }
935 let dy_luma_lo = _mm256_loadu_si256(dy_luma_pair.as_ptr() as *const _);
936 let dy_luma_hi = _mm256_loadu_si256(dy_luma_pair.as_ptr().add(16) as *const _);
937
938 for row_off in [y_row0, y_row1] {
940 let y_lo = _mm256_loadu_si256(y.as_ptr().add(row_off + cx * 2) as *const _);
942 let y_hi =
943 _mm256_loadu_si256(y.as_ptr().add(row_off + cx * 2 + 16) as *const _);
944
945 let y_lo_out = _mm256_add_epi16(y_lo, dy_luma_lo);
946 let y_hi_out = _mm256_add_epi16(y_hi, dy_luma_hi);
947
948 let y_lo_out =
950 _mm256_min_epi16(_mm256_max_epi16(y_lo_out, v_luma_lo), v_luma_hi);
951 let y_hi_out =
952 _mm256_min_epi16(_mm256_max_epi16(y_hi_out, v_luma_lo), v_luma_hi);
953
954 _mm256_storeu_si256(y.as_mut_ptr().add(row_off + cx * 2) as *mut _, y_lo_out);
955 _mm256_storeu_si256(
956 y.as_mut_ptr().add(row_off + cx * 2 + 16) as *mut _,
957 y_hi_out,
958 );
959 }
960
961 cx += 16;
962 }
963
964 while cx < cw {
966 let cb_idx = c_row + cx;
967 let cbl = cb[cb_idx] as i32 - CHROMA_CENTER_10BIT;
968 let crl = cr[cb_idx] as i32 - CHROMA_CENTER_10BIT;
969 let delta = (M_Y_CB * cbl + M_Y_CR * crl + Q15_ROUND) >> Q15;
970 let xi = cx * 2;
971 for row_off in [y_row0, y_row1] {
972 for sub in 0..2 {
973 let idx = row_off + xi + sub;
974 y[idx] = clamp_y_10bit(y[idx] as i32 + delta);
975 }
976 }
977 cx += 1;
978 }
979 }
980
981 let total_c = cb.len();
984 let mut i = 0usize;
985 while i + 16 <= total_c {
986 let cb_i16 = _mm256_loadu_si256(cb.as_ptr().add(i) as *const _);
987 let cr_i16 = _mm256_loadu_si256(cr.as_ptr().add(i) as *const _);
988 let cbl = _mm256_sub_epi16(cb_i16, v_chroma_center);
989 let crl = _mm256_sub_epi16(cr_i16, v_chroma_center);
990
991 let cb_corr = _mm256_mulhrs_epi16(cbl, v_m_cb_cb_corr);
993 let cb_cross = _mm256_mulhrs_epi16(crl, v_m_cb_cr);
994 let new_cb = _mm256_add_epi16(_mm256_add_epi16(cbl, cb_corr), cb_cross);
995 let new_cb = _mm256_add_epi16(new_cb, v_chroma_center);
996
997 let cr_corr = _mm256_mulhrs_epi16(crl, v_m_cr_cr_corr);
999 let cr_cross = _mm256_mulhrs_epi16(cbl, v_m_cr_cb);
1000 let new_cr = _mm256_add_epi16(_mm256_add_epi16(crl, cr_corr), cr_cross);
1001 let new_cr = _mm256_add_epi16(new_cr, v_chroma_center);
1002
1003 let new_cb = _mm256_min_epi16(_mm256_max_epi16(new_cb, v_chroma_lo), v_chroma_hi);
1005 let new_cr = _mm256_min_epi16(_mm256_max_epi16(new_cr, v_chroma_lo), v_chroma_hi);
1006
1007 _mm256_storeu_si256(cb.as_mut_ptr().add(i) as *mut _, new_cb);
1008 _mm256_storeu_si256(cr.as_mut_ptr().add(i) as *mut _, new_cr);
1009
1010 i += 16;
1011 }
1012
1013 while i < total_c {
1015 let cbl = cb[i] as i32 - CHROMA_CENTER_10BIT;
1016 let crl = cr[i] as i32 - CHROMA_CENTER_10BIT;
1017 let new_cb = (M_CB_CB * cbl + M_CB_CR * crl + Q15_ROUND) >> Q15;
1018 let new_cr = (M_CR_CB * cbl + M_CR_CR * crl + Q15_ROUND) >> Q15;
1019 cb[i] = clamp_c_10bit(new_cb + CHROMA_CENTER_10BIT);
1020 cr[i] = clamp_c_10bit(new_cr + CHROMA_CENTER_10BIT);
1021 i += 1;
1022 }
1023 }
1024}
1025
1026fn recolor_yuv420p_bt601_to_bt709(frame: &VideoFrame) -> Result<VideoFrame> {
1027 let w = frame.width as usize;
1028 let h = frame.height as usize;
1029 let y_size = w * h;
1030 let c_size = y_size / 4;
1031
1032 if frame.data.len() < y_size + 2 * c_size {
1033 bail!(
1034 "frame data too short for yuv420p {}x{}: {} bytes",
1035 w,
1036 h,
1037 frame.data.len()
1038 );
1039 }
1040 if !w.is_multiple_of(2) || !h.is_multiple_of(2) {
1041 bail!(
1042 "BT.601→BT.709 requires even dimensions for 4:2:0 subsampling; got {}x{}",
1043 w,
1044 h
1045 );
1046 }
1047
1048 let mut y = frame.data[..y_size].to_vec();
1049 let mut cb = frame.data[y_size..y_size + c_size].to_vec();
1050 let mut cr = frame.data[y_size + c_size..y_size + 2 * c_size].to_vec();
1051
1052 bt601_to_bt709_planes(&mut y, &mut cb, &mut cr, w, h);
1053
1054 let mut out = BytesMut::with_capacity(y_size + 2 * c_size);
1055 out.extend_from_slice(&y);
1056 out.extend_from_slice(&cb);
1057 out.extend_from_slice(&cr);
1058
1059 Ok(VideoFrame::new(
1060 out.freeze(),
1061 frame.width,
1062 frame.height,
1063 frame.format,
1064 ColorSpace::Bt709,
1065 frame.pts,
1066 ))
1067}
1068
1069pub fn downsample_chroma_444_to_420(
1118 y: &[u8],
1119 cb: &[u8],
1120 cr: &[u8],
1121 width: usize,
1122 height: usize,
1123) -> Vec<u8> {
1124 debug_assert_eq!(y.len(), width * height, "Y plane size");
1125 debug_assert_eq!(cb.len(), width * height, "Cb plane size (4:4:4)");
1126 debug_assert_eq!(cr.len(), width * height, "Cr plane size (4:4:4)");
1127
1128 let cw = width.div_ceil(2);
1129 let ch = height.div_ceil(2);
1130
1131 let mut out = Vec::with_capacity(width * height + 2 * cw * ch);
1132
1133 out.extend_from_slice(y);
1136
1137 for plane in [cb, cr] {
1139 for cy in 0..ch {
1140 let y0 = 2 * cy;
1142 let y1 = (y0 + 1).min(height - 1);
1143 for cx in 0..cw {
1144 let x0 = 2 * cx;
1145 let x1 = (x0 + 1).min(width - 1);
1146 let s00 = plane[y0 * width + x0] as u16;
1148 let s01 = plane[y0 * width + x1] as u16;
1149 let s10 = plane[y1 * width + x0] as u16;
1150 let s11 = plane[y1 * width + x1] as u16;
1151 let avg = ((s00 + s01 + s10 + s11 + 2) >> 2) as u8;
1152 out.push(avg);
1153 }
1154 }
1155 }
1156
1157 out
1158}
1159
1160pub fn downsample_chroma_444_to_420_10bit(
1169 y: &[u16],
1170 cb: &[u16],
1171 cr: &[u16],
1172 width: usize,
1173 height: usize,
1174) -> Vec<u8> {
1175 debug_assert_eq!(y.len(), width * height, "Y plane samples");
1176 debug_assert_eq!(cb.len(), width * height, "Cb plane samples (4:4:4)");
1177 debug_assert_eq!(cr.len(), width * height, "Cr plane samples (4:4:4)");
1178
1179 let cw = width.div_ceil(2);
1180 let ch = height.div_ceil(2);
1181 let total_samples = width * height + 2 * cw * ch;
1182 let mut out = Vec::with_capacity(total_samples * 2);
1183
1184 for &s in y {
1186 out.extend_from_slice(&s.to_le_bytes());
1187 }
1188
1189 for plane in [cb, cr] {
1190 for cy in 0..ch {
1191 let y0 = 2 * cy;
1192 let y1 = (y0 + 1).min(height - 1);
1193 for cx in 0..cw {
1194 let x0 = 2 * cx;
1195 let x1 = (x0 + 1).min(width - 1);
1196 let s00 = plane[y0 * width + x0] as u32;
1197 let s01 = plane[y0 * width + x1] as u32;
1198 let s10 = plane[y1 * width + x0] as u32;
1199 let s11 = plane[y1 * width + x1] as u32;
1200 let avg = ((s00 + s01 + s10 + s11 + 2) >> 2) as u16;
1201 out.extend_from_slice(&avg.to_le_bytes());
1202 }
1203 }
1204 }
1205
1206 out
1207}
1208
1209pub fn downsample_444_to_420_frame(frame: &VideoFrame) -> Result<VideoFrame> {
1218 let w = frame.width as usize;
1219 let h = frame.height as usize;
1220 if w == 0 || h == 0 {
1221 bail!("zero-dimension frame");
1222 }
1223
1224 match frame.format {
1225 PixelFormat::Yuv444p => {
1226 let plane = w * h;
1227 if frame.data.len() < 3 * plane {
1228 bail!(
1229 "Yuv444p frame data too short for {}x{}: {} bytes",
1230 w,
1231 h,
1232 frame.data.len()
1233 );
1234 }
1235 let y = &frame.data[..plane];
1236 let cb = &frame.data[plane..2 * plane];
1237 let cr = &frame.data[2 * plane..3 * plane];
1238 let out = downsample_chroma_444_to_420(y, cb, cr, w, h);
1239 Ok(VideoFrame::new(
1240 Bytes::from(out),
1241 frame.width,
1242 frame.height,
1243 PixelFormat::Yuv420p,
1244 frame.color_space,
1245 frame.pts,
1246 ))
1247 }
1248 PixelFormat::Yuv444p10le | PixelFormat::Yuva444p10le => {
1249 let plane = w * h;
1250 let needed = if frame.format == PixelFormat::Yuva444p10le {
1255 4 * plane * 2
1256 } else {
1257 3 * plane * 2
1258 };
1259 if frame.data.len() < needed {
1260 bail!(
1261 "{:?} frame data too short for {}x{}: {} bytes (need {})",
1262 frame.format,
1263 w,
1264 h,
1265 frame.data.len(),
1266 needed
1267 );
1268 }
1269 let y = read_u16le(&frame.data[..plane * 2]);
1271 let cb = read_u16le(&frame.data[plane * 2..2 * plane * 2]);
1272 let cr = read_u16le(&frame.data[2 * plane * 2..3 * plane * 2]);
1273
1274 if frame.format == PixelFormat::Yuva444p10le {
1275 tracing::warn!(
1276 pts = frame.pts,
1277 "dropping alpha plane on 4:4:4→4:2:0 downsample (rav1e 0.7 has no alpha; pipeline target is Yuv420p10le)"
1278 );
1279 }
1280
1281 let out = downsample_chroma_444_to_420_10bit(&y, &cb, &cr, w, h);
1282 Ok(VideoFrame::new(
1283 Bytes::from(out),
1284 frame.width,
1285 frame.height,
1286 PixelFormat::Yuv420p10le,
1287 frame.color_space,
1288 frame.pts,
1289 ))
1290 }
1291 other => bail!(
1292 "downsample_444_to_420_frame: expected 4:4:4 input, got {:?}",
1293 other
1294 ),
1295 }
1296}
1297
1298pub fn scale_frame(
1303 frame: &VideoFrame,
1304 target_width: u32,
1305 target_height: u32,
1306) -> Result<VideoFrame> {
1307 if frame.width == target_width && frame.height == target_height {
1308 return Ok(frame.clone());
1309 }
1310
1311 match frame.format {
1312 PixelFormat::Yuv420p => scale_frame_8bit(frame, target_width, target_height),
1313 PixelFormat::Yuv420p10le => scale_frame_10bit(frame, target_width, target_height),
1318 _ => bail!(
1319 "scaling only implemented for Yuv420p / Yuv420p10le; got {:?}",
1320 frame.format
1321 ),
1322 }
1323}
1324
1325fn scale_frame_8bit(
1326 frame: &VideoFrame,
1327 target_width: u32,
1328 target_height: u32,
1329) -> Result<VideoFrame> {
1330 let src_w = frame.width as usize;
1331 let src_h = frame.height as usize;
1332 let dst_w = target_width as usize;
1333 let dst_h = target_height as usize;
1334
1335 let src_y_size = src_w * src_h;
1336 let dst_y_size = dst_w * dst_h;
1337 let dst_uv_size = dst_y_size / 4;
1338
1339 let mut out = BytesMut::with_capacity(dst_y_size + dst_uv_size * 2);
1340
1341 let y_plane = &frame.data[..src_y_size];
1343 out.extend(bilinear_scale_plane(y_plane, src_w, src_h, dst_w, dst_h));
1344
1345 let u_offset = src_y_size;
1347 let u_plane = &frame.data[u_offset..u_offset + src_y_size / 4];
1348 out.extend(bilinear_scale_plane(
1349 u_plane,
1350 src_w / 2,
1351 src_h / 2,
1352 dst_w / 2,
1353 dst_h / 2,
1354 ));
1355
1356 let v_offset = u_offset + src_y_size / 4;
1358 let v_plane = &frame.data[v_offset..v_offset + src_y_size / 4];
1359 out.extend(bilinear_scale_plane(
1360 v_plane,
1361 src_w / 2,
1362 src_h / 2,
1363 dst_w / 2,
1364 dst_h / 2,
1365 ));
1366
1367 Ok(VideoFrame::new(
1368 out.freeze(),
1369 target_width,
1370 target_height,
1371 frame.format,
1372 frame.color_space,
1373 frame.pts,
1374 ))
1375}
1376
1377fn scale_frame_10bit(
1386 frame: &VideoFrame,
1387 target_width: u32,
1388 target_height: u32,
1389) -> Result<VideoFrame> {
1390 let src_w = frame.width as usize;
1391 let src_h = frame.height as usize;
1392 let dst_w = target_width as usize;
1393 let dst_h = target_height as usize;
1394
1395 let bytes_per_sample = 2usize;
1396 let src_y_size_samples = src_w * src_h;
1397 let src_y_size_bytes = src_y_size_samples * bytes_per_sample;
1398 let src_c_size_samples = (src_w / 2) * (src_h / 2);
1399 let src_c_size_bytes = src_c_size_samples * bytes_per_sample;
1400
1401 if frame.data.len() < src_y_size_bytes + 2 * src_c_size_bytes {
1402 bail!(
1403 "10-bit frame data too short for {}x{}: {} bytes",
1404 src_w,
1405 src_h,
1406 frame.data.len()
1407 );
1408 }
1409
1410 let dst_y_size_samples = dst_w * dst_h;
1411 let dst_c_size_samples = (dst_w / 2) * (dst_h / 2);
1412 let dst_total_bytes = (dst_y_size_samples + 2 * dst_c_size_samples) * bytes_per_sample;
1413
1414 let y_plane = read_u16le(&frame.data[..src_y_size_bytes]);
1416 let u_plane = read_u16le(&frame.data[src_y_size_bytes..src_y_size_bytes + src_c_size_bytes]);
1417 let v_plane = read_u16le(
1418 &frame.data[src_y_size_bytes + src_c_size_bytes..src_y_size_bytes + 2 * src_c_size_bytes],
1419 );
1420
1421 let y_dst = bilinear_scale_plane_u16(&y_plane, src_w, src_h, dst_w, dst_h);
1423 let u_dst = bilinear_scale_plane_u16(&u_plane, src_w / 2, src_h / 2, dst_w / 2, dst_h / 2);
1424 let v_dst = bilinear_scale_plane_u16(&v_plane, src_w / 2, src_h / 2, dst_w / 2, dst_h / 2);
1425
1426 let mut out = BytesMut::with_capacity(dst_total_bytes);
1427 write_u16le(&mut out, &y_dst);
1428 write_u16le(&mut out, &u_dst);
1429 write_u16le(&mut out, &v_dst);
1430
1431 Ok(VideoFrame::new(
1432 out.freeze(),
1433 target_width,
1434 target_height,
1435 frame.format,
1436 frame.color_space,
1437 frame.pts,
1438 ))
1439}
1440
1441fn read_u16le(bytes: &[u8]) -> Vec<u16> {
1442 bytes
1443 .chunks_exact(2)
1444 .map(|c| u16::from_le_bytes([c[0], c[1]]))
1445 .collect()
1446}
1447
1448fn write_u16le(out: &mut BytesMut, samples: &[u16]) {
1449 for s in samples {
1450 out.extend_from_slice(&s.to_le_bytes());
1451 }
1452}
1453
1454pub fn bilinear_scale_plane_u16_scalar(
1460 src: &[u16],
1461 src_w: usize,
1462 src_h: usize,
1463 dst_w: usize,
1464 dst_h: usize,
1465) -> Vec<u16> {
1466 let mut dst = vec![0u16; dst_w * dst_h];
1467 let x_ratio = src_w as f64 / dst_w as f64;
1468 let y_ratio = src_h as f64 / dst_h as f64;
1469
1470 for dy in 0..dst_h {
1471 let sy = (dy as f64 * y_ratio).min((src_h - 1) as f64);
1472 let y0 = sy as usize;
1473 let y1 = (y0 + 1).min(src_h - 1);
1474 let fy = sy - y0 as f64;
1475
1476 for dx in 0..dst_w {
1477 let sx = (dx as f64 * x_ratio).min((src_w - 1) as f64);
1478 let x0 = sx as usize;
1479 let x1 = (x0 + 1).min(src_w - 1);
1480 let fx = sx - x0 as f64;
1481
1482 let p00 = src[y0 * src_w + x0] as f64;
1483 let p10 = src[y0 * src_w + x1] as f64;
1484 let p01 = src[y1 * src_w + x0] as f64;
1485 let p11 = src[y1 * src_w + x1] as f64;
1486
1487 let val = p00 * (1.0 - fx) * (1.0 - fy)
1488 + p10 * fx * (1.0 - fy)
1489 + p01 * (1.0 - fx) * fy
1490 + p11 * fx * fy;
1491
1492 dst[dy * dst_w + dx] = val.round().clamp(0.0, 1023.0) as u16;
1498 }
1499 }
1500 dst
1501}
1502
1503pub fn bilinear_scale_plane_u16(
1518 src: &[u16],
1519 src_w: usize,
1520 src_h: usize,
1521 dst_w: usize,
1522 dst_h: usize,
1523) -> Vec<u16> {
1524 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1525 {
1526 if std::is_x86_feature_detected!("avx2") && dst_w >= 16 {
1530 return unsafe { bilinear_scale_plane_u16_avx2(src, src_w, src_h, dst_w, dst_h) };
1532 }
1533 }
1534 bilinear_scale_plane_u16_scalar(src, src_w, src_h, dst_w, dst_h)
1535}
1536
1537#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1555#[target_feature(enable = "avx2")]
1556unsafe fn bilinear_scale_plane_u16_avx2(
1557 src: &[u16],
1558 src_w: usize,
1559 src_h: usize,
1560 dst_w: usize,
1561 dst_h: usize,
1562) -> Vec<u16> {
1563 unsafe {
1564 #[cfg(target_arch = "x86")]
1565 use std::arch::x86::*;
1566 #[cfg(target_arch = "x86_64")]
1567 use std::arch::x86_64::*;
1568
1569 let mut dst = vec![0u16; dst_w * dst_h];
1570
1571 let x_step = ((src_w as u64) << 32) / (dst_w as u64);
1575 let y_step = ((src_h as u64) << 32) / (dst_h as u64);
1576
1577 let mut x0s: Vec<u32> = vec![0; dst_w];
1579 let mut x1s: Vec<u32> = vec![0; dst_w];
1580 let mut fxs_q15: Vec<i16> = vec![0; dst_w];
1581 let mut one_minus_fxs_q15: Vec<i16> = vec![0; dst_w];
1582 for dx in 0..dst_w {
1583 let sx_32_32 = (dx as u64) * x_step;
1584 let x0_full = (sx_32_32 >> 32) as usize;
1585 let x0 = x0_full.min(src_w - 1);
1586 let fx_q16 = ((sx_32_32 >> 16) & 0xFFFF) as u32;
1587 let fx_q15 = ((fx_q16 as i32) >> 1).min(32767) as i16;
1589 if x0 >= src_w - 1 {
1590 x0s[dx] = (src_w - 1) as u32;
1591 x1s[dx] = (src_w - 1) as u32;
1592 fxs_q15[dx] = 0;
1593 one_minus_fxs_q15[dx] = 32767;
1594 } else {
1595 x0s[dx] = x0 as u32;
1596 x1s[dx] = (x0 + 1) as u32;
1597 fxs_q15[dx] = fx_q15;
1598 one_minus_fxs_q15[dx] = 32767 - fx_q15;
1599 }
1600 }
1601
1602 let v_max = _mm256_set1_epi16(1023);
1603 let v_zero = _mm256_setzero_si256();
1604
1605 for dy in 0..dst_h {
1606 let sy_32_32 = (dy as u64) * y_step;
1607 let y0_full = (sy_32_32 >> 32) as usize;
1608 let y0 = y0_full.min(src_h - 1);
1609 let fy_q16 = ((sy_32_32 >> 16) & 0xFFFF) as u32;
1610 let y1 = (y0 + 1).min(src_h - 1);
1611 let fy_q15 = ((fy_q16 as i32) >> 1).min(32767) as i16;
1612 let one_minus_fy_q15 = 32767i16 - fy_q15;
1613
1614 let row0 = y0 * src_w;
1615 let row1 = y1 * src_w;
1616 let dst_row = dy * dst_w;
1617
1618 let v_fy = _mm256_set1_epi16(fy_q15);
1619 let v_one_minus_fy = _mm256_set1_epi16(one_minus_fy_q15);
1620
1621 let mut dx = 0usize;
1622 while dx + 16 <= dst_w {
1623 let mut p00_buf = [0u16; 16];
1629 let mut p10_buf = [0u16; 16];
1630 let mut p01_buf = [0u16; 16];
1631 let mut p11_buf = [0u16; 16];
1632 for i in 0..16 {
1633 let x0 = x0s[dx + i] as usize;
1634 let x1 = x1s[dx + i] as usize;
1635 p00_buf[i] = *src.get_unchecked(row0 + x0);
1636 p10_buf[i] = *src.get_unchecked(row0 + x1);
1637 p01_buf[i] = *src.get_unchecked(row1 + x0);
1638 p11_buf[i] = *src.get_unchecked(row1 + x1);
1639 }
1640
1641 let p00 = _mm256_loadu_si256(p00_buf.as_ptr() as *const _);
1646 let p10 = _mm256_loadu_si256(p10_buf.as_ptr() as *const _);
1647 let p01 = _mm256_loadu_si256(p01_buf.as_ptr() as *const _);
1648 let p11 = _mm256_loadu_si256(p11_buf.as_ptr() as *const _);
1649
1650 let v_fx = _mm256_loadu_si256(fxs_q15.as_ptr().add(dx) as *const _);
1652 let v_one_minus_fx =
1653 _mm256_loadu_si256(one_minus_fxs_q15.as_ptr().add(dx) as *const _);
1654
1655 let top = _mm256_add_epi16(
1666 _mm256_mulhrs_epi16(p00, v_one_minus_fx),
1667 _mm256_mulhrs_epi16(p10, v_fx),
1668 );
1669 let bottom = _mm256_add_epi16(
1670 _mm256_mulhrs_epi16(p01, v_one_minus_fx),
1671 _mm256_mulhrs_epi16(p11, v_fx),
1672 );
1673
1674 let out_i16 = _mm256_add_epi16(
1676 _mm256_mulhrs_epi16(top, v_one_minus_fy),
1677 _mm256_mulhrs_epi16(bottom, v_fy),
1678 );
1679
1680 let clamped = _mm256_min_epi16(_mm256_max_epi16(out_i16, v_zero), v_max);
1685
1686 _mm256_storeu_si256(dst.as_mut_ptr().add(dst_row + dx) as *mut _, clamped);
1687
1688 dx += 16;
1689 }
1690
1691 while dx < dst_w {
1696 let x0 = x0s[dx] as usize;
1697 let x1 = x1s[dx] as usize;
1698 let fx = fxs_q15[dx] as f64 / 32768.0;
1699 let fy = fy_q15 as f64 / 32768.0;
1700
1701 let p00 = src[row0 + x0] as f64;
1702 let p10 = src[row0 + x1] as f64;
1703 let p01 = src[row1 + x0] as f64;
1704 let p11 = src[row1 + x1] as f64;
1705
1706 let val = p00 * (1.0 - fx) * (1.0 - fy)
1707 + p10 * fx * (1.0 - fy)
1708 + p01 * (1.0 - fx) * fy
1709 + p11 * fx * fy;
1710 dst[dst_row + dx] = val.round().clamp(0.0, 1023.0) as u16;
1711 dx += 1;
1712 }
1713 }
1714
1715 dst
1716 }
1717}
1718
1719pub fn bilinear_scale_plane(
1721 src: &[u8],
1722 src_w: usize,
1723 src_h: usize,
1724 dst_w: usize,
1725 dst_h: usize,
1726) -> Vec<u8> {
1727 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1728 {
1729 if std::is_x86_feature_detected!("avx2") && dst_w >= 16 {
1733 return unsafe { bilinear_scale_plane_avx2(src, src_w, src_h, dst_w, dst_h) };
1735 }
1736 }
1737 bilinear_scale_plane_scalar(src, src_w, src_h, dst_w, dst_h)
1738}
1739
1740pub fn bilinear_scale_plane_scalar(
1741 src: &[u8],
1742 src_w: usize,
1743 src_h: usize,
1744 dst_w: usize,
1745 dst_h: usize,
1746) -> Vec<u8> {
1747 let mut dst = vec![0u8; dst_w * dst_h];
1748 let x_ratio = src_w as f64 / dst_w as f64;
1749 let y_ratio = src_h as f64 / dst_h as f64;
1750
1751 for dy in 0..dst_h {
1752 let sy = (dy as f64 * y_ratio).min((src_h - 1) as f64);
1753 let y0 = sy as usize;
1754 let y1 = (y0 + 1).min(src_h - 1);
1755 let fy = sy - y0 as f64;
1756
1757 for dx in 0..dst_w {
1758 let sx = (dx as f64 * x_ratio).min((src_w - 1) as f64);
1759 let x0 = sx as usize;
1760 let x1 = (x0 + 1).min(src_w - 1);
1761 let fx = sx - x0 as f64;
1762
1763 let p00 = src[y0 * src_w + x0] as f64;
1764 let p10 = src[y0 * src_w + x1] as f64;
1765 let p01 = src[y1 * src_w + x0] as f64;
1766 let p11 = src[y1 * src_w + x1] as f64;
1767
1768 let val = p00 * (1.0 - fx) * (1.0 - fy)
1769 + p10 * fx * (1.0 - fy)
1770 + p01 * (1.0 - fx) * fy
1771 + p11 * fx * fy;
1772
1773 dst[dy * dst_w + dx] = val.round() as u8;
1774 }
1775 }
1776 dst
1777}
1778
1779#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1780#[target_feature(enable = "avx2")]
1781unsafe fn bilinear_scale_plane_avx2(
1782 src: &[u8],
1783 src_w: usize,
1784 src_h: usize,
1785 dst_w: usize,
1786 dst_h: usize,
1787) -> Vec<u8> {
1788 unsafe {
1789 #[cfg(target_arch = "x86")]
1790 use std::arch::x86::*;
1791 #[cfg(target_arch = "x86_64")]
1792 use std::arch::x86_64::*;
1793
1794 let mut dst = vec![0u8; dst_w * dst_h];
1795
1796 let x_step = ((src_w as u64) << 32) / (dst_w as u64); let y_step = ((src_h as u64) << 32) / (dst_h as u64);
1805
1806 let mut x0s: Vec<u32> = vec![0; dst_w];
1808 let mut fxs: Vec<u16> = vec![0; dst_w];
1809 for dx in 0..dst_w {
1810 let sx_32_32 = (dx as u64) * x_step; let x0_full = (sx_32_32 >> 32) as usize;
1812 let x0 = x0_full.min(src_w - 1);
1813 let fx_q16 = ((sx_32_32 >> 16) & 0xFFFF) as u16; if x0 >= src_w - 1 {
1816 x0s[dx] = (src_w - 1) as u32;
1817 fxs[dx] = 0;
1818 } else {
1819 x0s[dx] = x0 as u32;
1820 fxs[dx] = fx_q16;
1821 }
1822 }
1823
1824 let mut fx_q15: Vec<i16> = vec![0; dst_w];
1836 let mut one_minus_fx_q15: Vec<i16> = vec![0; dst_w];
1837 for dx in 0..dst_w {
1838 let fxq15 = (fxs[dx] as i32 >> 1).min(32767) as i16;
1840 fx_q15[dx] = fxq15;
1841 one_minus_fx_q15[dx] = 32767 - fxq15;
1842 }
1843
1844 for dy in 0..dst_h {
1845 let sy_32_32 = (dy as u64) * y_step;
1846 let y0_full = (sy_32_32 >> 32) as usize;
1847 let y0 = y0_full.min(src_h - 1);
1848 let fy_q16 = ((sy_32_32 >> 16) & 0xFFFF) as u32;
1849 let y1 = (y0 + 1).min(src_h - 1);
1850 let fy_q15 = ((fy_q16 as i32) >> 1).min(32767) as i16;
1851 let one_minus_fy_q15 = 32767i16 - fy_q15;
1852
1853 let row0 = y0 * src_w;
1854 let row1 = y1 * src_w;
1855 let dst_row = dy * dst_w;
1856
1857 let v_fy = _mm256_set1_epi16(fy_q15);
1861 let v_one_minus_fy = _mm256_set1_epi16(one_minus_fy_q15);
1862
1863 let mut dx = 0usize;
1864 while dx + 16 <= dst_w {
1865 let mut p00_buf = [0u8; 16];
1868 let mut p10_buf = [0u8; 16];
1869 let mut p01_buf = [0u8; 16];
1870 let mut p11_buf = [0u8; 16];
1871 for i in 0..16 {
1872 let x0 = x0s[dx + i] as usize;
1873 let x1 = (x0 + 1).min(src_w - 1);
1874 p00_buf[i] = *src.get_unchecked(row0 + x0);
1875 p10_buf[i] = *src.get_unchecked(row0 + x1);
1876 p01_buf[i] = *src.get_unchecked(row1 + x0);
1877 p11_buf[i] = *src.get_unchecked(row1 + x1);
1878 }
1879
1880 let p00 = _mm256_cvtepu8_epi16(_mm_loadu_si128(p00_buf.as_ptr() as *const _));
1882 let p10 = _mm256_cvtepu8_epi16(_mm_loadu_si128(p10_buf.as_ptr() as *const _));
1883 let p01 = _mm256_cvtepu8_epi16(_mm_loadu_si128(p01_buf.as_ptr() as *const _));
1884 let p11 = _mm256_cvtepu8_epi16(_mm_loadu_si128(p11_buf.as_ptr() as *const _));
1885
1886 let p00 = _mm256_slli_epi16::<7>(p00);
1890 let p10 = _mm256_slli_epi16::<7>(p10);
1891 let p01 = _mm256_slli_epi16::<7>(p01);
1892 let p11 = _mm256_slli_epi16::<7>(p11);
1893
1894 let v_fx = _mm256_loadu_si256(fx_q15.as_ptr().add(dx) as *const _);
1896 let v_one_minus_fx =
1897 _mm256_loadu_si256(one_minus_fx_q15.as_ptr().add(dx) as *const _);
1898
1899 let top = _mm256_add_epi16(
1901 _mm256_mulhrs_epi16(p00, v_one_minus_fx),
1902 _mm256_mulhrs_epi16(p10, v_fx),
1903 );
1904 let bottom = _mm256_add_epi16(
1905 _mm256_mulhrs_epi16(p01, v_one_minus_fx),
1906 _mm256_mulhrs_epi16(p11, v_fx),
1907 );
1908
1909 let out_q7 = _mm256_add_epi16(
1914 _mm256_mulhrs_epi16(top, v_one_minus_fy),
1915 _mm256_mulhrs_epi16(bottom, v_fy),
1916 );
1917 let rounded = _mm256_add_epi16(out_q7, _mm256_set1_epi16(64));
1919 let shifted = _mm256_srai_epi16::<7>(rounded);
1920
1921 let packed = _mm256_packus_epi16(shifted, shifted);
1923 let packed = _mm256_permute4x64_epi64::<0b00_00_10_00>(packed);
1926 _mm_storeu_si128(
1927 dst.as_mut_ptr().add(dst_row + dx) as *mut _,
1928 _mm256_castsi256_si128(packed),
1929 );
1930
1931 dx += 16;
1932 }
1933
1934 while dx < dst_w {
1936 let x0 = x0s[dx] as usize;
1937 let x1 = (x0 + 1).min(src_w - 1);
1938 let fx = fxs[dx] as f64 / 65536.0;
1939 let fy = fy_q16 as f64 / 65536.0;
1940
1941 let p00 = src[row0 + x0] as f64;
1942 let p10 = src[row0 + x1] as f64;
1943 let p01 = src[row1 + x0] as f64;
1944 let p11 = src[row1 + x1] as f64;
1945
1946 let val = p00 * (1.0 - fx) * (1.0 - fy)
1947 + p10 * fx * (1.0 - fy)
1948 + p01 * (1.0 - fx) * fy
1949 + p11 * fx * fy;
1950 dst[dst_row + dx] = val.round() as u8;
1951 dx += 1;
1952 }
1953 }
1954
1955 dst
1956 }
1957}
1958
1959#[cfg(test)]
1964mod tests {
1965 use super::*;
1966
1967 fn synth_601_frame(w: usize, h: usize) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
1970 let mut y = vec![0u8; w * h];
1971 let mut cb = vec![0u8; (w / 2) * (h / 2)];
1972 let mut cr = vec![0u8; (w / 2) * (h / 2)];
1973 for i in 0..y.len() {
1974 y[i] = 16 + ((i as u32 * 17) % 220) as u8;
1976 }
1977 for i in 0..cb.len() {
1978 cb[i] = 16 + ((i as u32 * 13) % 225) as u8;
1979 cr[i] = 16 + ((i as u32 * 23) % 225) as u8;
1980 }
1981 (y, cb, cr)
1982 }
1983
1984 #[test]
1985 fn bt601_to_bt709_neutral_gray_roundtrips() {
1986 for &y_val in &[16u8, 64, 128, 200, 235] {
1989 let w = 32;
1990 let h = 16;
1991 let mut y = vec![y_val; w * h];
1992 let mut cb = vec![128u8; (w / 2) * (h / 2)];
1993 let mut cr = vec![128u8; (w / 2) * (h / 2)];
1994 bt601_to_bt709_planes_scalar(&mut y, &mut cb, &mut cr, w, h);
1995 for v in &y {
1996 assert_eq!(*v, y_val, "Y with neutral chroma must round-trip");
1997 }
1998 for v in &cb {
1999 assert_eq!(*v, 128);
2000 }
2001 for v in &cr {
2002 assert_eq!(*v, 128);
2003 }
2004 }
2005 }
2006
2007 #[test]
2008 fn bt601_to_bt709_black_and_white_round_trip() {
2009 for &(y_val, label) in &[(16u8, "black"), (235u8, "white")] {
2012 let w = 64;
2013 let h = 32;
2014 let mut y = vec![y_val; w * h];
2015 let mut cb = vec![128u8; (w / 2) * (h / 2)];
2016 let mut cr = vec![128u8; (w / 2) * (h / 2)];
2017 bt601_to_bt709_planes(&mut y, &mut cb, &mut cr, w, h);
2018 for v in &y {
2019 assert_eq!(*v, y_val, "{} Y round-trip", label);
2020 }
2021 for v in &cb {
2022 assert_eq!(*v, 128, "{} Cb round-trip", label);
2023 }
2024 for v in &cr {
2025 assert_eq!(*v, 128, "{} Cr round-trip", label);
2026 }
2027 }
2028 }
2029
2030 #[test]
2031 fn bt601_to_bt709_scalar_vs_avx2_agree_256x256() {
2032 let w = 256;
2037 let h = 256;
2038 let (y0, cb0, cr0) = synth_601_frame(w, h);
2039
2040 let mut y_s = y0.clone();
2041 let mut cb_s = cb0.clone();
2042 let mut cr_s = cr0.clone();
2043 bt601_to_bt709_planes_scalar(&mut y_s, &mut cb_s, &mut cr_s, w, h);
2044
2045 let mut y_v = y0.clone();
2046 let mut cb_v = cb0.clone();
2047 let mut cr_v = cr0.clone();
2048 bt601_to_bt709_planes(&mut y_v, &mut cb_v, &mut cr_v, w, h);
2049
2050 let mut max_y = 0i32;
2051 for i in 0..y_s.len() {
2052 let d = (y_s[i] as i32 - y_v[i] as i32).abs();
2053 if d > max_y {
2054 max_y = d;
2055 }
2056 assert!(d <= 1, "Y[{}] scalar={} avx2={}", i, y_s[i], y_v[i]);
2057 }
2058 for i in 0..cb_s.len() {
2059 assert!(
2060 (cb_s[i] as i32 - cb_v[i] as i32).abs() <= 1,
2061 "Cb[{}] scalar={} avx2={}",
2062 i,
2063 cb_s[i],
2064 cb_v[i]
2065 );
2066 assert!(
2067 (cr_s[i] as i32 - cr_v[i] as i32).abs() <= 1,
2068 "Cr[{}] scalar={} avx2={}",
2069 i,
2070 cr_s[i],
2071 cr_v[i]
2072 );
2073 }
2074 }
2075
2076 #[test]
2077 fn bt601_to_bt709_scalar_vs_avx2_agree_tail() {
2078 let w = 34;
2081 let h = 16;
2082 let (y0, cb0, cr0) = synth_601_frame(w, h);
2083
2084 let mut y_s = y0.clone();
2085 let mut cb_s = cb0.clone();
2086 let mut cr_s = cr0.clone();
2087 bt601_to_bt709_planes_scalar(&mut y_s, &mut cb_s, &mut cr_s, w, h);
2088
2089 let mut y_v = y0.clone();
2090 let mut cb_v = cb0.clone();
2091 let mut cr_v = cr0.clone();
2092 bt601_to_bt709_planes(&mut y_v, &mut cb_v, &mut cr_v, w, h);
2093
2094 for i in 0..y_s.len() {
2095 assert!(
2096 (y_s[i] as i32 - y_v[i] as i32).abs() <= 1,
2097 "Y[{}] scalar={} avx2={}",
2098 i,
2099 y_s[i],
2100 y_v[i]
2101 );
2102 }
2103 for i in 0..cb_s.len() {
2104 assert!((cb_s[i] as i32 - cb_v[i] as i32).abs() <= 1);
2105 assert!((cr_s[i] as i32 - cr_v[i] as i32).abs() <= 1);
2106 }
2107 }
2108
2109 #[test]
2110 fn bt601_to_bt709_clamps_ranges() {
2111 let w = 32;
2113 let h = 16;
2114 let (mut y, mut cb, mut cr) = synth_601_frame(w, h);
2115 bt601_to_bt709_planes(&mut y, &mut cb, &mut cr, w, h);
2116 for &v in cb.iter().chain(cr.iter()) {
2117 assert!((16..=240).contains(&v), "chroma {} out of limited range", v);
2118 }
2119 for &v in y.iter() {
2120 assert!((16..=235).contains(&v), "luma {} out of limited range", v);
2121 }
2122 }
2123
2124 fn make_ramp(w: usize, h: usize) -> Vec<u8> {
2127 (0..w * h).map(|i| ((i * 7 + i / w) & 0xff) as u8).collect()
2128 }
2129
2130 #[test]
2131 fn bilinear_scalar_vs_avx2_agree_2x() {
2132 let src_w = 64;
2133 let src_h = 32;
2134 let src = make_ramp(src_w, src_h);
2135 let dst_w = 128;
2136 let dst_h = 64;
2137
2138 let scalar = bilinear_scale_plane_scalar(&src, src_w, src_h, dst_w, dst_h);
2139 let simd = bilinear_scale_plane(&src, src_w, src_h, dst_w, dst_h);
2140
2141 assert_eq!(scalar.len(), simd.len());
2142 let mut max_diff = 0i32;
2143 for i in 0..scalar.len() {
2144 let d = (scalar[i] as i32 - simd[i] as i32).abs();
2145 if d > max_diff {
2146 max_diff = d;
2147 }
2148 assert!(
2149 d <= 1,
2150 "bilinear mismatch at {}: scalar={} simd={}",
2151 i,
2152 scalar[i],
2153 simd[i]
2154 );
2155 }
2156 }
2157
2158 #[test]
2159 fn bilinear_scalar_vs_avx2_agree_downscale() {
2160 let src_w = 128;
2161 let src_h = 72;
2162 let src = make_ramp(src_w, src_h);
2163 let dst_w = 64;
2164 let dst_h = 36;
2165
2166 let scalar = bilinear_scale_plane_scalar(&src, src_w, src_h, dst_w, dst_h);
2167 let simd = bilinear_scale_plane(&src, src_w, src_h, dst_w, dst_h);
2168
2169 for i in 0..scalar.len() {
2170 let d = (scalar[i] as i32 - simd[i] as i32).abs();
2171 assert!(
2172 d <= 1,
2173 "bilinear mismatch at {}: scalar={} simd={}",
2174 i,
2175 scalar[i],
2176 simd[i]
2177 );
2178 }
2179 }
2180
2181 #[test]
2182 fn bilinear_constant_input_yields_constant_output() {
2183 let src = vec![42u8; 64 * 32];
2184 let out = bilinear_scale_plane(&src, 64, 32, 128, 64);
2185 for &v in &out {
2186 assert_eq!(v, 42, "constant input must yield constant output");
2187 }
2188 }
2189
2190 #[test]
2191 fn bilinear_identity_scale() {
2192 let src = make_ramp(32, 32);
2193 let out = bilinear_scale_plane_scalar(&src, 32, 32, 32, 32);
2194 assert_eq!(out, src);
2195 }
2196
2197 fn make_10bit_frame_planar(w: usize, h: usize, y_val: u16, c_val: u16) -> VideoFrame {
2200 let y_samples = w * h;
2201 let c_samples = (w / 2) * (h / 2);
2202 let total = y_samples + 2 * c_samples;
2203 let mut buf = Vec::with_capacity(total * 2);
2204 for _ in 0..y_samples {
2205 buf.extend_from_slice(&y_val.to_le_bytes());
2206 }
2207 for _ in 0..(2 * c_samples) {
2208 buf.extend_from_slice(&c_val.to_le_bytes());
2209 }
2210 VideoFrame::new(
2211 bytes::Bytes::from(buf),
2212 w as u32,
2213 h as u32,
2214 PixelFormat::Yuv420p10le,
2215 ColorSpace::Bt2020,
2216 0,
2217 )
2218 }
2219
2220 #[test]
2221 fn convert_to_yuv420p_bt709_passthrough_10bit() {
2222 let frame = make_10bit_frame_planar(16, 16, 600, 512);
2229 let out = convert_to_yuv420p_bt709(&frame).expect("10-bit passthrough");
2230 assert_eq!(out.format, PixelFormat::Yuv420p10le);
2231 assert_eq!(out.width, 16);
2232 assert_eq!(out.height, 16);
2233 assert_eq!(out.data.len(), frame.data.len());
2234 assert_eq!(
2235 &out.data[..],
2236 &frame.data[..],
2237 "10-bit data must be byte-identical (no tonemap)"
2238 );
2239 assert_eq!(
2240 out.color_space,
2241 ColorSpace::Bt2020,
2242 "color space must not change"
2243 );
2244 }
2245
2246 #[test]
2247 fn scale_frame_10bit_constant_input_yields_constant_output() {
2248 let frame = make_10bit_frame_planar(64, 64, 600, 400);
2249 let out = scale_frame(&frame, 32, 32).expect("10-bit scale");
2250 assert_eq!(out.format, PixelFormat::Yuv420p10le);
2251 assert_eq!(out.width, 32);
2252 assert_eq!(out.height, 32);
2253
2254 let y_samples = 32 * 32;
2256 let c_samples = 16 * 16;
2257 let y_bytes = y_samples * 2;
2258 let c_bytes = c_samples * 2;
2259 assert_eq!(out.data.len(), y_bytes + 2 * c_bytes);
2260
2261 let y = read_u16le(&out.data[..y_bytes]);
2262 let u = read_u16le(&out.data[y_bytes..y_bytes + c_bytes]);
2263 let v = read_u16le(&out.data[y_bytes + c_bytes..y_bytes + 2 * c_bytes]);
2264 for &s in &y {
2265 assert_eq!(s, 600, "luma must be constant after bilinear");
2266 }
2267 for &s in u.iter().chain(v.iter()) {
2268 assert_eq!(s, 400, "chroma must be constant after bilinear");
2269 }
2270 }
2271
2272 #[test]
2273 fn scale_frame_10bit_identity_yields_byte_identical() {
2274 let frame = make_10bit_frame_planar(32, 32, 768, 256);
2275 let out = scale_frame(&frame, 32, 32).expect("identity");
2277 assert_eq!(&out.data[..], &frame.data[..]);
2278 }
2279
2280 #[test]
2281 fn bilinear_10bit_scalar_clamps_inside_10bit_range() {
2282 let mut src = vec![0u16; 64 * 32];
2284 for (i, s) in src.iter_mut().enumerate() {
2285 *s = (i as u16) % 1024;
2286 }
2287 let out = bilinear_scale_plane_u16_scalar(&src, 64, 32, 128, 64);
2288 for &v in &out {
2289 assert!(v <= 1023, "10-bit sample {} exceeds 1023", v);
2290 }
2291 }
2292
2293 fn make_10bit_ramp(w: usize, h: usize) -> Vec<u16> {
2296 (0..w * h)
2298 .map(|i| ((i * 7 + i / w) % 1024) as u16)
2299 .collect()
2300 }
2301
2302 #[test]
2303 fn bilinear_10bit_scalar_vs_avx2_agree_2x_upscale() {
2304 let src_w = 64;
2306 let src_h = 32;
2307 let src = make_10bit_ramp(src_w, src_h);
2308 let dst_w = 128;
2309 let dst_h = 64;
2310
2311 let scalar = bilinear_scale_plane_u16_scalar(&src, src_w, src_h, dst_w, dst_h);
2312 let simd = bilinear_scale_plane_u16(&src, src_w, src_h, dst_w, dst_h);
2313
2314 assert_eq!(scalar.len(), simd.len());
2315 let mut max_diff = 0i32;
2316 for i in 0..scalar.len() {
2317 let d = (scalar[i] as i32 - simd[i] as i32).abs();
2318 if d > max_diff {
2319 max_diff = d;
2320 }
2321 assert!(
2322 d <= 1,
2323 "bilinear 10-bit mismatch at {}: scalar={} simd={}",
2324 i,
2325 scalar[i],
2326 simd[i]
2327 );
2328 }
2329 }
2330
2331 #[test]
2332 fn bilinear_10bit_scalar_vs_avx2_agree_downscale_1080p_to_720p() {
2333 let src_w = 1920;
2337 let src_h = 1080;
2338 let src = make_10bit_ramp(src_w, src_h);
2339 let dst_w = 1280;
2340 let dst_h = 720;
2341
2342 let scalar = bilinear_scale_plane_u16_scalar(&src, src_w, src_h, dst_w, dst_h);
2343 let simd = bilinear_scale_plane_u16(&src, src_w, src_h, dst_w, dst_h);
2344
2345 for i in 0..scalar.len() {
2346 let d = (scalar[i] as i32 - simd[i] as i32).abs();
2347 assert!(
2348 d <= 1,
2349 "bilinear 10-bit mismatch at {}: scalar={} simd={}",
2350 i,
2351 scalar[i],
2352 simd[i]
2353 );
2354 }
2355 }
2356
2357 #[test]
2358 fn bilinear_10bit_avx2_constant_input_yields_constant_output() {
2359 let src = vec![600u16; 128 * 64];
2362 let out = bilinear_scale_plane_u16(&src, 128, 64, 256, 128);
2363 for &v in &out {
2364 assert_eq!(v, 600, "constant 10-bit input must yield constant output");
2365 }
2366 }
2367
2368 #[test]
2369 fn bilinear_10bit_avx2_max_value_clamped() {
2370 let src = vec![1023u16; 64 * 32];
2373 let out = bilinear_scale_plane_u16(&src, 64, 32, 128, 64);
2374 for &v in &out {
2375 assert!(v <= 1023, "10-bit AVX2 sample {} exceeds 1023", v);
2376 assert_eq!(v, 1023, "constant 1023 should stay 1023");
2377 }
2378 }
2379
2380 #[test]
2381 fn bilinear_10bit_narrow_width_falls_back_to_scalar() {
2382 let src_w = 8;
2385 let src_h = 8;
2386 let src = make_10bit_ramp(src_w, src_h);
2387 let dst_w = 4;
2388 let dst_h = 4;
2389
2390 let scalar = bilinear_scale_plane_u16_scalar(&src, src_w, src_h, dst_w, dst_h);
2391 let dispatched = bilinear_scale_plane_u16(&src, src_w, src_h, dst_w, dst_h);
2392
2393 assert_eq!(
2394 scalar, dispatched,
2395 "narrow strip should match scalar exactly"
2396 );
2397 }
2398
2399 #[test]
2400 fn bilinear_10bit_odd_dst_dims_handled() {
2401 let src_w = 32;
2403 let src_h = 32;
2404 let src = make_10bit_ramp(src_w, src_h);
2405 let dst_w = 17;
2406 let dst_h = 9;
2407
2408 let scalar = bilinear_scale_plane_u16_scalar(&src, src_w, src_h, dst_w, dst_h);
2409 let simd = bilinear_scale_plane_u16(&src, src_w, src_h, dst_w, dst_h);
2410 assert_eq!(scalar.len(), simd.len());
2411 for i in 0..scalar.len() {
2412 let d = (scalar[i] as i32 - simd[i] as i32).abs();
2413 assert!(
2414 d <= 1,
2415 "tail mismatch at {}: scalar={} simd={}",
2416 i,
2417 scalar[i],
2418 simd[i]
2419 );
2420 }
2421 }
2422
2423 #[test]
2424 fn bilinear_10bit_tall_narrow_strip() {
2425 let src_w = 16;
2428 let src_h = 512;
2429 let src = make_10bit_ramp(src_w, src_h);
2430 let dst_w = 16;
2431 let dst_h = 256;
2432
2433 let scalar = bilinear_scale_plane_u16_scalar(&src, src_w, src_h, dst_w, dst_h);
2434 let simd = bilinear_scale_plane_u16(&src, src_w, src_h, dst_w, dst_h);
2435 for i in 0..scalar.len() {
2436 let d = (scalar[i] as i32 - simd[i] as i32).abs();
2437 assert!(d <= 1, "tall strip mismatch at {}", i);
2438 }
2439 }
2440
2441 fn synth_601_frame_10bit(w: usize, h: usize) -> (Vec<u16>, Vec<u16>, Vec<u16>) {
2444 let mut y = vec![0u16; w * h];
2446 let mut cb = vec![0u16; (w / 2) * (h / 2)];
2447 let mut cr = vec![0u16; (w / 2) * (h / 2)];
2448 for i in 0..y.len() {
2449 y[i] = 64 + ((i as u32 * 17) % 877) as u16;
2450 }
2451 for i in 0..cb.len() {
2452 cb[i] = 64 + ((i as u32 * 13) % 897) as u16;
2453 cr[i] = 64 + ((i as u32 * 23) % 897) as u16;
2454 }
2455 (y, cb, cr)
2456 }
2457
2458 #[test]
2459 fn bt601_to_bt709_10bit_neutral_gray_roundtrips() {
2460 for &y_val in &[64u16, 256, 512, 800, 940] {
2465 let w = 32;
2466 let h = 16;
2467 let mut y = vec![y_val; w * h];
2468 let mut cb = vec![512u16; (w / 2) * (h / 2)];
2469 let mut cr = vec![512u16; (w / 2) * (h / 2)];
2470 bt601_to_bt709_planes_10bit_scalar(&mut y, &mut cb, &mut cr, w, h);
2471 for v in &y {
2472 assert_eq!(*v, y_val, "Y with neutral chroma must round-trip");
2473 }
2474 for v in &cb {
2475 assert_eq!(*v, 512);
2476 }
2477 for v in &cr {
2478 assert_eq!(*v, 512);
2479 }
2480 }
2481 }
2482
2483 #[test]
2484 fn bt601_to_bt709_10bit_scalar_vs_avx2_agree_256x256() {
2485 let w = 256;
2488 let h = 256;
2489 let (y0, cb0, cr0) = synth_601_frame_10bit(w, h);
2490
2491 let mut y_s = y0.clone();
2492 let mut cb_s = cb0.clone();
2493 let mut cr_s = cr0.clone();
2494 bt601_to_bt709_planes_10bit_scalar(&mut y_s, &mut cb_s, &mut cr_s, w, h);
2495
2496 let mut y_v = y0.clone();
2497 let mut cb_v = cb0.clone();
2498 let mut cr_v = cr0.clone();
2499 bt601_to_bt709_planes_10bit(&mut y_v, &mut cb_v, &mut cr_v, w, h);
2500
2501 for i in 0..y_s.len() {
2502 let d = (y_s[i] as i32 - y_v[i] as i32).abs();
2503 assert!(d <= 1, "Y[{}] scalar={} avx2={}", i, y_s[i], y_v[i]);
2504 }
2505 for i in 0..cb_s.len() {
2506 assert!(
2507 (cb_s[i] as i32 - cb_v[i] as i32).abs() <= 1,
2508 "Cb[{}] scalar={} avx2={}",
2509 i,
2510 cb_s[i],
2511 cb_v[i]
2512 );
2513 assert!(
2514 (cr_s[i] as i32 - cr_v[i] as i32).abs() <= 1,
2515 "Cr[{}] scalar={} avx2={}",
2516 i,
2517 cr_s[i],
2518 cr_v[i]
2519 );
2520 }
2521 }
2522
2523 #[test]
2524 fn bt601_to_bt709_10bit_scalar_vs_avx2_agree_tail() {
2525 let w = 34;
2528 let h = 16;
2529 let (y0, cb0, cr0) = synth_601_frame_10bit(w, h);
2530
2531 let mut y_s = y0.clone();
2532 let mut cb_s = cb0.clone();
2533 let mut cr_s = cr0.clone();
2534 bt601_to_bt709_planes_10bit_scalar(&mut y_s, &mut cb_s, &mut cr_s, w, h);
2535
2536 let mut y_v = y0.clone();
2537 let mut cb_v = cb0.clone();
2538 let mut cr_v = cr0.clone();
2539 bt601_to_bt709_planes_10bit(&mut y_v, &mut cb_v, &mut cr_v, w, h);
2540
2541 for i in 0..y_s.len() {
2542 assert!(
2543 (y_s[i] as i32 - y_v[i] as i32).abs() <= 1,
2544 "Y[{}] scalar={} avx2={}",
2545 i,
2546 y_s[i],
2547 y_v[i]
2548 );
2549 }
2550 for i in 0..cb_s.len() {
2551 assert!((cb_s[i] as i32 - cb_v[i] as i32).abs() <= 1);
2552 assert!((cr_s[i] as i32 - cr_v[i] as i32).abs() <= 1);
2553 }
2554 }
2555
2556 #[test]
2557 fn bt601_to_bt709_10bit_clamps_ranges() {
2558 let w = 32;
2560 let h = 16;
2561 let (mut y, mut cb, mut cr) = synth_601_frame_10bit(w, h);
2562 bt601_to_bt709_planes_10bit(&mut y, &mut cb, &mut cr, w, h);
2563 for &v in cb.iter().chain(cr.iter()) {
2564 assert!(
2565 (64..=960).contains(&v),
2566 "chroma {} out of 10-bit limited range",
2567 v
2568 );
2569 }
2570 for &v in y.iter() {
2571 assert!(
2572 (64..=940).contains(&v),
2573 "luma {} out of 10-bit limited range",
2574 v
2575 );
2576 }
2577 }
2578
2579 #[test]
2580 fn bt601_to_bt709_10bit_extreme_chroma_clamped_at_high_end() {
2581 let w = 32;
2583 let h = 16;
2584 let mut y = vec![940u16; w * h];
2585 let mut cb = vec![960u16; (w / 2) * (h / 2)];
2586 let mut cr = vec![960u16; (w / 2) * (h / 2)];
2587 bt601_to_bt709_planes_10bit(&mut y, &mut cb, &mut cr, w, h);
2588 for &v in y.iter() {
2589 assert!(v <= 940, "luma {} > 940 (clamp violated)", v);
2590 }
2591 for &v in cb.iter().chain(cr.iter()) {
2592 assert!(v <= 960, "chroma {} > 960 (clamp violated)", v);
2593 }
2594 }
2595
2596 #[test]
2599 fn downsample_4x4_box_average_8bit_hand_verified() {
2600 let cb: Vec<u8> = vec![
2614 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160,
2615 ];
2616 let cr: Vec<u8> = vec![
2618 5, 15, 25, 35, 45, 55, 65, 75, 85, 95, 105, 115, 125, 135, 145, 155,
2619 ];
2620 let y: Vec<u8> = (0..16).map(|i| i as u8 * 8).collect();
2623
2624 let out = downsample_chroma_444_to_420(&y, &cb, &cr, 4, 4);
2625 assert_eq!(out.len(), 16 + 4 + 4);
2627 assert_eq!(&out[..16], y.as_slice(), "Y must round-trip verbatim");
2628 assert_eq!(out[16], 35, "Cb block (0,0)");
2630 assert_eq!(out[17], 55, "Cb block (1,0)");
2631 assert_eq!(out[18], 115, "Cb block (0,1)");
2632 assert_eq!(out[19], 135, "Cb block (1,1)");
2633 assert_eq!(out[20], 30, "Cr block (0,0): (5+15+45+55+2)>>2 = 30");
2635 assert_eq!(out[21], 50, "Cr block (1,0): (25+35+65+75+2)>>2 = 50");
2636 assert_eq!(out[22], 110, "Cr block (0,1): (85+95+125+135+2)>>2 = 110");
2637 assert_eq!(out[23], 130, "Cr block (1,1): (105+115+145+155+2)>>2 = 130");
2638 }
2639
2640 #[test]
2641 fn downsample_constant_input_8bit_yields_constant_output() {
2642 let w = 16;
2645 let h = 16;
2646 let y = vec![64u8; w * h];
2647 let cb = vec![128u8; w * h];
2648 let cr = vec![128u8; w * h];
2649 let out = downsample_chroma_444_to_420(&y, &cb, &cr, w, h);
2650 let cw = (w + 1) / 2;
2651 let ch = (h + 1) / 2;
2652 assert_eq!(out.len(), w * h + 2 * cw * ch);
2653 for i in 0..w * h {
2655 assert_eq!(out[i], 64, "Y[{}] should be 64", i);
2656 }
2657 for i in (w * h)..(w * h + 2 * cw * ch) {
2659 assert_eq!(out[i], 128, "chroma[{}] should be 128", i - w * h);
2660 }
2661 }
2662
2663 #[test]
2664 fn downsample_odd_dimensions_clamp_policy() {
2665 let w = 7;
2677 let h = 7;
2678 let y = vec![100u8; w * h];
2679 let cb = vec![128u8; w * h];
2680 let cr = vec![64u8; w * h];
2681 let out = downsample_chroma_444_to_420(&y, &cb, &cr, w, h);
2682 let cw = (w + 1) / 2; let ch = (h + 1) / 2; assert_eq!(cw, 4);
2685 assert_eq!(ch, 4);
2686 assert_eq!(out.len(), w * h + 2 * cw * ch);
2687 for i in 0..w * h {
2689 assert_eq!(out[i], 100);
2690 }
2691 for cx in 0..cw {
2693 for cy in 0..ch {
2694 let idx = w * h + cy * cw + cx;
2695 assert_eq!(out[idx], 128, "Cb[{},{}] expected 128", cx, cy);
2696 }
2697 }
2698 for cx in 0..cw {
2700 for cy in 0..ch {
2701 let idx = w * h + cw * ch + cy * cw + cx;
2702 assert_eq!(out[idx], 64, "Cr[{},{}] expected 64", cx, cy);
2703 }
2704 }
2705 }
2706
2707 #[test]
2708 fn downsample_10bit_constant_input_yields_constant_output() {
2709 let w = 16;
2711 let h = 16;
2712 let y = vec![400u16; w * h];
2713 let cb = vec![512u16; w * h];
2714 let cr = vec![512u16; w * h];
2715 let out = downsample_chroma_444_to_420_10bit(&y, &cb, &cr, w, h);
2716 let cw = (w + 1) / 2;
2717 let ch = (h + 1) / 2;
2718 assert_eq!(out.len(), 2 * (w * h + 2 * cw * ch), "10-bit byte count");
2719
2720 for i in 0..w * h {
2722 let s = u16::from_le_bytes([out[i * 2], out[i * 2 + 1]]);
2723 assert_eq!(s, 400, "Y[{}] should be 400", i);
2724 }
2725 let cb_byte_off = w * h * 2;
2727 for i in 0..cw * ch {
2728 let s = u16::from_le_bytes([out[cb_byte_off + i * 2], out[cb_byte_off + i * 2 + 1]]);
2729 assert_eq!(s, 512, "Cb[{}] should be 512", i);
2730 }
2731 let cr_byte_off = cb_byte_off + cw * ch * 2;
2733 for i in 0..cw * ch {
2734 let s = u16::from_le_bytes([out[cr_byte_off + i * 2], out[cr_byte_off + i * 2 + 1]]);
2735 assert_eq!(s, 512, "Cr[{}] should be 512", i);
2736 }
2737 }
2738
2739 #[test]
2740 fn downsample_10bit_max_value_no_overflow() {
2741 let w = 4;
2745 let h = 4;
2746 let y = vec![1023u16; w * h];
2747 let cb = vec![1023u16; w * h];
2748 let cr = vec![1023u16; w * h];
2749 let out = downsample_chroma_444_to_420_10bit(&y, &cb, &cr, w, h);
2750 let cw = (w + 1) / 2;
2751 let ch = (h + 1) / 2;
2752
2753 for i in 0..w * h {
2755 let s = u16::from_le_bytes([out[i * 2], out[i * 2 + 1]]);
2756 assert_eq!(s, 1023, "Y[{}]", i);
2757 }
2758 let cb_byte_off = w * h * 2;
2760 for i in 0..2 * cw * ch {
2761 let s = u16::from_le_bytes([out[cb_byte_off + i * 2], out[cb_byte_off + i * 2 + 1]]);
2762 assert_eq!(s, 1023, "chroma[{}] should be 1023 (no overflow)", i);
2763 }
2764 }
2765
2766 #[test]
2767 fn downsample_10bit_4x4_box_average_hand_verified() {
2768 let cb_u: Vec<u16> = vec![
2770 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160,
2771 ];
2772 let cr_u: Vec<u16> = vec![
2773 500, 600, 700, 800, 500, 600, 700, 800, 500, 600, 700, 800, 500, 600, 700, 800,
2774 ];
2775 let y_u: Vec<u16> = (0..16).map(|i| i as u16 * 50).collect();
2776
2777 let out = downsample_chroma_444_to_420_10bit(&y_u, &cb_u, &cr_u, 4, 4);
2778 assert_eq!(out.len(), 32 + 8 + 8);
2780
2781 for i in 0..16 {
2783 let s = u16::from_le_bytes([out[i * 2], out[i * 2 + 1]]);
2784 assert_eq!(s, i as u16 * 50, "Y[{}]", i);
2785 }
2786 let cb_off = 32;
2788 let cb0 = u16::from_le_bytes([out[cb_off], out[cb_off + 1]]);
2789 let cb1 = u16::from_le_bytes([out[cb_off + 2], out[cb_off + 3]]);
2790 let cb2 = u16::from_le_bytes([out[cb_off + 4], out[cb_off + 5]]);
2791 let cb3 = u16::from_le_bytes([out[cb_off + 6], out[cb_off + 7]]);
2792 assert_eq!(cb0, 35);
2793 assert_eq!(cb1, 55);
2794 assert_eq!(cb2, 115);
2795 assert_eq!(cb3, 135);
2796 let cr_off = cb_off + 8;
2800 let cr0 = u16::from_le_bytes([out[cr_off], out[cr_off + 1]]);
2801 let cr1 = u16::from_le_bytes([out[cr_off + 2], out[cr_off + 3]]);
2802 assert_eq!(cr0, 550);
2803 assert_eq!(cr1, 750);
2804 }
2805
2806 #[test]
2807 fn downsample_frame_yuv444p10le_to_yuv420p10le() {
2808 let w = 16;
2811 let h = 16;
2812 let plane = w * h;
2813 let mut buf = Vec::with_capacity(3 * plane * 2);
2814 for _ in 0..plane {
2815 buf.extend_from_slice(&500u16.to_le_bytes()); }
2817 for _ in 0..plane {
2818 buf.extend_from_slice(&512u16.to_le_bytes()); }
2820 for _ in 0..plane {
2821 buf.extend_from_slice(&512u16.to_le_bytes()); }
2823 let frame = VideoFrame::new(
2824 bytes::Bytes::from(buf),
2825 w as u32,
2826 h as u32,
2827 PixelFormat::Yuv444p10le,
2828 ColorSpace::Bt2020,
2829 42,
2830 );
2831 let out = downsample_444_to_420_frame(&frame).expect("downsample");
2832 assert_eq!(out.format, PixelFormat::Yuv420p10le);
2833 assert_eq!(out.width, w as u32);
2834 assert_eq!(out.height, h as u32);
2835 assert_eq!(out.pts, 42, "PTS preserved");
2836 assert_eq!(out.color_space, ColorSpace::Bt2020, "color_space preserved");
2837
2838 let cw = w / 2;
2840 let ch = h / 2;
2841 let expected_bytes = 2 * (w * h + 2 * cw * ch);
2842 assert_eq!(out.data.len(), expected_bytes);
2843
2844 let y0 = u16::from_le_bytes([out.data[0], out.data[1]]);
2846 assert_eq!(y0, 500);
2847 let cb0 = u16::from_le_bytes([out.data[w * h * 2], out.data[w * h * 2 + 1]]);
2849 assert_eq!(cb0, 512);
2850 }
2851
2852 #[test]
2853 fn downsample_frame_yuva444p10le_drops_alpha() {
2854 let w = 8;
2857 let h = 8;
2858 let plane = w * h;
2859 let mut buf = Vec::with_capacity(4 * plane * 2);
2860 for _ in 0..plane {
2861 buf.extend_from_slice(&600u16.to_le_bytes());
2862 }
2863 for _ in 0..plane {
2864 buf.extend_from_slice(&500u16.to_le_bytes());
2865 }
2866 for _ in 0..plane {
2867 buf.extend_from_slice(&500u16.to_le_bytes());
2868 }
2869 for _ in 0..plane {
2870 buf.extend_from_slice(&65535u16.to_le_bytes());
2872 }
2873 let frame = VideoFrame::new(
2874 bytes::Bytes::from(buf),
2875 w as u32,
2876 h as u32,
2877 PixelFormat::Yuva444p10le,
2878 ColorSpace::Bt2020,
2879 7,
2880 );
2881 let out = downsample_444_to_420_frame(&frame).expect("downsample with alpha");
2882 assert_eq!(out.format, PixelFormat::Yuv420p10le);
2883 let cw = w / 2;
2885 let ch = h / 2;
2886 let expected = 2 * (w * h + 2 * cw * ch);
2887 assert_eq!(out.data.len(), expected);
2888 for i in (0..out.data.len()).step_by(2) {
2890 let s = u16::from_le_bytes([out.data[i], out.data[i + 1]]);
2891 assert!(
2892 s < 1024 || s == 65535 && false,
2893 "stray alpha sample {} at {}",
2894 s,
2895 i
2896 );
2897 assert_ne!(s, 65535, "alpha plane leaked into output");
2898 }
2899 }
2900
2901 #[test]
2902 fn downsample_frame_rejects_non_444() {
2903 let w = 16;
2905 let h = 16;
2906 let plane = w * h;
2907 let mut buf = Vec::with_capacity(plane + 2 * (plane / 4));
2908 buf.resize(plane + 2 * (plane / 4), 128);
2909 let frame = VideoFrame::new(
2910 bytes::Bytes::from(buf),
2911 w as u32,
2912 h as u32,
2913 PixelFormat::Yuv420p,
2914 ColorSpace::Bt709,
2915 0,
2916 );
2917 let err = downsample_444_to_420_frame(&frame).unwrap_err();
2918 assert!(format!("{}", err).contains("expected 4:4:4 input"));
2919 }
2920}