#![allow(clippy::too_many_arguments)]
use archmage::incant;
use archmage::prelude::*;
#[cfg(target_arch = "x86_64")]
use safe_unaligned_simd::x86_64::{_mm_loadu_si64, _mm_loadu_si128, _mm256_storeu_si256};
#[cfg(target_arch = "aarch64")]
use super::color_convert_neon::convert_420_to_rgb_neon;
#[inline]
pub(crate) fn get_coefficients(
full_range: bool,
matrix_coeffs: u8,
) -> (i32, i32, i32, i32, i32, i32, i32, i32) {
if full_range {
let (cr_r, cb_g, cr_g, cb_b) = match matrix_coeffs {
1 => (403, -48, -120, 475), 9 => (377, -42, -146, 482), _ => (359, -88, -183, 454), };
(cr_r, cb_g, cr_g, cb_b, 0, 256, 128, 8)
} else {
let (cr_r, cb_g, cr_g, cb_b) = match matrix_coeffs {
1 => (14744, -1754, -4383, 17373), 9 => (13806, -1541, -5349, 17615), _ => (13126, -3222, -6686, 16591), };
(cr_r, cb_g, cr_g, cb_b, 16, 9576, 4096, 13)
}
}
#[allow(clippy::too_many_arguments)]
pub fn convert_420_to_rgb(
y_plane: &[u16],
cb_plane: &[u16],
cr_plane: &[u16],
y_stride: usize,
c_stride: usize,
y_start: u32,
y_end: u32,
x_start: u32,
x_end: u32,
shift: u32,
full_range: bool,
matrix_coeffs: u8,
rgb: &mut [u8],
) {
incant!(
convert_420_to_rgb(
y_plane,
cb_plane,
cr_plane,
y_stride,
c_stride,
y_start,
y_end,
x_start,
x_end,
shift,
full_range,
matrix_coeffs,
rgb
),
[v3, neon, wasm128, scalar]
)
}
#[cfg(target_arch = "wasm32")]
#[allow(clippy::too_many_arguments)]
fn convert_420_to_rgb_wasm128(
_token: Wasm128Token,
y_plane: &[u16],
cb_plane: &[u16],
cr_plane: &[u16],
y_stride: usize,
c_stride: usize,
y_start: u32,
y_end: u32,
x_start: u32,
x_end: u32,
shift: u32,
full_range: bool,
matrix_coeffs: u8,
rgb: &mut [u8],
) {
convert_420_to_rgb_scalar(
ScalarToken,
y_plane,
cb_plane,
cr_plane,
y_stride,
c_stride,
y_start,
y_end,
x_start,
x_end,
shift,
full_range,
matrix_coeffs,
rgb,
);
}
#[allow(clippy::too_many_arguments)]
fn convert_420_to_rgb_scalar(
_token: ScalarToken,
y_plane: &[u16],
cb_plane: &[u16],
cr_plane: &[u16],
y_stride: usize,
c_stride: usize,
y_start: u32,
y_end: u32,
x_start: u32,
x_end: u32,
shift: u32,
full_range: bool,
matrix_coeffs: u8,
rgb: &mut [u8],
) {
let (cr_r, cb_g, cr_g, cb_b, y_bias, y_scale, rnd, shr) =
get_coefficients(full_range, matrix_coeffs);
let mut out_idx = 0;
for y in y_start..y_end {
let y_row = y as usize * y_stride;
let c_row = (y as usize / 2) * c_stride;
for x in x_start..x_end {
let y_val = (y_plane[y_row + x as usize] >> shift) as i32;
let cx = x as usize / 2;
let c_idx = c_row + cx;
let cb_val = (cb_plane[c_idx] >> shift) as i32;
let cr_val = (cr_plane[c_idx] >> shift) as i32;
let cb = cb_val - 128;
let cr = cr_val - 128;
let yv = (y_val - y_bias) * y_scale;
let r = (yv + cr_r * cr + rnd) >> shr;
let g = (yv + cb_g * cb + cr_g * cr + rnd) >> shr;
let b = (yv + cb_b * cb + rnd) >> shr;
rgb[out_idx] = r.clamp(0, 255) as u8;
rgb[out_idx + 1] = g.clamp(0, 255) as u8;
rgb[out_idx + 2] = b.clamp(0, 255) as u8;
out_idx += 3;
}
}
}
#[arcane]
#[allow(clippy::too_many_arguments)]
fn convert_420_to_rgb_v3(
_token: X64V3Token,
y_plane: &[u16],
cb_plane: &[u16],
cr_plane: &[u16],
y_stride: usize,
c_stride: usize,
y_start: u32,
y_end: u32,
x_start: u32,
x_end: u32,
shift: u32,
full_range: bool,
matrix_coeffs: u8,
rgb: &mut [u8],
) {
let (cr_r, cb_g, cr_g, cb_b, y_bias, y_scale, rnd, shr) =
get_coefficients(full_range, matrix_coeffs);
let cr_r_v = _mm256_set1_epi32(cr_r);
let cb_g_v = _mm256_set1_epi32(cb_g);
let cr_g_v = _mm256_set1_epi32(cr_g);
let cb_b_v = _mm256_set1_epi32(cb_b);
let y_bias_v = _mm256_set1_epi32(y_bias);
let y_scale_v = _mm256_set1_epi32(y_scale);
let rnd_v = _mm256_set1_epi32(rnd);
let bias128_v = _mm256_set1_epi32(128);
let zero = _mm256_setzero_si256();
let max255 = _mm256_set1_epi32(255);
let shr_v = _mm_cvtsi32_si128(shr);
let shift_v = _mm_cvtsi32_si128(shift as i32);
let needs_shift = shift > 0;
let shuffle = _mm256_setr_epi8(
0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, -1, -1, -1, -1, 0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11,
-1, -1, -1, -1,
);
let x_simd_start = x_start.next_multiple_of(2);
let row_pixels = x_end.saturating_sub(x_simd_start) as usize;
let simd_count = (row_pixels / 8) * 8;
let x_simd_end = x_simd_start + simd_count as u32;
let mut out_idx = 0;
for y in y_start..y_end {
let y_row = y as usize * y_stride;
let c_row = (y as usize / 2) * c_stride;
for x in x_start..x_simd_start.min(x_end) {
scalar_pixel(
y_plane,
cb_plane,
cr_plane,
y_row,
c_row,
x as usize,
shift,
y_bias,
y_scale,
cr_r,
cb_g,
cr_g,
cb_b,
rnd,
shr,
rgb,
&mut out_idx,
);
}
let mut x = x_simd_start as usize;
let x_end_simd = x_simd_end as usize;
while x < x_end_simd {
let cx = x / 2;
let y_arr: &[u16; 8] = (&y_plane[y_row + x..y_row + x + 8]).try_into().unwrap();
let y_raw = _mm_loadu_si128(y_arr);
let mut y_i32 = _mm256_cvtepu16_epi32(y_raw);
let cb_arr: &[u16; 4] = (&cb_plane[c_row + cx..c_row + cx + 4]).try_into().unwrap();
let cr_arr: &[u16; 4] = (&cr_plane[c_row + cx..c_row + cx + 4]).try_into().unwrap();
let cb_raw = _mm_loadu_si64(cb_arr);
let cr_raw = _mm_loadu_si64(cr_arr);
let cb_dup = _mm_unpacklo_epi16(cb_raw, cb_raw);
let cr_dup = _mm_unpacklo_epi16(cr_raw, cr_raw);
let mut cb_i32 = _mm256_cvtepu16_epi32(cb_dup);
let mut cr_i32 = _mm256_cvtepu16_epi32(cr_dup);
if needs_shift {
y_i32 = _mm256_srl_epi32(y_i32, shift_v);
cb_i32 = _mm256_srl_epi32(cb_i32, shift_v);
cr_i32 = _mm256_srl_epi32(cr_i32, shift_v);
}
let yv = _mm256_mullo_epi32(_mm256_sub_epi32(y_i32, y_bias_v), y_scale_v);
let cb_adj = _mm256_sub_epi32(cb_i32, bias128_v);
let cr_adj = _mm256_sub_epi32(cr_i32, bias128_v);
let r = _mm256_sra_epi32(
_mm256_add_epi32(
_mm256_add_epi32(yv, _mm256_mullo_epi32(cr_r_v, cr_adj)),
rnd_v,
),
shr_v,
);
let g = _mm256_sra_epi32(
_mm256_add_epi32(
_mm256_add_epi32(
_mm256_add_epi32(yv, _mm256_mullo_epi32(cb_g_v, cb_adj)),
_mm256_mullo_epi32(cr_g_v, cr_adj),
),
rnd_v,
),
shr_v,
);
let b = _mm256_sra_epi32(
_mm256_add_epi32(
_mm256_add_epi32(yv, _mm256_mullo_epi32(cb_b_v, cb_adj)),
rnd_v,
),
shr_v,
);
let r = _mm256_min_epi32(_mm256_max_epi32(r, zero), max255);
let g = _mm256_min_epi32(_mm256_max_epi32(g, zero), max255);
let b = _mm256_min_epi32(_mm256_max_epi32(b, zero), max255);
let rg = _mm256_packs_epi32(r, g);
let bz = _mm256_packs_epi32(b, zero);
let packed = _mm256_packus_epi16(rg, bz);
let interleaved = _mm256_shuffle_epi8(packed, shuffle);
let mut buf = [0u8; 32];
_mm256_storeu_si256(&mut buf, interleaved);
rgb[out_idx..out_idx + 12].copy_from_slice(&buf[..12]);
rgb[out_idx + 12..out_idx + 24].copy_from_slice(&buf[16..28]);
out_idx += 24;
x += 8;
}
for x in x_simd_end..x_end {
scalar_pixel(
y_plane,
cb_plane,
cr_plane,
y_row,
c_row,
x as usize,
shift,
y_bias,
y_scale,
cr_r,
cb_g,
cr_g,
cb_b,
rnd,
shr,
rgb,
&mut out_idx,
);
}
}
}
#[inline(always)]
#[allow(clippy::too_many_arguments)]
#[allow(dead_code)] pub(crate) fn scalar_pixel(
y_plane: &[u16],
cb_plane: &[u16],
cr_plane: &[u16],
y_row: usize,
c_row: usize,
x: usize,
shift: u32,
y_bias: i32,
y_scale: i32,
cr_r: i32,
cb_g: i32,
cr_g: i32,
cb_b: i32,
rnd: i32,
shr: i32,
rgb: &mut [u8],
out_idx: &mut usize,
) {
let y_val = (y_plane[y_row + x] >> shift) as i32;
let cx = x / 2;
let c_idx = c_row + cx;
let cb_val = (cb_plane[c_idx] >> shift) as i32;
let cr_val = (cr_plane[c_idx] >> shift) as i32;
let cb = cb_val - 128;
let cr = cr_val - 128;
let yv = (y_val - y_bias) * y_scale;
let r = (yv + cr_r * cr + rnd) >> shr;
let g = (yv + cb_g * cb + cr_g * cr + rnd) >> shr;
let b = (yv + cb_b * cb + rnd) >> shr;
rgb[*out_idx] = r.clamp(0, 255) as u8;
rgb[*out_idx + 1] = g.clamp(0, 255) as u8;
rgb[*out_idx + 2] = b.clamp(0, 255) as u8;
*out_idx += 3;
}