use core::arch::x86_64::*;
use super::super::{
x86_common::{write_rgb_u8_8, write_rgba_u8_8},
x86_sse41::endian::load_endian_u16x8,
};
use crate::{
DcpTargetGamut,
row::scalar::{
self,
xyz12::{oetf_srgb, smpte428_inverse_oetf},
xyz12_constants::xyz_to_rgb_matrix,
},
};
const PIXELS_PER_ITER: usize = 8;
const SAMPLE_MASK_U16: u16 = 0x0FFF;
#[inline(always)]
unsafe fn deinterleave_xyz12_8px(
v0: __m128i,
v1: __m128i,
v2: __m128i,
) -> (__m128i, __m128i, __m128i) {
unsafe {
let x_v0 = _mm_setr_epi8(0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let x_v1 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1);
let x_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 10, 11);
let x = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(v0, x_v0), _mm_shuffle_epi8(v1, x_v1)),
_mm_shuffle_epi8(v2, x_v2),
);
let y_v0 = _mm_setr_epi8(2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let y_v1 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1);
let y_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 6, 7, 12, 13);
let y = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(v0, y_v0), _mm_shuffle_epi8(v1, y_v1)),
_mm_shuffle_epi8(v2, y_v2),
);
let z_v0 = _mm_setr_epi8(4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let z_v1 = _mm_setr_epi8(-1, -1, -1, -1, 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1);
let z_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15);
let z = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(v0, z_v0), _mm_shuffle_epi8(v1, z_v1)),
_mm_shuffle_epi8(v2, z_v2),
);
(x, y, z)
}
}
#[inline(always)]
unsafe fn u16x8_to_f32x8(v: __m128i) -> __m256 {
unsafe { _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(v)) }
}
#[inline(always)]
unsafe fn smpte428_inv_oetf_scalar8(v: __m256) -> __m256 {
unsafe {
let mut buf = [0.0_f32; 8];
_mm256_storeu_ps(buf.as_mut_ptr(), v);
for slot in &mut buf {
*slot = smpte428_inverse_oetf(*slot as u16);
}
_mm256_loadu_ps(buf.as_ptr())
}
}
#[inline(always)]
unsafe fn oetf_srgb_scalar8(v: __m256) -> __m256 {
unsafe {
let mut buf = [0.0_f32; 8];
_mm256_storeu_ps(buf.as_mut_ptr(), v);
for slot in &mut buf {
*slot = oetf_srgb(*slot);
}
_mm256_loadu_ps(buf.as_ptr())
}
}
#[inline(always)]
unsafe fn matmul_xyz_to_rgb_8lane(
m: &[[f32; 3]; 3],
x: __m256,
y: __m256,
z: __m256,
) -> (__m256, __m256, __m256) {
unsafe {
let m00 = _mm256_set1_ps(m[0][0]);
let m01 = _mm256_set1_ps(m[0][1]);
let m02 = _mm256_set1_ps(m[0][2]);
let m10 = _mm256_set1_ps(m[1][0]);
let m11 = _mm256_set1_ps(m[1][1]);
let m12 = _mm256_set1_ps(m[1][2]);
let m20 = _mm256_set1_ps(m[2][0]);
let m21 = _mm256_set1_ps(m[2][1]);
let m22 = _mm256_set1_ps(m[2][2]);
let r = _mm256_add_ps(
_mm256_add_ps(_mm256_mul_ps(m00, x), _mm256_mul_ps(m01, y)),
_mm256_mul_ps(m02, z),
);
let g = _mm256_add_ps(
_mm256_add_ps(_mm256_mul_ps(m10, x), _mm256_mul_ps(m11, y)),
_mm256_mul_ps(m12, z),
);
let b = _mm256_add_ps(
_mm256_add_ps(_mm256_mul_ps(m20, x), _mm256_mul_ps(m21, y)),
_mm256_mul_ps(m22, z),
);
(r, g, b)
}
}
#[inline(always)]
unsafe fn load_and_matmul_8px<const BE: bool>(
p: *const u8,
m: &[[f32; 3]; 3],
) -> (__m256, __m256, __m256) {
unsafe {
let v0 = load_endian_u16x8::<BE>(p);
let v1 = load_endian_u16x8::<BE>(p.add(16));
let v2 = load_endian_u16x8::<BE>(p.add(32));
let (x_u, y_u, z_u) = deinterleave_xyz12_8px(v0, v1, v2);
let mask = _mm_set1_epi16(SAMPLE_MASK_U16 as i16);
let x_shr = _mm_and_si128(_mm_srli_epi16::<4>(x_u), mask);
let y_shr = _mm_and_si128(_mm_srli_epi16::<4>(y_u), mask);
let z_shr = _mm_and_si128(_mm_srli_epi16::<4>(z_u), mask);
let x_lin = smpte428_inv_oetf_scalar8(u16x8_to_f32x8(x_shr));
let y_lin = smpte428_inv_oetf_scalar8(u16x8_to_f32x8(y_shr));
let z_lin = smpte428_inv_oetf_scalar8(u16x8_to_f32x8(z_shr));
matmul_xyz_to_rgb_8lane(m, x_lin, y_lin, z_lin)
}
}
#[inline(always)]
unsafe fn load_xyz_linear_8px<const BE: bool>(p: *const u8) -> (__m256, __m256, __m256) {
unsafe {
let v0 = load_endian_u16x8::<BE>(p);
let v1 = load_endian_u16x8::<BE>(p.add(16));
let v2 = load_endian_u16x8::<BE>(p.add(32));
let (x_u, y_u, z_u) = deinterleave_xyz12_8px(v0, v1, v2);
let mask = _mm_set1_epi16(SAMPLE_MASK_U16 as i16);
let x_shr = _mm_and_si128(_mm_srli_epi16::<4>(x_u), mask);
let y_shr = _mm_and_si128(_mm_srli_epi16::<4>(y_u), mask);
let z_shr = _mm_and_si128(_mm_srli_epi16::<4>(z_u), mask);
let x_lin = smpte428_inv_oetf_scalar8(u16x8_to_f32x8(x_shr));
let y_lin = smpte428_inv_oetf_scalar8(u16x8_to_f32x8(y_shr));
let z_lin = smpte428_inv_oetf_scalar8(u16x8_to_f32x8(z_shr));
(x_lin, y_lin, z_lin)
}
}
#[inline(always)]
unsafe fn clamp_scale_to_u32x8(v: __m256, zero: __m256, one: __m256, scale: __m256) -> __m256i {
unsafe {
let half = _mm256_set1_ps(0.5);
let clamped = _mm256_min_ps(_mm256_max_ps(v, zero), one);
let scaled = _mm256_add_ps(_mm256_mul_ps(clamped, scale), half);
_mm256_cvttps_epi32(scaled)
}
}
#[inline(always)]
unsafe fn narrow_u32x8_to_u16x8(lo: __m256i) -> __m128i {
unsafe {
let packed = _mm256_packus_epi32(lo, lo);
let permuted = _mm256_permute4x64_epi64::<0xD8>(packed);
_mm256_castsi256_si128(permuted)
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn xyz12_to_rgb_row<const BE: bool>(
xyz: &[u16],
rgb_out: &mut [u8],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
unsafe {
let zero_ps = _mm256_setzero_ps();
let one_ps = _mm256_set1_ps(1.0);
let scale = _mm256_set1_ps(255.0);
let zero_si = _mm_setzero_si128();
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let (r_lin, g_lin, b_lin) = load_and_matmul_8px::<BE>(p, &m);
let r_oetf = oetf_srgb_scalar8(r_lin);
let g_oetf = oetf_srgb_scalar8(g_lin);
let b_oetf = oetf_srgb_scalar8(b_lin);
let r_u32 = clamp_scale_to_u32x8(r_oetf, zero_ps, one_ps, scale);
let g_u32 = clamp_scale_to_u32x8(g_oetf, zero_ps, one_ps, scale);
let b_u32 = clamp_scale_to_u32x8(b_oetf, zero_ps, one_ps, scale);
let r_u16 = narrow_u32x8_to_u16x8(r_u32);
let g_u16 = narrow_u32x8_to_u16x8(g_u32);
let b_u16 = narrow_u32x8_to_u16x8(b_u32);
let r_u8 = _mm_packus_epi16(r_u16, zero_si);
let g_u8 = _mm_packus_epi16(g_u16, zero_si);
let b_u8 = _mm_packus_epi16(b_u16, zero_si);
write_rgb_u8_8(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_rgb_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgb_out[x * 3..width * 3],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn xyz12_to_rgba_row<const BE: bool>(
xyz: &[u16],
rgba_out: &mut [u8],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
unsafe {
let zero_ps = _mm256_setzero_ps();
let one_ps = _mm256_set1_ps(1.0);
let scale = _mm256_set1_ps(255.0);
let zero_si = _mm_setzero_si128();
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let (r_lin, g_lin, b_lin) = load_and_matmul_8px::<BE>(p, &m);
let r_oetf = oetf_srgb_scalar8(r_lin);
let g_oetf = oetf_srgb_scalar8(g_lin);
let b_oetf = oetf_srgb_scalar8(b_lin);
let r_u32 = clamp_scale_to_u32x8(r_oetf, zero_ps, one_ps, scale);
let g_u32 = clamp_scale_to_u32x8(g_oetf, zero_ps, one_ps, scale);
let b_u32 = clamp_scale_to_u32x8(b_oetf, zero_ps, one_ps, scale);
let r_u16 = narrow_u32x8_to_u16x8(r_u32);
let g_u16 = narrow_u32x8_to_u16x8(g_u32);
let b_u16 = narrow_u32x8_to_u16x8(b_u32);
let r_u8 = _mm_packus_epi16(r_u16, zero_si);
let g_u8 = _mm_packus_epi16(g_u16, zero_si);
let b_u8 = _mm_packus_epi16(b_u16, zero_si);
let a_u8 = _mm_set1_epi8(-1_i8);
write_rgba_u8_8(r_u8, g_u8, b_u8, a_u8, rgba_out.as_mut_ptr().add(x * 4));
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_rgba_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgba_out[x * 4..width * 4],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn xyz12_to_rgb_u16_row<const BE: bool>(
xyz: &[u16],
rgb_out: &mut [u16],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
unsafe {
let zero_ps = _mm256_setzero_ps();
let one_ps = _mm256_set1_ps(1.0);
let scale = _mm256_set1_ps(65535.0);
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let (r_lin, g_lin, b_lin) = load_and_matmul_8px::<BE>(p, &m);
let r_oetf = oetf_srgb_scalar8(r_lin);
let g_oetf = oetf_srgb_scalar8(g_lin);
let b_oetf = oetf_srgb_scalar8(b_lin);
let r_u32 = clamp_scale_to_u32x8(r_oetf, zero_ps, one_ps, scale);
let g_u32 = clamp_scale_to_u32x8(g_oetf, zero_ps, one_ps, scale);
let b_u32 = clamp_scale_to_u32x8(b_oetf, zero_ps, one_ps, scale);
let r_u16 = narrow_u32x8_to_u16x8(r_u32);
let g_u16 = narrow_u32x8_to_u16x8(g_u32);
let b_u16 = narrow_u32x8_to_u16x8(b_u32);
let mut tmp_r = [0u16; 8];
let mut tmp_g = [0u16; 8];
let mut tmp_b = [0u16; 8];
_mm_storeu_si128(tmp_r.as_mut_ptr() as *mut __m128i, r_u16);
_mm_storeu_si128(tmp_g.as_mut_ptr() as *mut __m128i, g_u16);
_mm_storeu_si128(tmp_b.as_mut_ptr() as *mut __m128i, b_u16);
let dst = rgb_out.as_mut_ptr().add(x * 3);
for i in 0..PIXELS_PER_ITER {
*dst.add(i * 3) = tmp_r[i];
*dst.add(i * 3 + 1) = tmp_g[i];
*dst.add(i * 3 + 2) = tmp_b[i];
}
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_rgb_u16_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgb_out[x * 3..width * 3],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn xyz12_to_rgba_u16_row<const BE: bool>(
xyz: &[u16],
rgba_out: &mut [u16],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
unsafe {
let zero_ps = _mm256_setzero_ps();
let one_ps = _mm256_set1_ps(1.0);
let scale = _mm256_set1_ps(65535.0);
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let (r_lin, g_lin, b_lin) = load_and_matmul_8px::<BE>(p, &m);
let r_oetf = oetf_srgb_scalar8(r_lin);
let g_oetf = oetf_srgb_scalar8(g_lin);
let b_oetf = oetf_srgb_scalar8(b_lin);
let r_u32 = clamp_scale_to_u32x8(r_oetf, zero_ps, one_ps, scale);
let g_u32 = clamp_scale_to_u32x8(g_oetf, zero_ps, one_ps, scale);
let b_u32 = clamp_scale_to_u32x8(b_oetf, zero_ps, one_ps, scale);
let r_u16 = narrow_u32x8_to_u16x8(r_u32);
let g_u16 = narrow_u32x8_to_u16x8(g_u32);
let b_u16 = narrow_u32x8_to_u16x8(b_u32);
let mut tmp_r = [0u16; 8];
let mut tmp_g = [0u16; 8];
let mut tmp_b = [0u16; 8];
_mm_storeu_si128(tmp_r.as_mut_ptr() as *mut __m128i, r_u16);
_mm_storeu_si128(tmp_g.as_mut_ptr() as *mut __m128i, g_u16);
_mm_storeu_si128(tmp_b.as_mut_ptr() as *mut __m128i, b_u16);
let dst = rgba_out.as_mut_ptr().add(x * 4);
for i in 0..PIXELS_PER_ITER {
*dst.add(i * 4) = tmp_r[i];
*dst.add(i * 4 + 1) = tmp_g[i];
*dst.add(i * 4 + 2) = tmp_b[i];
*dst.add(i * 4 + 3) = 0xFFFF;
}
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_rgba_u16_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgba_out[x * 4..width * 4],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn xyz12_to_rgb_f32_row<const BE: bool>(
xyz: &[u16],
rgb_out: &mut [f32],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
unsafe {
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let (r_lin, g_lin, b_lin) = load_and_matmul_8px::<BE>(p, &m);
let mut rb = [0.0_f32; 8];
let mut gb = [0.0_f32; 8];
let mut bb = [0.0_f32; 8];
_mm256_storeu_ps(rb.as_mut_ptr(), r_lin);
_mm256_storeu_ps(gb.as_mut_ptr(), g_lin);
_mm256_storeu_ps(bb.as_mut_ptr(), b_lin);
let dst = rgb_out.as_mut_ptr().add(x * 3);
for i in 0..PIXELS_PER_ITER {
*dst.add(i * 3) = rb[i];
*dst.add(i * 3 + 1) = gb[i];
*dst.add(i * 3 + 2) = bb[i];
}
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_rgb_f32_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgb_out[x * 3..width * 3],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn xyz12_to_xyz_f32_row<const BE: bool>(
xyz: &[u16],
xyz_out: &mut [f32],
width: usize,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(xyz_out.len() >= width * 3, "xyz_out row too short");
unsafe {
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let (xv, yv, zv) = load_xyz_linear_8px::<BE>(p);
let mut xb = [0.0_f32; 8];
let mut yb = [0.0_f32; 8];
let mut zb = [0.0_f32; 8];
_mm256_storeu_ps(xb.as_mut_ptr(), xv);
_mm256_storeu_ps(yb.as_mut_ptr(), yv);
_mm256_storeu_ps(zb.as_mut_ptr(), zv);
let dst = xyz_out.as_mut_ptr().add(x * 3);
for i in 0..PIXELS_PER_ITER {
*dst.add(i * 3) = xb[i];
*dst.add(i * 3 + 1) = yb[i];
*dst.add(i * 3 + 2) = zb[i];
}
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_xyz_f32_row::<BE>(
&xyz[x * 3..width * 3],
&mut xyz_out[x * 3..width * 3],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn xyz12_to_rgb_f16_row<const BE: bool>(
xyz: &[u16],
rgb_out: &mut [half::f16],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
unsafe {
let zero_ps = _mm256_setzero_ps();
let one_ps = _mm256_set1_ps(1.0);
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let (r_lin, g_lin, b_lin) = load_and_matmul_8px::<BE>(p, &m);
let r_oetf = oetf_srgb_scalar8(r_lin);
let g_oetf = oetf_srgb_scalar8(g_lin);
let b_oetf = oetf_srgb_scalar8(b_lin);
let r_clamp = _mm256_min_ps(_mm256_max_ps(r_oetf, zero_ps), one_ps);
let g_clamp = _mm256_min_ps(_mm256_max_ps(g_oetf, zero_ps), one_ps);
let b_clamp = _mm256_min_ps(_mm256_max_ps(b_oetf, zero_ps), one_ps);
let mut rb = [0.0_f32; 8];
let mut gb = [0.0_f32; 8];
let mut bb = [0.0_f32; 8];
_mm256_storeu_ps(rb.as_mut_ptr(), r_clamp);
_mm256_storeu_ps(gb.as_mut_ptr(), g_clamp);
_mm256_storeu_ps(bb.as_mut_ptr(), b_clamp);
for i in 0..PIXELS_PER_ITER {
let oi = (x + i) * 3;
rgb_out[oi] = half::f16::from_f32(rb[i]);
rgb_out[oi + 1] = half::f16::from_f32(gb[i]);
rgb_out[oi + 2] = half::f16::from_f32(bb[i]);
}
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_rgb_f16_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgb_out[x * 3..width * 3],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn xyz12_to_rgba_f16_row<const BE: bool>(
xyz: &[u16],
rgba_out: &mut [half::f16],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
let one_f16 = half::f16::from_f32(1.0);
unsafe {
let zero_ps = _mm256_setzero_ps();
let one_ps = _mm256_set1_ps(1.0);
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let (r_lin, g_lin, b_lin) = load_and_matmul_8px::<BE>(p, &m);
let r_oetf = oetf_srgb_scalar8(r_lin);
let g_oetf = oetf_srgb_scalar8(g_lin);
let b_oetf = oetf_srgb_scalar8(b_lin);
let r_clamp = _mm256_min_ps(_mm256_max_ps(r_oetf, zero_ps), one_ps);
let g_clamp = _mm256_min_ps(_mm256_max_ps(g_oetf, zero_ps), one_ps);
let b_clamp = _mm256_min_ps(_mm256_max_ps(b_oetf, zero_ps), one_ps);
let mut rb = [0.0_f32; 8];
let mut gb = [0.0_f32; 8];
let mut bb = [0.0_f32; 8];
_mm256_storeu_ps(rb.as_mut_ptr(), r_clamp);
_mm256_storeu_ps(gb.as_mut_ptr(), g_clamp);
_mm256_storeu_ps(bb.as_mut_ptr(), b_clamp);
for i in 0..PIXELS_PER_ITER {
let oi = (x + i) * 4;
rgba_out[oi] = half::f16::from_f32(rb[i]);
rgba_out[oi + 1] = half::f16::from_f32(gb[i]);
rgba_out[oi + 2] = half::f16::from_f32(bb[i]);
rgba_out[oi + 3] = one_f16;
}
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_rgba_f16_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgba_out[x * 4..width * 4],
width - x,
target_gamut,
);
}
}
}