use core::arch::x86_64::*;
use super::{endian::load_endian_u16x32, *};
use crate::row::arch::x86_sse41::endian::load_endian_u16x8;
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
g: &[u16],
b: &[u16],
r: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let shr_count = _mm_cvtsi32_si128((BITS - 8) as i32);
let zero128 = _mm_setzero_si128();
let mask512 = _mm512_set1_epi16(((1u32 << BITS) - 1) as u16 as i16);
let mask128 = _mm_set1_epi16(((1u32 << BITS) - 1) as u16 as i16);
let mut x = 0usize;
while x + 32 <= width {
let r_v = _mm512_and_si512(load_endian_u16x32::<BE>(r.as_ptr().add(x).cast()), mask512);
let g_v = _mm512_and_si512(load_endian_u16x32::<BE>(g.as_ptr().add(x).cast()), mask512);
let b_v = _mm512_and_si512(load_endian_u16x32::<BE>(b.as_ptr().add(x).cast()), mask512);
let r_sh = _mm512_srl_epi16(r_v, shr_count);
let g_sh = _mm512_srl_epi16(g_v, shr_count);
let b_sh = _mm512_srl_epi16(b_v, shr_count);
{
let r_q = _mm512_extracti32x4_epi32::<0>(r_sh);
let g_q = _mm512_extracti32x4_epi32::<0>(g_sh);
let b_q = _mm512_extracti32x4_epi32::<0>(b_sh);
let r_u8 = _mm_packus_epi16(r_q, zero128);
let g_u8 = _mm_packus_epi16(g_q, zero128);
let b_u8 = _mm_packus_epi16(b_q, zero128);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
}
{
let r_q = _mm512_extracti32x4_epi32::<1>(r_sh);
let g_q = _mm512_extracti32x4_epi32::<1>(g_sh);
let b_q = _mm512_extracti32x4_epi32::<1>(b_sh);
let r_u8 = _mm_packus_epi16(r_q, zero128);
let g_u8 = _mm_packus_epi16(g_q, zero128);
let b_u8 = _mm_packus_epi16(b_q, zero128);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add((x + 8) * 3), 24);
}
{
let r_q = _mm512_extracti32x4_epi32::<2>(r_sh);
let g_q = _mm512_extracti32x4_epi32::<2>(g_sh);
let b_q = _mm512_extracti32x4_epi32::<2>(b_sh);
let r_u8 = _mm_packus_epi16(r_q, zero128);
let g_u8 = _mm_packus_epi16(g_q, zero128);
let b_u8 = _mm_packus_epi16(b_q, zero128);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add((x + 16) * 3), 24);
}
{
let r_q = _mm512_extracti32x4_epi32::<3>(r_sh);
let g_q = _mm512_extracti32x4_epi32::<3>(g_sh);
let b_q = _mm512_extracti32x4_epi32::<3>(b_sh);
let r_u8 = _mm_packus_epi16(r_q, zero128);
let g_u8 = _mm_packus_epi16(g_q, zero128);
let b_u8 = _mm_packus_epi16(b_q, zero128);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add((x + 24) * 3), 24);
}
x += 32;
}
while x + 8 <= width {
let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
let r_sh = _mm_srl_epi16(r_v, shr_count);
let g_sh = _mm_srl_epi16(g_v, shr_count);
let b_sh = _mm_srl_epi16(b_v, shr_count);
let r_u8 = _mm_packus_epi16(r_sh, zero128);
let g_u8 = _mm_packus_epi16(g_sh, zero128);
let b_u8 = _mm_packus_epi16(b_sh, zero128);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
x += 8;
}
if x < width {
scalar::gbr_to_rgb_high_bit_row::<BITS, BE>(
&g[x..width],
&b[x..width],
&r[x..width],
&mut rgb_out[x * 3..width * 3],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
g: &[u16],
b: &[u16],
r: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let shr_count = _mm_cvtsi32_si128((BITS - 8) as i32);
let zero128 = _mm_setzero_si128();
let mask512 = _mm512_set1_epi16(((1u32 << BITS) - 1) as u16 as i16);
let mask128 = _mm_set1_epi16(((1u32 << BITS) - 1) as u16 as i16);
let opaque_u16 = _mm_set1_epi16(0x00FF_u16 as i16);
let opaque_u8 = _mm_packus_epi16(opaque_u16, zero128);
let mut x = 0usize;
while x + 32 <= width {
let r_v = _mm512_and_si512(load_endian_u16x32::<BE>(r.as_ptr().add(x).cast()), mask512);
let g_v = _mm512_and_si512(load_endian_u16x32::<BE>(g.as_ptr().add(x).cast()), mask512);
let b_v = _mm512_and_si512(load_endian_u16x32::<BE>(b.as_ptr().add(x).cast()), mask512);
let r_sh = _mm512_srl_epi16(r_v, shr_count);
let g_sh = _mm512_srl_epi16(g_v, shr_count);
let b_sh = _mm512_srl_epi16(b_v, shr_count);
{
let r_q = _mm512_extracti32x4_epi32::<0>(r_sh);
let g_q = _mm512_extracti32x4_epi32::<0>(g_sh);
let b_q = _mm512_extracti32x4_epi32::<0>(b_sh);
let r_u8 = _mm_packus_epi16(r_q, zero128);
let g_u8 = _mm_packus_epi16(g_q, zero128);
let b_u8 = _mm_packus_epi16(b_q, zero128);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, opaque_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
}
{
let r_q = _mm512_extracti32x4_epi32::<1>(r_sh);
let g_q = _mm512_extracti32x4_epi32::<1>(g_sh);
let b_q = _mm512_extracti32x4_epi32::<1>(b_sh);
let r_u8 = _mm_packus_epi16(r_q, zero128);
let g_u8 = _mm_packus_epi16(g_q, zero128);
let b_u8 = _mm_packus_epi16(b_q, zero128);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, opaque_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add((x + 8) * 4), 32);
}
{
let r_q = _mm512_extracti32x4_epi32::<2>(r_sh);
let g_q = _mm512_extracti32x4_epi32::<2>(g_sh);
let b_q = _mm512_extracti32x4_epi32::<2>(b_sh);
let r_u8 = _mm_packus_epi16(r_q, zero128);
let g_u8 = _mm_packus_epi16(g_q, zero128);
let b_u8 = _mm_packus_epi16(b_q, zero128);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, opaque_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add((x + 16) * 4), 32);
}
{
let r_q = _mm512_extracti32x4_epi32::<3>(r_sh);
let g_q = _mm512_extracti32x4_epi32::<3>(g_sh);
let b_q = _mm512_extracti32x4_epi32::<3>(b_sh);
let r_u8 = _mm_packus_epi16(r_q, zero128);
let g_u8 = _mm_packus_epi16(g_q, zero128);
let b_u8 = _mm_packus_epi16(b_q, zero128);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, opaque_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add((x + 24) * 4), 32);
}
x += 32;
}
while x + 8 <= width {
let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
let r_sh = _mm_srl_epi16(r_v, shr_count);
let g_sh = _mm_srl_epi16(g_v, shr_count);
let b_sh = _mm_srl_epi16(b_v, shr_count);
let r_u8 = _mm_packus_epi16(r_sh, zero128);
let g_u8 = _mm_packus_epi16(g_sh, zero128);
let b_u8 = _mm_packus_epi16(b_sh, zero128);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, opaque_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
&g[x..width],
&b[x..width],
&r[x..width],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
g: &[u16],
b: &[u16],
r: &[u16],
a: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let shr_count = _mm_cvtsi32_si128((BITS - 8) as i32);
let zero128 = _mm_setzero_si128();
let mask512 = _mm512_set1_epi16(((1u32 << BITS) - 1) as u16 as i16);
let mask128 = _mm_set1_epi16(((1u32 << BITS) - 1) as u16 as i16);
let mut x = 0usize;
while x + 32 <= width {
let r_v = _mm512_and_si512(load_endian_u16x32::<BE>(r.as_ptr().add(x).cast()), mask512);
let g_v = _mm512_and_si512(load_endian_u16x32::<BE>(g.as_ptr().add(x).cast()), mask512);
let b_v = _mm512_and_si512(load_endian_u16x32::<BE>(b.as_ptr().add(x).cast()), mask512);
let a_v = _mm512_and_si512(load_endian_u16x32::<BE>(a.as_ptr().add(x).cast()), mask512);
let r_sh = _mm512_srl_epi16(r_v, shr_count);
let g_sh = _mm512_srl_epi16(g_v, shr_count);
let b_sh = _mm512_srl_epi16(b_v, shr_count);
let a_sh = _mm512_srl_epi16(a_v, shr_count);
{
let r_q = _mm512_extracti32x4_epi32::<0>(r_sh);
let g_q = _mm512_extracti32x4_epi32::<0>(g_sh);
let b_q = _mm512_extracti32x4_epi32::<0>(b_sh);
let a_q = _mm512_extracti32x4_epi32::<0>(a_sh);
let r_u8 = _mm_packus_epi16(r_q, zero128);
let g_u8 = _mm_packus_epi16(g_q, zero128);
let b_u8 = _mm_packus_epi16(b_q, zero128);
let a_u8 = _mm_packus_epi16(a_q, zero128);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, a_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
}
{
let r_q = _mm512_extracti32x4_epi32::<1>(r_sh);
let g_q = _mm512_extracti32x4_epi32::<1>(g_sh);
let b_q = _mm512_extracti32x4_epi32::<1>(b_sh);
let a_q = _mm512_extracti32x4_epi32::<1>(a_sh);
let r_u8 = _mm_packus_epi16(r_q, zero128);
let g_u8 = _mm_packus_epi16(g_q, zero128);
let b_u8 = _mm_packus_epi16(b_q, zero128);
let a_u8 = _mm_packus_epi16(a_q, zero128);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, a_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add((x + 8) * 4), 32);
}
{
let r_q = _mm512_extracti32x4_epi32::<2>(r_sh);
let g_q = _mm512_extracti32x4_epi32::<2>(g_sh);
let b_q = _mm512_extracti32x4_epi32::<2>(b_sh);
let a_q = _mm512_extracti32x4_epi32::<2>(a_sh);
let r_u8 = _mm_packus_epi16(r_q, zero128);
let g_u8 = _mm_packus_epi16(g_q, zero128);
let b_u8 = _mm_packus_epi16(b_q, zero128);
let a_u8 = _mm_packus_epi16(a_q, zero128);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, a_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add((x + 16) * 4), 32);
}
{
let r_q = _mm512_extracti32x4_epi32::<3>(r_sh);
let g_q = _mm512_extracti32x4_epi32::<3>(g_sh);
let b_q = _mm512_extracti32x4_epi32::<3>(b_sh);
let a_q = _mm512_extracti32x4_epi32::<3>(a_sh);
let r_u8 = _mm_packus_epi16(r_q, zero128);
let g_u8 = _mm_packus_epi16(g_q, zero128);
let b_u8 = _mm_packus_epi16(b_q, zero128);
let a_u8 = _mm_packus_epi16(a_q, zero128);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, a_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add((x + 24) * 4), 32);
}
x += 32;
}
while x + 8 <= width {
let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask128);
let r_sh = _mm_srl_epi16(r_v, shr_count);
let g_sh = _mm_srl_epi16(g_v, shr_count);
let b_sh = _mm_srl_epi16(b_v, shr_count);
let a_sh = _mm_srl_epi16(a_v, shr_count);
let r_u8 = _mm_packus_epi16(r_sh, zero128);
let g_u8 = _mm_packus_epi16(g_sh, zero128);
let b_u8 = _mm_packus_epi16(b_sh, zero128);
let a_u8 = _mm_packus_epi16(a_sh, zero128);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, a_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::gbra_to_rgba_high_bit_row::<BITS, BE>(
&g[x..width],
&b[x..width],
&r[x..width],
&a[x..width],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
g: &[u16],
b: &[u16],
r: &[u16],
rgb_u16_out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short");
unsafe {
let mask128 = _mm_set1_epi16(((1u32 << BITS) - 1) as u16 as i16);
let mut x = 0usize;
while x + 32 <= width {
{
let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3));
}
{
let r_v = _mm_and_si128(
load_endian_u16x8::<BE>(r.as_ptr().add(x + 8).cast()),
mask128,
);
let g_v = _mm_and_si128(
load_endian_u16x8::<BE>(g.as_ptr().add(x + 8).cast()),
mask128,
);
let b_v = _mm_and_si128(
load_endian_u16x8::<BE>(b.as_ptr().add(x + 8).cast()),
mask128,
);
write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add((x + 8) * 3));
}
{
let r_v = _mm_and_si128(
load_endian_u16x8::<BE>(r.as_ptr().add(x + 16).cast()),
mask128,
);
let g_v = _mm_and_si128(
load_endian_u16x8::<BE>(g.as_ptr().add(x + 16).cast()),
mask128,
);
let b_v = _mm_and_si128(
load_endian_u16x8::<BE>(b.as_ptr().add(x + 16).cast()),
mask128,
);
write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add((x + 16) * 3));
}
{
let r_v = _mm_and_si128(
load_endian_u16x8::<BE>(r.as_ptr().add(x + 24).cast()),
mask128,
);
let g_v = _mm_and_si128(
load_endian_u16x8::<BE>(g.as_ptr().add(x + 24).cast()),
mask128,
);
let b_v = _mm_and_si128(
load_endian_u16x8::<BE>(b.as_ptr().add(x + 24).cast()),
mask128,
);
write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add((x + 24) * 3));
}
x += 32;
}
while x + 8 <= width {
let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
write_rgb_u16_8(r_v, g_v, b_v, rgb_u16_out.as_mut_ptr().add(x * 3));
x += 8;
}
if x < width {
scalar::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
&g[x..width],
&b[x..width],
&r[x..width],
&mut rgb_u16_out[x * 3..width * 3],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
g: &[u16],
b: &[u16],
r: &[u16],
rgba_u16_out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(
rgba_u16_out.len() >= width * 4,
"rgba_u16_out row too short"
);
unsafe {
let mask128 = _mm_set1_epi16(((1u32 << BITS) - 1) as u16 as i16);
let opaque = mask128;
let mut x = 0usize;
while x + 32 <= width {
{
let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4));
}
{
let r_v = _mm_and_si128(
load_endian_u16x8::<BE>(r.as_ptr().add(x + 8).cast()),
mask128,
);
let g_v = _mm_and_si128(
load_endian_u16x8::<BE>(g.as_ptr().add(x + 8).cast()),
mask128,
);
let b_v = _mm_and_si128(
load_endian_u16x8::<BE>(b.as_ptr().add(x + 8).cast()),
mask128,
);
write_rgba_u16_8(
r_v,
g_v,
b_v,
opaque,
rgba_u16_out.as_mut_ptr().add((x + 8) * 4),
);
}
{
let r_v = _mm_and_si128(
load_endian_u16x8::<BE>(r.as_ptr().add(x + 16).cast()),
mask128,
);
let g_v = _mm_and_si128(
load_endian_u16x8::<BE>(g.as_ptr().add(x + 16).cast()),
mask128,
);
let b_v = _mm_and_si128(
load_endian_u16x8::<BE>(b.as_ptr().add(x + 16).cast()),
mask128,
);
write_rgba_u16_8(
r_v,
g_v,
b_v,
opaque,
rgba_u16_out.as_mut_ptr().add((x + 16) * 4),
);
}
{
let r_v = _mm_and_si128(
load_endian_u16x8::<BE>(r.as_ptr().add(x + 24).cast()),
mask128,
);
let g_v = _mm_and_si128(
load_endian_u16x8::<BE>(g.as_ptr().add(x + 24).cast()),
mask128,
);
let b_v = _mm_and_si128(
load_endian_u16x8::<BE>(b.as_ptr().add(x + 24).cast()),
mask128,
);
write_rgba_u16_8(
r_v,
g_v,
b_v,
opaque,
rgba_u16_out.as_mut_ptr().add((x + 24) * 4),
);
}
x += 32;
}
while x + 8 <= width {
let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
write_rgba_u16_8(r_v, g_v, b_v, opaque, rgba_u16_out.as_mut_ptr().add(x * 4));
x += 8;
}
if x < width {
scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
&g[x..width],
&b[x..width],
&r[x..width],
&mut rgba_u16_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
g: &[u16],
b: &[u16],
r: &[u16],
a: &[u16],
rgba_u16_out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(
rgba_u16_out.len() >= width * 4,
"rgba_u16_out row too short"
);
unsafe {
let mask128 = _mm_set1_epi16(((1u32 << BITS) - 1) as u16 as i16);
let mut x = 0usize;
while x + 32 <= width {
{
let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask128);
write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4));
}
{
let r_v = _mm_and_si128(
load_endian_u16x8::<BE>(r.as_ptr().add(x + 8).cast()),
mask128,
);
let g_v = _mm_and_si128(
load_endian_u16x8::<BE>(g.as_ptr().add(x + 8).cast()),
mask128,
);
let b_v = _mm_and_si128(
load_endian_u16x8::<BE>(b.as_ptr().add(x + 8).cast()),
mask128,
);
let a_v = _mm_and_si128(
load_endian_u16x8::<BE>(a.as_ptr().add(x + 8).cast()),
mask128,
);
write_rgba_u16_8(
r_v,
g_v,
b_v,
a_v,
rgba_u16_out.as_mut_ptr().add((x + 8) * 4),
);
}
{
let r_v = _mm_and_si128(
load_endian_u16x8::<BE>(r.as_ptr().add(x + 16).cast()),
mask128,
);
let g_v = _mm_and_si128(
load_endian_u16x8::<BE>(g.as_ptr().add(x + 16).cast()),
mask128,
);
let b_v = _mm_and_si128(
load_endian_u16x8::<BE>(b.as_ptr().add(x + 16).cast()),
mask128,
);
let a_v = _mm_and_si128(
load_endian_u16x8::<BE>(a.as_ptr().add(x + 16).cast()),
mask128,
);
write_rgba_u16_8(
r_v,
g_v,
b_v,
a_v,
rgba_u16_out.as_mut_ptr().add((x + 16) * 4),
);
}
{
let r_v = _mm_and_si128(
load_endian_u16x8::<BE>(r.as_ptr().add(x + 24).cast()),
mask128,
);
let g_v = _mm_and_si128(
load_endian_u16x8::<BE>(g.as_ptr().add(x + 24).cast()),
mask128,
);
let b_v = _mm_and_si128(
load_endian_u16x8::<BE>(b.as_ptr().add(x + 24).cast()),
mask128,
);
let a_v = _mm_and_si128(
load_endian_u16x8::<BE>(a.as_ptr().add(x + 24).cast()),
mask128,
);
write_rgba_u16_8(
r_v,
g_v,
b_v,
a_v,
rgba_u16_out.as_mut_ptr().add((x + 24) * 4),
);
}
x += 32;
}
while x + 8 <= width {
let r_v = _mm_and_si128(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask128);
let g_v = _mm_and_si128(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask128);
let b_v = _mm_and_si128(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask128);
let a_v = _mm_and_si128(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask128);
write_rgba_u16_8(r_v, g_v, b_v, a_v, rgba_u16_out.as_mut_ptr().add(x * 4));
x += 8;
}
if x < width {
scalar::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
&g[x..width],
&b[x..width],
&r[x..width],
&a[x..width],
&mut rgba_u16_out[x * 4..width * 4],
width - x,
);
}
}
}