#![allow(dead_code)]
use core::arch::x86_64::*;
use super::*;
#[inline(always)]
unsafe fn deinterleave_rgb48_8px(
v0: __m128i,
v1: __m128i,
v2: __m128i,
) -> (__m128i, __m128i, __m128i) {
unsafe {
let ch0_v0 = _mm_setr_epi8(0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let ch0_v1 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1);
let ch0_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 10, 11);
let ch0 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(v0, ch0_v0), _mm_shuffle_epi8(v1, ch0_v1)),
_mm_shuffle_epi8(v2, ch0_v2),
);
let ch1_v0 = _mm_setr_epi8(2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let ch1_v1 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1);
let ch1_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 6, 7, 12, 13);
let ch1 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(v0, ch1_v0), _mm_shuffle_epi8(v1, ch1_v1)),
_mm_shuffle_epi8(v2, ch1_v2),
);
let ch2_v0 = _mm_setr_epi8(4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let ch2_v1 = _mm_setr_epi8(-1, -1, -1, -1, 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1);
let ch2_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15);
let ch2 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(v0, ch2_v0), _mm_shuffle_epi8(v1, ch2_v1)),
_mm_shuffle_epi8(v2, ch2_v2),
);
(ch0, ch1, ch2)
}
}
#[rustfmt::skip]
static C0_FROM_PAIR_IDX: [i16; 32] = [
0, 4, 8, 12, 16, 20, 24, 28,
32, 36, 40, 44, 48, 52, 56, 60,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
];
#[rustfmt::skip]
static C1_FROM_PAIR_IDX: [i16; 32] = [
1, 5, 9, 13, 17, 21, 25, 29,
33, 37, 41, 45, 49, 53, 57, 61,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
];
#[rustfmt::skip]
static C2_FROM_PAIR_IDX: [i16; 32] = [
2, 6, 10, 14, 18, 22, 26, 30,
34, 38, 42, 46, 50, 54, 58, 62,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
];
#[rustfmt::skip]
static C3_FROM_PAIR_IDX: [i16; 32] = [
3, 7, 11, 15, 19, 23, 27, 31,
35, 39, 43, 47, 51, 55, 59, 63,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
];
#[rustfmt::skip]
static COMBINE_HALVES_IDX: [i16; 32] = [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
];
#[inline(always)]
unsafe fn deinterleave_rgba64_32px(
raw0: __m512i,
raw1: __m512i,
raw2: __m512i,
raw3: __m512i,
) -> (__m512i, __m512i, __m512i, __m512i) {
unsafe {
let c0_idx = _mm512_loadu_si512(C0_FROM_PAIR_IDX.as_ptr().cast());
let c1_idx = _mm512_loadu_si512(C1_FROM_PAIR_IDX.as_ptr().cast());
let c2_idx = _mm512_loadu_si512(C2_FROM_PAIR_IDX.as_ptr().cast());
let c3_idx = _mm512_loadu_si512(C3_FROM_PAIR_IDX.as_ptr().cast());
let comb_idx = _mm512_loadu_si512(COMBINE_HALVES_IDX.as_ptr().cast());
let ch0_lo = _mm512_permutex2var_epi16(raw0, c0_idx, raw1);
let ch0_hi = _mm512_permutex2var_epi16(raw2, c0_idx, raw3);
let ch1_lo = _mm512_permutex2var_epi16(raw0, c1_idx, raw1);
let ch1_hi = _mm512_permutex2var_epi16(raw2, c1_idx, raw3);
let ch2_lo = _mm512_permutex2var_epi16(raw0, c2_idx, raw1);
let ch2_hi = _mm512_permutex2var_epi16(raw2, c2_idx, raw3);
let ch3_lo = _mm512_permutex2var_epi16(raw0, c3_idx, raw1);
let ch3_hi = _mm512_permutex2var_epi16(raw2, c3_idx, raw3);
let ch0 = _mm512_permutex2var_epi16(ch0_lo, comb_idx, ch0_hi);
let ch1 = _mm512_permutex2var_epi16(ch1_lo, comb_idx, ch1_hi);
let ch2 = _mm512_permutex2var_epi16(ch2_lo, comb_idx, ch2_hi);
let ch3 = _mm512_permutex2var_epi16(ch3_lo, comb_idx, ch3_hi);
(ch0, ch1, ch2, ch3)
}
}
#[inline(always)]
unsafe fn narrow_u16x32_to_u8x32(v: __m512i) -> __m256i {
unsafe { _mm512_cvtusepi16_epi8(_mm512_srli_epi16::<8>(v)) }
}
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline(always)]
unsafe fn byteswap128_if_be<const BE: bool>(v: __m128i) -> __m128i {
if BE != HOST_NATIVE_BE {
const MASK: __m128i =
unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) };
unsafe { _mm_shuffle_epi8(v, MASK) }
} else {
v
}
}
#[inline(always)]
unsafe fn byteswap512_if_be<const BE: bool>(v: __m512i) -> __m512i {
if BE != HOST_NATIVE_BE {
const MASK: __m512i = unsafe {
core::mem::transmute([
1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4,
7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
])
};
unsafe { _mm512_shuffle_epi8(v, MASK) }
} else {
v
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_rgb48_to_rgb_row<const BE: bool>(
rgb48: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 32 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2);
let r0u8 = narrow_u16x8_to_u8x8(r0, zero);
let g0u8 = narrow_u16x8_to_u8x8(g0, zero);
let b0u8 = narrow_u16x8_to_u8x8(b0, zero);
let mut tmp0 = [0u8; 48];
write_rgb_16(r0u8, g0u8, b0u8, tmp0.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp0.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
let ptr8 = ptr.add(24);
let v3 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.cast()));
let v4 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(8).cast()));
let v5 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(16).cast()));
let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5);
let r1u8 = narrow_u16x8_to_u8x8(r1, zero);
let g1u8 = narrow_u16x8_to_u8x8(g1, zero);
let b1u8 = narrow_u16x8_to_u8x8(b1, zero);
let mut tmp1 = [0u8; 48];
write_rgb_16(r1u8, g1u8, b1u8, tmp1.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp1.as_ptr(), rgb_out.as_mut_ptr().add((x + 8) * 3), 24);
let ptr16 = ptr.add(48);
let v6 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr16.cast()));
let v7 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr16.add(8).cast()));
let v8 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr16.add(16).cast()));
let (r2, g2, b2) = deinterleave_rgb48_8px(v6, v7, v8);
let r2u8 = narrow_u16x8_to_u8x8(r2, zero);
let g2u8 = narrow_u16x8_to_u8x8(g2, zero);
let b2u8 = narrow_u16x8_to_u8x8(b2, zero);
let mut tmp2 = [0u8; 48];
write_rgb_16(r2u8, g2u8, b2u8, tmp2.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp2.as_ptr(), rgb_out.as_mut_ptr().add((x + 16) * 3), 24);
let ptr24 = ptr.add(72);
let v9 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr24.cast()));
let v10 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr24.add(8).cast()));
let v11 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr24.add(16).cast()));
let (r3, g3, b3) = deinterleave_rgb48_8px(v9, v10, v11);
let r3u8 = narrow_u16x8_to_u8x8(r3, zero);
let g3u8 = narrow_u16x8_to_u8x8(g3, zero);
let b3u8 = narrow_u16x8_to_u8x8(b3, zero);
let mut tmp3 = [0u8; 48];
write_rgb_16(r3u8, g3u8, b3u8, tmp3.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp3.as_ptr(), rgb_out.as_mut_ptr().add((x + 24) * 3), 24);
x += 32;
}
if x < width {
scalar::rgb48_to_rgb_row::<BE>(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_rgb48_to_rgba_row<const BE: bool>(
rgb48: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let opaque_u16 = _mm_set1_epi16(0x00FFu16 as i16);
let opaque_u8 = _mm_packus_epi16(opaque_u16, zero);
let mut x = 0usize;
while x + 32 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
macro_rules! process_half {
($ptr:expr, $out_off:expr) => {{
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.add(16).cast()));
let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2);
let ru8 = narrow_u16x8_to_u8x8(r, zero);
let gu8 = narrow_u16x8_to_u8x8(g, zero);
let bu8 = narrow_u16x8_to_u8x8(b, zero);
let mut tmp = [0u8; 64];
write_rgba_16(ru8, gu8, bu8, opaque_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add($out_off), 32);
}};
}
process_half!(ptr, x * 4);
process_half!(ptr.add(24), (x + 8) * 4);
process_half!(ptr.add(48), (x + 16) * 4);
process_half!(ptr.add(72), (x + 24) * 4);
x += 32;
}
if x < width {
scalar::rgb48_to_rgba_row::<BE>(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row<const BE: bool>(
rgb48: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 32 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
macro_rules! process_half_u16 {
($ptr:expr, $out_off:expr) => {{
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.add(16).cast()));
let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add($out_off));
}};
}
process_half_u16!(ptr, x * 3);
process_half_u16!(ptr.add(24), (x + 8) * 3);
process_half_u16!(ptr.add(48), (x + 16) * 3);
process_half_u16!(ptr.add(72), (x + 24) * 3);
x += 32;
}
if x < width {
scalar::rgb48_to_rgb_u16_row::<BE>(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row<const BE: bool>(
rgb48: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let opaque = _mm_set1_epi16(0xFFFFu16 as i16);
let mut x = 0usize;
while x + 32 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
macro_rules! process_half_rgba_u16 {
($ptr:expr, $out_off:expr) => {{
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.add(16).cast()));
let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add($out_off));
}};
}
process_half_rgba_u16!(ptr, x * 4);
process_half_rgba_u16!(ptr.add(24), (x + 8) * 4);
process_half_rgba_u16!(ptr.add(48), (x + 16) * 4);
process_half_rgba_u16!(ptr.add(72), (x + 24) * 4);
x += 32;
}
if x < width {
scalar::rgb48_to_rgba_u16_row::<BE>(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_bgr48_to_rgb_row<const BE: bool>(
bgr48: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 32 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
macro_rules! process_half_bgr {
($ptr:expr, $out_off:expr) => {{
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.add(16).cast()));
let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2);
let ru8 = narrow_u16x8_to_u8x8(r, zero);
let gu8 = narrow_u16x8_to_u8x8(g, zero);
let bu8 = narrow_u16x8_to_u8x8(b, zero);
let mut tmp = [0u8; 48];
write_rgb_16(ru8, gu8, bu8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add($out_off), 24);
}};
}
process_half_bgr!(ptr, x * 3);
process_half_bgr!(ptr.add(24), (x + 8) * 3);
process_half_bgr!(ptr.add(48), (x + 16) * 3);
process_half_bgr!(ptr.add(72), (x + 24) * 3);
x += 32;
}
if x < width {
scalar::bgr48_to_rgb_row::<BE>(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_bgr48_to_rgba_row<const BE: bool>(
bgr48: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let opaque_u16 = _mm_set1_epi16(0x00FFu16 as i16);
let opaque_u8 = _mm_packus_epi16(opaque_u16, zero);
let mut x = 0usize;
while x + 32 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
macro_rules! process_half_bgr_rgba {
($ptr:expr, $out_off:expr) => {{
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.add(16).cast()));
let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2);
let ru8 = narrow_u16x8_to_u8x8(r, zero);
let gu8 = narrow_u16x8_to_u8x8(g, zero);
let bu8 = narrow_u16x8_to_u8x8(b, zero);
let mut tmp = [0u8; 64];
write_rgba_16(ru8, gu8, bu8, opaque_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add($out_off), 32);
}};
}
process_half_bgr_rgba!(ptr, x * 4);
process_half_bgr_rgba!(ptr.add(24), (x + 8) * 4);
process_half_bgr_rgba!(ptr.add(48), (x + 16) * 4);
process_half_bgr_rgba!(ptr.add(72), (x + 24) * 4);
x += 32;
}
if x < width {
scalar::bgr48_to_rgba_row::<BE>(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row<const BE: bool>(
bgr48: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 32 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
macro_rules! process_half_bgr_u16 {
($ptr:expr, $out_off:expr) => {{
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.add(16).cast()));
let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add($out_off));
}};
}
process_half_bgr_u16!(ptr, x * 3);
process_half_bgr_u16!(ptr.add(24), (x + 8) * 3);
process_half_bgr_u16!(ptr.add(48), (x + 16) * 3);
process_half_bgr_u16!(ptr.add(72), (x + 24) * 3);
x += 32;
}
if x < width {
scalar::bgr48_to_rgb_u16_row::<BE>(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row<const BE: bool>(
bgr48: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let opaque = _mm_set1_epi16(0xFFFFu16 as i16);
let mut x = 0usize;
while x + 32 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
macro_rules! process_half_bgr_rgba_u16 {
($ptr:expr, $out_off:expr) => {{
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128($ptr.add(16).cast()));
let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add($out_off));
}};
}
process_half_bgr_rgba_u16!(ptr, x * 4);
process_half_bgr_rgba_u16!(ptr.add(24), (x + 8) * 4);
process_half_bgr_rgba_u16!(ptr.add(48), (x + 16) * 4);
process_half_bgr_rgba_u16!(ptr.add(72), (x + 24) * 4);
x += 32;
}
if x < width {
scalar::bgr48_to_rgba_u16_row::<BE>(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_rgba64_to_rgb_row<const BE: bool>(
rgba64: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 32 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let raw0 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.cast()));
let raw1 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(32).cast()));
let raw2 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(64).cast()));
let raw3 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(96).cast()));
let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3);
let r_u8 = narrow_u16x32_to_u8x32(r_u16);
let g_u8 = narrow_u16x32_to_u8x32(g_u16);
let b_u8 = narrow_u16x32_to_u8x32(b_u16);
let out_ptr = rgb_out.as_mut_ptr().add(x * 3);
write_rgb_16(
_mm256_castsi256_si128(r_u8),
_mm256_castsi256_si128(g_u8),
_mm256_castsi256_si128(b_u8),
out_ptr,
);
write_rgb_16(
_mm256_extracti128_si256::<1>(r_u8),
_mm256_extracti128_si256::<1>(g_u8),
_mm256_extracti128_si256::<1>(b_u8),
out_ptr.add(48),
);
x += 32;
}
if x < width {
scalar::rgba64_to_rgb_row::<BE>(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_rgba64_to_rgba_row<const BE: bool>(
rgba64: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 32 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let raw0 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.cast()));
let raw1 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(32).cast()));
let raw2 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(64).cast()));
let raw3 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(96).cast()));
let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3);
let r_u8 = narrow_u16x32_to_u8x32(r_u16);
let g_u8 = narrow_u16x32_to_u8x32(g_u16);
let b_u8 = narrow_u16x32_to_u8x32(b_u16);
let a_u8 = narrow_u16x32_to_u8x32(a_u16);
let out_ptr = rgba_out.as_mut_ptr().add(x * 4);
write_rgba_16(
_mm256_castsi256_si128(r_u8),
_mm256_castsi256_si128(g_u8),
_mm256_castsi256_si128(b_u8),
_mm256_castsi256_si128(a_u8),
out_ptr,
);
write_rgba_16(
_mm256_extracti128_si256::<1>(r_u8),
_mm256_extracti128_si256::<1>(g_u8),
_mm256_extracti128_si256::<1>(b_u8),
_mm256_extracti128_si256::<1>(a_u8),
out_ptr.add(64),
);
x += 32;
}
if x < width {
scalar::rgba64_to_rgba_row::<BE>(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_rgba64_to_rgb_u16_row<const BE: bool>(
rgba64: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 32 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let raw0 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.cast()));
let raw1 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(32).cast()));
let raw2 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(64).cast()));
let raw3 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(96).cast()));
let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3);
write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
x += 32;
}
if x < width {
scalar::rgba64_to_rgb_u16_row::<BE>(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row<const BE: bool>(
rgba64: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 32 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let raw0 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.cast()));
let raw1 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(32).cast()));
let raw2 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(64).cast()));
let raw3 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(96).cast()));
let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3);
let opaque = _mm_set1_epi16(-1i16); let out_ptr = rgba_out.as_mut_ptr().add(x * 4);
write_rgba_u16_8(
_mm512_extracti32x4_epi32::<0>(r_u16),
_mm512_extracti32x4_epi32::<0>(g_u16),
_mm512_extracti32x4_epi32::<0>(b_u16),
_mm512_extracti32x4_epi32::<0>(a_u16),
out_ptr,
);
write_rgba_u16_8(
_mm512_extracti32x4_epi32::<1>(r_u16),
_mm512_extracti32x4_epi32::<1>(g_u16),
_mm512_extracti32x4_epi32::<1>(b_u16),
_mm512_extracti32x4_epi32::<1>(a_u16),
out_ptr.add(32),
);
write_rgba_u16_8(
_mm512_extracti32x4_epi32::<2>(r_u16),
_mm512_extracti32x4_epi32::<2>(g_u16),
_mm512_extracti32x4_epi32::<2>(b_u16),
_mm512_extracti32x4_epi32::<2>(a_u16),
out_ptr.add(64),
);
write_rgba_u16_8(
_mm512_extracti32x4_epi32::<3>(r_u16),
_mm512_extracti32x4_epi32::<3>(g_u16),
_mm512_extracti32x4_epi32::<3>(b_u16),
_mm512_extracti32x4_epi32::<3>(a_u16),
out_ptr.add(96),
);
let _ = opaque; x += 32;
}
if x < width {
scalar::rgba64_to_rgba_u16_row::<BE>(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_bgra64_to_rgb_row<const BE: bool>(
bgra64: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 32 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let raw0 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.cast()));
let raw1 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(32).cast()));
let raw2 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(64).cast()));
let raw3 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(96).cast()));
let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3);
let r_u8 = narrow_u16x32_to_u8x32(r_u16);
let g_u8 = narrow_u16x32_to_u8x32(g_u16);
let b_u8 = narrow_u16x32_to_u8x32(b_u16);
let out_ptr = rgb_out.as_mut_ptr().add(x * 3);
write_rgb_16(
_mm256_castsi256_si128(r_u8),
_mm256_castsi256_si128(g_u8),
_mm256_castsi256_si128(b_u8),
out_ptr,
);
write_rgb_16(
_mm256_extracti128_si256::<1>(r_u8),
_mm256_extracti128_si256::<1>(g_u8),
_mm256_extracti128_si256::<1>(b_u8),
out_ptr.add(48),
);
x += 32;
}
if x < width {
scalar::bgra64_to_rgb_row::<BE>(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_bgra64_to_rgba_row<const BE: bool>(
bgra64: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 32 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let raw0 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.cast()));
let raw1 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(32).cast()));
let raw2 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(64).cast()));
let raw3 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(96).cast()));
let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3);
let r_u8 = narrow_u16x32_to_u8x32(r_u16);
let g_u8 = narrow_u16x32_to_u8x32(g_u16);
let b_u8 = narrow_u16x32_to_u8x32(b_u16);
let a_u8 = narrow_u16x32_to_u8x32(a_u16);
let out_ptr = rgba_out.as_mut_ptr().add(x * 4);
write_rgba_16(
_mm256_castsi256_si128(r_u8),
_mm256_castsi256_si128(g_u8),
_mm256_castsi256_si128(b_u8),
_mm256_castsi256_si128(a_u8),
out_ptr,
);
write_rgba_16(
_mm256_extracti128_si256::<1>(r_u8),
_mm256_extracti128_si256::<1>(g_u8),
_mm256_extracti128_si256::<1>(b_u8),
_mm256_extracti128_si256::<1>(a_u8),
out_ptr.add(64),
);
x += 32;
}
if x < width {
scalar::bgra64_to_rgba_row::<BE>(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_bgra64_to_rgb_u16_row<const BE: bool>(
bgra64: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 32 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let raw0 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.cast()));
let raw1 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(32).cast()));
let raw2 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(64).cast()));
let raw3 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(96).cast()));
let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3);
write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
x += 32;
}
if x < width {
scalar::bgra64_to_rgb_u16_row::<BE>(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn avx512_bgra64_to_rgba_u16_row<const BE: bool>(
bgra64: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 32 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let raw0 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.cast()));
let raw1 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(32).cast()));
let raw2 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(64).cast()));
let raw3 = byteswap512_if_be::<BE>(_mm512_loadu_si512(ptr.add(96).cast()));
let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3);
let out_ptr = rgba_out.as_mut_ptr().add(x * 4);
write_rgba_u16_8(
_mm512_extracti32x4_epi32::<0>(r_u16),
_mm512_extracti32x4_epi32::<0>(g_u16),
_mm512_extracti32x4_epi32::<0>(b_u16),
_mm512_extracti32x4_epi32::<0>(a_u16),
out_ptr,
);
write_rgba_u16_8(
_mm512_extracti32x4_epi32::<1>(r_u16),
_mm512_extracti32x4_epi32::<1>(g_u16),
_mm512_extracti32x4_epi32::<1>(b_u16),
_mm512_extracti32x4_epi32::<1>(a_u16),
out_ptr.add(32),
);
write_rgba_u16_8(
_mm512_extracti32x4_epi32::<2>(r_u16),
_mm512_extracti32x4_epi32::<2>(g_u16),
_mm512_extracti32x4_epi32::<2>(b_u16),
_mm512_extracti32x4_epi32::<2>(a_u16),
out_ptr.add(64),
);
write_rgba_u16_8(
_mm512_extracti32x4_epi32::<3>(r_u16),
_mm512_extracti32x4_epi32::<3>(g_u16),
_mm512_extracti32x4_epi32::<3>(b_u16),
_mm512_extracti32x4_epi32::<3>(a_u16),
out_ptr.add(96),
);
x += 32;
}
if x < width {
scalar::bgra64_to_rgba_u16_row::<BE>(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline(always)]
unsafe fn narrow_u16x8_to_u8x8(v: __m128i, zero: __m128i) -> __m128i {
unsafe { _mm_packus_epi16(_mm_srli_epi16::<8>(v), zero) }
}