#![allow(dead_code)]
use core::arch::x86_64::*;
use super::*;
#[inline(always)]
unsafe fn deinterleave_rgb48_8px(
v0: __m128i,
v1: __m128i,
v2: __m128i,
) -> (__m128i, __m128i, __m128i) {
unsafe {
let ch0_v0 = _mm_setr_epi8(0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let ch0_v1 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1);
let ch0_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 10, 11);
let ch0 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(v0, ch0_v0), _mm_shuffle_epi8(v1, ch0_v1)),
_mm_shuffle_epi8(v2, ch0_v2),
);
let ch1_v0 = _mm_setr_epi8(2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let ch1_v1 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1);
let ch1_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 6, 7, 12, 13);
let ch1 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(v0, ch1_v0), _mm_shuffle_epi8(v1, ch1_v1)),
_mm_shuffle_epi8(v2, ch1_v2),
);
let ch2_v0 = _mm_setr_epi8(4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let ch2_v1 = _mm_setr_epi8(-1, -1, -1, -1, 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1);
let ch2_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15);
let ch2 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(v0, ch2_v0), _mm_shuffle_epi8(v1, ch2_v1)),
_mm_shuffle_epi8(v2, ch2_v2),
);
(ch0, ch1, ch2)
}
}
#[inline(always)]
unsafe fn reshape_rgba64_for_cascade(
raw0: __m256i,
raw1: __m256i,
raw2: __m256i,
raw3: __m256i,
) -> (__m256i, __m256i, __m256i, __m256i) {
unsafe {
let r0 = _mm256_permute2x128_si256::<0x20>(raw0, raw2);
let r1 = _mm256_permute2x128_si256::<0x31>(raw0, raw2);
let r2 = _mm256_permute2x128_si256::<0x20>(raw1, raw3);
let r3 = _mm256_permute2x128_si256::<0x31>(raw1, raw3);
(r0, r1, r2, r3)
}
}
#[inline(always)]
unsafe fn deinterleave_rgba64_cascade(
r0: __m256i,
r1: __m256i,
r2: __m256i,
r3: __m256i,
) -> (__m256i, __m256i, __m256i, __m256i) {
unsafe {
let s1_lo = _mm256_unpacklo_epi16(r0, r1);
let s1_hi = _mm256_unpackhi_epi16(r0, r1);
let s2_lo = _mm256_unpacklo_epi16(r2, r3);
let s2_hi = _mm256_unpackhi_epi16(r2, r3);
let s3_lo = _mm256_unpacklo_epi16(s1_lo, s1_hi);
let s3_hi = _mm256_unpackhi_epi16(s1_lo, s1_hi);
let s4_lo = _mm256_unpacklo_epi16(s2_lo, s2_hi);
let s4_hi = _mm256_unpackhi_epi16(s2_lo, s2_hi);
let ch0 = _mm256_unpacklo_epi64(s3_lo, s4_lo);
let ch1 = _mm256_unpackhi_epi64(s3_lo, s4_lo);
let ch2 = _mm256_unpacklo_epi64(s3_hi, s4_hi);
let ch3 = _mm256_unpackhi_epi64(s3_hi, s4_hi);
(ch0, ch1, ch2, ch3)
}
}
#[inline(always)]
unsafe fn deinterleave_rgba64_16px(
raw0: __m256i,
raw1: __m256i,
raw2: __m256i,
raw3: __m256i,
) -> (__m256i, __m256i, __m256i, __m256i) {
unsafe {
let (r0, r1, r2, r3) = reshape_rgba64_for_cascade(raw0, raw1, raw2, raw3);
deinterleave_rgba64_cascade(r0, r1, r2, r3)
}
}
#[inline(always)]
unsafe fn narrow_u16x16_to_u8x16(v: __m256i, zero: __m256i) -> __m128i {
unsafe {
let shifted = _mm256_srli_epi16::<8>(v);
let packed = _mm256_packus_epi16(shifted, zero);
_mm256_castsi256_si128(_mm256_permute4x64_epi64::<0xD8>(packed))
}
}
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline(always)]
unsafe fn byteswap128_if_be<const BE: bool>(v: __m128i) -> __m128i {
if BE != HOST_NATIVE_BE {
const MASK: __m128i =
unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) };
unsafe { _mm_shuffle_epi8(v, MASK) }
} else {
v
}
}
#[inline(always)]
unsafe fn byteswap256_if_be<const BE: bool>(v: __m256i) -> __m256i {
if BE != HOST_NATIVE_BE {
const MASK: __m256i = unsafe {
core::mem::transmute([
1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
10, 13, 12, 15, 14,
])
};
unsafe { _mm256_shuffle_epi8(v, MASK) }
} else {
v
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_rgb48_to_rgb_row<const BE: bool>(
rgb48: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 16 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2);
let r0u8 = narrow_u16x8_to_u8x8(r0, zero);
let g0u8 = narrow_u16x8_to_u8x8(g0, zero);
let b0u8 = narrow_u16x8_to_u8x8(b0, zero);
let mut tmp0 = [0u8; 48];
write_rgb_16(r0u8, g0u8, b0u8, tmp0.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp0.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
let ptr8 = ptr.add(24); let v3 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.cast()));
let v4 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(8).cast()));
let v5 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(16).cast()));
let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5);
let r1u8 = narrow_u16x8_to_u8x8(r1, zero);
let g1u8 = narrow_u16x8_to_u8x8(g1, zero);
let b1u8 = narrow_u16x8_to_u8x8(b1, zero);
let mut tmp1 = [0u8; 48];
write_rgb_16(r1u8, g1u8, b1u8, tmp1.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp1.as_ptr(), rgb_out.as_mut_ptr().add((x + 8) * 3), 24);
x += 16;
}
if x < width {
scalar::rgb48_to_rgb_row::<BE>(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_rgb48_to_rgba_row<const BE: bool>(
rgb48: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let opaque_u16 = _mm_set1_epi16(0x00FFu16 as i16);
let opaque_u8 = _mm_packus_epi16(opaque_u16, zero);
let mut x = 0usize;
while x + 16 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2);
let r0u8 = narrow_u16x8_to_u8x8(r0, zero);
let g0u8 = narrow_u16x8_to_u8x8(g0, zero);
let b0u8 = narrow_u16x8_to_u8x8(b0, zero);
let mut tmp0 = [0u8; 64];
write_rgba_16(r0u8, g0u8, b0u8, opaque_u8, tmp0.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp0.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
let ptr8 = ptr.add(24);
let v3 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.cast()));
let v4 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(8).cast()));
let v5 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(16).cast()));
let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5);
let r1u8 = narrow_u16x8_to_u8x8(r1, zero);
let g1u8 = narrow_u16x8_to_u8x8(g1, zero);
let b1u8 = narrow_u16x8_to_u8x8(b1, zero);
let mut tmp1 = [0u8; 64];
write_rgba_16(r1u8, g1u8, b1u8, opaque_u8, tmp1.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp1.as_ptr(), rgba_out.as_mut_ptr().add((x + 8) * 4), 32);
x += 16;
}
if x < width {
scalar::rgb48_to_rgba_row::<BE>(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_rgb48_to_rgb_u16_row<const BE: bool>(
rgb48: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgb_u16_8(r0, g0, b0, rgb_out.as_mut_ptr().add(x * 3));
let ptr8 = ptr.add(24);
let v3 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.cast()));
let v4 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(8).cast()));
let v5 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(16).cast()));
let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5);
write_rgb_u16_8(r1, g1, b1, rgb_out.as_mut_ptr().add((x + 8) * 3));
x += 16;
}
if x < width {
scalar::rgb48_to_rgb_u16_row::<BE>(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row<const BE: bool>(
rgb48: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let opaque = _mm_set1_epi16(0xFFFFu16 as i16);
let mut x = 0usize;
while x + 16 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgba_u16_8(r0, g0, b0, opaque, rgba_out.as_mut_ptr().add(x * 4));
let ptr8 = ptr.add(24);
let v3 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.cast()));
let v4 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(8).cast()));
let v5 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(16).cast()));
let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5);
write_rgba_u16_8(r1, g1, b1, opaque, rgba_out.as_mut_ptr().add((x + 8) * 4));
x += 16;
}
if x < width {
scalar::rgb48_to_rgba_u16_row::<BE>(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_bgr48_to_rgb_row<const BE: bool>(
bgr48: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 16 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2);
let r0u8 = narrow_u16x8_to_u8x8(r0, zero);
let g0u8 = narrow_u16x8_to_u8x8(g0, zero);
let b0u8 = narrow_u16x8_to_u8x8(b0, zero);
let mut tmp0 = [0u8; 48];
write_rgb_16(r0u8, g0u8, b0u8, tmp0.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp0.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
let ptr8 = ptr.add(24);
let v3 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.cast()));
let v4 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(8).cast()));
let v5 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(16).cast()));
let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5);
let r1u8 = narrow_u16x8_to_u8x8(r1, zero);
let g1u8 = narrow_u16x8_to_u8x8(g1, zero);
let b1u8 = narrow_u16x8_to_u8x8(b1, zero);
let mut tmp1 = [0u8; 48];
write_rgb_16(r1u8, g1u8, b1u8, tmp1.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp1.as_ptr(), rgb_out.as_mut_ptr().add((x + 8) * 3), 24);
x += 16;
}
if x < width {
scalar::bgr48_to_rgb_row::<BE>(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_bgr48_to_rgba_row<const BE: bool>(
bgr48: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let opaque_u16 = _mm_set1_epi16(0x00FFu16 as i16);
let opaque_u8 = _mm_packus_epi16(opaque_u16, zero);
let mut x = 0usize;
while x + 16 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2);
let r0u8 = narrow_u16x8_to_u8x8(r0, zero);
let g0u8 = narrow_u16x8_to_u8x8(g0, zero);
let b0u8 = narrow_u16x8_to_u8x8(b0, zero);
let mut tmp0 = [0u8; 64];
write_rgba_16(r0u8, g0u8, b0u8, opaque_u8, tmp0.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp0.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
let ptr8 = ptr.add(24);
let v3 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.cast()));
let v4 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(8).cast()));
let v5 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(16).cast()));
let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5);
let r1u8 = narrow_u16x8_to_u8x8(r1, zero);
let g1u8 = narrow_u16x8_to_u8x8(g1, zero);
let b1u8 = narrow_u16x8_to_u8x8(b1, zero);
let mut tmp1 = [0u8; 64];
write_rgba_16(r1u8, g1u8, b1u8, opaque_u8, tmp1.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp1.as_ptr(), rgba_out.as_mut_ptr().add((x + 8) * 4), 32);
x += 16;
}
if x < width {
scalar::bgr48_to_rgba_row::<BE>(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_bgr48_to_rgb_u16_row<const BE: bool>(
bgr48: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgb_u16_8(r0, g0, b0, rgb_out.as_mut_ptr().add(x * 3));
let ptr8 = ptr.add(24);
let v3 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.cast()));
let v4 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(8).cast()));
let v5 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(16).cast()));
let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5);
write_rgb_u16_8(r1, g1, b1, rgb_out.as_mut_ptr().add((x + 8) * 3));
x += 16;
}
if x < width {
scalar::bgr48_to_rgb_u16_row::<BE>(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row<const BE: bool>(
bgr48: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let opaque = _mm_set1_epi16(0xFFFFu16 as i16);
let mut x = 0usize;
while x + 16 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
let v0 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgba_u16_8(r0, g0, b0, opaque, rgba_out.as_mut_ptr().add(x * 4));
let ptr8 = ptr.add(24);
let v3 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.cast()));
let v4 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(8).cast()));
let v5 = byteswap128_if_be::<BE>(_mm_loadu_si128(ptr8.add(16).cast()));
let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5);
write_rgba_u16_8(r1, g1, b1, opaque, rgba_out.as_mut_ptr().add((x + 8) * 4));
x += 16;
}
if x < width {
scalar::bgr48_to_rgba_u16_row::<BE>(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_rgba64_to_rgb_row<const BE: bool>(
rgba64: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let zero256 = _mm256_setzero_si256();
let mut x = 0usize;
while x + 16 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let raw0 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.cast()));
let raw1 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(16).cast()));
let raw2 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(32).cast()));
let raw3 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(48).cast()));
let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3);
let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256);
let g_u8 = narrow_u16x16_to_u8x16(g_u16, zero256);
let b_u8 = narrow_u16x16_to_u8x16(b_u16, zero256);
write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
x += 16;
}
if x < width {
scalar::rgba64_to_rgb_row::<BE>(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_rgba64_to_rgba_row<const BE: bool>(
rgba64: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let zero256 = _mm256_setzero_si256();
let mut x = 0usize;
while x + 16 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let raw0 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.cast()));
let raw1 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(16).cast()));
let raw2 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(32).cast()));
let raw3 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(48).cast()));
let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3);
let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256);
let g_u8 = narrow_u16x16_to_u8x16(g_u16, zero256);
let b_u8 = narrow_u16x16_to_u8x16(b_u16, zero256);
let a_u8 = narrow_u16x16_to_u8x16(a_u16, zero256);
write_rgba_16(r_u8, g_u8, b_u8, a_u8, rgba_out.as_mut_ptr().add(x * 4));
x += 16;
}
if x < width {
scalar::rgba64_to_rgba_row::<BE>(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row<const BE: bool>(
rgba64: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let raw0 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.cast()));
let raw1 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(16).cast()));
let raw2 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(32).cast()));
let raw3 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(48).cast()));
let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3);
write_rgb_u16_8(
_mm256_castsi256_si128(r_u16),
_mm256_castsi256_si128(g_u16),
_mm256_castsi256_si128(b_u16),
rgb_out.as_mut_ptr().add(x * 3),
);
write_rgb_u16_8(
_mm256_extracti128_si256::<1>(r_u16),
_mm256_extracti128_si256::<1>(g_u16),
_mm256_extracti128_si256::<1>(b_u16),
rgb_out.as_mut_ptr().add(x * 3 + 24),
);
x += 16;
}
if x < width {
scalar::rgba64_to_rgb_u16_row::<BE>(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row<const BE: bool>(
rgba64: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let raw0 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.cast()));
let raw1 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(16).cast()));
let raw2 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(32).cast()));
let raw3 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(48).cast()));
let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3);
write_rgba_u16_8(
_mm256_castsi256_si128(r_u16),
_mm256_castsi256_si128(g_u16),
_mm256_castsi256_si128(b_u16),
_mm256_castsi256_si128(a_u16),
rgba_out.as_mut_ptr().add(x * 4),
);
write_rgba_u16_8(
_mm256_extracti128_si256::<1>(r_u16),
_mm256_extracti128_si256::<1>(g_u16),
_mm256_extracti128_si256::<1>(b_u16),
_mm256_extracti128_si256::<1>(a_u16),
rgba_out.as_mut_ptr().add(x * 4 + 32),
);
x += 16;
}
if x < width {
scalar::rgba64_to_rgba_u16_row::<BE>(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_bgra64_to_rgb_row<const BE: bool>(
bgra64: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let zero256 = _mm256_setzero_si256();
let mut x = 0usize;
while x + 16 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let raw0 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.cast()));
let raw1 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(16).cast()));
let raw2 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(32).cast()));
let raw3 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(48).cast()));
let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3);
let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256);
let g_u8 = narrow_u16x16_to_u8x16(g_u16, zero256);
let b_u8 = narrow_u16x16_to_u8x16(b_u16, zero256);
write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
x += 16;
}
if x < width {
scalar::bgra64_to_rgb_row::<BE>(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_bgra64_to_rgba_row<const BE: bool>(
bgra64: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let zero256 = _mm256_setzero_si256();
let mut x = 0usize;
while x + 16 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let raw0 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.cast()));
let raw1 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(16).cast()));
let raw2 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(32).cast()));
let raw3 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(48).cast()));
let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3);
let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256);
let g_u8 = narrow_u16x16_to_u8x16(g_u16, zero256);
let b_u8 = narrow_u16x16_to_u8x16(b_u16, zero256);
let a_u8 = narrow_u16x16_to_u8x16(a_u16, zero256);
write_rgba_16(r_u8, g_u8, b_u8, a_u8, rgba_out.as_mut_ptr().add(x * 4));
x += 16;
}
if x < width {
scalar::bgra64_to_rgba_row::<BE>(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row<const BE: bool>(
bgra64: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let raw0 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.cast()));
let raw1 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(16).cast()));
let raw2 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(32).cast()));
let raw3 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(48).cast()));
let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3);
write_rgb_u16_8(
_mm256_castsi256_si128(r_u16),
_mm256_castsi256_si128(g_u16),
_mm256_castsi256_si128(b_u16),
rgb_out.as_mut_ptr().add(x * 3),
);
write_rgb_u16_8(
_mm256_extracti128_si256::<1>(r_u16),
_mm256_extracti128_si256::<1>(g_u16),
_mm256_extracti128_si256::<1>(b_u16),
rgb_out.as_mut_ptr().add(x * 3 + 24),
);
x += 16;
}
if x < width {
scalar::bgra64_to_rgb_u16_row::<BE>(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn avx2_bgra64_to_rgba_u16_row<const BE: bool>(
bgra64: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let raw0 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.cast()));
let raw1 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(16).cast()));
let raw2 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(32).cast()));
let raw3 = byteswap256_if_be::<BE>(_mm256_loadu_si256(ptr.add(48).cast()));
let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3);
write_rgba_u16_8(
_mm256_castsi256_si128(r_u16),
_mm256_castsi256_si128(g_u16),
_mm256_castsi256_si128(b_u16),
_mm256_castsi256_si128(a_u16),
rgba_out.as_mut_ptr().add(x * 4),
);
write_rgba_u16_8(
_mm256_extracti128_si256::<1>(r_u16),
_mm256_extracti128_si256::<1>(g_u16),
_mm256_extracti128_si256::<1>(b_u16),
_mm256_extracti128_si256::<1>(a_u16),
rgba_out.as_mut_ptr().add(x * 4 + 32),
);
x += 16;
}
if x < width {
scalar::bgra64_to_rgba_u16_row::<BE>(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline(always)]
unsafe fn narrow_u16x8_to_u8x8(v: __m128i, zero: __m128i) -> __m128i {
unsafe { _mm_packus_epi16(_mm_srli_epi16::<8>(v), zero) }
}