#![allow(dead_code)]
use core::arch::x86_64::*;
use super::*;
#[inline(always)]
unsafe fn deinterleave_rgb48_8px(
v0: __m128i,
v1: __m128i,
v2: __m128i,
) -> (__m128i, __m128i, __m128i) {
unsafe {
let ch0_v0 = _mm_setr_epi8(0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let ch0_v1 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1);
let ch0_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 10, 11);
let ch0 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(v0, ch0_v0), _mm_shuffle_epi8(v1, ch0_v1)),
_mm_shuffle_epi8(v2, ch0_v2),
);
let ch1_v0 = _mm_setr_epi8(2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let ch1_v1 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1);
let ch1_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 6, 7, 12, 13);
let ch1 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(v0, ch1_v0), _mm_shuffle_epi8(v1, ch1_v1)),
_mm_shuffle_epi8(v2, ch1_v2),
);
let ch2_v0 = _mm_setr_epi8(4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let ch2_v1 = _mm_setr_epi8(-1, -1, -1, -1, 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1);
let ch2_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15);
let ch2 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(v0, ch2_v0), _mm_shuffle_epi8(v1, ch2_v1)),
_mm_shuffle_epi8(v2, ch2_v2),
);
(ch0, ch1, ch2)
}
}
#[inline(always)]
unsafe fn deinterleave_rgba64_8px(
v0: __m128i,
v1: __m128i,
v2: __m128i,
v3: __m128i,
) -> (__m128i, __m128i, __m128i, __m128i) {
unsafe {
let c0_from_v0 = _mm_setr_epi8(0, 1, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let c0_from_v1 = _mm_setr_epi8(-1, -1, -1, -1, 0, 1, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1);
let c0_from_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 8, 9, -1, -1, -1, -1);
let c0_from_v3 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 8, 9);
let ch0 = _mm_or_si128(
_mm_or_si128(
_mm_shuffle_epi8(v0, c0_from_v0),
_mm_shuffle_epi8(v1, c0_from_v1),
),
_mm_or_si128(
_mm_shuffle_epi8(v2, c0_from_v2),
_mm_shuffle_epi8(v3, c0_from_v3),
),
);
let c1_from_v0 = _mm_setr_epi8(2, 3, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let c1_from_v1 = _mm_setr_epi8(-1, -1, -1, -1, 2, 3, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1);
let c1_from_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 10, 11, -1, -1, -1, -1);
let c1_from_v3 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 10, 11);
let ch1 = _mm_or_si128(
_mm_or_si128(
_mm_shuffle_epi8(v0, c1_from_v0),
_mm_shuffle_epi8(v1, c1_from_v1),
),
_mm_or_si128(
_mm_shuffle_epi8(v2, c1_from_v2),
_mm_shuffle_epi8(v3, c1_from_v3),
),
);
let c2_from_v0 = _mm_setr_epi8(4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let c2_from_v1 = _mm_setr_epi8(-1, -1, -1, -1, 4, 5, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
let c2_from_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 12, 13, -1, -1, -1, -1);
let c2_from_v3 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 12, 13);
let ch2 = _mm_or_si128(
_mm_or_si128(
_mm_shuffle_epi8(v0, c2_from_v0),
_mm_shuffle_epi8(v1, c2_from_v1),
),
_mm_or_si128(
_mm_shuffle_epi8(v2, c2_from_v2),
_mm_shuffle_epi8(v3, c2_from_v3),
),
);
let c3_from_v0 = _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let c3_from_v1 = _mm_setr_epi8(-1, -1, -1, -1, 6, 7, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1);
let c3_from_v2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 6, 7, 14, 15, -1, -1, -1, -1);
let c3_from_v3 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, 7, 14, 15);
let ch3 = _mm_or_si128(
_mm_or_si128(
_mm_shuffle_epi8(v0, c3_from_v0),
_mm_shuffle_epi8(v1, c3_from_v1),
),
_mm_or_si128(
_mm_shuffle_epi8(v2, c3_from_v2),
_mm_shuffle_epi8(v3, c3_from_v3),
),
);
(ch0, ch1, ch2, ch3)
}
}
#[inline(always)]
unsafe fn narrow_u16x8_to_u8x8(v: __m128i, zero: __m128i) -> __m128i {
unsafe { _mm_packus_epi16(_mm_srli_epi16::<8>(v), zero) }
}
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline(always)]
unsafe fn byteswap_if_be<const BE: bool>(v: __m128i) -> __m128i {
if BE != HOST_NATIVE_BE {
const MASK: __m128i =
unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) };
unsafe { _mm_shuffle_epi8(v, MASK) }
} else {
v
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_rgb48_to_rgb_row<const BE: bool>(
rgb48: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2);
let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero);
let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero);
let b_u8 = narrow_u16x8_to_u8x8(b_u16, zero);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
x += 8;
}
if x < width {
scalar::rgb48_to_rgb_row::<BE>(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_rgb48_to_rgba_row<const BE: bool>(
rgb48: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let opaque_u16 = _mm_set1_epi16(0x00FFu16 as i16);
let opaque_u8 = _mm_packus_epi16(opaque_u16, zero);
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2);
let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero);
let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero);
let b_u8 = narrow_u16x8_to_u8x8(b_u16, zero);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, opaque_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::rgb48_to_rgba_row::<BE>(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_rgb48_to_rgb_u16_row<const BE: bool>(
rgb48: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
x += 8;
}
if x < width {
scalar::rgb48_to_rgb_u16_row::<BE>(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row<const BE: bool>(
rgb48: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let opaque = _mm_set1_epi16(0xFFFFu16 as i16);
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgba_u16_8(
r_u16,
g_u16,
b_u16,
opaque,
rgba_out.as_mut_ptr().add(x * 4),
);
x += 8;
}
if x < width {
scalar::rgb48_to_rgba_u16_row::<BE>(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_bgr48_to_rgb_row<const BE: bool>(
bgr48: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2);
let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero);
let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero);
let b_u8 = narrow_u16x8_to_u8x8(b_u16, zero);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
x += 8;
}
if x < width {
scalar::bgr48_to_rgb_row::<BE>(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_bgr48_to_rgba_row<const BE: bool>(
bgr48: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let opaque_u16 = _mm_set1_epi16(0x00FFu16 as i16);
let opaque_u8 = _mm_packus_epi16(opaque_u16, zero);
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2);
let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero);
let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero);
let b_u8 = narrow_u16x8_to_u8x8(b_u16, zero);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, opaque_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::bgr48_to_rgba_row::<BE>(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_bgr48_to_rgb_u16_row<const BE: bool>(
bgr48: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
x += 8;
}
if x < width {
scalar::bgr48_to_rgb_u16_row::<BE>(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row<const BE: bool>(
bgr48: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let opaque = _mm_set1_epi16(0xFFFFu16 as i16);
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgba_u16_8(
r_u16,
g_u16,
b_u16,
opaque,
rgba_out.as_mut_ptr().add(x * 4),
);
x += 8;
}
if x < width {
scalar::bgr48_to_rgba_u16_row::<BE>(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_rgba64_to_rgb_row<const BE: bool>(
rgba64: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let v3 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(24).cast()));
let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3);
let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero);
let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero);
let b_u8 = narrow_u16x8_to_u8x8(b_u16, zero);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
x += 8;
}
if x < width {
scalar::rgba64_to_rgb_row::<BE>(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_rgba64_to_rgba_row<const BE: bool>(
rgba64: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let v3 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(24).cast()));
let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3);
let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero);
let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero);
let b_u8 = narrow_u16x8_to_u8x8(b_u16, zero);
let a_u8 = narrow_u16x8_to_u8x8(a_u16, zero);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, a_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::rgba64_to_rgba_row::<BE>(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row<const BE: bool>(
rgba64: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let v3 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(24).cast()));
let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3);
write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
x += 8;
}
if x < width {
scalar::rgba64_to_rgb_u16_row::<BE>(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row<const BE: bool>(
rgba64: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let v3 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(24).cast()));
let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3);
write_rgba_u16_8(r_u16, g_u16, b_u16, a_u16, rgba_out.as_mut_ptr().add(x * 4));
x += 8;
}
if x < width {
scalar::rgba64_to_rgba_u16_row::<BE>(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_bgra64_to_rgb_row<const BE: bool>(
bgra64: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let v3 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(24).cast()));
let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3);
let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero);
let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero);
let b_u8 = narrow_u16x8_to_u8x8(b_u16, zero);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
x += 8;
}
if x < width {
scalar::bgra64_to_rgb_row::<BE>(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_bgra64_to_rgba_row<const BE: bool>(
bgra64: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let v3 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(24).cast()));
let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3);
let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero);
let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero);
let b_u8 = narrow_u16x8_to_u8x8(b_u16, zero);
let a_u8 = narrow_u16x8_to_u8x8(a_u16, zero);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, a_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::bgra64_to_rgba_row::<BE>(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_bgra64_to_rgb_u16_row<const BE: bool>(
bgra64: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let v3 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(24).cast()));
let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3);
write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
x += 8;
}
if x < width {
scalar::bgra64_to_rgb_u16_row::<BE>(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn sse41_bgra64_to_rgba_u16_row<const BE: bool>(
bgra64: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let v0 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.cast()));
let v1 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(16).cast()));
let v3 = byteswap_if_be::<BE>(_mm_loadu_si128(ptr.add(24).cast()));
let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3);
write_rgba_u16_8(r_u16, g_u16, b_u16, a_u16, rgba_out.as_mut_ptr().add(x * 4));
x += 8;
}
if x < width {
scalar::bgra64_to_rgba_u16_row::<BE>(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}