use core::arch::x86_64::*;
use super::*;
#[inline(always)]
unsafe fn expand5(c: __m128i) -> __m128i {
unsafe { _mm_or_si128(_mm_slli_epi16(c, 3), _mm_srli_epi16(c, 2)) }
}
#[inline(always)]
unsafe fn expand6(c: __m128i) -> __m128i {
unsafe { _mm_or_si128(_mm_slli_epi16(c, 2), _mm_srli_epi16(c, 4)) }
}
#[inline(always)]
unsafe fn expand4(c: __m128i) -> __m128i {
unsafe { _mm_or_si128(_mm_slli_epi16(c, 4), c) }
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgb565_to_rgb_row(src: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short");
unsafe {
let mask_r5 = _mm_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm_set1_epi16(0x3F_u16 as i16);
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let r5 = _mm_and_si128(_mm_srli_epi16(px, 11), mask_r5);
let g6 = _mm_and_si128(_mm_srli_epi16(px, 5), mask_g6);
let b5 = _mm_and_si128(px, mask_r5);
let r_exp = expand5(r5);
let g_exp = expand6(g6);
let b_exp = expand5(b5);
let r_u8 = _mm_packus_epi16(r_exp, zero);
let g_u8 = _mm_packus_epi16(g_exp, zero);
let b_u8 = _mm_packus_epi16(b_exp, zero);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
x += 8;
}
if x < width {
scalar::legacy_rgb::rgb565_to_rgb_row(&src[x * 2..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgb565_to_rgba_row(src: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let mask_r5 = _mm_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm_set1_epi16(0x3F_u16 as i16);
let zero = _mm_setzero_si128();
let alpha_u8 = _mm_set1_epi8(-1i8);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let r5 = _mm_and_si128(_mm_srli_epi16(px, 11), mask_r5);
let g6 = _mm_and_si128(_mm_srli_epi16(px, 5), mask_g6);
let b5 = _mm_and_si128(px, mask_r5);
let r_u8 = _mm_packus_epi16(expand5(r5), zero);
let g_u8 = _mm_packus_epi16(expand6(g6), zero);
let b_u8 = _mm_packus_epi16(expand5(b5), zero);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::legacy_rgb::rgb565_to_rgba_row(&src[x * 2..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgb565_to_rgb_u16_row(src: &[u8], rgb_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short");
unsafe {
let mask_r5 = _mm_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm_set1_epi16(0x3F_u16 as i16);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let r = _mm_and_si128(_mm_srli_epi16(px, 11), mask_r5);
let g = _mm_and_si128(_mm_srli_epi16(px, 5), mask_g6);
let b = _mm_and_si128(px, mask_r5);
write_rgb_u16_8(r, g, b, rgb_u16_out.as_mut_ptr().add(x * 3));
x += 8;
}
if x < width {
scalar::legacy_rgb::rgb565_to_rgb_u16_row(
&src[x * 2..],
&mut rgb_u16_out[x * 3..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgb565_to_rgba_u16_row(src: &[u8], rgba_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
unsafe {
let mask_r5 = _mm_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm_set1_epi16(0x3F_u16 as i16);
let alpha = _mm_set1_epi16(-1i16);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let r = _mm_and_si128(_mm_srli_epi16(px, 11), mask_r5);
let g = _mm_and_si128(_mm_srli_epi16(px, 5), mask_g6);
let b = _mm_and_si128(px, mask_r5);
write_rgba_u16_8(r, g, b, alpha, rgba_u16_out.as_mut_ptr().add(x * 4));
x += 8;
}
if x < width {
scalar::legacy_rgb::rgb565_to_rgba_u16_row(
&src[x * 2..],
&mut rgba_u16_out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn bgr565_to_rgb_row(src: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short");
unsafe {
let mask_r5 = _mm_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm_set1_epi16(0x3F_u16 as i16);
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let b5 = _mm_and_si128(_mm_srli_epi16(px, 11), mask_r5);
let g6 = _mm_and_si128(_mm_srli_epi16(px, 5), mask_g6);
let r5 = _mm_and_si128(px, mask_r5);
let r_u8 = _mm_packus_epi16(expand5(r5), zero);
let g_u8 = _mm_packus_epi16(expand6(g6), zero);
let b_u8 = _mm_packus_epi16(expand5(b5), zero);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
x += 8;
}
if x < width {
scalar::legacy_rgb::bgr565_to_rgb_row(&src[x * 2..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn bgr565_to_rgba_row(src: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let mask_r5 = _mm_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm_set1_epi16(0x3F_u16 as i16);
let zero = _mm_setzero_si128();
let alpha_u8 = _mm_set1_epi8(-1i8);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let b5 = _mm_and_si128(_mm_srli_epi16(px, 11), mask_r5);
let g6 = _mm_and_si128(_mm_srli_epi16(px, 5), mask_g6);
let r5 = _mm_and_si128(px, mask_r5);
let r_u8 = _mm_packus_epi16(expand5(r5), zero);
let g_u8 = _mm_packus_epi16(expand6(g6), zero);
let b_u8 = _mm_packus_epi16(expand5(b5), zero);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::legacy_rgb::bgr565_to_rgba_row(&src[x * 2..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn bgr565_to_rgb_u16_row(src: &[u8], rgb_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short");
unsafe {
let mask_r5 = _mm_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm_set1_epi16(0x3F_u16 as i16);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let b = _mm_and_si128(_mm_srli_epi16(px, 11), mask_r5);
let g = _mm_and_si128(_mm_srli_epi16(px, 5), mask_g6);
let r = _mm_and_si128(px, mask_r5);
write_rgb_u16_8(r, g, b, rgb_u16_out.as_mut_ptr().add(x * 3));
x += 8;
}
if x < width {
scalar::legacy_rgb::bgr565_to_rgb_u16_row(
&src[x * 2..],
&mut rgb_u16_out[x * 3..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn bgr565_to_rgba_u16_row(src: &[u8], rgba_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
unsafe {
let mask_r5 = _mm_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm_set1_epi16(0x3F_u16 as i16);
let alpha = _mm_set1_epi16(-1i16);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let b = _mm_and_si128(_mm_srli_epi16(px, 11), mask_r5);
let g = _mm_and_si128(_mm_srli_epi16(px, 5), mask_g6);
let r = _mm_and_si128(px, mask_r5);
write_rgba_u16_8(r, g, b, alpha, rgba_u16_out.as_mut_ptr().add(x * 4));
x += 8;
}
if x < width {
scalar::legacy_rgb::bgr565_to_rgba_u16_row(
&src[x * 2..],
&mut rgba_u16_out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgb555_to_rgb_row(src: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short");
unsafe {
let mask5 = _mm_set1_epi16(0x1F_u16 as i16);
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let r5 = _mm_and_si128(_mm_srli_epi16(px, 10), mask5);
let g5 = _mm_and_si128(_mm_srli_epi16(px, 5), mask5);
let b5 = _mm_and_si128(px, mask5);
let r_u8 = _mm_packus_epi16(expand5(r5), zero);
let g_u8 = _mm_packus_epi16(expand5(g5), zero);
let b_u8 = _mm_packus_epi16(expand5(b5), zero);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
x += 8;
}
if x < width {
scalar::legacy_rgb::rgb555_to_rgb_row(&src[x * 2..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgb555_to_rgba_row(src: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let mask5 = _mm_set1_epi16(0x1F_u16 as i16);
let zero = _mm_setzero_si128();
let alpha_u8 = _mm_set1_epi8(-1i8);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let r5 = _mm_and_si128(_mm_srli_epi16(px, 10), mask5);
let g5 = _mm_and_si128(_mm_srli_epi16(px, 5), mask5);
let b5 = _mm_and_si128(px, mask5);
let r_u8 = _mm_packus_epi16(expand5(r5), zero);
let g_u8 = _mm_packus_epi16(expand5(g5), zero);
let b_u8 = _mm_packus_epi16(expand5(b5), zero);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::legacy_rgb::rgb555_to_rgba_row(&src[x * 2..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgb555_to_rgb_u16_row(src: &[u8], rgb_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short");
unsafe {
let mask5 = _mm_set1_epi16(0x1F_u16 as i16);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let r = _mm_and_si128(_mm_srli_epi16(px, 10), mask5);
let g = _mm_and_si128(_mm_srli_epi16(px, 5), mask5);
let b = _mm_and_si128(px, mask5);
write_rgb_u16_8(r, g, b, rgb_u16_out.as_mut_ptr().add(x * 3));
x += 8;
}
if x < width {
scalar::legacy_rgb::rgb555_to_rgb_u16_row(
&src[x * 2..],
&mut rgb_u16_out[x * 3..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgb555_to_rgba_u16_row(src: &[u8], rgba_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
unsafe {
let mask5 = _mm_set1_epi16(0x1F_u16 as i16);
let alpha = _mm_set1_epi16(-1i16);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let r = _mm_and_si128(_mm_srli_epi16(px, 10), mask5);
let g = _mm_and_si128(_mm_srli_epi16(px, 5), mask5);
let b = _mm_and_si128(px, mask5);
write_rgba_u16_8(r, g, b, alpha, rgba_u16_out.as_mut_ptr().add(x * 4));
x += 8;
}
if x < width {
scalar::legacy_rgb::rgb555_to_rgba_u16_row(
&src[x * 2..],
&mut rgba_u16_out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn bgr555_to_rgb_row(src: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short");
unsafe {
let mask5 = _mm_set1_epi16(0x1F_u16 as i16);
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let b5 = _mm_and_si128(_mm_srli_epi16(px, 10), mask5);
let g5 = _mm_and_si128(_mm_srli_epi16(px, 5), mask5);
let r5 = _mm_and_si128(px, mask5);
let r_u8 = _mm_packus_epi16(expand5(r5), zero);
let g_u8 = _mm_packus_epi16(expand5(g5), zero);
let b_u8 = _mm_packus_epi16(expand5(b5), zero);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
x += 8;
}
if x < width {
scalar::legacy_rgb::bgr555_to_rgb_row(&src[x * 2..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn bgr555_to_rgba_row(src: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let mask5 = _mm_set1_epi16(0x1F_u16 as i16);
let zero = _mm_setzero_si128();
let alpha_u8 = _mm_set1_epi8(-1i8);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let b5 = _mm_and_si128(_mm_srli_epi16(px, 10), mask5);
let g5 = _mm_and_si128(_mm_srli_epi16(px, 5), mask5);
let r5 = _mm_and_si128(px, mask5);
let r_u8 = _mm_packus_epi16(expand5(r5), zero);
let g_u8 = _mm_packus_epi16(expand5(g5), zero);
let b_u8 = _mm_packus_epi16(expand5(b5), zero);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::legacy_rgb::bgr555_to_rgba_row(&src[x * 2..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn bgr555_to_rgb_u16_row(src: &[u8], rgb_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short");
unsafe {
let mask5 = _mm_set1_epi16(0x1F_u16 as i16);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let b = _mm_and_si128(_mm_srli_epi16(px, 10), mask5);
let g = _mm_and_si128(_mm_srli_epi16(px, 5), mask5);
let r = _mm_and_si128(px, mask5);
write_rgb_u16_8(r, g, b, rgb_u16_out.as_mut_ptr().add(x * 3));
x += 8;
}
if x < width {
scalar::legacy_rgb::bgr555_to_rgb_u16_row(
&src[x * 2..],
&mut rgb_u16_out[x * 3..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn bgr555_to_rgba_u16_row(src: &[u8], rgba_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
unsafe {
let mask5 = _mm_set1_epi16(0x1F_u16 as i16);
let alpha = _mm_set1_epi16(-1i16);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let b = _mm_and_si128(_mm_srli_epi16(px, 10), mask5);
let g = _mm_and_si128(_mm_srli_epi16(px, 5), mask5);
let r = _mm_and_si128(px, mask5);
write_rgba_u16_8(r, g, b, alpha, rgba_u16_out.as_mut_ptr().add(x * 4));
x += 8;
}
if x < width {
scalar::legacy_rgb::bgr555_to_rgba_u16_row(
&src[x * 2..],
&mut rgba_u16_out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgb444_to_rgb_row(src: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short");
unsafe {
let mask4 = _mm_set1_epi16(0x0F_u16 as i16);
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let r4 = _mm_and_si128(_mm_srli_epi16(px, 8), mask4);
let g4 = _mm_and_si128(_mm_srli_epi16(px, 4), mask4);
let b4 = _mm_and_si128(px, mask4);
let r_u8 = _mm_packus_epi16(expand4(r4), zero);
let g_u8 = _mm_packus_epi16(expand4(g4), zero);
let b_u8 = _mm_packus_epi16(expand4(b4), zero);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
x += 8;
}
if x < width {
scalar::legacy_rgb::rgb444_to_rgb_row(&src[x * 2..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgb444_to_rgba_row(src: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let mask4 = _mm_set1_epi16(0x0F_u16 as i16);
let zero = _mm_setzero_si128();
let alpha_u8 = _mm_set1_epi8(-1i8);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let r4 = _mm_and_si128(_mm_srli_epi16(px, 8), mask4);
let g4 = _mm_and_si128(_mm_srli_epi16(px, 4), mask4);
let b4 = _mm_and_si128(px, mask4);
let r_u8 = _mm_packus_epi16(expand4(r4), zero);
let g_u8 = _mm_packus_epi16(expand4(g4), zero);
let b_u8 = _mm_packus_epi16(expand4(b4), zero);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::legacy_rgb::rgb444_to_rgba_row(&src[x * 2..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgb444_to_rgb_u16_row(src: &[u8], rgb_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short");
unsafe {
let mask4 = _mm_set1_epi16(0x0F_u16 as i16);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let r = _mm_and_si128(_mm_srli_epi16(px, 8), mask4);
let g = _mm_and_si128(_mm_srli_epi16(px, 4), mask4);
let b = _mm_and_si128(px, mask4);
write_rgb_u16_8(r, g, b, rgb_u16_out.as_mut_ptr().add(x * 3));
x += 8;
}
if x < width {
scalar::legacy_rgb::rgb444_to_rgb_u16_row(
&src[x * 2..],
&mut rgb_u16_out[x * 3..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgb444_to_rgba_u16_row(src: &[u8], rgba_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
unsafe {
let mask4 = _mm_set1_epi16(0x0F_u16 as i16);
let alpha = _mm_set1_epi16(-1i16);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let r = _mm_and_si128(_mm_srli_epi16(px, 8), mask4);
let g = _mm_and_si128(_mm_srli_epi16(px, 4), mask4);
let b = _mm_and_si128(px, mask4);
write_rgba_u16_8(r, g, b, alpha, rgba_u16_out.as_mut_ptr().add(x * 4));
x += 8;
}
if x < width {
scalar::legacy_rgb::rgb444_to_rgba_u16_row(
&src[x * 2..],
&mut rgba_u16_out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn bgr444_to_rgb_row(src: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short");
unsafe {
let mask4 = _mm_set1_epi16(0x0F_u16 as i16);
let zero = _mm_setzero_si128();
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let b4 = _mm_and_si128(_mm_srli_epi16(px, 8), mask4);
let g4 = _mm_and_si128(_mm_srli_epi16(px, 4), mask4);
let r4 = _mm_and_si128(px, mask4);
let r_u8 = _mm_packus_epi16(expand4(r4), zero);
let g_u8 = _mm_packus_epi16(expand4(g4), zero);
let b_u8 = _mm_packus_epi16(expand4(b4), zero);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
x += 8;
}
if x < width {
scalar::legacy_rgb::bgr444_to_rgb_row(&src[x * 2..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn bgr444_to_rgba_row(src: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let mask4 = _mm_set1_epi16(0x0F_u16 as i16);
let zero = _mm_setzero_si128();
let alpha_u8 = _mm_set1_epi8(-1i8);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let b4 = _mm_and_si128(_mm_srli_epi16(px, 8), mask4);
let g4 = _mm_and_si128(_mm_srli_epi16(px, 4), mask4);
let r4 = _mm_and_si128(px, mask4);
let r_u8 = _mm_packus_epi16(expand4(r4), zero);
let g_u8 = _mm_packus_epi16(expand4(g4), zero);
let b_u8 = _mm_packus_epi16(expand4(b4), zero);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::legacy_rgb::bgr444_to_rgba_row(&src[x * 2..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn bgr444_to_rgb_u16_row(src: &[u8], rgb_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short");
unsafe {
let mask4 = _mm_set1_epi16(0x0F_u16 as i16);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let b = _mm_and_si128(_mm_srli_epi16(px, 8), mask4);
let g = _mm_and_si128(_mm_srli_epi16(px, 4), mask4);
let r = _mm_and_si128(px, mask4);
write_rgb_u16_8(r, g, b, rgb_u16_out.as_mut_ptr().add(x * 3));
x += 8;
}
if x < width {
scalar::legacy_rgb::bgr444_to_rgb_u16_row(
&src[x * 2..],
&mut rgb_u16_out[x * 3..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn bgr444_to_rgba_u16_row(src: &[u8], rgba_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
unsafe {
let mask4 = _mm_set1_epi16(0x0F_u16 as i16);
let alpha = _mm_set1_epi16(-1i16);
let mut x = 0usize;
while x + 8 <= width {
let px = _mm_loadu_si128(src.as_ptr().add(x * 2).cast());
let b = _mm_and_si128(_mm_srli_epi16(px, 8), mask4);
let g = _mm_and_si128(_mm_srli_epi16(px, 4), mask4);
let r = _mm_and_si128(px, mask4);
write_rgba_u16_8(r, g, b, alpha, rgba_u16_out.as_mut_ptr().add(x * 4));
x += 8;
}
if x < width {
scalar::legacy_rgb::bgr444_to_rgba_u16_row(
&src[x * 2..],
&mut rgba_u16_out[x * 4..],
width - x,
);
}
}
}