use core::arch::x86_64::*;
use super::*;
#[inline(always)]
unsafe fn expand5(c: __m256i) -> __m256i {
unsafe { _mm256_or_si256(_mm256_slli_epi16(c, 3), _mm256_srli_epi16(c, 2)) }
}
#[inline(always)]
unsafe fn expand6(c: __m256i) -> __m256i {
unsafe { _mm256_or_si256(_mm256_slli_epi16(c, 2), _mm256_srli_epi16(c, 4)) }
}
#[inline(always)]
unsafe fn expand4(c: __m256i) -> __m256i {
unsafe { _mm256_or_si256(_mm256_slli_epi16(c, 4), c) }
}
#[inline(always)]
unsafe fn pack_u8(v: __m256i, zero256: __m256i) -> __m256i {
unsafe { _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi16(v, zero256)) }
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgb565_to_rgb_row(src: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short");
unsafe {
let mask_r5 = _mm256_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm256_set1_epi16(0x3F_u16 as i16);
let zero256 = _mm256_setzero_si256();
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let r5 = _mm256_and_si256(_mm256_srli_epi16(px, 11), mask_r5);
let g6 = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask_g6);
let b5 = _mm256_and_si256(px, mask_r5);
let r_u8 = _mm256_castsi256_si128(pack_u8(expand5(r5), zero256));
let g_u8 = _mm256_castsi256_si128(pack_u8(expand6(g6), zero256));
let b_u8 = _mm256_castsi256_si128(pack_u8(expand5(b5), zero256));
write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
x += 16;
}
if x < width {
scalar::legacy_rgb::rgb565_to_rgb_row(&src[x * 2..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgb565_to_rgba_row(src: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let mask_r5 = _mm256_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm256_set1_epi16(0x3F_u16 as i16);
let zero256 = _mm256_setzero_si256();
let alpha_u8 = _mm_set1_epi8(-1i8);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let r5 = _mm256_and_si256(_mm256_srli_epi16(px, 11), mask_r5);
let g6 = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask_g6);
let b5 = _mm256_and_si256(px, mask_r5);
let r_u8 = _mm256_castsi256_si128(pack_u8(expand5(r5), zero256));
let g_u8 = _mm256_castsi256_si128(pack_u8(expand6(g6), zero256));
let b_u8 = _mm256_castsi256_si128(pack_u8(expand5(b5), zero256));
write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, rgba_out.as_mut_ptr().add(x * 4));
x += 16;
}
if x < width {
scalar::legacy_rgb::rgb565_to_rgba_row(&src[x * 2..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgb565_to_rgb_u16_row(src: &[u8], rgb_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short");
unsafe {
let mask_r5 = _mm256_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm256_set1_epi16(0x3F_u16 as i16);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let r = _mm256_and_si256(_mm256_srli_epi16(px, 11), mask_r5);
let g = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask_g6);
let b = _mm256_and_si256(px, mask_r5);
let r_lo = _mm256_castsi256_si128(r);
let g_lo = _mm256_castsi256_si128(g);
let b_lo = _mm256_castsi256_si128(b);
write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_u16_out.as_mut_ptr().add(x * 3));
let r_hi = _mm256_extracti128_si256::<1>(r);
let g_hi = _mm256_extracti128_si256::<1>(g);
let b_hi = _mm256_extracti128_si256::<1>(b);
write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_u16_out.as_mut_ptr().add((x + 8) * 3));
x += 16;
}
if x < width {
scalar::legacy_rgb::rgb565_to_rgb_u16_row(
&src[x * 2..],
&mut rgb_u16_out[x * 3..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgb565_to_rgba_u16_row(src: &[u8], rgba_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
unsafe {
let mask_r5 = _mm256_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm256_set1_epi16(0x3F_u16 as i16);
let alpha = _mm_set1_epi16(-1i16);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let r = _mm256_and_si256(_mm256_srli_epi16(px, 11), mask_r5);
let g = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask_g6);
let b = _mm256_and_si256(px, mask_r5);
let r_lo = _mm256_castsi256_si128(r);
let g_lo = _mm256_castsi256_si128(g);
let b_lo = _mm256_castsi256_si128(b);
write_rgba_u16_8(
r_lo,
g_lo,
b_lo,
alpha,
rgba_u16_out.as_mut_ptr().add(x * 4),
);
let r_hi = _mm256_extracti128_si256::<1>(r);
let g_hi = _mm256_extracti128_si256::<1>(g);
let b_hi = _mm256_extracti128_si256::<1>(b);
write_rgba_u16_8(
r_hi,
g_hi,
b_hi,
alpha,
rgba_u16_out.as_mut_ptr().add((x + 8) * 4),
);
x += 16;
}
if x < width {
scalar::legacy_rgb::rgb565_to_rgba_u16_row(
&src[x * 2..],
&mut rgba_u16_out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn bgr565_to_rgb_row(src: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short");
unsafe {
let mask_r5 = _mm256_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm256_set1_epi16(0x3F_u16 as i16);
let zero256 = _mm256_setzero_si256();
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let b5 = _mm256_and_si256(_mm256_srli_epi16(px, 11), mask_r5);
let g6 = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask_g6);
let r5 = _mm256_and_si256(px, mask_r5);
let r_u8 = _mm256_castsi256_si128(pack_u8(expand5(r5), zero256));
let g_u8 = _mm256_castsi256_si128(pack_u8(expand6(g6), zero256));
let b_u8 = _mm256_castsi256_si128(pack_u8(expand5(b5), zero256));
write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
x += 16;
}
if x < width {
scalar::legacy_rgb::bgr565_to_rgb_row(&src[x * 2..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn bgr565_to_rgba_row(src: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let mask_r5 = _mm256_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm256_set1_epi16(0x3F_u16 as i16);
let zero256 = _mm256_setzero_si256();
let alpha_u8 = _mm_set1_epi8(-1i8);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let b5 = _mm256_and_si256(_mm256_srli_epi16(px, 11), mask_r5);
let g6 = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask_g6);
let r5 = _mm256_and_si256(px, mask_r5);
let r_u8 = _mm256_castsi256_si128(pack_u8(expand5(r5), zero256));
let g_u8 = _mm256_castsi256_si128(pack_u8(expand6(g6), zero256));
let b_u8 = _mm256_castsi256_si128(pack_u8(expand5(b5), zero256));
write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, rgba_out.as_mut_ptr().add(x * 4));
x += 16;
}
if x < width {
scalar::legacy_rgb::bgr565_to_rgba_row(&src[x * 2..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn bgr565_to_rgb_u16_row(src: &[u8], rgb_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short");
unsafe {
let mask_r5 = _mm256_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm256_set1_epi16(0x3F_u16 as i16);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let b = _mm256_and_si256(_mm256_srli_epi16(px, 11), mask_r5);
let g = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask_g6);
let r = _mm256_and_si256(px, mask_r5);
write_rgb_u16_8(
_mm256_castsi256_si128(r),
_mm256_castsi256_si128(g),
_mm256_castsi256_si128(b),
rgb_u16_out.as_mut_ptr().add(x * 3),
);
write_rgb_u16_8(
_mm256_extracti128_si256::<1>(r),
_mm256_extracti128_si256::<1>(g),
_mm256_extracti128_si256::<1>(b),
rgb_u16_out.as_mut_ptr().add((x + 8) * 3),
);
x += 16;
}
if x < width {
scalar::legacy_rgb::bgr565_to_rgb_u16_row(
&src[x * 2..],
&mut rgb_u16_out[x * 3..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn bgr565_to_rgba_u16_row(src: &[u8], rgba_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
unsafe {
let mask_r5 = _mm256_set1_epi16(0x1F_u16 as i16);
let mask_g6 = _mm256_set1_epi16(0x3F_u16 as i16);
let alpha = _mm_set1_epi16(-1i16);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let b = _mm256_and_si256(_mm256_srli_epi16(px, 11), mask_r5);
let g = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask_g6);
let r = _mm256_and_si256(px, mask_r5);
write_rgba_u16_8(
_mm256_castsi256_si128(r),
_mm256_castsi256_si128(g),
_mm256_castsi256_si128(b),
alpha,
rgba_u16_out.as_mut_ptr().add(x * 4),
);
write_rgba_u16_8(
_mm256_extracti128_si256::<1>(r),
_mm256_extracti128_si256::<1>(g),
_mm256_extracti128_si256::<1>(b),
alpha,
rgba_u16_out.as_mut_ptr().add((x + 8) * 4),
);
x += 16;
}
if x < width {
scalar::legacy_rgb::bgr565_to_rgba_u16_row(
&src[x * 2..],
&mut rgba_u16_out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgb555_to_rgb_row(src: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short");
unsafe {
let mask5 = _mm256_set1_epi16(0x1F_u16 as i16);
let zero256 = _mm256_setzero_si256();
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let r5 = _mm256_and_si256(_mm256_srli_epi16(px, 10), mask5);
let g5 = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask5);
let b5 = _mm256_and_si256(px, mask5);
let r_u8 = _mm256_castsi256_si128(pack_u8(expand5(r5), zero256));
let g_u8 = _mm256_castsi256_si128(pack_u8(expand5(g5), zero256));
let b_u8 = _mm256_castsi256_si128(pack_u8(expand5(b5), zero256));
write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
x += 16;
}
if x < width {
scalar::legacy_rgb::rgb555_to_rgb_row(&src[x * 2..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgb555_to_rgba_row(src: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let mask5 = _mm256_set1_epi16(0x1F_u16 as i16);
let zero256 = _mm256_setzero_si256();
let alpha_u8 = _mm_set1_epi8(-1i8);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let r5 = _mm256_and_si256(_mm256_srli_epi16(px, 10), mask5);
let g5 = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask5);
let b5 = _mm256_and_si256(px, mask5);
let r_u8 = _mm256_castsi256_si128(pack_u8(expand5(r5), zero256));
let g_u8 = _mm256_castsi256_si128(pack_u8(expand5(g5), zero256));
let b_u8 = _mm256_castsi256_si128(pack_u8(expand5(b5), zero256));
write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, rgba_out.as_mut_ptr().add(x * 4));
x += 16;
}
if x < width {
scalar::legacy_rgb::rgb555_to_rgba_row(&src[x * 2..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgb555_to_rgb_u16_row(src: &[u8], rgb_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short");
unsafe {
let mask5 = _mm256_set1_epi16(0x1F_u16 as i16);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let r = _mm256_and_si256(_mm256_srli_epi16(px, 10), mask5);
let g = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask5);
let b = _mm256_and_si256(px, mask5);
write_rgb_u16_8(
_mm256_castsi256_si128(r),
_mm256_castsi256_si128(g),
_mm256_castsi256_si128(b),
rgb_u16_out.as_mut_ptr().add(x * 3),
);
write_rgb_u16_8(
_mm256_extracti128_si256::<1>(r),
_mm256_extracti128_si256::<1>(g),
_mm256_extracti128_si256::<1>(b),
rgb_u16_out.as_mut_ptr().add((x + 8) * 3),
);
x += 16;
}
if x < width {
scalar::legacy_rgb::rgb555_to_rgb_u16_row(
&src[x * 2..],
&mut rgb_u16_out[x * 3..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgb555_to_rgba_u16_row(src: &[u8], rgba_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
unsafe {
let mask5 = _mm256_set1_epi16(0x1F_u16 as i16);
let alpha = _mm_set1_epi16(-1i16);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let r = _mm256_and_si256(_mm256_srli_epi16(px, 10), mask5);
let g = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask5);
let b = _mm256_and_si256(px, mask5);
write_rgba_u16_8(
_mm256_castsi256_si128(r),
_mm256_castsi256_si128(g),
_mm256_castsi256_si128(b),
alpha,
rgba_u16_out.as_mut_ptr().add(x * 4),
);
write_rgba_u16_8(
_mm256_extracti128_si256::<1>(r),
_mm256_extracti128_si256::<1>(g),
_mm256_extracti128_si256::<1>(b),
alpha,
rgba_u16_out.as_mut_ptr().add((x + 8) * 4),
);
x += 16;
}
if x < width {
scalar::legacy_rgb::rgb555_to_rgba_u16_row(
&src[x * 2..],
&mut rgba_u16_out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn bgr555_to_rgb_row(src: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short");
unsafe {
let mask5 = _mm256_set1_epi16(0x1F_u16 as i16);
let zero256 = _mm256_setzero_si256();
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let b5 = _mm256_and_si256(_mm256_srli_epi16(px, 10), mask5);
let g5 = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask5);
let r5 = _mm256_and_si256(px, mask5);
let r_u8 = _mm256_castsi256_si128(pack_u8(expand5(r5), zero256));
let g_u8 = _mm256_castsi256_si128(pack_u8(expand5(g5), zero256));
let b_u8 = _mm256_castsi256_si128(pack_u8(expand5(b5), zero256));
write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
x += 16;
}
if x < width {
scalar::legacy_rgb::bgr555_to_rgb_row(&src[x * 2..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn bgr555_to_rgba_row(src: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let mask5 = _mm256_set1_epi16(0x1F_u16 as i16);
let zero256 = _mm256_setzero_si256();
let alpha_u8 = _mm_set1_epi8(-1i8);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let b5 = _mm256_and_si256(_mm256_srli_epi16(px, 10), mask5);
let g5 = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask5);
let r5 = _mm256_and_si256(px, mask5);
let r_u8 = _mm256_castsi256_si128(pack_u8(expand5(r5), zero256));
let g_u8 = _mm256_castsi256_si128(pack_u8(expand5(g5), zero256));
let b_u8 = _mm256_castsi256_si128(pack_u8(expand5(b5), zero256));
write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, rgba_out.as_mut_ptr().add(x * 4));
x += 16;
}
if x < width {
scalar::legacy_rgb::bgr555_to_rgba_row(&src[x * 2..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn bgr555_to_rgb_u16_row(src: &[u8], rgb_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short");
unsafe {
let mask5 = _mm256_set1_epi16(0x1F_u16 as i16);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let b = _mm256_and_si256(_mm256_srli_epi16(px, 10), mask5);
let g = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask5);
let r = _mm256_and_si256(px, mask5);
write_rgb_u16_8(
_mm256_castsi256_si128(r),
_mm256_castsi256_si128(g),
_mm256_castsi256_si128(b),
rgb_u16_out.as_mut_ptr().add(x * 3),
);
write_rgb_u16_8(
_mm256_extracti128_si256::<1>(r),
_mm256_extracti128_si256::<1>(g),
_mm256_extracti128_si256::<1>(b),
rgb_u16_out.as_mut_ptr().add((x + 8) * 3),
);
x += 16;
}
if x < width {
scalar::legacy_rgb::bgr555_to_rgb_u16_row(
&src[x * 2..],
&mut rgb_u16_out[x * 3..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn bgr555_to_rgba_u16_row(src: &[u8], rgba_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
unsafe {
let mask5 = _mm256_set1_epi16(0x1F_u16 as i16);
let alpha = _mm_set1_epi16(-1i16);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let b = _mm256_and_si256(_mm256_srli_epi16(px, 10), mask5);
let g = _mm256_and_si256(_mm256_srli_epi16(px, 5), mask5);
let r = _mm256_and_si256(px, mask5);
write_rgba_u16_8(
_mm256_castsi256_si128(r),
_mm256_castsi256_si128(g),
_mm256_castsi256_si128(b),
alpha,
rgba_u16_out.as_mut_ptr().add(x * 4),
);
write_rgba_u16_8(
_mm256_extracti128_si256::<1>(r),
_mm256_extracti128_si256::<1>(g),
_mm256_extracti128_si256::<1>(b),
alpha,
rgba_u16_out.as_mut_ptr().add((x + 8) * 4),
);
x += 16;
}
if x < width {
scalar::legacy_rgb::bgr555_to_rgba_u16_row(
&src[x * 2..],
&mut rgba_u16_out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgb444_to_rgb_row(src: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short");
unsafe {
let mask4 = _mm256_set1_epi16(0x0F_u16 as i16);
let zero256 = _mm256_setzero_si256();
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let r4 = _mm256_and_si256(_mm256_srli_epi16(px, 8), mask4);
let g4 = _mm256_and_si256(_mm256_srli_epi16(px, 4), mask4);
let b4 = _mm256_and_si256(px, mask4);
let r_u8 = _mm256_castsi256_si128(pack_u8(expand4(r4), zero256));
let g_u8 = _mm256_castsi256_si128(pack_u8(expand4(g4), zero256));
let b_u8 = _mm256_castsi256_si128(pack_u8(expand4(b4), zero256));
write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
x += 16;
}
if x < width {
scalar::legacy_rgb::rgb444_to_rgb_row(&src[x * 2..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgb444_to_rgba_row(src: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let mask4 = _mm256_set1_epi16(0x0F_u16 as i16);
let zero256 = _mm256_setzero_si256();
let alpha_u8 = _mm_set1_epi8(-1i8);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let r4 = _mm256_and_si256(_mm256_srli_epi16(px, 8), mask4);
let g4 = _mm256_and_si256(_mm256_srli_epi16(px, 4), mask4);
let b4 = _mm256_and_si256(px, mask4);
let r_u8 = _mm256_castsi256_si128(pack_u8(expand4(r4), zero256));
let g_u8 = _mm256_castsi256_si128(pack_u8(expand4(g4), zero256));
let b_u8 = _mm256_castsi256_si128(pack_u8(expand4(b4), zero256));
write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, rgba_out.as_mut_ptr().add(x * 4));
x += 16;
}
if x < width {
scalar::legacy_rgb::rgb444_to_rgba_row(&src[x * 2..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgb444_to_rgb_u16_row(src: &[u8], rgb_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short");
unsafe {
let mask4 = _mm256_set1_epi16(0x0F_u16 as i16);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let r = _mm256_and_si256(_mm256_srli_epi16(px, 8), mask4);
let g = _mm256_and_si256(_mm256_srli_epi16(px, 4), mask4);
let b = _mm256_and_si256(px, mask4);
write_rgb_u16_8(
_mm256_castsi256_si128(r),
_mm256_castsi256_si128(g),
_mm256_castsi256_si128(b),
rgb_u16_out.as_mut_ptr().add(x * 3),
);
write_rgb_u16_8(
_mm256_extracti128_si256::<1>(r),
_mm256_extracti128_si256::<1>(g),
_mm256_extracti128_si256::<1>(b),
rgb_u16_out.as_mut_ptr().add((x + 8) * 3),
);
x += 16;
}
if x < width {
scalar::legacy_rgb::rgb444_to_rgb_u16_row(
&src[x * 2..],
&mut rgb_u16_out[x * 3..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgb444_to_rgba_u16_row(src: &[u8], rgba_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
unsafe {
let mask4 = _mm256_set1_epi16(0x0F_u16 as i16);
let alpha = _mm_set1_epi16(-1i16);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let r = _mm256_and_si256(_mm256_srli_epi16(px, 8), mask4);
let g = _mm256_and_si256(_mm256_srli_epi16(px, 4), mask4);
let b = _mm256_and_si256(px, mask4);
write_rgba_u16_8(
_mm256_castsi256_si128(r),
_mm256_castsi256_si128(g),
_mm256_castsi256_si128(b),
alpha,
rgba_u16_out.as_mut_ptr().add(x * 4),
);
write_rgba_u16_8(
_mm256_extracti128_si256::<1>(r),
_mm256_extracti128_si256::<1>(g),
_mm256_extracti128_si256::<1>(b),
alpha,
rgba_u16_out.as_mut_ptr().add((x + 8) * 4),
);
x += 16;
}
if x < width {
scalar::legacy_rgb::rgb444_to_rgba_u16_row(
&src[x * 2..],
&mut rgba_u16_out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn bgr444_to_rgb_row(src: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out too short");
unsafe {
let mask4 = _mm256_set1_epi16(0x0F_u16 as i16);
let zero256 = _mm256_setzero_si256();
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let b4 = _mm256_and_si256(_mm256_srli_epi16(px, 8), mask4);
let g4 = _mm256_and_si256(_mm256_srli_epi16(px, 4), mask4);
let r4 = _mm256_and_si256(px, mask4);
let r_u8 = _mm256_castsi256_si128(pack_u8(expand4(r4), zero256));
let g_u8 = _mm256_castsi256_si128(pack_u8(expand4(g4), zero256));
let b_u8 = _mm256_castsi256_si128(pack_u8(expand4(b4), zero256));
write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
x += 16;
}
if x < width {
scalar::legacy_rgb::bgr444_to_rgb_row(&src[x * 2..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn bgr444_to_rgba_row(src: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let mask4 = _mm256_set1_epi16(0x0F_u16 as i16);
let zero256 = _mm256_setzero_si256();
let alpha_u8 = _mm_set1_epi8(-1i8);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let b4 = _mm256_and_si256(_mm256_srli_epi16(px, 8), mask4);
let g4 = _mm256_and_si256(_mm256_srli_epi16(px, 4), mask4);
let r4 = _mm256_and_si256(px, mask4);
let r_u8 = _mm256_castsi256_si128(pack_u8(expand4(r4), zero256));
let g_u8 = _mm256_castsi256_si128(pack_u8(expand4(g4), zero256));
let b_u8 = _mm256_castsi256_si128(pack_u8(expand4(b4), zero256));
write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, rgba_out.as_mut_ptr().add(x * 4));
x += 16;
}
if x < width {
scalar::legacy_rgb::bgr444_to_rgba_row(&src[x * 2..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn bgr444_to_rgb_u16_row(src: &[u8], rgb_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out too short");
unsafe {
let mask4 = _mm256_set1_epi16(0x0F_u16 as i16);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let b = _mm256_and_si256(_mm256_srli_epi16(px, 8), mask4);
let g = _mm256_and_si256(_mm256_srli_epi16(px, 4), mask4);
let r = _mm256_and_si256(px, mask4);
write_rgb_u16_8(
_mm256_castsi256_si128(r),
_mm256_castsi256_si128(g),
_mm256_castsi256_si128(b),
rgb_u16_out.as_mut_ptr().add(x * 3),
);
write_rgb_u16_8(
_mm256_extracti128_si256::<1>(r),
_mm256_extracti128_si256::<1>(g),
_mm256_extracti128_si256::<1>(b),
rgb_u16_out.as_mut_ptr().add((x + 8) * 3),
);
x += 16;
}
if x < width {
scalar::legacy_rgb::bgr444_to_rgb_u16_row(
&src[x * 2..],
&mut rgb_u16_out[x * 3..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn bgr444_to_rgba_u16_row(src: &[u8], rgba_u16_out: &mut [u16], width: usize) {
debug_assert!(src.len() >= width * 2, "src row too short");
debug_assert!(rgba_u16_out.len() >= width * 4, "rgba_u16_out too short");
unsafe {
let mask4 = _mm256_set1_epi16(0x0F_u16 as i16);
let alpha = _mm_set1_epi16(-1i16);
let mut x = 0usize;
while x + 16 <= width {
let px = _mm256_loadu_si256(src.as_ptr().add(x * 2).cast());
let b = _mm256_and_si256(_mm256_srli_epi16(px, 8), mask4);
let g = _mm256_and_si256(_mm256_srli_epi16(px, 4), mask4);
let r = _mm256_and_si256(px, mask4);
write_rgba_u16_8(
_mm256_castsi256_si128(r),
_mm256_castsi256_si128(g),
_mm256_castsi256_si128(b),
alpha,
rgba_u16_out.as_mut_ptr().add(x * 4),
);
write_rgba_u16_8(
_mm256_extracti128_si256::<1>(r),
_mm256_extracti128_si256::<1>(g),
_mm256_extracti128_si256::<1>(b),
alpha,
rgba_u16_out.as_mut_ptr().add((x + 8) * 4),
);
x += 16;
}
if x < width {
scalar::legacy_rgb::bgr444_to_rgba_u16_row(
&src[x * 2..],
&mut rgba_u16_out[x * 4..],
width - x,
);
}
}
}