use crate::shuffle::ShuffleConverter;
use crate::sse::{
_mm_load_deinterleave_half_rgbx, _mm_load_deinterleave_rgbx,
_mm_store_interleave_half_rgb_for_yuv, _mm_store_interleave_rgb_for_yuv, _xx_load_si64,
};
use crate::yuv_support::YuvSourceChannels;
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
pub(crate) struct ShuffleConverterSse<const SRC: u8, const DST: u8> {}
impl<const SRC: u8, const DST: u8> Default for ShuffleConverterSse<SRC, DST> {
fn default() -> Self {
ShuffleConverterSse {}
}
}
impl<const SRC: u8, const DST: u8> ShuffleConverter<u8, SRC, DST>
for ShuffleConverterSse<SRC, DST>
{
fn convert(&self, src: &[u8], dst: &mut [u8], width: usize) {
unsafe { shuffle_channels8_impl::<SRC, DST>(src, dst, width) }
}
}
#[target_feature(enable = "sse4.1")]
unsafe fn shuffle_channels8_impl<const SRC: u8, const DST: u8>(
src: &[u8],
dst: &mut [u8],
_: usize,
) {
let src_channels: YuvSourceChannels = SRC.into();
let dst_channels: YuvSourceChannels = DST.into();
for (src, dst) in src
.chunks_exact(16 * src_channels.get_channels_count())
.zip(dst.chunks_exact_mut(16 * dst_channels.get_channels_count()))
{
let (a0, b0, c0, d0) = _mm_load_deinterleave_rgbx::<SRC>(src.as_ptr());
_mm_store_interleave_rgb_for_yuv::<DST>(dst.as_mut_ptr(), a0, b0, c0, d0);
}
let src = src
.chunks_exact(16 * src_channels.get_channels_count())
.remainder();
let dst = dst
.chunks_exact_mut(16 * dst_channels.get_channels_count())
.into_remainder();
for (src, dst) in src
.chunks_exact(8 * src_channels.get_channels_count())
.zip(dst.chunks_exact_mut(8 * dst_channels.get_channels_count()))
{
let (a0, b0, c0, d0) = _mm_load_deinterleave_half_rgbx::<SRC>(src.as_ptr());
_mm_store_interleave_half_rgb_for_yuv::<DST>(dst.as_mut_ptr(), a0, b0, c0, d0);
}
let src = src
.chunks_exact(8 * src_channels.get_channels_count())
.remainder();
let dst = dst
.chunks_exact_mut(8 * dst_channels.get_channels_count())
.into_remainder();
if !src.is_empty() && !dst.is_empty() {
assert!(src.len() < 64);
assert!(dst.len() < 64);
let mut transient_src: [u8; 64] = [0; 64];
let mut transient_dst: [u8; 64] = [0; 64];
std::ptr::copy_nonoverlapping(src.as_ptr(), transient_src.as_mut_ptr(), src.len());
let (a0, b0, c0, d0) = _mm_load_deinterleave_half_rgbx::<SRC>(transient_src.as_ptr());
_mm_store_interleave_half_rgb_for_yuv::<DST>(transient_dst.as_mut_ptr(), a0, b0, c0, d0);
std::ptr::copy_nonoverlapping(transient_dst.as_ptr(), dst.as_mut_ptr(), dst.len());
}
}
pub(crate) struct ShuffleQTableConverterSse<const SRC: u8, const DST: u8> {
q_table: [u8; 16],
}
const RGBA_TO_BGRA_TABLE: [u8; 16] = [
2,
1,
0,
3,
2 + 4,
1 + 4,
4,
3 + 4,
2 + 8,
1 + 8,
8,
3 + 8,
2 + 12,
1 + 12,
12,
3 + 12,
];
impl<const SRC: u8, const DST: u8> ShuffleQTableConverterSse<SRC, DST> {
pub(crate) fn create() -> Self {
let src_channels: YuvSourceChannels = SRC.into();
let dst_channels: YuvSourceChannels = DST.into();
if src_channels.get_channels_count() != 4 || dst_channels.get_channels_count() != 4 {
unimplemented!("Shuffle table implemented only for 4 channels");
}
let new_table: [u8; 16] = match src_channels {
YuvSourceChannels::Rgb => unreachable!(),
YuvSourceChannels::Rgba => match dst_channels {
YuvSourceChannels::Rgb => unreachable!(),
YuvSourceChannels::Rgba => RGBA_TO_BGRA_TABLE,
YuvSourceChannels::Bgra => RGBA_TO_BGRA_TABLE,
YuvSourceChannels::Bgr => unreachable!(),
},
YuvSourceChannels::Bgra => match dst_channels {
YuvSourceChannels::Rgb => unreachable!(),
YuvSourceChannels::Rgba => RGBA_TO_BGRA_TABLE,
YuvSourceChannels::Bgra => RGBA_TO_BGRA_TABLE,
YuvSourceChannels::Bgr => unreachable!(),
},
YuvSourceChannels::Bgr => unreachable!(),
};
ShuffleQTableConverterSse { q_table: new_table }
}
}
impl<const SRC: u8, const DST: u8> ShuffleConverter<u8, SRC, DST>
for ShuffleQTableConverterSse<SRC, DST>
{
fn convert(&self, src: &[u8], dst: &mut [u8], width: usize) {
unsafe { shuffle_qtable_channels8_impl::<SRC, DST>(src, dst, width, self.q_table) }
}
}
#[target_feature(enable = "sse4.1")]
unsafe fn shuffle_qtable_channels8_impl<const SRC: u8, const DST: u8>(
src: &[u8],
dst: &mut [u8],
_: usize,
vq_table: [u8; 16],
) {
let src_channels: YuvSourceChannels = SRC.into();
let dst_channels: YuvSourceChannels = DST.into();
assert_eq!(src_channels.get_channels_count(), 4);
assert_eq!(dst_channels.get_channels_count(), 4);
let q_table = _mm_loadu_si128(vq_table.as_ptr() as *const _);
for (src, dst) in src.chunks_exact(16 * 4).zip(dst.chunks_exact_mut(16 * 4)) {
let mut row_1 = _mm_loadu_si128(src.as_ptr() as *const __m128i);
let mut row_2 = _mm_loadu_si128(src.as_ptr().add(16) as *const __m128i);
let mut row_3 = _mm_loadu_si128(src.as_ptr().add(32) as *const __m128i);
let mut row_4 = _mm_loadu_si128(src.as_ptr().add(48) as *const __m128i);
row_1 = _mm_shuffle_epi8(row_1, q_table);
row_2 = _mm_shuffle_epi8(row_2, q_table);
row_3 = _mm_shuffle_epi8(row_3, q_table);
row_4 = _mm_shuffle_epi8(row_4, q_table);
_mm_storeu_si128(dst.as_mut_ptr() as *mut __m128i, row_1);
_mm_storeu_si128(dst.as_mut_ptr().add(16) as *mut __m128i, row_2);
_mm_storeu_si128(dst.as_mut_ptr().add(32) as *mut __m128i, row_3);
_mm_storeu_si128(dst.as_mut_ptr().add(48) as *mut __m128i, row_4);
}
let src = src.chunks_exact(16 * 4).remainder();
let dst = dst.chunks_exact_mut(16 * 4).into_remainder();
for (src, dst) in src.chunks_exact(16).zip(dst.chunks_exact_mut(16)) {
let mut row_1 = _mm_loadu_si128(src.as_ptr() as *const __m128i);
row_1 = _mm_shuffle_epi8(row_1, q_table);
_mm_storeu_si128(dst.as_mut_ptr() as *mut __m128i, row_1);
}
let src = src.chunks_exact(16).remainder();
let dst = dst.chunks_exact_mut(16).into_remainder();
for (src, dst) in src.chunks_exact(8).zip(dst.chunks_exact_mut(8)) {
let mut row_1 = _xx_load_si64(src.as_ptr());
row_1 = _mm_shuffle_epi8(row_1, q_table);
_mm_storeu_si64(dst.as_mut_ptr(), row_1);
}
let src = src.chunks_exact(8).remainder();
let dst = dst.chunks_exact_mut(8).into_remainder();
if !src.is_empty() && !dst.is_empty() {
assert!(src.len() < 16);
assert!(dst.len() < 16);
let mut transient_src: [u8; 16] = [0; 16];
let mut transient_dst: [u8; 16] = [0; 16];
std::ptr::copy_nonoverlapping(src.as_ptr(), transient_src.as_mut_ptr(), src.len());
let mut row_1 = _xx_load_si64(transient_src.as_ptr());
row_1 = _mm_shuffle_epi8(row_1, q_table);
_mm_storeu_si64(transient_dst.as_mut_ptr(), row_1);
std::ptr::copy_nonoverlapping(transient_dst.as_ptr(), dst.as_mut_ptr(), dst.len());
}
}