#[cfg(target_arch = "x86")]
use core::arch::x86::{__m128i, _mm_set_epi8, _mm_shuffle_epi8};
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::{__m128i, _mm_set_epi8, _mm_shuffle_epi8};
#[target_feature(enable = "ssse3")]
pub unsafe fn shuffle_u8x4(u8s: &mut [[u8; 4]], idx: [u8; 4]) {
let (pre, aligned, post) = bytemuck::pod_align_to_mut::<_, __m128i>(u8s);
if !pre.is_empty() {
super::ShuffleOps::shuffle_u8x4(pre, idx);
}
fn in_bounds(idx: u8) -> i8 {
(if idx < 4 { idx } else { 0x80 }) as i8
}
let [a, b, c, d] = idx.map(in_bounds);
#[rustfmt::skip]
let shuffle = _mm_set_epi8(
d+12, c+12, b+12, a+12,
d+8, c+8, b+8, a+8,
d+4, c+4, b+4, a+4,
d, c, b, a);
let mut chunks = aligned.chunks_exact_mut(2);
for vec in &mut chunks {
let a = _mm_shuffle_epi8(vec[0], shuffle);
let b = _mm_shuffle_epi8(vec[1], shuffle);
vec[0] = a;
vec[1] = b;
}
for vec in chunks.into_remainder() {
*vec = _mm_shuffle_epi8(*vec, shuffle);
}
if !post.is_empty() {
super::ShuffleOps::shuffle_u8x4(post, idx);
}
}
#[target_feature(enable = "ssse3")]
pub unsafe fn shuffle_u16x4(u16s: &mut [[u16; 4]], idx: [u8; 4]) {
let (pre, aligned, post) = bytemuck::pod_align_to_mut::<_, __m128i>(u16s);
if !pre.is_empty() {
super::ShuffleOps::shuffle_u16x4(pre, idx);
}
fn in_bounds(idx: u8) -> i8 {
(if idx < 4 { idx } else { 0x80 }) as i8
}
let [a, b, c, d] = idx.map(in_bounds);
#[rustfmt::skip]
let shuffle = _mm_set_epi8(
2*d+9, 2*d+8, 2*c+9, 2*c+8, 2*b+9, 2*b+8, 2*a+9, 2*a+8,
2*d+1, 2*d, 2*c+1, 2*c, 2*b+1, 2*b, 2*a+1, 2*a);
let mut chunks = aligned.chunks_exact_mut(2);
for vec in &mut chunks {
let a = _mm_shuffle_epi8(vec[0], shuffle);
let b = _mm_shuffle_epi8(vec[1], shuffle);
vec[0] = a;
vec[1] = b;
}
for vec in chunks.into_remainder() {
*vec = _mm_shuffle_epi8(*vec, shuffle);
}
if !post.is_empty() {
super::ShuffleOps::shuffle_u16x4(post, idx);
}
}