#[cfg(target_arch = "x86")]
use core::arch::x86 as simd;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64 as simd;
use std::mem;
use simd::{
__m128i,
__m256i,
_mm256_blend_epi32,
_mm256_loadu2_m128i,
_mm256_loadu_si256,
_mm256_permute2x128_si256,
_mm256_permute4x64_epi64,
_mm256_set_epi8,
_mm256_shuffle_epi8,
_mm256_storeu2_m128i,
_mm256_storeu_si256,
_mm256_unpackhi_epi16,
_mm256_unpackhi_epi32,
_mm256_unpackhi_epi64,
_mm256_unpackhi_epi8,
_mm256_unpacklo_epi16,
_mm256_unpacklo_epi32,
_mm256_unpacklo_epi64,
_mm256_unpacklo_epi8,
};
const SO256I: usize = mem::size_of::<__m256i>();
const SO128I: usize = mem::size_of::<__m128i>();
#[allow(clippy::needless_range_loop)] #[target_feature(enable = "avx2")]
unsafe fn shuffle2(
vectorizable_elements: usize,
total_elements: usize,
src: *const u8,
dst: *mut u8,
) {
const TS: usize = 2;
let mut ymm0: [__m256i; 16] = mem::zeroed();
let mut ymm1: [__m256i; 16] = mem::zeroed();
#[rustfmt::skip]
let shmask = _mm256_set_epi8(
0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00,
0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01,
0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00);
for j in (0..vectorizable_elements).step_by(SO256I) {
for k in 0..2 {
let p = src.add(j * TS + k * SO256I) as *const __m256i;
ymm0[k] = _mm256_loadu_si256(p);
ymm1[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
}
ymm0[0] = _mm256_permute4x64_epi64(ymm1[0], 0xd8);
ymm0[1] = _mm256_permute4x64_epi64(ymm1[1], 0x8d);
ymm1[0] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0xf0);
ymm0[1] = _mm256_blend_epi32(ymm0[0], ymm0[1], 0x0f);
ymm1[1] = _mm256_permute4x64_epi64(ymm0[1], 0x4e);
for k in 0..2 {
let p = dst.add(j + k * total_elements) as *mut __m256i;
_mm256_storeu_si256(p, ymm1[k]);
}
}
}
#[allow(clippy::needless_range_loop)] #[target_feature(enable = "avx2")]
unsafe fn shuffle16(
vectorizable_elements: usize,
total_elements: usize,
src: *const u8,
dst: *mut u8,
) {
const TS: usize = 16;
let mut ymm0: [__m256i; 16] = mem::zeroed();
let mut ymm1: [__m256i; 16] = mem::zeroed();
#[rustfmt::skip]
let shmask: __m256i = _mm256_set_epi8(
0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
for j in (0..vectorizable_elements).step_by(SO256I) {
for k in 0..16 {
let p = src.add(j * TS + k * SO256I) as *const __m256i;
ymm0[k] = _mm256_loadu_si256(p);
}
for k in 0..8 {
let l = k * 2;
ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]);
ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]);
}
let mut l = 0;
for k in 0..8 {
ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]);
ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]);
l += 1;
if k % 2 == 1 {
l += 2;
}
}
l = 0;
for k in 0..8 {
ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]);
ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]);
l += 1;
if k % 4 == 3 {
l += 4;
}
}
for k in 0..8 {
ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]);
ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]);
}
for k in 0..16 {
ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
}
for k in 0..16 {
let p = dst.add(j + k * total_elements) as *mut __m256i;
_mm256_storeu_si256(p, ymm0[k]);
}
}
}
#[allow(clippy::needless_range_loop)] #[target_feature(enable = "avx2")]
unsafe fn shuffle_tiled(
vectorizable_elements: usize,
total_elements: usize,
ts: usize,
src: *const u8,
dst: *mut u8,
) {
let mut ymm0: [__m256i; 16] = mem::zeroed();
let mut ymm1: [__m256i; 16] = mem::zeroed();
let vecs_rem = ts % SO128I;
#[rustfmt::skip]
let shmask = _mm256_set_epi8(
0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);
for j in (0..vectorizable_elements).step_by(SO256I) {
let mut offset_into_type = 0;
while offset_into_type < ts {
for k in 0..16 {
let p0 = src.add(offset_into_type + (j + 2 * k + 1) * ts) as *const __m128i;
let p1 = src.add(offset_into_type + (j + 2 * k) * ts) as *const __m128i;
ymm0[k] = _mm256_loadu2_m128i(p0, p1);
}
for k in 0..8 {
let l = 2 * k;
ymm1[k * 2] = _mm256_unpacklo_epi8(ymm0[l], ymm0[l + 1]);
ymm1[k * 2 + 1] = _mm256_unpackhi_epi8(ymm0[l], ymm0[l + 1]);
}
let mut l = 0;
for k in 0..8 {
ymm0[k * 2] = _mm256_unpacklo_epi16(ymm1[l], ymm1[l + 2]);
ymm0[k * 2 + 1] = _mm256_unpackhi_epi16(ymm1[l], ymm1[l + 2]);
l += 1;
if k % 2 == 1 {
l += 2;
}
}
l = 0;
for k in 0..8 {
ymm1[k * 2] = _mm256_unpacklo_epi32(ymm0[l], ymm0[l + 4]);
ymm1[k * 2 + 1] = _mm256_unpackhi_epi32(ymm0[l], ymm0[l + 4]);
l += 1;
if k % 4 == 3 {
l += 4;
}
}
for k in 0..8 {
ymm0[k * 2] = _mm256_unpacklo_epi64(ymm1[k], ymm1[k + 8]);
ymm0[k * 2 + 1] = _mm256_unpackhi_epi64(ymm1[k], ymm1[k + 8]);
}
for k in 0..16 {
ymm0[k] = _mm256_permute4x64_epi64(ymm0[k], 0xd8);
ymm0[k] = _mm256_shuffle_epi8(ymm0[k], shmask);
}
for k in 0..16 {
let p = dst.add(j + total_elements * (offset_into_type + k)) as *mut __m256i;
_mm256_storeu_si256(p, ymm0[k]);
}
offset_into_type += if offset_into_type == 0 && vecs_rem > 0 {
vecs_rem
} else {
SO128I
};
}
}
}
pub unsafe fn shuffle(typesize: usize, len: usize, src: *const u8, dst: *mut u8) {
let vectorized_chunk_size = typesize * mem::size_of::<__m256i>();
let vectorizable_bytes = len - (len % vectorized_chunk_size);
let vectorizable_elements = vectorizable_bytes / typesize;
let total_elements = len / typesize;
if len < vectorized_chunk_size {
crate::sse2::shuffle(typesize, len, src, dst);
return;
}
if typesize == 2 {
shuffle2(vectorizable_elements, total_elements, src, dst);
} else if typesize == 16 {
shuffle16(vectorizable_elements, total_elements, src, dst);
} else if typesize > SO128I {
shuffle_tiled(vectorizable_elements, total_elements, typesize, src, dst);
} else {
crate::generic::shuffle(typesize, len, src, dst);
return;
}
if vectorizable_bytes < len {
crate::generic::shuffle_partial(typesize, vectorizable_bytes, len, src, dst);
}
}
#[allow(clippy::needless_range_loop)] unsafe fn unshuffle2(
vectorizable_elements: usize,
total_elements: usize,
src: *const u8,
dst: *mut u8,
) {
const TS: usize = 2;
let mut ymm0: [__m256i; 2] = mem::zeroed();
let mut ymm1: [__m256i; 2] = mem::zeroed();
for i in (0..vectorizable_elements).step_by(SO256I) {
for j in 0..2 {
ymm0[j] = _mm256_loadu_si256(src.add(i + (j * total_elements)) as *mut __m256i);
}
for j in 0..2 {
ymm0[j] = _mm256_permute4x64_epi64(ymm0[j], 0xd8);
}
ymm1[0] = _mm256_unpacklo_epi8(ymm0[0], ymm0[1]);
ymm1[1] = _mm256_unpackhi_epi8(ymm0[0], ymm0[1]);
#[allow(clippy::erasing_op)]
_mm256_storeu_si256(dst.add(i * TS + 0 * SO256I) as *mut __m256i, ymm1[0]);
#[allow(clippy::identity_op)]
_mm256_storeu_si256(dst.add(i * TS + 1 * SO256I) as *mut __m256i, ymm1[1]);
}
}
#[allow(clippy::needless_range_loop)] unsafe fn unshuffle16(
vectorizable_elements: usize,
total_elements: usize,
src: *const u8,
dst: *mut u8,
) {
const TS: usize = 16;
let mut ymm0: [__m256i; 16] = mem::zeroed();
let mut ymm1: [__m256i; 16] = mem::zeroed();
for i in (0..vectorizable_elements).step_by(SO256I) {
for j in 0..16 {
ymm0[j] = _mm256_loadu_si256(src.add(i + (j * total_elements)) as *mut __m256i);
}
for j in 0..8 {
ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
}
for j in 0..8 {
ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
}
for j in 0..8 {
ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
}
for j in 0..8 {
ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
}
for j in 0..8 {
ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20);
ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31);
}
#[allow(clippy::erasing_op)]
#[allow(clippy::identity_op)]
{
_mm256_storeu_si256(dst.add(i * TS + 0 * SO256I) as *mut __m256i, ymm1[0]);
_mm256_storeu_si256(dst.add(i * TS + 1 * SO256I) as *mut __m256i, ymm1[4]);
_mm256_storeu_si256(dst.add(i * TS + 2 * SO256I) as *mut __m256i, ymm1[2]);
_mm256_storeu_si256(dst.add(i * TS + 3 * SO256I) as *mut __m256i, ymm1[6]);
_mm256_storeu_si256(dst.add(i * TS + 4 * SO256I) as *mut __m256i, ymm1[1]);
_mm256_storeu_si256(dst.add(i * TS + 5 * SO256I) as *mut __m256i, ymm1[5]);
_mm256_storeu_si256(dst.add(i * TS + 6 * SO256I) as *mut __m256i, ymm1[3]);
_mm256_storeu_si256(dst.add(i * TS + 7 * SO256I) as *mut __m256i, ymm1[7]);
_mm256_storeu_si256(dst.add(i * TS + 8 * SO256I) as *mut __m256i, ymm1[8]);
_mm256_storeu_si256(dst.add(i * TS + 9 * SO256I) as *mut __m256i, ymm1[12]);
_mm256_storeu_si256(dst.add(i * TS + 10 * SO256I) as *mut __m256i, ymm1[10]);
_mm256_storeu_si256(dst.add(i * TS + 11 * SO256I) as *mut __m256i, ymm1[14]);
_mm256_storeu_si256(dst.add(i * TS + 12 * SO256I) as *mut __m256i, ymm1[9]);
_mm256_storeu_si256(dst.add(i * TS + 13 * SO256I) as *mut __m256i, ymm1[13]);
_mm256_storeu_si256(dst.add(i * TS + 14 * SO256I) as *mut __m256i, ymm1[11]);
_mm256_storeu_si256(dst.add(i * TS + 15 * SO256I) as *mut __m256i, ymm1[15]);
}
}
}
#[allow(clippy::needless_range_loop)] unsafe fn unshuffle_tiled(
vectorizable_elements: usize,
total_elements: usize,
ts: usize,
src: *const u8,
dst: *mut u8,
) {
let mut ymm0: [__m256i; 16] = mem::zeroed();
let mut ymm1: [__m256i; 16] = mem::zeroed();
let vecs_rem = ts % SO128I;
let mut off_in_ty = 0;
while off_in_ty < ts {
for i in (0..vectorizable_elements).step_by(SO256I) {
for j in 0..16 {
let p = src.add(i + total_elements * (off_in_ty + j)) as *const __m256i;
ymm0[j] = _mm256_loadu_si256(p);
}
for j in 0..8 {
ymm1[j] = _mm256_unpacklo_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
ymm1[8 + j] = _mm256_unpackhi_epi8(ymm0[j * 2], ymm0[j * 2 + 1]);
}
for j in 0..8 {
ymm0[j] = _mm256_unpacklo_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
ymm0[8 + j] = _mm256_unpackhi_epi16(ymm1[j * 2], ymm1[j * 2 + 1]);
}
for j in 0..8 {
ymm1[j] = _mm256_unpacklo_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
ymm1[8 + j] = _mm256_unpackhi_epi32(ymm0[j * 2], ymm0[j * 2 + 1]);
}
for j in 0..8 {
ymm0[j] = _mm256_unpacklo_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
ymm0[8 + j] = _mm256_unpackhi_epi64(ymm1[j * 2], ymm1[j * 2 + 1]);
}
for j in 0..8 {
ymm1[j] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x20);
ymm1[j + 8] = _mm256_permute2x128_si256(ymm0[j], ymm0[j + 8], 0x31);
}
#[allow(clippy::identity_op)]
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 1) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 0) * ts) as *mut __m128i,
ymm1[0],
);
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 3) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 2) * ts) as *mut __m128i,
ymm1[4],
);
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 5) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 4) * ts) as *mut __m128i,
ymm1[2],
);
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 7) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 6) * ts) as *mut __m128i,
ymm1[6],
);
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 9) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 8) * ts) as *mut __m128i,
ymm1[1],
);
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 11) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 10) * ts) as *mut __m128i,
ymm1[5],
);
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 13) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 12) * ts) as *mut __m128i,
ymm1[3],
);
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 15) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 14) * ts) as *mut __m128i,
ymm1[7],
);
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 17) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 16) * ts) as *mut __m128i,
ymm1[8],
);
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 19) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 18) * ts) as *mut __m128i,
ymm1[12],
);
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 21) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 20) * ts) as *mut __m128i,
ymm1[10],
);
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 23) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 22) * ts) as *mut __m128i,
ymm1[14],
);
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 25) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 24) * ts) as *mut __m128i,
ymm1[9],
);
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 27) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 26) * ts) as *mut __m128i,
ymm1[13],
);
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 29) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 28) * ts) as *mut __m128i,
ymm1[11],
);
_mm256_storeu2_m128i(
dst.add(off_in_ty + (i + 31) * ts) as *mut __m128i,
dst.add(off_in_ty + (i + 30) * ts) as *mut __m128i,
ymm1[15],
);
}
off_in_ty += if off_in_ty == 0 && vecs_rem > 0 {
vecs_rem
} else {
SO128I
};
}
}
pub unsafe fn unshuffle(typesize: usize, len: usize, src: *const u8, dst: *mut u8) {
let vectorized_chunk_size = typesize * mem::size_of::<__m256i>();
let vectorizable_bytes = len - (len % vectorized_chunk_size);
let vectorizable_elements = vectorizable_bytes / typesize;
let total_elements = len / typesize;
if len < vectorized_chunk_size {
crate::generic::unshuffle(typesize, len, src, dst);
return;
}
if typesize == 2 {
unshuffle2(vectorizable_elements, total_elements, src, dst);
} else if typesize == 16 {
unshuffle16(vectorizable_elements, total_elements, src, dst);
} else if typesize > SO128I {
unshuffle_tiled(vectorizable_elements, total_elements, typesize, src, dst);
} else {
crate::generic::unshuffle(typesize, len, src, dst);
return;
}
if vectorizable_bytes < len {
crate::generic::unshuffle_partial(typesize, vectorizable_bytes, len, src, dst);
}
}
#[cfg(test)]
mod t {
macro_rules! require_avx2 {
() => {
if !is_x86_feature_detected!("avx2") {
eprintln!("Skipping: AVX2 unavailable.");
return;
}
};
}
mod shuffle {
use rand::Rng;
use rstest::rstest;
#[rstest]
#[case(16, 256)]
#[case(16, 4096)]
#[case(16, 4352)]
fn compare(#[case] typesize: usize, #[case] len: usize) {
require_avx2!();
let mut rng = rand::rng();
let src = (0..len).map(|_| rng.random()).collect::<Vec<u8>>();
let mut generic_dst = vec![0u8; len];
let mut sse2_dst = vec![0u8; len];
unsafe {
crate::generic::shuffle(typesize, len, src.as_ptr(), generic_dst.as_mut_ptr());
crate::avx2::shuffle(typesize, len, src.as_ptr(), sse2_dst.as_mut_ptr());
}
assert_eq!(generic_dst, sse2_dst);
}
#[rstest]
fn compare16x256() {
require_avx2!();
let typesize = 16;
let len = 256;
let src = (0..=255).collect::<Vec<u8>>();
let mut generic_dst = vec![0u8; len];
let mut sse2_dst = vec![0u8; len];
unsafe {
crate::generic::shuffle(typesize, len, src.as_ptr(), generic_dst.as_mut_ptr());
crate::avx2::shuffle(typesize, len, src.as_ptr(), sse2_dst.as_mut_ptr());
}
assert_eq!(generic_dst, sse2_dst);
}
#[rstest]
fn compare16x512() {
require_avx2!();
let typesize = 16;
let len = 512;
let src = (0..len).map(|i| (i % 256) as u8).collect::<Vec<u8>>();
let mut generic_dst = vec![0u8; len];
let mut sse2_dst = vec![0u8; len];
let srcp = src.as_ptr();
unsafe {
crate::generic::shuffle(typesize, len, srcp, generic_dst.as_mut_ptr());
crate::avx2::shuffle(typesize, len, srcp, sse2_dst.as_mut_ptr());
}
assert_eq!(generic_dst, sse2_dst);
}
}
mod unshuffle {
use rand::Rng;
use rstest::rstest;
#[rstest]
#[case(2, 64)]
#[case(2, 4096)]
#[case(16, 512)]
#[case(16, 4096)]
#[case(17, 272)]
#[case(17, 4096)]
fn compare(#[case] typesize: usize, #[case] len: usize) {
require_avx2!();
let mut rng = rand::rng();
let src = (0..len).map(|_| rng.random()).collect::<Vec<u8>>();
let mut generic_dst = vec![0u8; len];
let mut avx2_dst = vec![0u8; len];
unsafe {
crate::generic::unshuffle(typesize, len, src.as_ptr(), generic_dst.as_mut_ptr());
super::super::unshuffle(typesize, len, src.as_ptr(), avx2_dst.as_mut_ptr());
}
assert_eq!(generic_dst, avx2_dst);
}
}
}