#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
pub mod interleave {
use core::arch::aarch64::{self, uint32x4_t};
#[inline]
#[must_use]
pub fn interleave_u32(v0: uint32x4_t, v1: uint32x4_t) -> (uint32x4_t, uint32x4_t) {
unsafe {
(aarch64::vtrn1q_u32(v0, v1), aarch64::vtrn2q_u32(v0, v1))
}
}
#[inline]
#[must_use]
pub fn interleave_u64(v0: uint32x4_t, v1: uint32x4_t) -> (uint32x4_t, uint32x4_t) {
unsafe {
let v0 = aarch64::vreinterpretq_u64_u32(v0);
let v1 = aarch64::vreinterpretq_u64_u32(v1);
(
aarch64::vreinterpretq_u32_u64(aarch64::vtrn1q_u64(v0, v1)),
aarch64::vreinterpretq_u32_u64(aarch64::vtrn2q_u64(v0, v1)),
)
}
}
}
#[cfg(all(
target_arch = "x86_64",
target_feature = "avx2",
not(target_feature = "avx512f")
))]
pub mod interleave {
use core::arch::x86_64::{self, __m256i};
#[inline]
#[must_use]
pub fn interleave_u32(a: __m256i, b: __m256i) -> (__m256i, __m256i) {
unsafe {
let t = x86_64::_mm256_srli_epi64::<32>(a);
let u = x86_64::_mm256_slli_epi64::<32>(b);
(
x86_64::_mm256_blend_epi32::<0b10101010>(a, u),
x86_64::_mm256_blend_epi32::<0b10101010>(t, b),
)
}
}
#[inline]
#[must_use]
pub fn interleave_u64(a: __m256i, b: __m256i) -> (__m256i, __m256i) {
unsafe {
(
x86_64::_mm256_unpacklo_epi64(a, b),
x86_64::_mm256_unpackhi_epi64(a, b),
)
}
}
#[inline]
#[must_use]
pub fn interleave_u128(a: __m256i, b: __m256i) -> (__m256i, __m256i) {
unsafe {
let t = x86_64::_mm256_permute2x128_si256::<0x21>(a, b);
(
x86_64::_mm256_blend_epi32::<0b11110000>(a, t),
x86_64::_mm256_blend_epi32::<0b11110000>(t, b),
)
}
}
}
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
pub mod interleave {
use core::arch::x86_64::{self, __m512i, __mmask8, __mmask16};
use core::mem::transmute;
const EVENS: __mmask16 = 0b0101010101010101;
const EVENS4: __mmask16 = 0x0f0f;
#[cfg(target_feature = "avx512vbmi2")]
#[inline]
#[must_use]
fn interleave1_antidiagonal(x: __m512i, y: __m512i) -> __m512i {
unsafe {
x86_64::_mm512_shrdi_epi64::<32>(x, y)
}
}
#[cfg(not(target_feature = "avx512vbmi2"))]
#[inline]
#[must_use]
fn interleave1_antidiagonal(x: __m512i, y: __m512i) -> __m512i {
const INTERLEAVE1_INDICES: __m512i = unsafe {
transmute::<[u32; 16], _>([
0x01, 0x10, 0x03, 0x12, 0x05, 0x14, 0x07, 0x16, 0x09, 0x18, 0x0b, 0x1a, 0x0d, 0x1c,
0x0f, 0x1e,
])
};
unsafe {
x86_64::_mm512_permutex2var_epi32(x, INTERLEAVE1_INDICES, y)
}
}
#[inline]
#[must_use]
pub fn interleave_u32(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
let t = interleave1_antidiagonal(x, y);
unsafe {
(
x86_64::_mm512_mask_blend_epi32(EVENS, t, x),
x86_64::_mm512_mask_blend_epi32(EVENS, y, t),
)
}
}
#[inline]
#[must_use]
fn shuffle_epi64<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
unsafe {
let a = x86_64::_mm512_castsi512_pd(a);
let b = x86_64::_mm512_castsi512_pd(b);
x86_64::_mm512_castpd_si512(x86_64::_mm512_shuffle_pd::<MASK>(a, b))
}
}
#[inline]
#[must_use]
pub fn interleave_u64(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
unsafe {
let t = shuffle_epi64::<0b01010101>(x, y);
(
x86_64::_mm512_mask_blend_epi64(EVENS as __mmask8, t, x),
x86_64::_mm512_mask_blend_epi64(EVENS as __mmask8, y, t),
)
}
}
#[inline]
#[must_use]
pub fn interleave_u128(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
const INTERLEAVE4_INDICES: __m512i = unsafe {
transmute::<[u64; 8], _>([0o02, 0o03, 0o10, 0o11, 0o06, 0o07, 0o14, 0o15])
};
unsafe {
let t = x86_64::_mm512_permutex2var_epi64(x, INTERLEAVE4_INDICES, y);
(
x86_64::_mm512_mask_blend_epi32(EVENS4, t, x),
x86_64::_mm512_mask_blend_epi32(EVENS4, y, t),
)
}
}
#[inline]
#[must_use]
pub fn interleave_u256(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
unsafe {
let t = x86_64::_mm512_shuffle_i64x2::<0b01_00_11_10>(x, y);
(
x86_64::_mm512_mask_blend_epi64(EVENS4 as __mmask8, t, x),
x86_64::_mm512_mask_blend_epi64(EVENS4 as __mmask8, y, t),
)
}
}
}
#[macro_export]
macro_rules! impl_packed_field_pow_2 {
(
$type:ty
$(, ($type_param:ty, $param_name:ty))?
; [ $( ($block_len:expr, $func:ident) ),* $(,)? ],
$width:expr
) => {
paste::paste! {
unsafe impl$(<$param_name: $type_param>)? PackedFieldPow2 for $type$(<$param_name>)? {
#[inline]
fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) {
let (v0, v1) = (self.to_vector(), other.to_vector());
let (res0, res1) = match block_len {
$(
$block_len => $func(v0, v1),
)*
$width => (v0, v1),
_ => panic!("unsupported block_len"),
};
unsafe {
(Self::from_vector(res0), Self::from_vector(res1))
}
}
}
}
};
}
pub use impl_packed_field_pow_2;