#![cfg_attr(docsrs, feature(doc_cfg))]
#![cfg_attr(not(feature = "std"), no_std)]
#[cfg(feature = "large_tables")]
mod large_tables;
mod tables;
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
#[cfg(feature = "large_tables")]
use large_tables::mask128_epi8;
use tables::*;
#[target_feature(enable = "ssse3")]
#[cfg(feature = "large_tables")]
#[cfg_attr(docsrs, doc(cfg(feature = "large_tables")))]
#[inline]
pub unsafe fn prune_epi8(x: __m128i, mask: i32) -> __m128i {
let ptr = mask128_epi8[16 * mask as usize..].as_ptr().cast();
_mm_shuffle_epi8(x, _mm_loadu_si128(ptr))
}
#[inline]
unsafe fn left_shift_bytes(x: __m128i, count: i32) -> __m128i {
let p1 = _mm_sll_epi64(x, _mm_cvtsi64_si128(count as i64 * 8));
let p2 = _mm_srl_epi64(
_mm_unpacklo_epi64(_mm_setzero_si128(), x),
_mm_cvtsi64_si128(64 - count as i64 * 8),
);
_mm_or_si128(p1, p2)
}
#[target_feature(enable = "ssse3")]
#[inline]
pub unsafe fn thinprune_epi8(x: __m128i, mask: i32) -> __m128i {
let mask1 = mask & 0xFF;
let pop = 8 - mask1.count_ones();
let mask2 = mask as u32 >> 8; let m1 = _mm_loadl_epi64(thintable_epi8[mask1 as usize..].as_ptr().cast());
let m2 = _mm_loadl_epi64(thintable_epi8[mask2 as usize..].as_ptr().cast());
let m2add = _mm_add_epi8(m2, _mm_set1_epi8(8));
let m2shifted = left_shift_bytes(m2add, pop as i32);
let shufmask = _mm_or_si128(m2shifted, m1);
_mm_shuffle_epi8(x, shufmask)
}
#[target_feature(enable = "ssse3")]
#[inline]
pub unsafe fn skinnyprune_epi8(x: __m128i, mask: i32) -> __m128i {
let mask1 = mask & 0xFF;
let mask2 = mask as u32 >> 8;
let ptr1 = thintable_epi8[mask1 as usize..].as_ptr().cast();
let ptr2 = thintable_epi8[mask2 as usize..].as_ptr().cast();
let mut shufmask =
_mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(_mm_loadl_epi64(ptr1)), ptr2));
shufmask = _mm_add_epi8(shufmask, _mm_set_epi32(0x0808_0808, 0x0808_0808, 0, 0));
let pruned = _mm_shuffle_epi8(x, shufmask);
let popx2 = BitsSetTable256mul2[mask1 as usize];
let compactmask = _mm_loadu_si128(pshufb_combine_table[popx2 as usize * 8..].as_ptr().cast());
_mm_shuffle_epi8(pruned, compactmask)
}
#[target_feature(enable = "ssse3")]
#[inline]
pub unsafe fn prune_epi16(x: __m128i, mask: i32) -> __m128i {
let ptr = mask128_epi16[16 * mask as usize..].as_ptr().cast();
_mm_shuffle_epi8(x, _mm_loadu_si128(ptr))
}
#[target_feature(enable = "ssse3")]
#[inline]
pub unsafe fn prune_epi32(x: __m128i, mask: i32) -> __m128i {
let ptr = mask128_epi32[16 * mask as usize..].as_ptr().cast();
_mm_shuffle_epi8(x, _mm_loadu_si128(ptr))
}
#[inline]
#[target_feature(enable = "ssse3")]
pub unsafe fn prune_ps(x: __m128, mask: i32) -> __m128 {
_mm_castsi128_ps(prune_epi32(_mm_castps_si128(x), mask))
}
#[target_feature(enable = "avx2")]
#[inline]
pub unsafe fn prune256_epi32(x: __m256i, mask: i32) -> __m256i {
let ptr = mask256_epi32[8 * mask as usize..].as_ptr().cast();
_mm256_permutevar8x32_epi32(x, _mm256_loadu_si256(ptr))
}
#[inline]
#[target_feature(enable = "avx2")]
pub unsafe fn prune256_ps(x: __m256, mask: i32) -> __m256 {
let ptr = mask256_epi32[8 * mask as usize..].as_ptr().cast();
_mm256_permutevar8x32_ps(x, _mm256_loadu_si256(ptr))
}
#[target_feature(enable = "avx2,bmi2")]
#[inline]
pub unsafe fn pext_prune256_epi32(src: __m256i, mask: u64) -> __m256i {
assert!(mask < 1 << 8);
let mut expanded_mask = _pdep_u64(mask, 0x0101_0101_0101_0101); expanded_mask *= 0xFF;
let identity_indices = 0x0706_0504_0302_0100;
let wanted_indices = _pext_u64(identity_indices, expanded_mask);
let bytevec = _mm_cvtsi64_si128(wanted_indices as i64);
let shufmask = _mm256_cvtepu8_epi32(bytevec);
_mm256_permutevar8x32_epi32(src, shufmask)
}