#![cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
pub(crate) const LEFTPACK_LUT: [[i32; 8]; 16] = {
let mut lut = [[0i32; 8]; 16];
let mut m = 0usize;
while m < 16 {
let mut j = 0usize;
let mut k = 0usize;
while k < 4 {
if m & (1 << k) != 0 {
lut[m][2 * j] = (2 * k) as i32;
lut[m][2 * j + 1] = (2 * k + 1) as i32;
j += 1;
}
k += 1;
}
m += 1;
}
lut
};
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn leftpack4(src: *const usize, mask: u32, dst: *mut usize) -> usize {
let lane = (mask & 0xF) as usize;
unsafe {
let idx = _mm256_loadu_si256(src as *const __m256i);
let ctrl = _mm256_loadu_si256(LEFTPACK_LUT[lane].as_ptr() as *const __m256i);
let packed = _mm256_permutevar8x32_epi32(idx, ctrl);
_mm256_storeu_si256(dst as *mut __m256i, packed);
}
lane.count_ones() as usize
}