static BMI2_AVAILABLE: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
#[inline]
pub fn bmi2_available() -> bool {
*BMI2_AVAILABLE.get_or_init(|| {
#[cfg(target_arch = "x86_64")]
{
is_x86_feature_detected!("bmi2")
}
#[cfg(not(target_arch = "x86_64"))]
{
false
}
})
}
#[cfg(target_arch = "x86_64")]
pub fn bmi2_fast() -> bool {
if !bmi2_available() {
return false;
}
true
}
#[cfg(not(target_arch = "x86_64"))]
pub fn bmi2_fast() -> bool {
false
}
#[inline]
pub fn pext_u64(src: u64, mask: u64) -> u64 {
#[cfg(target_arch = "x86_64")]
{
if bmi2_available() {
return unsafe { pext_u64_bmi2(src, mask) };
}
}
pext_u64_scalar(src, mask)
}
#[inline]
pub fn pdep_u64(src: u64, mask: u64) -> u64 {
#[cfg(target_arch = "x86_64")]
{
if bmi2_available() {
return unsafe { pdep_u64_bmi2(src, mask) };
}
}
pdep_u64_scalar(src, mask)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi2")]
#[inline]
unsafe fn pext_u64_bmi2(src: u64, mask: u64) -> u64 {
use std::arch::x86_64::_pext_u64;
_pext_u64(src, mask)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi2")]
#[inline]
unsafe fn pdep_u64_bmi2(src: u64, mask: u64) -> u64 {
use std::arch::x86_64::_pdep_u64;
_pdep_u64(src, mask)
}
#[inline]
fn pext_u64_scalar(src: u64, mask: u64) -> u64 {
let mut result = 0u64;
let mut bb = 1u64;
let mut m = mask;
while m != 0 {
if src & m & m.wrapping_neg() != 0 {
result |= bb;
}
m &= m - 1; bb <<= 1;
}
result
}
#[inline]
fn pdep_u64_scalar(src: u64, mask: u64) -> u64 {
let mut result = 0u64;
let mut bb = 1u64;
let mut m = mask;
while m != 0 {
let bit = m & m.wrapping_neg(); if src & bb != 0 {
result |= bit;
}
m &= m - 1; bb <<= 1;
}
result
}
#[inline]
pub fn pext_u32(src: u32, mask: u32) -> u32 {
pext_u64(src as u64, mask as u64) as u32
}
#[inline]
pub fn pdep_u32(src: u32, mask: u32) -> u32 {
pdep_u64(src as u64, mask as u64) as u32
}
pub fn pack_4bit(values: &[u8]) -> Vec<u8> {
let packed_len = (values.len() + 1) / 2;
let mut packed = vec![0u8; packed_len];
for (i, chunk) in values.chunks(2).enumerate() {
let lo = chunk[0] & 0x0F;
let hi = chunk.get(1).map_or(0, |&v| v & 0x0F);
packed[i] = lo | (hi << 4);
}
packed
}
pub fn unpack_4bit(packed: &[u8], count: usize) -> Vec<u8> {
let mut values = Vec::with_capacity(count);
for &byte in packed {
if values.len() < count {
values.push(byte & 0x0F);
}
if values.len() < count {
values.push(byte >> 4);
}
}
values
}
pub fn pack_nbits(values: &[u8], bits_per_value: u8) -> Vec<u8> {
assert!(bits_per_value >= 1 && bits_per_value <= 8);
let total_bits = values.len() * bits_per_value as usize;
let packed_len = (total_bits + 7) / 8;
let mut packed = vec![0u8; packed_len];
let mask = (1u8 << bits_per_value) - 1;
let mut bit_pos = 0usize;
for &value in values {
let value = value & mask;
let byte_idx = bit_pos / 8;
let bit_offset = bit_pos % 8;
packed[byte_idx] |= value << bit_offset;
if bit_offset + bits_per_value as usize > 8 {
let overflow_bits = bit_offset + bits_per_value as usize - 8;
if byte_idx + 1 < packed.len() {
packed[byte_idx + 1] |= value >> (bits_per_value as usize - overflow_bits);
}
}
bit_pos += bits_per_value as usize;
}
packed
}
pub fn unpack_nbits(packed: &[u8], bits_per_value: u8, count: usize) -> Vec<u8> {
assert!(bits_per_value >= 1 && bits_per_value <= 8);
let mut values = Vec::with_capacity(count);
let mask = (1u16 << bits_per_value) - 1;
let mut bit_pos = 0usize;
for _ in 0..count {
let byte_idx = bit_pos / 8;
let bit_offset = bit_pos % 8;
if byte_idx >= packed.len() {
break;
}
let mut raw = packed[byte_idx] as u16;
if byte_idx + 1 < packed.len() {
raw |= (packed[byte_idx + 1] as u16) << 8;
}
let value = ((raw >> bit_offset) & mask) as u8;
values.push(value);
bit_pos += bits_per_value as usize;
}
values
}
#[inline]
pub fn extract_4bit_batch(packed: u64) -> [u8; 16] {
let mut result = [0u8; 16];
#[cfg(target_arch = "x86_64")]
if bmi2_available() {
const NIBBLE_MASK: u64 = 0x0F0F_0F0F_0F0F_0F0F;
let even = unsafe { pext_u64_bmi2(packed, NIBBLE_MASK) };
let odd = unsafe { pext_u64_bmi2(packed >> 4, NIBBLE_MASK) };
for i in 0..8 {
result[i * 2] = ((even >> (i * 4)) & 0x0F) as u8;
result[i * 2 + 1] = ((odd >> (i * 4)) & 0x0F) as u8;
}
return result;
}
for i in 0..16 {
result[i] = ((packed >> (i * 4)) & 0x0F) as u8;
}
result
}
#[inline]
pub fn deposit_4bit_batch(values: [u8; 16]) -> u64 {
#[cfg(target_arch = "x86_64")]
if bmi2_available() {
let mut even = 0u64;
let mut odd = 0u64;
for i in 0..8 {
even |= ((values[i * 2] & 0x0F) as u64) << (i * 4);
odd |= ((values[i * 2 + 1] & 0x0F) as u64) << (i * 4);
}
const NIBBLE_MASK: u64 = 0x0F0F_0F0F_0F0F_0F0F;
let packed_even = unsafe { pdep_u64_bmi2(even, NIBBLE_MASK) };
let packed_odd = unsafe { pdep_u64_bmi2(odd, NIBBLE_MASK) } << 4;
return packed_even | packed_odd;
}
let mut result = 0u64;
for i in 0..16 {
result |= ((values[i] & 0x0F) as u64) << (i * 4);
}
result
}
pub fn dispatch_info() -> &'static str {
#[cfg(target_arch = "x86_64")]
{
if bmi2_available() {
if bmi2_fast() {
return "BMI2 (fast)";
} else {
return "BMI2 (slow/microcode)";
}
}
return "Scalar (x86_64)";
}
#[cfg(target_arch = "aarch64")]
{
return "Scalar (ARM64)";
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
return "Scalar";
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pext_basic() {
let src = 0b_1010_1010u64;
let mask = 0b_0101_0101u64;
let result = pext_u64(src, mask);
assert_eq!(result, 0b_0000_0000);
let result2 = pext_u64(src, 0b_1010_1010);
assert_eq!(result2, 0b_0000_1111); }
#[test]
fn test_pdep_basic() {
let src = 0b_0000_1111u64;
let mask = 0b_0101_0101u64;
let result = pdep_u64(src, mask);
assert_eq!(result, 0b_0101_0101); }
#[test]
fn test_pext_pdep_roundtrip() {
let original = 0b_1100_1010_0011_0101u64;
let mask = 0b_1111_0000_1111_0000u64;
let extracted = pext_u64(original, mask);
let restored = pdep_u64(extracted, mask);
assert_eq!(original & mask, restored);
}
#[test]
fn test_pack_unpack_4bit() {
let values: Vec<u8> = vec![0, 15, 7, 8, 3, 12];
let packed = pack_4bit(&values);
let unpacked = unpack_4bit(&packed, values.len());
assert_eq!(values, unpacked);
}
#[test]
fn test_pack_unpack_nbits() {
let values: Vec<u8> = vec![0, 7, 3, 5, 2, 6, 1, 4];
let packed = pack_nbits(&values, 3);
let unpacked = unpack_nbits(&packed, 3, values.len());
assert_eq!(values, unpacked);
}
#[test]
fn test_extract_4bit_batch() {
let mut packed: u64 = 0;
for i in 0..16u64 {
packed |= i << (i * 4);
}
let result = extract_4bit_batch(packed);
for i in 0..16 {
assert_eq!(result[i], i as u8, "nibble {} mismatch", i);
}
}
#[test]
fn test_deposit_4bit_batch() {
let values: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
let packed = deposit_4bit_batch(values);
for i in 0..16 {
assert_eq!(((packed >> (i * 4)) & 0x0F) as u8, i as u8);
}
}
#[test]
fn test_dispatch_info() {
let info = dispatch_info();
assert!(!info.is_empty());
println!("BMI2 dispatch: {}", info);
}
#[test]
fn test_scalar_fallback_correctness() {
let src = 0xDEAD_BEEF_CAFE_BABEu64;
let mask = 0x5555_5555_5555_5555u64;
let extracted = pext_u64_scalar(src, mask);
let deposited = pdep_u64_scalar(extracted, mask);
assert_eq!(src & mask, deposited);
}
#[test]
fn test_32bit_versions() {
let src = 0xABCD_1234u32;
let mask = 0xFF00_FF00u32;
let extracted = pext_u32(src, mask);
let deposited = pdep_u32(extracted, mask);
assert_eq!(src & mask, deposited);
}
#[test]
fn test_edge_cases() {
assert_eq!(pext_u64(0xFFFF_FFFF, 0), 0);
assert_eq!(pdep_u64(0xFFFF_FFFF, 0), 0);
assert_eq!(pext_u64(0x1234, 0xFFFF), 0x1234);
assert_eq!(pdep_u64(0x1234, 0xFFFF), 0x1234);
assert_eq!(pext_u64(0, 0xFFFF), 0);
assert_eq!(pdep_u64(0, 0xFFFF), 0);
}
}