#[inline(always)]
pub fn popcount_word(word: u64) -> u32 {
#[cfg(feature = "portable-popcount")]
{
popcount_word_portable(word)
}
#[cfg(all(feature = "simd", not(feature = "portable-popcount")))]
{
word.count_ones()
}
#[cfg(not(any(feature = "portable-popcount", feature = "simd")))]
{
word.count_ones()
}
}
#[inline]
pub fn popcount_words(words: &[u64]) -> usize {
#[cfg(feature = "portable-popcount")]
{
popcount_words_portable(words)
}
#[cfg(all(
feature = "simd",
target_arch = "aarch64",
not(feature = "portable-popcount")
))]
{
popcount_words_neon(words)
}
#[cfg(all(
feature = "simd",
target_arch = "x86_64",
not(feature = "portable-popcount")
))]
{
popcount_words_x86(words)
}
#[cfg(not(any(feature = "simd", feature = "portable-popcount")))]
{
popcount_words_default(words)
}
}
#[inline]
#[cfg(not(any(feature = "simd", feature = "portable-popcount")))]
fn popcount_words_default(words: &[u64]) -> usize {
let mut total = 0usize;
for &word in words {
total += word.count_ones() as usize;
}
total
}
#[inline(always)]
#[cfg(feature = "portable-popcount")]
pub fn popcount_word_portable(mut x: u64) -> u32 {
const M1: u64 = 0x5555_5555_5555_5555; const M2: u64 = 0x3333_3333_3333_3333; const M4: u64 = 0x0f0f_0f0f_0f0f_0f0f; const H01: u64 = 0x0101_0101_0101_0101;
x = x - ((x >> 1) & M1);
x = (x & M2) + ((x >> 2) & M2);
x = (x + (x >> 4)) & M4;
((x.wrapping_mul(H01)) >> 56) as u32
}
#[inline]
#[cfg(feature = "portable-popcount")]
fn popcount_words_portable(words: &[u64]) -> usize {
let mut total = 0usize;
for &word in words {
total += popcount_word_portable(word) as usize;
}
total
}
#[cfg(all(
feature = "simd",
target_arch = "aarch64",
not(feature = "portable-popcount")
))]
#[inline]
fn popcount_words_neon(words: &[u64]) -> usize {
if words.is_empty() {
return 0;
}
let mut total = 0usize;
let ptr = words.as_ptr() as *const u8;
let byte_len = words.len() * 8;
let mut offset = 0;
while offset + 256 <= byte_len {
unsafe {
let c0 = popcount_64bytes_neon(ptr.add(offset));
let c1 = popcount_64bytes_neon(ptr.add(offset + 64));
let c2 = popcount_64bytes_neon(ptr.add(offset + 128));
let c3 = popcount_64bytes_neon(ptr.add(offset + 192));
total += (c0 + c1 + c2 + c3) as usize;
}
offset += 256;
}
while offset + 64 <= byte_len {
let count = unsafe { popcount_64bytes_neon(ptr.add(offset)) };
total += count as usize;
offset += 64;
}
let remaining_words = (byte_len - offset) / 8;
for i in 0..remaining_words {
total += words[offset / 8 + i].count_ones() as usize;
}
total
}
#[cfg(all(
feature = "simd",
target_arch = "aarch64",
not(feature = "portable-popcount")
))]
#[inline]
unsafe fn popcount_64bytes_neon(ptr: *const u8) -> u32 {
use core::arch::aarch64::*;
unsafe {
let v0 = vld1q_u8(ptr);
let v1 = vld1q_u8(ptr.add(16));
let v2 = vld1q_u8(ptr.add(32));
let v3 = vld1q_u8(ptr.add(48));
let c0 = vcntq_u8(v0);
let c1 = vcntq_u8(v1);
let c2 = vcntq_u8(v2);
let c3 = vcntq_u8(v3);
let sum01 = vaddq_u8(c0, c1);
let sum23 = vaddq_u8(c2, c3);
let wide01 = vpaddlq_u8(sum01);
let wide23 = vpaddlq_u8(sum23);
let wide_sum = vaddq_u16(wide01, wide23);
vaddvq_u16(wide_sum) as u32
}
}
#[cfg(all(
feature = "simd",
target_arch = "x86_64",
not(feature = "portable-popcount")
))]
#[inline]
#[target_feature(enable = "avx512f,avx512vpopcntdq")]
unsafe fn popcount_words_avx512vpopcntdq(words: &[u64]) -> usize {
use core::arch::x86_64::*;
if words.is_empty() {
return 0;
}
let mut total = 0usize;
let mut offset = 0;
while offset + 8 <= words.len() {
unsafe {
let ptr = words.as_ptr().add(offset) as *const __m512i;
let v = _mm512_loadu_si512(ptr);
let counts = _mm512_popcnt_epi64(v);
total += _mm512_reduce_add_epi64(counts) as usize;
}
offset += 8;
}
for &word in &words[offset..] {
total += word.count_ones() as usize;
}
total
}
#[cfg(all(
feature = "simd",
target_arch = "x86_64",
not(feature = "portable-popcount")
))]
#[inline]
fn popcount_words_x86(words: &[u64]) -> usize {
#[cfg(feature = "std")]
{
if is_x86_feature_detected!("avx512vpopcntdq") {
return unsafe { popcount_words_avx512vpopcntdq(words) };
}
}
let mut total = 0usize;
for &word in words {
total += word.count_ones() as usize;
}
total
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_popcount_word() {
assert_eq!(popcount_word(0), 0);
assert_eq!(popcount_word(1), 1);
assert_eq!(popcount_word(u64::MAX), 64);
assert_eq!(popcount_word(0xAAAA_AAAA_AAAA_AAAA), 32);
assert_eq!(popcount_word(0x5555_5555_5555_5555), 32);
}
#[test]
fn test_popcount_words() {
let empty: &[u64] = &[];
assert_eq!(popcount_words(empty), 0);
let ones = [u64::MAX; 8];
assert_eq!(popcount_words(&ones), 512);
let pattern = [0xAAAA_AAAA_AAAA_AAAA; 16];
assert_eq!(popcount_words(&pattern), 512);
}
#[test]
fn test_popcount_words_various_lengths() {
for len in 0..20 {
let words: Vec<u64> = (0..len)
.map(|i| (i as u64) | 0x8000_0000_0000_0001)
.collect();
let expected: usize = words.iter().map(|w| w.count_ones() as usize).sum();
assert_eq!(popcount_words(&words), expected, "len={}", len);
}
}
#[test]
fn test_popcount_words_chunk_boundaries() {
for len in [
0, 1, 7, 8, 9, 15, 16, 17, 24, 31, 32, 33, 39, 40, 48, 63, 64, 65, 96, 100, 128,
] {
let words: Vec<u64> = (0..len)
.map(|i| (i as u64).wrapping_mul(0xDEAD_BEEF_CAFE_BABE) | 1)
.collect();
let expected: usize = words.iter().map(|w| w.count_ones() as usize).sum();
assert_eq!(popcount_words(&words), expected, "len={}", len);
}
}
#[test]
fn test_popcount_words_all_ones() {
for len in [1, 8, 32, 33, 64, 100, 256] {
let words = vec![u64::MAX; len];
assert_eq!(popcount_words(&words), len * 64, "len={}", len);
}
}
#[test]
fn test_popcount_words_all_zeros() {
for len in [1, 8, 32, 64, 100] {
let words = vec![0u64; len];
assert_eq!(popcount_words(&words), 0, "len={}", len);
}
}
#[test]
fn test_popcount_words_exceeds_u32_max() {
let words_for_overflow: usize = (u32::MAX as usize / 64) + 1; let expected: usize = words_for_overflow * 64; assert!(expected > u32::MAX as usize);
let words = vec![u64::MAX; 1_000_000];
assert_eq!(popcount_words(&words), 64_000_000);
}
#[test]
fn test_popcount_words_single_bits() {
let words: Vec<u64> = (0..64).map(|i| 1u64 << i).collect();
assert_eq!(popcount_words(&words), 64);
}
#[test]
fn test_popcount_words_mixed_density() {
let words: Vec<u64> = (0..128)
.map(|i| if i % 2 == 0 { u64::MAX } else { 1 })
.collect();
let expected = 64 * 64 + 64; assert_eq!(popcount_words(&words), expected);
}
#[cfg(feature = "portable-popcount")]
#[test]
fn test_portable_matches_builtin() {
for i in 0u64..1000 {
let word = i.wrapping_mul(0x1234_5678_9ABC_DEF0_u64).wrapping_add(i);
assert_eq!(
popcount_word_portable(word),
word.count_ones(),
"word={:#x}",
word
);
}
}
#[cfg(all(
feature = "simd",
target_arch = "x86_64",
not(feature = "portable-popcount")
))]
#[test]
fn test_avx512_vpopcntdq_matches_scalar() {
if !is_x86_feature_detected!("avx512vpopcntdq") {
eprintln!("Skipping AVX-512 VPOPCNTDQ test: CPU doesn't support it");
return;
}
for len in [0, 1, 7, 8, 9, 15, 16, 17, 64, 100, 1000] {
let words: Vec<u64> = (0..len)
.map(|i: u64| {
match i % 4 {
0 => u64::MAX,
1 => 0,
2 => 0xAAAA_AAAA_AAAA_AAAA,
_ => i.wrapping_mul(0x0123_4567_89AB_CDEF),
}
})
.collect();
let expected: usize = words.iter().map(|w: &u64| w.count_ones() as usize).sum();
let avx512_result = unsafe { popcount_words_avx512vpopcntdq(&words) };
assert_eq!(
avx512_result, expected,
"AVX-512 VPOPCNTDQ mismatch for {} words",
len
);
}
}
#[cfg(all(
feature = "simd",
target_arch = "x86_64",
not(feature = "portable-popcount")
))]
#[test]
fn test_avx512_edge_cases() {
if !is_x86_feature_detected!("avx512vpopcntdq") {
return;
}
let zeros = vec![0u64; 100];
assert_eq!(unsafe { popcount_words_avx512vpopcntdq(&zeros) }, 0);
let ones = vec![u64::MAX; 100];
assert_eq!(unsafe { popcount_words_avx512vpopcntdq(&ones) }, 100 * 64);
let alt = vec![0xAAAA_AAAA_AAAA_AAAA; 100];
assert_eq!(unsafe { popcount_words_avx512vpopcntdq(&alt) }, 100 * 32);
}
}