use crate::vektor::x86_64::*;
use crate::vektor::x86::*;
use crate::intrin::sum::*;
use crate::intrin::transmute::*;
use crate::intrin::popcnt::*;
use crate::arch::current::intrin::upcast::*;
use crate::intrin::sum::UpcastSum;
use crate::arch::current::vecs::*;
use crate::intrin::upcast::*;
use crate::vecs::*;
#[inline(always)]
#[cfg(target_feature = "ssse3")]
unsafe fn popcnt128(v: u8x16) -> usize {
optimized!();
let lookup = i8x16::new(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
let lo = v.be_u8s() & 0x0f;
let hi: u8x16 = v.be_u8s() >> 4;
(_mm_shuffle_epi8(lookup, hi.be_i8s()).be_u8s()
+ _mm_shuffle_epi8(lookup, lo.be_i8s()).be_u8s())
.sum_upcast() as usize
}
#[inline(always)]
#[cfg(not(target_feature = "ssse3"))]
#[allow(unused_unsafe)]
unsafe fn popcnt128(v: u8x16) -> usize {
fallback!();
v.be_u64s(). scalar_reduce(0, |acc, s| acc + (s.count_ones() as usize))
}
#[inline(always)]
#[cfg(target_feature = "avx2")]
unsafe fn popcnt256(v: u8x32) -> usize {
optimized!();
let lookup = i8x32::new(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
let lo = v.be_u8s() & 0x0f;
let hi: u8x32 = v.be_u8s() >> 4;
(_mm256_shuffle_epi8(lookup, hi.be_i8s()).be_u8s()
+ _mm256_shuffle_epi8(lookup, lo.be_i8s()).be_u8s())
.sum_upcast() as usize
}
#[inline(always)]
#[cfg(not(target_feature = "avx2"))]
#[allow(unused_unsafe)]
unsafe fn popcnt256(v: u8x32) -> usize {
fallback!();
v.be_u64s().scalar_reduce(0, |acc, s| acc + (s.count_ones() as usize))
}
#[inline(always)]
unsafe fn popcnt512(v: u8x64) -> usize {
fallback!();
v.be_u64s().scalar_reduce(0, |acc, s| acc + (s.count_ones() as usize))
}
impl_popcnt!(u8x64, popcnt512, u8x32, popcnt256, u8x16, popcnt128);
impl_popcnt!(i8x64, popcnt512, i8x32, popcnt256, i8x16, popcnt128);
impl_popcnt!(u16x32, popcnt512, u16x16, popcnt256, u16x8, popcnt128);
impl_popcnt!(i16x32, popcnt512, i16x16, popcnt256, i16x8, popcnt128);
impl_popcnt!(u32x16, popcnt512, u32x8, popcnt256, u32x4, popcnt128);
impl_popcnt!(i32x16, popcnt512, i32x8, popcnt256, i32x4, popcnt128);
impl_popcnt!(u64x8, popcnt512, u64x4, popcnt256, u64x2, popcnt128);
impl_popcnt!(i64x8, popcnt512, i64x4, popcnt256, i64x2, popcnt128);
#[cfg(test)]
mod tests {
use crate::prelude::*;
use crate::arch::current::vecs::*;
test_popcnt!((u8, u8, u8, i8, i8, i8, u16, u16, u16, i16, i16, i16, u32, u32, u32, i32, i32, i32, u64, u64, u64, i64, i64, i64),
(u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2),
(popcnt_u8x64, popcnt_u8x32, popcnt_u8x16, popcnt_i8x64, popcnt_i8x32, popcnt_i8x16, popcnt_u16x32, popcnt_u16x16, popcnt_u16x8, popcnt_i16x32, popcnt_i16x16, popcnt_i16x8, popcnt_u32x16, popcnt_u32x8, popcnt_u32x4, popcnt_i32x16, popcnt_i32x8, popcnt_i32x4, popcnt_u64x8, popcnt_u64x4, popcnt_u64x2, popcnt_i64x8, popcnt_i64x4, popcnt_i64x2));
}