#ifndef BMAVX512__H__INCLUDED__
#define BMAVX512__H__INCLUDED__
#include<emmintrin.h>
#include<immintrin.h>
#include<x86intrin.h>
#include "bmdef.h"
#include "bmbmi2.h"
namespace bm
{
#ifdef __GNUG__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"
#endif
inline
bool avx512_test_zero(__m512i m)
{
const __mmask16 m16F = __mmask16(~0u); __mmask16 eq_m = _mm512_cmpeq_epi32_mask(m, _mm512_set1_epi64(0ull));
return (eq_m == m16F);
}
inline
bool avx512_test_one(__m512i m)
{
const __mmask16 m16F = __mmask16(~0u); __mmask16 eq_m = _mm512_cmpeq_epi32_mask(m, _mm512_set1_epi64(-1));
return (eq_m == m16F);
}
#define BM_CSA256(h, l, a, b, c) \
{ \
__m256i u = _mm256_xor_si256(a, b); \
h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); \
l = _mm256_xor_si256(u, c); \
}
#define BM_AVX2_BIT_COUNT(ret, v) \
{ \
__m256i lo = _mm256_and_si256(v, low_mask); \
__m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); \
__m256i cnt1 = _mm256_shuffle_epi8(lookup1, lo); \
__m256i cnt2 = _mm256_shuffle_epi8(lookup2, hi); \
ret = _mm256_sad_epu8(cnt1, cnt2); \
}
#define BM_AVX2_DECL_LOOKUP1 \
__m256i lookup1 = _mm256_setr_epi8(4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, \
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8);
#define BM_AVX2_DECL_LOOKUP2 \
__m256i lookup2 = _mm256_setr_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, \
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
#define BM_AVX2_POPCNT_PROLOG \
BM_AVX2_DECL_LOOKUP1 \
BM_AVX2_DECL_LOOKUP2 \
__m256i low_mask = _mm256_set1_epi8(0x0f); \
__m256i bc;
inline
bm::id_t avx2_bit_count(const __m256i* BMRESTRICT block,
const __m256i* BMRESTRICT block_end)
{
__m256i cnt = _mm256_setzero_si256();
__m256i ones = _mm256_setzero_si256();
__m256i twos = _mm256_setzero_si256();
__m256i fours = _mm256_setzero_si256();
__m256i eights = _mm256_setzero_si256();
__m256i sixteens = _mm256_setzero_si256();
__m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
__m256i b, c;
BM_AVX2_POPCNT_PROLOG
bm::id64_t* cnt64;
do
{
b = _mm256_load_si256(block+0); c = _mm256_load_si256(block+1);
BM_CSA256(twosA, ones, ones, b, c);
b = _mm256_load_si256(block+2); c = _mm256_load_si256(block+3);
BM_CSA256(twosB, ones, ones, b, c);
BM_CSA256(foursA, twos, twos, twosA, twosB);
b = _mm256_load_si256(block+4); c = _mm256_load_si256(block+5);
BM_CSA256(twosA, ones, ones, b, c);
b = _mm256_load_si256(block+6); c = _mm256_load_si256(block+7);
BM_CSA256(twosB, ones, ones, b, c);
BM_CSA256(foursB, twos, twos, twosA, twosB);
BM_CSA256(eightsA, fours, fours, foursA, foursB);
b = _mm256_load_si256(block+8); c = _mm256_load_si256(block+9);
BM_CSA256(twosA, ones, ones, b, c);
b = _mm256_load_si256(block+10); c = _mm256_load_si256(block+11);
BM_CSA256(twosB, ones, ones, b, c);
BM_CSA256(foursA, twos, twos, twosA, twosB);
b = _mm256_load_si256(block+12); c = _mm256_load_si256(block+13);
BM_CSA256(twosA, ones, ones, b, c);
b = _mm256_load_si256(block+14); c = _mm256_load_si256(block+15);
BM_CSA256(twosB, ones, ones, b, c);
BM_CSA256(foursB, twos, twos, twosA, twosB);
BM_CSA256(eightsB, fours, fours, foursA, foursB);
BM_CSA256(sixteens, eights, eights, eightsA, eightsB);
BM_AVX2_BIT_COUNT(bc, sixteens);
cnt = _mm256_add_epi64(cnt, bc);
block += 16;
} while (block < block_end);
cnt = _mm256_slli_epi64(cnt, 4);
BM_AVX2_BIT_COUNT(bc, eights)
cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(bc, 3));
BM_AVX2_BIT_COUNT(bc, fours);
cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(bc, 2));
BM_AVX2_BIT_COUNT(bc, twos);
cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(bc, 1));
BM_AVX2_BIT_COUNT(bc, ones);
cnt = _mm256_add_epi64(cnt, bc);
cnt64 = (bm::id64_t*) &cnt;
return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
}
inline
bm::id_t avx2_bit_count_and(const __m256i* BMRESTRICT block,
const __m256i* BMRESTRICT block_end,
const __m256i* BMRESTRICT mask_block)
{
bm::id64_t* cnt64;
BM_AVX2_POPCNT_PROLOG;
__m256i cnt = _mm256_setzero_si256();
__m256i ymm0, ymm1;
do
{
ymm0 = _mm256_load_si256(block);
ymm1 = _mm256_load_si256(mask_block);
ymm0 = _mm256_and_si256(ymm0, ymm1);
++block; ++mask_block;
BM_AVX2_BIT_COUNT(bc, ymm0)
cnt = _mm256_add_epi64(cnt, bc);
ymm0 = _mm256_load_si256(block);
ymm1 = _mm256_load_si256(mask_block);
ymm0 = _mm256_and_si256(ymm0, ymm1);
++block; ++mask_block;
BM_AVX2_BIT_COUNT(bc, ymm0)
cnt = _mm256_add_epi64(cnt, bc);
ymm0 = _mm256_load_si256(block);
ymm1 = _mm256_load_si256(mask_block);
ymm0 = _mm256_and_si256(ymm0, ymm1);
++block; ++mask_block;
BM_AVX2_BIT_COUNT(bc, ymm0)
cnt = _mm256_add_epi64(cnt, bc);
ymm0 = _mm256_load_si256(block);
ymm1 = _mm256_load_si256(mask_block);
ymm0 = _mm256_and_si256(ymm0, ymm1);
++block; ++mask_block;
BM_AVX2_BIT_COUNT(bc, ymm0)
cnt = _mm256_add_epi64(cnt, bc);
} while (block < block_end);
cnt64 = (bm::id64_t*)&cnt;
return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
}
inline
bm::id_t avx2_bit_count_or(const __m256i* BMRESTRICT block,
const __m256i* BMRESTRICT block_end,
const __m256i* BMRESTRICT mask_block)
{
bm::id64_t* cnt64;
BM_AVX2_POPCNT_PROLOG;
__m256i cnt = _mm256_setzero_si256();
do
{
__m256i tmp0 = _mm256_load_si256(block);
__m256i tmp1 = _mm256_load_si256(mask_block);
tmp0 = _mm256_or_si256(tmp0, tmp1);
BM_AVX2_BIT_COUNT(bc, tmp0)
cnt = _mm256_add_epi64(cnt, bc);
++block; ++mask_block;
} while (block < block_end);
cnt64 = (bm::id64_t*)&cnt;
return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
}
inline
bm::id_t avx2_bit_count_xor(const __m256i* BMRESTRICT block,
const __m256i* BMRESTRICT block_end,
const __m256i* BMRESTRICT mask_block)
{
bm::id64_t* cnt64;
BM_AVX2_POPCNT_PROLOG
__m256i cnt = _mm256_setzero_si256();
__m256i mA, mB, mC, mD;
do
{
mA = _mm256_xor_si256(_mm256_load_si256(block+0),
_mm256_load_si256(mask_block+0));
BM_AVX2_BIT_COUNT(bc, mA)
cnt = _mm256_add_epi64(cnt, bc);
mB = _mm256_xor_si256(_mm256_load_si256(block+1),
_mm256_load_si256(mask_block+1));
BM_AVX2_BIT_COUNT(bc, mB);
cnt = _mm256_add_epi64(cnt, bc);
mC = _mm256_xor_si256(_mm256_load_si256(block+2),
_mm256_load_si256(mask_block+2));
BM_AVX2_BIT_COUNT(bc, mC);
cnt = _mm256_add_epi64(cnt, bc);
mD = _mm256_xor_si256(_mm256_load_si256(block+3),
_mm256_load_si256(mask_block+3));
BM_AVX2_BIT_COUNT(bc, mD);
cnt = _mm256_add_epi64(cnt, bc);
block += 4; mask_block += 4;
} while (block < block_end);
cnt64 = (bm::id64_t*)&cnt;
return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
}
inline
bm::id_t avx2_bit_count_sub(const __m256i* BMRESTRICT block,
const __m256i* BMRESTRICT block_end,
const __m256i* BMRESTRICT mask_block)
{
bm::id64_t* cnt64;
BM_AVX2_POPCNT_PROLOG
__m256i cnt = _mm256_setzero_si256();
do
{
__m256i tmp0 = _mm256_load_si256(block);
__m256i tmp1 = _mm256_load_si256(mask_block);
tmp0 = _mm256_andnot_si256(tmp1, tmp0);
BM_AVX2_BIT_COUNT(bc, tmp0)
cnt = _mm256_add_epi64(cnt, bc);
++block; ++mask_block;
} while (block < block_end);
cnt64 = (bm::id64_t*)&cnt;
return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
}
inline
void avx512_xor_arr_2_mask(__m512i* BMRESTRICT dst,
const __m512i* BMRESTRICT src,
const __m512i* BMRESTRICT src_end,
bm::word_t mask)
{
__m512i yM = _mm512_set1_epi32(int(mask));
do
{
_mm512_store_si512(dst+0, _mm512_xor_si512(_mm512_load_si512(src+0), yM)); _mm512_store_si512(dst+1, _mm512_xor_si512(_mm512_load_si512(src+1), yM));
_mm512_store_si512(dst+2, _mm512_xor_si512(_mm512_load_si512(src+2), yM));
_mm512_store_si512(dst+3, _mm512_xor_si512(_mm512_load_si512(src+3), yM));
dst += 4; src += 4;
} while (src < src_end);
}
inline
void avx512_andnot_arr_2_mask(__m512i* BMRESTRICT dst,
const __m512i* BMRESTRICT src,
const __m512i* BMRESTRICT src_end,
bm::word_t mask)
{
__m512i yM = _mm512_set1_epi32(int(mask));
do
{
_mm512_store_si512(dst+0, _mm512_andnot_si512(_mm512_load_si512(src+0), yM)); _mm512_store_si512(dst+1, _mm512_andnot_si512(_mm512_load_si512(src+1), yM));
_mm512_store_si512(dst+2, _mm512_andnot_si512(_mm512_load_si512(src+2), yM));
_mm512_store_si512(dst+3, _mm512_andnot_si512(_mm512_load_si512(src+3), yM));
dst += 4; src += 4;
} while (src < src_end);
}
inline
unsigned avx512_and_block(__m512i* BMRESTRICT dst,
const __m512i* BMRESTRICT src)
{
__m512i m1A, m1B, m1C, m1D;
__m512i accA, accB, accC, accD;
const __m512i* BMRESTRICT src_end =
(const __m512i*)((bm::word_t*)(src) + bm::set_block_size);
accA = accB = accC = accD = _mm512_setzero_si512();
do
{
m1A = _mm512_and_si512(_mm512_load_si512(src+0), _mm512_load_si512(dst+0));
m1B = _mm512_and_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));
m1C = _mm512_and_si512(_mm512_load_si512(src+2), _mm512_load_si512(dst+2));
m1D = _mm512_and_si512(_mm512_load_si512(src+3), _mm512_load_si512(dst+3));
_mm512_store_si512(dst+0, m1A);
_mm512_store_si512(dst+1, m1B);
_mm512_store_si512(dst+2, m1C);
_mm512_store_si512(dst+3, m1D);
accA = _mm512_or_si512(accA, m1A);
accB = _mm512_or_si512(accB, m1B);
accC = _mm512_or_si512(accC, m1C);
accD = _mm512_or_si512(accD, m1D);
src += 4; dst += 4;
} while (src < src_end);
accA = _mm512_or_si512(accA, accB); accC = _mm512_or_si512(accC, accD); accA = _mm512_or_si512(accA, accC);
return !avx512_test_zero(accA);
}
inline
bool avx512_and_digest(__m512i* BMRESTRICT dst,
const __m512i* BMRESTRICT src)
{
__m512i m1A, m1B;
m1A = _mm512_and_si512(_mm512_load_si512(src+0), _mm512_load_si512(dst+0));
m1B = _mm512_and_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));
_mm512_store_si512(dst+0, m1A);
_mm512_store_si512(dst+1, m1B);
m1A = _mm512_or_si512(m1A, m1B);
return avx512_test_zero(m1A);
}
inline
bool avx512_and_digest_2way(__m512i* BMRESTRICT dst,
const __m512i* BMRESTRICT src1,
const __m512i* BMRESTRICT src2)
{
__m512i m1A, m1B;
m1A = _mm512_and_si512(_mm512_load_si512(src1+0), _mm512_load_si512(src2+0));
m1B = _mm512_and_si512(_mm512_load_si512(src1+1), _mm512_load_si512(src2+1));
_mm512_store_si512(dst+0, m1A);
_mm512_store_si512(dst+1, m1B);
m1A = _mm512_or_si512(m1A, m1B);
return avx512_test_zero(m1A);
}
inline
unsigned avx2_and_arr_unal(__m256i* BMRESTRICT dst,
const __m256i* BMRESTRICT src,
const __m256i* BMRESTRICT src_end)
{
__m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
__m256i accA, accB, accC, accD;
accA = _mm256_setzero_si256();
accB = _mm256_setzero_si256();
accC = _mm256_setzero_si256();
accD = _mm256_setzero_si256();
do
{
m1A = _mm256_loadu_si256(src+0);
m2A = _mm256_load_si256(dst+0);
m1A = _mm256_and_si256(m1A, m2A);
_mm256_store_si256(dst+0, m1A);
accA = _mm256_or_si256(accA, m1A);
m1B = _mm256_loadu_si256(src+1);
m2B = _mm256_load_si256(dst+1);
m1B = _mm256_and_si256(m1B, m2B);
_mm256_store_si256(dst+1, m1B);
accB = _mm256_or_si256(accB, m1B);
m1C = _mm256_loadu_si256(src+2);
m2C = _mm256_load_si256(dst+2);
m1C = _mm256_and_si256(m1C, m2C);
_mm256_store_si256(dst+2, m1C);
accC = _mm256_or_si256(accC, m1C);
m1D = _mm256_loadu_si256(src+3);
m2D = _mm256_load_si256(dst+3);
m1D = _mm256_and_si256(m1D, m2D);
_mm256_store_si256(dst+3, m1D);
accD = _mm256_or_si256(accD, m1D);
src += 4; dst += 4;
} while (src < src_end);
accA = _mm256_or_si256(accA, accB); accC = _mm256_or_si256(accC, accD); accA = _mm256_or_si256(accA, accC);
return !_mm256_testz_si256(accA, accA);
}
inline
bool avx512_or_block(__m512i* BMRESTRICT dst,
const __m512i* BMRESTRICT src)
{
__m512i m1A, m1B, m1C, m1D;
__m512i mAccF0, mAccF1;
mAccF0 = mAccF1 = _mm512_set1_epi32(~0u);
__m512i* BMRESTRICT dst2 =
(__m512i*)((bm::word_t*)(dst) + bm::set_block_size/2);
const __m512i* BMRESTRICT src2 =
(const __m512i*)((bm::word_t*)(src) + bm::set_block_size/2);
const __m512i* BMRESTRICT src_end =
(const __m512i*)((bm::word_t*)(src) + bm::set_block_size);
do
{
m1A = _mm512_or_si512(_mm512_load_si512(src), _mm512_load_si512(dst));
m1B = _mm512_or_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));
mAccF0 = _mm512_and_si512(mAccF0, m1A);
mAccF0 = _mm512_and_si512(mAccF0, m1B);
_mm512_stream_si512(dst, m1A);
_mm512_stream_si512(dst+1, m1B);
src += 2; dst += 2;
m1C = _mm512_or_si512(_mm512_load_si512(src2), _mm512_load_si512(dst2));
m1D = _mm512_or_si512(_mm512_load_si512(src2+1), _mm512_load_si512(dst2+1));
mAccF1 = _mm512_and_si512(mAccF1, m1C);
mAccF1 = _mm512_and_si512(mAccF1, m1D);
_mm512_stream_si512(dst2, m1C);
_mm512_stream_si512(dst2+1, m1D);
src2 += 2; dst2 += 2;
} while (src2 < src_end);
mAccF0 = _mm512_and_si512(mAccF0, mAccF1);
return avx512_test_one(mAccF0);
}
inline
bool avx2_or_arr_unal(__m256i* BMRESTRICT dst,
const __m256i* BMRESTRICT src,
const __m256i* BMRESTRICT src_end)
{
__m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
__m256i mAccF0 = _mm256_set1_epi32(~0u); __m256i mAccF1 = _mm256_set1_epi32(~0u); do
{
m1A = _mm256_loadu_si256(src+0);
m2A = _mm256_load_si256(dst+0);
m1A = _mm256_or_si256(m1A, m2A);
_mm256_store_si256(dst+0, m1A);
m1B = _mm256_loadu_si256(src+1);
m2B = _mm256_load_si256(dst+1);
m1B = _mm256_or_si256(m1B, m2B);
_mm256_store_si256(dst+1, m1B);
m1C = _mm256_loadu_si256(src+2);
m2C = _mm256_load_si256(dst+2);
m1C = _mm256_or_si256(m1C, m2C);
_mm256_store_si256(dst+2, m1C);
m1D = _mm256_loadu_si256(src+3);
m2D = _mm256_load_si256(dst+3);
m1D = _mm256_or_si256(m1D, m2D);
_mm256_store_si256(dst+3, m1D);
mAccF1 = _mm256_and_si256(mAccF1, m1C);
mAccF1 = _mm256_and_si256(mAccF1, m1D);
mAccF0 = _mm256_and_si256(mAccF0, m1A);
mAccF0 = _mm256_and_si256(mAccF0, m1B);
src += 4; dst += 4;
} while (src < src_end);
__m256i maskF = _mm256_set1_epi32(~0u);
mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
__m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);
unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
return (maskA == ~0u);
}
inline
bool avx512_or_block_2way(__m512i* BMRESTRICT dst,
const __m512i* BMRESTRICT src1,
const __m512i* BMRESTRICT src2)
{
__m512i m1A, m1B, m1C, m1D;
__m512i mAccF0, mAccF1;
mAccF0 = mAccF1 = _mm512_set1_epi32(~0u); const __m512i* BMRESTRICT src_end1 =
(const __m512i*)((bm::word_t*)(src1) + bm::set_block_size);
do
{
m1A = _mm512_or_si512(_mm512_load_si512(src1+0), _mm512_load_si512(src2+0));
m1B = _mm512_or_si512(_mm512_load_si512(src1+1), _mm512_load_si512(src2+1));
m1C = _mm512_or_si512(_mm512_load_si512(src1+2), _mm512_load_si512(src2+2));
m1D = _mm512_or_si512(_mm512_load_si512(src1+3), _mm512_load_si512(src2+3));
_mm512_store_si512(dst+0, m1A);
_mm512_store_si512(dst+1, m1B);
_mm512_store_si512(dst+2, m1C);
_mm512_store_si512(dst+3, m1D);
mAccF1 = _mm512_and_si512(mAccF1, m1C);
mAccF1 = _mm512_and_si512(mAccF1, m1D);
mAccF0 = _mm512_and_si512(mAccF0, m1A);
mAccF0 = _mm512_and_si512(mAccF0, m1B);
src1 += 4; src2 += 4; dst += 4;
} while (src1 < src_end1);
mAccF0 = _mm512_and_si512(mAccF0, mAccF1);
return avx512_test_one(mAccF0);
}
inline
bool avx512_or_block_3way(__m512i* BMRESTRICT dst,
const __m512i* BMRESTRICT src1,
const __m512i* BMRESTRICT src2)
{
__m512i m1A, m1B, m1C, m1D;
__m512i mAccF0, mAccF1;
mAccF0 = mAccF1 = _mm512_set1_epi32(~0u); const __m512i* BMRESTRICT src_end1 =
(const __m512i*)((bm::word_t*)(src1) + bm::set_block_size);
do
{
m1A = _mm512_or_si512(_mm512_load_si512(src1+0), _mm512_load_si512(dst+0));
m1B = _mm512_or_si512(_mm512_load_si512(src1+1), _mm512_load_si512(dst+1));
m1C = _mm512_or_si512(_mm512_load_si512(src1+2), _mm512_load_si512(dst+2));
m1D = _mm512_or_si512(_mm512_load_si512(src1+3), _mm512_load_si512(dst+3));
m1A = _mm512_or_si512(m1A, _mm512_load_si512(src2+0));
m1B = _mm512_or_si512(m1B, _mm512_load_si512(src2+1));
m1C = _mm512_or_si512(m1C, _mm512_load_si512(src2+2));
m1D = _mm512_or_si512(m1D, _mm512_load_si512(src2+3));
_mm512_store_si512(dst+0, m1A);
_mm512_store_si512(dst+1, m1B);
_mm512_store_si512(dst+2, m1C);
_mm512_store_si512(dst+3, m1D);
mAccF1 = _mm512_and_si512(mAccF1, m1C);
mAccF1 = _mm512_and_si512(mAccF1, m1D);
mAccF0 = _mm512_and_si512(mAccF0, m1A);
mAccF0 = _mm512_and_si512(mAccF0, m1B);
src1 += 4; src2 += 4; dst += 4;
} while (src1 < src_end1);
mAccF0 = _mm512_and_si512(mAccF0, mAccF1);
return avx512_test_one(mAccF0);
}
inline
bool avx512_or_block_5way(__m512i* BMRESTRICT dst,
const __m512i* BMRESTRICT src1,
const __m512i* BMRESTRICT src2,
const __m512i* BMRESTRICT src3,
const __m512i* BMRESTRICT src4)
{
__m512i m1A, m1B, m1C, m1D;
__m512i mAccF0, mAccF1;
mAccF0 = mAccF1 = _mm512_set1_epi32(~0u);
const __m512i* BMRESTRICT src_end1 =
(const __m512i*)((bm::word_t*)(src1) + bm::set_block_size);
do
{
m1A = _mm512_or_si512(_mm512_load_si512(src1+0), _mm512_load_si512(dst+0));
m1B = _mm512_or_si512(_mm512_load_si512(src1+1), _mm512_load_si512(dst+1));
m1C = _mm512_or_si512(_mm512_load_si512(src1+2), _mm512_load_si512(dst+2));
m1D = _mm512_or_si512(_mm512_load_si512(src1+3), _mm512_load_si512(dst+3));
m1A = _mm512_or_si512(m1A, _mm512_load_si512(src2+0));
m1B = _mm512_or_si512(m1B, _mm512_load_si512(src2+1));
m1C = _mm512_or_si512(m1C, _mm512_load_si512(src2+2));
m1D = _mm512_or_si512(m1D, _mm512_load_si512(src2+3));
m1A = _mm512_or_si512(m1A, _mm512_load_si512(src3+0));
m1B = _mm512_or_si512(m1B, _mm512_load_si512(src3+1));
m1C = _mm512_or_si512(m1C, _mm512_load_si512(src3+2));
m1D = _mm512_or_si512(m1D, _mm512_load_si512(src3+3));
m1A = _mm512_or_si512(m1A, _mm512_load_si512(src4+0));
m1B = _mm512_or_si512(m1B, _mm512_load_si512(src4+1));
m1C = _mm512_or_si512(m1C, _mm512_load_si512(src4+2));
m1D = _mm512_or_si512(m1D, _mm512_load_si512(src4+3));
_mm512_store_si512(dst+0, m1A);
_mm512_store_si512(dst+1, m1B);
_mm512_store_si512(dst+2, m1C);
_mm512_store_si512(dst+3, m1D);
mAccF1 = _mm512_and_si512(mAccF1, m1C);
mAccF1 = _mm512_and_si512(mAccF1, m1D);
mAccF0 = _mm512_and_si512(mAccF0, m1A);
mAccF0 = _mm512_and_si512(mAccF0, m1B);
src1 += 4; src2 += 4;
src3 += 4; src4 += 4;
_mm_prefetch ((const char*)src3, _MM_HINT_T0);
_mm_prefetch ((const char*)src4, _MM_HINT_T0);
dst += 4;
} while (src1 < src_end1);
mAccF0 = _mm512_and_si512(mAccF0, mAccF1);
return avx512_test_one(mAccF0);
}
inline
unsigned avx512_xor_block(__m512i* BMRESTRICT dst,
const __m512i* BMRESTRICT src)
{
__m512i m1A, m1B, m1C, m1D;
__m512i accA, accB, accC, accD;
const __m512i* BMRESTRICT src_end =
(const __m512i*)((bm::word_t*)(src) + bm::set_block_size);
accA = accB = accC = accD = _mm512_setzero_si512();
do
{
m1A = _mm512_xor_si512(_mm512_load_si512(src+0), _mm512_load_si512(dst+0));
m1B = _mm512_xor_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));
m1C = _mm512_xor_si512(_mm512_load_si512(src+2), _mm512_load_si512(dst+2));
m1D = _mm512_xor_si512(_mm512_load_si512(src+3), _mm512_load_si512(dst+3));
_mm512_store_si512(dst+0, m1A);
_mm512_store_si512(dst+1, m1B);
_mm512_store_si512(dst+2, m1C);
_mm512_store_si512(dst+3, m1D);
accA = _mm512_or_si512(accA, m1A);
accB = _mm512_or_si512(accB, m1B);
accC = _mm512_or_si512(accC, m1C);
accD = _mm512_or_si512(accD, m1D);
src += 4; dst += 4;
} while (src < src_end);
accA = _mm512_or_si512(accA, accB); accC = _mm512_or_si512(accC, accD); accA = _mm512_or_si512(accA, accC);
return !avx512_test_zero(accA);
}
inline
unsigned avx512_xor_block_2way(__m512i* BMRESTRICT dst,
const __m512i* BMRESTRICT src1,
const __m512i* BMRESTRICT src2)
{
__m512i m1A, m1B, m1C, m1D;
__m512i accA, accB, accC, accD;
const __m512i* BMRESTRICT src1_end =
(const __m512i*)((bm::word_t*)(src1) + bm::set_block_size);
accA = accB = accC = accD = _mm512_setzero_si512();
do
{
m1A = _mm512_xor_si512(_mm512_load_si512(src1 + 0), _mm512_load_si512(src2 + 0));
m1B = _mm512_xor_si512(_mm512_load_si512(src1 + 1), _mm512_load_si512(src2 + 1));
m1C = _mm512_xor_si512(_mm512_load_si512(src1 + 2), _mm512_load_si512(src2 + 2));
m1D = _mm512_xor_si512(_mm512_load_si512(src1 + 3), _mm512_load_si512(src2 + 3));
_mm512_store_si512(dst + 0, m1A);
_mm512_store_si512(dst + 1, m1B);
_mm512_store_si512(dst + 2, m1C);
_mm512_store_si512(dst + 3, m1D);
accA = _mm512_or_si512(accA, m1A);
accB = _mm512_or_si512(accB, m1B);
accC = _mm512_or_si512(accC, m1C);
accD = _mm512_or_si512(accD, m1D);
src1 += 4; src2 += 4; dst += 4;
} while (src1 < src1_end);
accA = _mm512_or_si512(accA, accB); accC = _mm512_or_si512(accC, accD); accA = _mm512_or_si512(accA, accC);
return !avx512_test_zero(accA);
}
inline
unsigned avx512_sub_block(__m512i* BMRESTRICT dst,
const __m512i* BMRESTRICT src)
{
__m512i m1A, m1B, m1C, m1D;
__m512i accA, accB, accC, accD;
accA = accB = accC = accD = _mm512_setzero_si512();
const __m512i* BMRESTRICT src_end =
(const __m512i*)((bm::word_t*)(src) + bm::set_block_size);
do
{
m1A = _mm512_andnot_si512(_mm512_load_si512(src), _mm512_load_si512(dst));
m1B = _mm512_andnot_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));
m1C = _mm512_andnot_si512(_mm512_load_si512(src+2), _mm512_load_si512(dst+2));
m1D = _mm512_andnot_si512(_mm512_load_si512(src+3), _mm512_load_si512(dst+3));
_mm512_store_si512(dst+0, m1A);
_mm512_store_si512(dst+1, m1B);
_mm512_store_si512(dst+2, m1C);
_mm512_store_si512(dst+3, m1D);
accA = _mm512_or_si512(accA, m1A);
accB = _mm512_or_si512(accB, m1B);
accC = _mm512_or_si512(accC, m1C);
accD = _mm512_or_si512(accD, m1D);
src += 4; dst += 4;
} while (src < src_end);
accA = _mm512_or_si512(accA, accB); accC = _mm512_or_si512(accC, accD); accA = _mm512_or_si512(accA, accC);
return !avx512_test_zero(accA);
}
inline
bool avx512_sub_digest(__m512i* BMRESTRICT dst,
const __m512i* BMRESTRICT src)
{
__m512i m1A, m1B;
m1A = _mm512_andnot_si512(_mm512_load_si512(src+0), _mm512_load_si512(dst+0));
m1B = _mm512_andnot_si512(_mm512_load_si512(src+1), _mm512_load_si512(dst+1));
_mm512_store_si512(dst+0, m1A);
_mm512_store_si512(dst+1, m1B);
m1A = _mm512_or_si512(m1A, m1B);
return avx512_test_zero(m1A);
}
BMFORCEINLINE
void avx512_set_block(__m512i* BMRESTRICT dst, bm::word_t value)
{
__m512i* BMRESTRICT dst_end =
(__m512i*)((bm::word_t*)(dst) + bm::set_block_size);
__m512i zmm0 = _mm512_set1_epi32(int(value));
do
{
_mm512_store_si512(dst, zmm0);
_mm512_store_si512(dst+1, zmm0);
_mm512_store_si512(dst+2, zmm0);
_mm512_store_si512(dst+3, zmm0);
dst += 4;
} while (dst < dst_end);
}
inline
void avx512_copy_block(__m512i* BMRESTRICT dst,
const __m512i* BMRESTRICT src)
{
__m512i ymm0, ymm1, ymm2, ymm3;
const __m512i* BMRESTRICT src_end =
(const __m512i*)((bm::word_t*)(src) + bm::set_block_size);
do
{
ymm0 = _mm512_load_si512(src+0);
ymm1 = _mm512_load_si512(src+1);
ymm2 = _mm512_load_si512(src+2);
ymm3 = _mm512_load_si512(src+3);
_mm512_store_si512(dst+0, ymm0);
_mm512_store_si512(dst+1, ymm1);
_mm512_store_si512(dst+2, ymm2);
_mm512_store_si512(dst+3, ymm3);
src += 4; dst += 4;
} while (src < src_end);
}
inline
void avx512_invert_block(__m512i* BMRESTRICT dst)
{
__m512i maskFF = _mm512_set1_epi64(-1); const __m512i* BMRESTRICT dst_end =
(const __m512i*)((bm::word_t*)(dst) + bm::set_block_size);
__m512i ymm0, ymm1;
do
{
ymm0 = _mm512_xor_si512(_mm512_load_si512(dst+0), maskFF);
ymm1 = _mm512_xor_si512(_mm512_load_si512(dst+1), maskFF);
_mm512_store_si512(dst+0, ymm0);
_mm512_store_si512(dst+1, ymm1);
ymm0 = _mm512_xor_si512(_mm512_load_si512(dst+2), maskFF);
ymm1 = _mm512_xor_si512(_mm512_load_si512(dst+3), maskFF);
_mm512_store_si512(dst+2, ymm0);
_mm512_store_si512(dst+3, ymm1);
dst += 4;
} while (dst < dst_end);
}
inline
bool avx512_is_all_zero(const __m512i* BMRESTRICT block)
{
const __m512i* BMRESTRICT block_end =
(const __m512i*)((bm::word_t*)(block) + bm::set_block_size);
do
{
__m512i w0 = _mm512_load_si512(block+0);
__m512i w1 = _mm512_load_si512(block+1);
__m512i wA = _mm512_or_si512(w0, w1);
__m512i w2 = _mm512_load_si512(block+2);
__m512i w3 = _mm512_load_si512(block+3);
__m512i wB = _mm512_or_si512(w2, w3);
wA = _mm512_or_si512(wA, wB);
bool z = avx512_test_zero(wA);
if (!z)
return false;
block += 4;
} while (block < block_end);
return true;
}
inline
bool avx512_is_digest_zero(const __m512i* BMRESTRICT block)
{
__m512i mA =
_mm512_or_si512(_mm512_load_si512(block+0),
_mm512_load_si512(block+1));
return avx512_test_zero(mA);
}
inline
bool avx512_is_all_one(const __m512i* BMRESTRICT block)
{
const __mmask16 m16F = __mmask16(~0u);
__m512i maskF = _mm512_set1_epi64(-1); const __m512i* BMRESTRICT block_end =
(const __m512i*)((bm::word_t*)(block) + bm::set_block_size);
do
{
__mmask16 eq_m = _mm512_cmpeq_epi32_mask(_mm512_load_si512(block), maskF);
if (eq_m != m16F)
return false;
eq_m = _mm512_cmpeq_epi32_mask(_mm512_load_si512(block+1), maskF);
if (eq_m != m16F)
return false;
block += 2;
} while (block < block_end);
return true;
}
BMFORCEINLINE
bool avx2_test_all_zero_wave(const void* ptr)
{
__m256i w0 = _mm256_loadu_si256((__m256i*)ptr);
return _mm256_testz_si256(w0, w0);
}
BMFORCEINLINE
bool avx2_test_all_zero_wave2(const void* ptr0, const void* ptr1)
{
__m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
__m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
w0 = _mm256_or_si256(w0, w1);
return _mm256_testz_si256(w0, w0);
}
BMFORCEINLINE
bool avx2_test_all_eq_wave2(const void* ptr0, const void* ptr1)
{
__m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
__m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
w0 = _mm256_xor_si256(w0, w1);
return _mm256_testz_si256(w0, w0);
}
inline
bm::id_t sse42_bit_block_calc_count_change(const __m128i* BMRESTRICT block,
const __m128i* BMRESTRICT block_end,
unsigned* BMRESTRICT bit_count)
{
unsigned count = (unsigned)(block_end - block)*4;
bm::word_t w0, w_prev;
const int w_shift = sizeof(w0) * 8 - 1;
bool first_word = true;
*bit_count = 0;
{
bm::word_t w;
const bm::word_t* blk = (const bm::word_t*) block;
w = w0 = blk[0];
*bit_count += unsigned(_mm_popcnt_u32(w));
w ^= (w >> 1);
count += unsigned(_mm_popcnt_u32(w));
count -= (w_prev = (w0 >> w_shift));
}
do
{
__m128i b = _mm_load_si128(block);
__m128i tmp2 = _mm_xor_si128(b, _mm_srli_epi32(b, 1)); __m128i tmp3 = _mm_srli_epi32(b, w_shift);
{
if (first_word)
{
first_word = false;
}
else
{
w0 = unsigned(_mm_extract_epi32(b, 0));
if (w0)
{
*bit_count += unsigned(_mm_popcnt_u32(w0));
count += unsigned(_mm_popcnt_u32((unsigned)_mm_extract_epi32(tmp2, 0)));
count -= !(w_prev ^ (w0 & 1));
count -= w_prev = unsigned(_mm_extract_epi32(tmp3, 0));
}
else
{
count -= !w_prev; w_prev ^= w_prev;
}
}
w0 = unsigned(_mm_extract_epi32(b, 1));
if (w0)
{
*bit_count += unsigned(_mm_popcnt_u32(w0));
count += unsigned(_mm_popcnt_u32((unsigned)_mm_extract_epi32(tmp2, 1)));
count -= !(w_prev ^ (w0 & 1));
count -= w_prev = unsigned(_mm_extract_epi32(tmp3, 1));
}
else
{
count -= !w_prev; w_prev ^= w_prev;
}
w0 = unsigned(_mm_extract_epi32(b, 2));
if (w0)
{
*bit_count += unsigned(_mm_popcnt_u32(w0));
count += unsigned(_mm_popcnt_u32((unsigned)_mm_extract_epi32(tmp2, 2)));
count -= !(w_prev ^ (w0 & 1));
count -= w_prev = unsigned(_mm_extract_epi32(tmp3, 2));
}
else
{
count -= !w_prev; w_prev ^= w_prev;
}
w0 = unsigned(_mm_extract_epi32(b, 3));
if (w0)
{
*bit_count += unsigned(_mm_popcnt_u32(w0));
count += unsigned(_mm_popcnt_u32((unsigned)_mm_extract_epi32(tmp2, 3)));
count -= !(w_prev ^ (w0 & 1));
count -= w_prev = unsigned(_mm_extract_epi32(tmp3, 3));
}
else
{
count -= !w_prev; w_prev ^= w_prev;
}
}
} while (++block < block_end);
return count;
}
inline
const bm::gap_word_t* avx2_gap_sum_arr(const bm::gap_word_t* pbuf,
unsigned avx_vect_waves,
unsigned* sum)
{
__m256i xcnt = _mm256_setzero_si256();
for (unsigned i = 0; i < avx_vect_waves; ++i)
{
__m256i ymm0 = _mm256_loadu_si256((__m256i*)(pbuf - 1));
__m256i ymm1 = _mm256_loadu_si256((__m256i*)(pbuf + 16 - 1));
__m256i ymm_s2 = _mm256_add_epi16(ymm1, ymm0);
xcnt = _mm256_add_epi16(xcnt, ymm_s2);
pbuf += 32;
}
xcnt = _mm256_sub_epi16(_mm256_bsrli_epi128(xcnt, 2), xcnt);
xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 4), xcnt);
xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 8), xcnt);
__m128i xcnt2 = _mm_add_epi16(_mm256_extracti128_si256(xcnt, 1), _mm256_extracti128_si256(xcnt, 0));
*sum += _mm_cvtsi128_si32(xcnt2) & 0xffff;
return pbuf;
}
inline
unsigned avx2_idx_arr_block_lookup(const unsigned* idx, unsigned size,
unsigned nb, unsigned start)
{
const unsigned unroll_factor = 16;
const unsigned len = (size - start);
const unsigned len_unr = len - (len % unroll_factor);
unsigned k;
idx += start;
__m256i nbM = _mm256_set1_epi32(int(nb));
for (k = 0; k < len_unr; k+=unroll_factor)
{
__m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
__m256i nbA = _mm256_srli_epi32(idxA, bm::set_block_shift);
__m256i wcmpA= _mm256_cmpeq_epi8(nbM, nbA);
if (~0u != unsigned(_mm256_movemask_epi8(wcmpA)))
break;
__m256i idxB = _mm256_loadu_si256((__m256i*)(idx+k+8));
__m256i nbB = _mm256_srli_epi32(idxB, bm::set_block_shift);
__m256i wcmpB = _mm256_cmpeq_epi8(nbM, nbB);
if (~0u != unsigned(_mm256_movemask_epi8(wcmpB)))
break;
} for (; k < len; ++k)
{
if (nb != unsigned(idx[k] >> bm::set_block_shift))
break;
}
return start + k;
}
inline
void avx2_bit_block_gather_scatter(unsigned* BMRESTRICT arr,
const unsigned* BMRESTRICT blk,
const unsigned* BMRESTRICT idx,
unsigned size,
unsigned start,
unsigned bit_idx)
{
const unsigned unroll_factor = 8;
const unsigned len = (size - start);
const unsigned len_unr = len - (len % unroll_factor);
__m256i sb_mask = _mm256_set1_epi32(bm::set_block_mask);
__m256i sw_mask = _mm256_set1_epi32(bm::set_word_mask);
__m256i maskFF = _mm256_set1_epi32(~0u);
__m256i mask_tmp, mask_0;
unsigned BM_ALIGN32 mword_v[8] BM_ALIGN32ATTR;
unsigned k = 0, mask, w_idx;
for (; k < len_unr; k+=unroll_factor)
{
__m256i nbitA, nwordA;
const unsigned base = start + k;
__m256i* idx_ptr = (__m256i*)(idx+base);
nbitA = _mm256_and_si256 (_mm256_loadu_si256(idx_ptr), sb_mask); nwordA = _mm256_srli_epi32 (nbitA, bm::set_word_shift);
mask_tmp = _mm256_shuffle_epi32 (nwordA, _MM_SHUFFLE(1,1,1,1));
mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));
_mm256_store_si256((__m256i*)mword_v, nwordA);
if (mask == ~0u) {
w_idx = mword_v[0];
mask_tmp = _mm256_set1_epi32(blk[w_idx]); }
else {
mask_tmp = _mm256_set_epi32(blk[mword_v[7]], blk[mword_v[6]],
blk[mword_v[5]], blk[mword_v[4]],
blk[mword_v[3]], blk[mword_v[2]],
blk[mword_v[1]], blk[mword_v[0]]);
}
__m256i shiftA = _mm256_and_si256 (nbitA, sw_mask);
__m256i mask1 = _mm256_srli_epi32 (maskFF, 31);
mask_0 = _mm256_sllv_epi32(mask1, shiftA);
mask_tmp = _mm256_and_si256(mask_tmp, mask_0);
if (!_mm256_testz_si256(mask_tmp, mask_tmp)) {
__m256i* target_ptr = (__m256i*)(arr+base); __m256i maskZ = _mm256_xor_si256(maskFF, maskFF); mask1 = _mm256_slli_epi32(mask1, bit_idx); mask_tmp = _mm256_cmpeq_epi32 (mask_tmp, maskZ); mask_tmp = _mm256_xor_si256 (mask_tmp, maskFF); mask_tmp = _mm256_and_si256 (mask_tmp, mask1);
_mm256_storeu_si256 (target_ptr, _mm256_or_si256 (mask_tmp,
_mm256_loadu_si256(target_ptr)));
}
}
for (; k < len; ++k)
{
const unsigned base = start + k;
unsigned nbit = unsigned(idx[base] & bm::set_block_mask);
arr[base] |= unsigned(bool(blk[nbit >> bm::set_word_shift] & (1u << (nbit & bm::set_word_mask))) << bit_idx);
}
}
#ifdef __GNUG__
#pragma GCC diagnostic pop
#endif
#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
avx512_xor_arr_2_mask((__m512i*)(dst), (__m512i*)(src), (__m512i*)(src_end), (bm::word_t)mask)
#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
avx512_andnot_arr_2_mask((__m512i*)(dst), (__m512i*)(src), (__m512i*)(src_end), (bm::word_t)mask)
#define VECT_BITCOUNT(first, last) \
avx2_bit_count((__m256i*) (first), (__m256i*) (last))
#define VECT_BITCOUNT_AND(first, last, mask) \
avx2_bit_count_and((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
#define VECT_BITCOUNT_OR(first, last, mask) \
avx2_bit_count_or((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
#define VECT_BITCOUNT_XOR(first, last, mask) \
avx2_bit_count_xor((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
#define VECT_BITCOUNT_SUB(first, last, mask) \
avx2_bit_count_sub((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
#define VECT_INVERT_BLOCK(first) \
avx512_invert_block((__m512i*)first);
#define VECT_AND_BLOCK(dst, src) \
avx512_and_block((__m512i*) dst, (const __m512i*) (src))
#define VECT_AND_DIGEST(dst, src) \
avx512_and_digest((__m512i*) dst, (const __m512i*) (src))
#define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
avx512_and_digest_2way((__m512i*) dst, (const __m512i*) (src1), (const __m512i*) (src2))
#define VECT_OR_BLOCK(dst, src) \
avx512_or_block((__m512i*) dst, (__m512i*) (src))
#define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
avx512_or_block_2way((__m512i*) dst, (__m512i*) (src1), (__m512i*) (src2))
#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
avx512_or_block_3way((__m512i*) dst, (__m512i*) (src1), (__m512i*) (src2))
#define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
avx512_or_block_5way((__m512i*) dst, (__m512i*) (src1), (__m512i*) (src2), (__m512i*) (src3), (__m512i*) (src4))
#define VECT_SUB_BLOCK(dst, src) \
avx512_sub_block((__m512i*) dst, (__m512i*) (src))
#define VECT_SUB_DIGEST(dst, src) \
avx512_sub_digest((__m512i*) dst, (const __m512i*) (src))
#define VECT_XOR_BLOCK(dst, src) \
avx512_xor_block((__m512i*) dst, (__m512i*) (src))
#define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
avx512_xor_block_2way((__m512i*) dst, (__m512i*) (src1), (__m512i*) (src2))
#define VECT_COPY_BLOCK(dst, src) \
avx512_copy_block((__m512i*) dst, (__m512i*) (src))
#define VECT_SET_BLOCK(dst, value) \
avx512_set_block((__m512i*) dst, (value))
#define VECT_IS_ZERO_BLOCK(dst) \
avx512_is_all_zero((__m512i*) dst)
#define VECT_IS_ONE_BLOCK(dst) \
avx512_is_all_one((__m512i*) dst)
#define VECT_IS_DIGEST_ZERO(start) \
avx512_is_digest_zero((__m512i*)start)
#define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
avx2_idx_arr_block_lookup(idx, size, nb, start)
}
#endif