#ifndef BMSSE4__H__INCLUDED__
#define BMSSE4__H__INCLUDED__
#ifndef BMWASMSIMDOPT
#include<mmintrin.h>
#endif
#include<emmintrin.h>
#include<smmintrin.h>
#include<nmmintrin.h>
#include<immintrin.h>
#include "bmdef.h"
#include "bmsse_util.h"
#include "bmutil.h"
namespace bm
{
#ifdef __GNUG__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"
#endif
#ifdef _MSC_VER
#pragma warning( push )
#pragma warning( disable : 4146)
#endif
#ifdef BMWASMSIMDOPT
# define _mm_popcnt_u32 __builtin_popcount
# define _mm_popcnt_u64 __builtin_popcountll
# define BM_BSF32 __builtin_ctz
#else
# define BM_BSF32 bm::bsf_asm32
#endif
inline
bm::id_t sse4_bit_count(const __m128i* block, const __m128i* block_end) BMNOEXCEPT
{
bm::id_t count = 0;
#ifdef BM64_SSE4
const bm::id64_t* b = (bm::id64_t*) block;
const bm::id64_t* b_end = (bm::id64_t*) block_end;
do
{
count += unsigned( _mm_popcnt_u64(b[0]) +
_mm_popcnt_u64(b[1]));
b += 2;
} while (b < b_end);
#else
do
{
const unsigned* b = (unsigned*) block;
count += _mm_popcnt_u32(b[0]) +
_mm_popcnt_u32(b[1]) +
_mm_popcnt_u32(b[2]) +
_mm_popcnt_u32(b[3]);
} while (++block < block_end);
#endif
return count;
}
BMFORCEINLINE
unsigned op_xor(unsigned a, unsigned b) BMNOEXCEPT
{
unsigned ret = (a ^ b);
return ret;
}
BMFORCEINLINE
unsigned op_or(unsigned a, unsigned b) BMNOEXCEPT
{
return (a | b);
}
BMFORCEINLINE
unsigned op_and(unsigned a, unsigned b) BMNOEXCEPT
{
return (a & b);
}
template<class Func>
bm::id_t sse4_bit_count_op(const __m128i* BMRESTRICT block,
const __m128i* BMRESTRICT block_end,
const __m128i* BMRESTRICT mask_block,
Func sse2_func) BMNOEXCEPT
{
bm::id_t count = 0;
#ifdef BM64_SSE4
do
{
__m128i tmp0 = _mm_load_si128(block);
__m128i tmp1 = _mm_load_si128(mask_block);
__m128i b = sse2_func(tmp0, tmp1);
count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 0));
count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 1));
++block; ++mask_block;
} while (block < block_end);
#else
do
{
__m128i tmp0 = _mm_load_si128(block);
__m128i tmp1 = _mm_load_si128(mask_block);
__m128i b = sse2_func(tmp0, tmp1);
count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
++block; ++mask_block;
} while (block < block_end);
#endif
return count;
}
inline
bool sse4_is_all_zero(const __m128i* BMRESTRICT block) BMNOEXCEPT
{
__m128i w;
__m128i maskz = _mm_setzero_si128();
const __m128i* BMRESTRICT block_end =
(const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
do
{
w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) return false;
w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) return false;
block += 4;
} while (block < block_end);
return true;
}
inline
bool sse4_is_digest_zero(const __m128i* BMRESTRICT block) BMNOEXCEPT
{
__m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
__m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
wA = _mm_or_si128(wA, wB);
bool z1 = _mm_test_all_zeros(wA, wA);
wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
wA = _mm_or_si128(wA, wB);
bool z2 = _mm_test_all_zeros(wA, wA);
return z1 & z2;
}
inline
void sse4_block_set_digest(__m128i* dst, unsigned value) BMNOEXCEPT
{
__m128i mV = _mm_set1_epi32(int(value));
_mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
_mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
_mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
_mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
}
inline
unsigned sse4_and_block(__m128i* BMRESTRICT dst,
const __m128i* BMRESTRICT src) BMNOEXCEPT
{
__m128i m1A, m1B, m1C, m1D;
__m128i accA, accB, accC, accD;
const __m128i* BMRESTRICT src_end =
(const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
accA = accB = accC = accD = _mm_setzero_si128();
do
{
m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
_mm_store_si128(dst+0, m1A);
_mm_store_si128(dst+1, m1B);
_mm_store_si128(dst+2, m1C);
_mm_store_si128(dst+3, m1D);
accA = _mm_or_si128(accA, m1A);
accB = _mm_or_si128(accB, m1B);
accC = _mm_or_si128(accC, m1C);
accD = _mm_or_si128(accD, m1D);
src += 4; dst += 4;
} while (src < src_end);
accA = _mm_or_si128(accA, accB); accC = _mm_or_si128(accC, accD); accA = _mm_or_si128(accA, accC);
return !_mm_testz_si128(accA, accA);
}
inline
bool sse4_and_digest(__m128i* BMRESTRICT dst,
const __m128i* BMRESTRICT src) BMNOEXCEPT
{
__m128i m1A, m1B, m1C, m1D;
m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
_mm_store_si128(dst+0, m1A);
_mm_store_si128(dst+1, m1B);
_mm_store_si128(dst+2, m1C);
_mm_store_si128(dst+3, m1D);
m1A = _mm_or_si128(m1A, m1B);
m1C = _mm_or_si128(m1C, m1D);
m1A = _mm_or_si128(m1A, m1C);
bool z1 = _mm_testz_si128(m1A, m1A);
m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
_mm_store_si128(dst+4, m1A);
_mm_store_si128(dst+5, m1B);
_mm_store_si128(dst+6, m1C);
_mm_store_si128(dst+7, m1D);
m1A = _mm_or_si128(m1A, m1B);
m1C = _mm_or_si128(m1C, m1D);
m1A = _mm_or_si128(m1A, m1C);
bool z2 = _mm_testz_si128(m1A, m1A);
return z1 & z2;
}
inline
bool sse4_and_digest_2way(__m128i* BMRESTRICT dst,
const __m128i* BMRESTRICT src1,
const __m128i* BMRESTRICT src2) BMNOEXCEPT
{
__m128i m1A, m1B, m1C, m1D;
m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
_mm_store_si128(dst+0, m1A);
_mm_store_si128(dst+1, m1B);
_mm_store_si128(dst+2, m1C);
_mm_store_si128(dst+3, m1D);
m1A = _mm_or_si128(m1A, m1B);
m1C = _mm_or_si128(m1C, m1D);
m1A = _mm_or_si128(m1A, m1C);
bool z1 = _mm_testz_si128(m1A, m1A);
m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
_mm_store_si128(dst+4, m1A);
_mm_store_si128(dst+5, m1B);
_mm_store_si128(dst+6, m1C);
_mm_store_si128(dst+7, m1D);
m1A = _mm_or_si128(m1A, m1B);
m1C = _mm_or_si128(m1C, m1D);
m1A = _mm_or_si128(m1A, m1C);
bool z2 = _mm_testz_si128(m1A, m1A);
return z1 & z2;
}
inline
bool sse4_and_or_digest_2way(__m128i* BMRESTRICT dst,
const __m128i* BMRESTRICT src1,
const __m128i* BMRESTRICT src2) BMNOEXCEPT
{
__m128i m1A, m1B, m1C, m1D;
__m128i mACC1;
m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
bool z1 = _mm_testz_si128(mACC1, mACC1);
m1A = _mm_or_si128(_mm_load_si128(dst+0), m1A);
m1B = _mm_or_si128(_mm_load_si128(dst+1), m1B);
m1C = _mm_or_si128(_mm_load_si128(dst+2), m1C);
m1D = _mm_or_si128(_mm_load_si128(dst+3), m1D);
_mm_store_si128(dst+0, m1A);
_mm_store_si128(dst+1, m1B);
_mm_store_si128(dst+2, m1C);
_mm_store_si128(dst+3, m1D);
m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
bool z2 = _mm_testz_si128(mACC1, mACC1);
m1A = _mm_or_si128(_mm_load_si128(dst+4), m1A);
m1B = _mm_or_si128(_mm_load_si128(dst+5), m1B);
m1C = _mm_or_si128(_mm_load_si128(dst+6), m1C);
m1D = _mm_or_si128(_mm_load_si128(dst+7), m1D);
_mm_store_si128(dst+4, m1A);
_mm_store_si128(dst+5, m1B);
_mm_store_si128(dst+6, m1C);
_mm_store_si128(dst+7, m1D);
return z1 & z2;
}
inline
bool sse4_and_digest_5way(__m128i* BMRESTRICT dst,
const __m128i* BMRESTRICT src1,
const __m128i* BMRESTRICT src2,
const __m128i* BMRESTRICT src3,
const __m128i* BMRESTRICT src4) BMNOEXCEPT
{
__m128i m1A, m1B, m1C, m1D;
__m128i m1E, m1F, m1G, m1H;
m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
m1A = _mm_and_si128(m1A, m1E);
m1B = _mm_and_si128(m1B, m1F);
m1C = _mm_and_si128(m1C, m1G);
m1D = _mm_and_si128(m1D, m1H);
m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
_mm_store_si128(dst+0, m1A);
_mm_store_si128(dst+1, m1B);
_mm_store_si128(dst+2, m1C);
_mm_store_si128(dst+3, m1D);
m1A = _mm_or_si128(m1A, m1B);
m1C = _mm_or_si128(m1C, m1D);
m1A = _mm_or_si128(m1A, m1C);
bool z1 = _mm_testz_si128(m1A, m1A);
m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
m1A = _mm_and_si128(m1A, m1E);
m1B = _mm_and_si128(m1B, m1F);
m1C = _mm_and_si128(m1C, m1G);
m1D = _mm_and_si128(m1D, m1H);
m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
_mm_store_si128(dst+4, m1A);
_mm_store_si128(dst+5, m1B);
_mm_store_si128(dst+6, m1C);
_mm_store_si128(dst+7, m1D);
m1A = _mm_or_si128(m1A, m1B);
m1C = _mm_or_si128(m1C, m1D);
m1A = _mm_or_si128(m1A, m1C);
bool z2 = _mm_testz_si128(m1A, m1A);
return z1 & z2;
}
inline
bool sse4_sub_digest(__m128i* BMRESTRICT dst,
const __m128i* BMRESTRICT src) BMNOEXCEPT
{
__m128i m1A, m1B, m1C, m1D;
m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
_mm_store_si128(dst+0, m1A);
_mm_store_si128(dst+1, m1B);
_mm_store_si128(dst+2, m1C);
_mm_store_si128(dst+3, m1D);
m1A = _mm_or_si128(m1A, m1B);
m1C = _mm_or_si128(m1C, m1D);
m1A = _mm_or_si128(m1A, m1C);
bool z1 = _mm_testz_si128(m1A, m1A);
m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
_mm_store_si128(dst+4, m1A);
_mm_store_si128(dst+5, m1B);
_mm_store_si128(dst+6, m1C);
_mm_store_si128(dst+7, m1D);
m1A = _mm_or_si128(m1A, m1B);
m1C = _mm_or_si128(m1C, m1D);
m1A = _mm_or_si128(m1A, m1C);
bool z2 = _mm_testz_si128(m1A, m1A);
return z1 & z2;
}
inline
bool sse4_sub_digest_2way(__m128i* BMRESTRICT dst,
const __m128i* BMRESTRICT src1,
const __m128i* BMRESTRICT src2) BMNOEXCEPT
{
__m128i m1A, m1B, m1C, m1D;
m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
_mm_store_si128(dst+0, m1A);
_mm_store_si128(dst+1, m1B);
_mm_store_si128(dst+2, m1C);
_mm_store_si128(dst+3, m1D);
m1A = _mm_or_si128(m1A, m1B);
m1C = _mm_or_si128(m1C, m1D);
m1A = _mm_or_si128(m1A, m1C);
bool z1 = _mm_testz_si128(m1A, m1A);
m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
_mm_store_si128(dst+4, m1A);
_mm_store_si128(dst+5, m1B);
_mm_store_si128(dst+6, m1C);
_mm_store_si128(dst+7, m1D);
m1A = _mm_or_si128(m1A, m1B);
m1C = _mm_or_si128(m1C, m1D);
m1A = _mm_or_si128(m1A, m1C);
bool z2 = _mm_testz_si128(m1A, m1A);
return z1 & z2;
}
inline
bool sse4_is_all_one(const __m128i* BMRESTRICT block) BMNOEXCEPT
{
__m128i w;
const __m128i* BMRESTRICT block_end =
(const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
do
{
w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
if (!_mm_test_all_ones(w))
return false;
w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
if (!_mm_test_all_ones(w))
return false;
block+=4;
} while (block < block_end);
return true;
}
BMFORCEINLINE
bool sse42_test_all_one_wave(const void* ptr) BMNOEXCEPT
{
return _mm_test_all_ones(_mm_loadu_si128((__m128i*)ptr));
}
BMFORCEINLINE
bool sse42_test_all_zero_wave(const void* ptr) BMNOEXCEPT
{
__m128i w0 = _mm_loadu_si128((__m128i*)ptr);
return _mm_testz_si128(w0, w0);
}
BMFORCEINLINE
bool sse42_test_all_zero_wave2(const void* ptr0, const void* ptr1) BMNOEXCEPT
{
__m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
__m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
w0 = _mm_or_si128(w0, w1);
return _mm_testz_si128(w0, w0);
}
BMFORCEINLINE
bool sse42_test_all_eq_wave2(const void* ptr0, const void* ptr1) BMNOEXCEPT
{
__m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
__m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
w0 = _mm_xor_si128(w0, w1);
return _mm_testz_si128(w0, w0);
}
inline
unsigned sse42_bit_block_calc_change(const __m128i* BMRESTRICT block,
unsigned size) BMNOEXCEPT
{
bm::id64_t BM_ALIGN32 tcnt[2] BM_ALIGN32ATTR;
const __m128i* block_end =
( __m128i*)((bm::word_t*)(block) + size); __m128i m1COshft, m2COshft;
unsigned w0 = *((bm::word_t*)(block));
unsigned count = 1;
unsigned co2, co1 = 0;
for (;block < block_end; block += 2)
{
__m128i m1A = _mm_load_si128(block);
__m128i m2A = _mm_load_si128(block+1);
__m128i m1CO = _mm_srli_epi32(m1A, 31);
__m128i m2CO = _mm_srli_epi32(m2A, 31);
co2 = _mm_extract_epi32(m1CO, 3);
__m128i m1As = _mm_slli_epi32(m1A, 1); __m128i m2As = _mm_slli_epi32(m2A, 1);
m1COshft = _mm_slli_si128 (m1CO, 4); m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
co1 = co2;
co2 = _mm_extract_epi32(m2CO, 3);
m2COshft = _mm_slli_si128 (m2CO, 4);
m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
m1As = _mm_or_si128(m1As, m1COshft); m2As = _mm_or_si128(m2As, m2COshft);
co1 = co2;
m1A = _mm_xor_si128(m1A, m1As); m2A = _mm_xor_si128(m2A, m2As);
#ifdef BM64_SSE4
_mm_store_si128((__m128i*)tcnt, m1A);
count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
_mm_store_si128((__m128i*)tcnt, m2A);
count += unsigned(_mm_popcnt_u64(tcnt[0]) + _mm_popcnt_u64(tcnt[1]));
#else
bm::id_t m0 = _mm_extract_epi32(m1A, 0);
bm::id_t m1 = _mm_extract_epi32(m1A, 1);
bm::id_t m2 = _mm_extract_epi32(m1A, 2);
bm::id_t m3 = _mm_extract_epi32(m1A, 3);
count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
_mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
m0 = _mm_extract_epi32(m2A, 0);
m1 = _mm_extract_epi32(m2A, 1);
m2 = _mm_extract_epi32(m2A, 2);
m3 = _mm_extract_epi32(m2A, 3);
count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
_mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
#endif
}
count -= (w0 & 1u); return count;
}
inline
void sse42_bit_block_calc_xor_change(const __m128i* BMRESTRICT block,
const __m128i* BMRESTRICT xor_block,
unsigned size,
unsigned* BMRESTRICT gc,
unsigned* BMRESTRICT bc) BMNOEXCEPT
{
#ifdef BM64_SSE4
bm::id64_t BM_ALIGN32 simd_buf[2] BM_ALIGN32ATTR;
#else
#endif
const __m128i* block_end =
( __m128i*)((bm::word_t*)(block) + size);
__m128i m1COshft, m2COshft;
unsigned w0 = *((bm::word_t*)(block));
unsigned gap_count = 1;
unsigned bit_count = 0;
unsigned co2, co1 = 0;
for (;block < block_end; block += 2, xor_block += 2)
{
__m128i m1A = _mm_load_si128(block);
__m128i m2A = _mm_load_si128(block+1);
__m128i m1B = _mm_load_si128(xor_block);
__m128i m2B = _mm_load_si128(xor_block+1);
m1A = _mm_xor_si128(m1A, m1B);
m2A = _mm_xor_si128(m2A, m2B);
{
#ifdef BM64_SSE4
_mm_store_si128 ((__m128i*)simd_buf, m1A);
bit_count += unsigned(_mm_popcnt_u64(simd_buf[0]) + _mm_popcnt_u64(simd_buf[1]));
_mm_store_si128 ((__m128i*)simd_buf, m2A);
bit_count += unsigned(_mm_popcnt_u64(simd_buf[0]) + _mm_popcnt_u64(simd_buf[1]));
#else
bm::id_t m0 = _mm_extract_epi32(m1A, 0);
bm::id_t m1 = _mm_extract_epi32(m1A, 1);
bm::id_t m2 = _mm_extract_epi32(m1A, 2);
bm::id_t m3 = _mm_extract_epi32(m1A, 3);
bit_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
_mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
m0 = _mm_extract_epi32(m2A, 0);
m1 = _mm_extract_epi32(m2A, 1);
m2 = _mm_extract_epi32(m2A, 2);
m3 = _mm_extract_epi32(m2A, 3);
bit_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
_mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
#endif
}
__m128i m1CO = _mm_srli_epi32(m1A, 31);
__m128i m2CO = _mm_srli_epi32(m2A, 31);
co2 = _mm_extract_epi32(m1CO, 3);
__m128i m1As = _mm_slli_epi32(m1A, 1); __m128i m2As = _mm_slli_epi32(m2A, 1);
m1COshft = _mm_slli_si128 (m1CO, 4); m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
co1 = co2;
co2 = _mm_extract_epi32(m2CO, 3);
m2COshft = _mm_slli_si128 (m2CO, 4);
m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
m1As = _mm_or_si128(m1As, m1COshft); m2As = _mm_or_si128(m2As, m2COshft);
co1 = co2;
m1A = _mm_xor_si128(m1A, m1As); m2A = _mm_xor_si128(m2A, m2As);
#ifdef BM64_SSE4
_mm_store_si128 ((__m128i*)simd_buf, m1A);
gap_count += unsigned(_mm_popcnt_u64(simd_buf[0]) + _mm_popcnt_u64(simd_buf[1]));
_mm_store_si128 ((__m128i*)simd_buf, m2A);
gap_count += unsigned(_mm_popcnt_u64(simd_buf[0]) + _mm_popcnt_u64(simd_buf[1]));
#else
bm::id_t m0 = _mm_extract_epi32(m1A, 0);
bm::id_t m1 = _mm_extract_epi32(m1A, 1);
bm::id_t m2 = _mm_extract_epi32(m1A, 2);
bm::id_t m3 = _mm_extract_epi32(m1A, 3);
gap_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
_mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
m0 = _mm_extract_epi32(m2A, 0);
m1 = _mm_extract_epi32(m2A, 1);
m2 = _mm_extract_epi32(m2A, 2);
m3 = _mm_extract_epi32(m2A, 3);
gap_count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
_mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
#endif
}
gap_count -= (w0 & 1u); if (!gap_count)
++gap_count; *gc = gap_count;
*bc = bit_count;
}
#ifdef BM64_SSE4
inline
void sse42_bit_block_calc_change_bc(const __m128i* BMRESTRICT block,
unsigned* gc, unsigned* bc) BMNOEXCEPT
{
const __m128i* block_end =
( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
__m128i m1COshft, m2COshft;
unsigned w0 = *((bm::word_t*)(block));
unsigned bit_count = 0;
unsigned gap_count = 1;
unsigned co2, co1 = 0;
for (;block < block_end; block += 2)
{
__m128i m1A = _mm_load_si128(block);
__m128i m2A = _mm_load_si128(block+1);
{
bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
m0 = _mm_extract_epi64(m2A, 0);
m1 = _mm_extract_epi64(m2A, 1);
bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
}
__m128i m1CO = _mm_srli_epi32(m1A, 31);
__m128i m2CO = _mm_srli_epi32(m2A, 31);
co2 = _mm_extract_epi32(m1CO, 3);
__m128i m1As = _mm_slli_epi32(m1A, 1); __m128i m2As = _mm_slli_epi32(m2A, 1);
m1COshft = _mm_slli_si128 (m1CO, 4); m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
co1 = co2;
co2 = _mm_extract_epi32(m2CO, 3);
m2COshft = _mm_slli_si128 (m2CO, 4);
m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
m1As = _mm_or_si128(m1As, m1COshft); m2As = _mm_or_si128(m2As, m2COshft);
co1 = co2;
m1A = _mm_xor_si128(m1A, m1As); m2A = _mm_xor_si128(m2A, m2As);
{
bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
}
bm::id64_t m0 = _mm_extract_epi64(m2A, 0);
bm::id64_t m1 = _mm_extract_epi64(m2A, 1);
gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
}
gap_count -= (w0 & 1u); *gc = gap_count;
*bc = bit_count;
}
#endif
inline
bool sse42_bit_find_first_diff(const __m128i* BMRESTRICT block1,
const __m128i* BMRESTRICT block2,
unsigned* pos) BMNOEXCEPT
{
unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
const __m128i* block1_end =
(const __m128i*)((bm::word_t*)(block1) + bm::set_block_size);
__m128i maskZ = _mm_setzero_si128();
__m128i mA, mB;
unsigned simd_lane = 0;
do
{
mA = _mm_xor_si128(_mm_load_si128(block1), _mm_load_si128(block2));
mB = _mm_xor_si128(_mm_load_si128(block1+1), _mm_load_si128(block2+1));
__m128i mOR = _mm_or_si128(mA, mB);
if (!_mm_test_all_zeros(mOR, mOR)) {
if (!_mm_test_all_zeros(mA, mA))
{
unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
mask = ~mask; BM_ASSERT(mask);
int bsf = BM_BSF32(mask); _mm_store_si128 ((__m128i*)simd_buf, mA);
unsigned widx = bsf >> 2; unsigned w = simd_buf[widx]; bsf = BM_BSF32(w); *pos = (simd_lane * 128) + (widx * 32) + bsf;
return true;
}
unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
mask = ~mask; BM_ASSERT(mask);
int bsf = BM_BSF32(mask); _mm_store_si128 ((__m128i*)simd_buf, mB);
unsigned widx = bsf >> 2; unsigned w = simd_buf[widx]; bsf = BM_BSF32(w); *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
return true;
}
simd_lane+=2;
block1+=2; block2+=2;
} while (block1 < block1_end);
return false;
}
inline
bool sse42_bit_find_first(const __m128i* BMRESTRICT block,
unsigned* pos) BMNOEXCEPT
{
unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
const __m128i* block_end =
(const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
__m128i maskZ = _mm_setzero_si128();
__m128i mA, mB;
unsigned simd_lane = 0;
do
{
mA = _mm_load_si128(block); mB = _mm_load_si128(block+1);
__m128i mOR = _mm_or_si128(mA, mB);
if (!_mm_test_all_zeros(mOR, mOR)) {
if (!_mm_test_all_zeros(mA, mA))
{
unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
mask = ~mask; BM_ASSERT(mask);
int bsf = BM_BSF32(mask); _mm_store_si128 ((__m128i*)simd_buf, mA);
unsigned widx = bsf >> 2; unsigned w = simd_buf[widx];
bsf = BM_BSF32(w); *pos = (simd_lane * 128) + (widx * 32) + bsf;
return true;
}
unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
mask = ~mask; BM_ASSERT(mask);
int bsf = BM_BSF32(mask); _mm_store_si128 ((__m128i*)simd_buf, mB);
unsigned widx = bsf >> 2; unsigned w = simd_buf[widx];
bsf = BM_BSF32(w); *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
return true;
}
simd_lane+=2;
block+=2;
} while (block < block_end);
return false;
}
#ifdef __GNUG__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Warray-bounds"
#endif
inline
unsigned sse4_gap_find(const bm::gap_word_t* BMRESTRICT pbuf,
const bm::gap_word_t pos, const unsigned size) BMNOEXCEPT
{
BM_ASSERT(size <= 16);
BM_ASSERT(size);
const unsigned unroll_factor = 8;
if (size < 4) {
unsigned j;
for (j = 0; j < size; ++j)
{
if (pbuf[j] >= pos)
break;
}
return j;
}
__m128i m1, mz, maskF, maskFL;
mz = _mm_setzero_si128();
m1 = _mm_loadu_si128((__m128i*)(pbuf));
maskF = _mm_cmpeq_epi64(mz, mz); maskFL = _mm_slli_si128(maskF, 4 * 2); int shiftL= (64 - (unroll_factor - size) * 16);
maskFL = _mm_slli_epi64(maskFL, shiftL);
m1 = _mm_andnot_si128(maskFL, m1); m1 = _mm_or_si128(m1, maskFL);
__m128i mp = _mm_set1_epi16(pos); __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); __m128i c_mask = _mm_slli_epi16(mge_mask, 15); int mi = _mm_movemask_epi8(c_mask); if (mi)
{
unsigned bc = _mm_popcnt_u32(mi); return unroll_factor - bc; }
const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
BM_ASSERT(pbuf2 > pbuf || size == 8);
m1 = _mm_loadu_si128((__m128i*)(pbuf2)); mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); mi = _mm_movemask_epi8(_mm_slli_epi16(mge_mask, 15));
unsigned bc = _mm_popcnt_u32(mi);
return size - bc;
}
inline
unsigned sse42_gap_bfind(const unsigned short* BMRESTRICT buf,
unsigned pos, unsigned* BMRESTRICT is_set) BMNOEXCEPT
{
unsigned start = 1;
unsigned end = 1 + ((*buf) >> 3);
unsigned dsize = end - start;
if (dsize < 17)
{
start = bm::sse4_gap_find(buf+1, (bm::gap_word_t)pos, dsize);
*is_set = ((*buf) & 1) ^ (start & 1);
BM_ASSERT(buf[start+1] >= pos);
BM_ASSERT(buf[start] < pos || (start==0));
return start+1;
}
unsigned arr_end = end;
while (start != end)
{
unsigned curr = (start + end) >> 1;
if (buf[curr] < pos)
start = curr + 1;
else
end = curr;
unsigned size = end - start;
if (size < 16)
{
size += (end != arr_end);
unsigned idx =
bm::sse4_gap_find(buf + start, (bm::gap_word_t)pos, size);
start += idx;
BM_ASSERT(buf[start] >= pos);
BM_ASSERT(buf[start - 1] < pos || (start == 1));
break;
}
}
*is_set = ((*buf) & 1) ^ ((start-1) & 1);
return start;
}
inline
unsigned sse42_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos) BMNOEXCEPT
{
unsigned is_set;
bm::sse42_gap_bfind(buf, pos, &is_set);
return is_set;
}
inline
int sse42_cmpge_u32(__m128i vect4, unsigned value) BMNOEXCEPT
{
__m128i mask0x8 = _mm_set1_epi32(0x80000000);
__m128i mm_val = _mm_set1_epi32(value);
__m128i norm_vect4 = _mm_sub_epi32(vect4, mask0x8); __m128i norm_val = _mm_sub_epi32(mm_val, mask0x8);
__m128i cmp_mask_gt = _mm_cmpgt_epi32 (norm_vect4, norm_val);
__m128i cmp_mask_eq = _mm_cmpeq_epi32 (mm_val, vect4);
__m128i cmp_mask_ge = _mm_or_si128 (cmp_mask_gt, cmp_mask_eq);
int mask = _mm_movemask_epi8(cmp_mask_ge);
if (mask)
{
int bsf = BM_BSF32(mask); return bsf / 4;
}
return -1;
}
inline
unsigned sse4_lower_bound_scan_u32(const unsigned* BMRESTRICT arr,
unsigned target,
unsigned from,
unsigned to) BMNOEXCEPT
{
const unsigned* BMRESTRICT arr_base = &arr[from];
unsigned unroll_factor = 8;
unsigned len = to - from + 1;
unsigned len_unr = len - (len % unroll_factor);
__m128i mask0x8 = _mm_set1_epi32(0x80000000);
__m128i vect_target = _mm_set1_epi32(target);
__m128i norm_target = _mm_sub_epi32(vect_target, mask0x8);
int mask;
__m128i vect40, vect41, norm_vect40, norm_vect41, cmp_mask_ge;
unsigned k = 0;
for (; k < len_unr; k+=unroll_factor)
{
vect40 = _mm_loadu_si128((__m128i*)(&arr_base[k])); norm_vect40 = _mm_sub_epi32(vect40, mask0x8);
cmp_mask_ge = _mm_or_si128( _mm_cmpgt_epi32 (norm_vect40, norm_target),
_mm_cmpeq_epi32 (vect40, vect_target)
);
mask = _mm_movemask_epi8(cmp_mask_ge);
if (mask)
{
int bsf = BM_BSF32(mask); return from + k + (bsf / 4);
}
vect41 = _mm_loadu_si128((__m128i*)(&arr_base[k+4]));
norm_vect41 = _mm_sub_epi32(vect41, mask0x8);
cmp_mask_ge = _mm_or_si128(
_mm_cmpgt_epi32 (norm_vect41, norm_target),
_mm_cmpeq_epi32 (vect41, vect_target)
);
mask = _mm_movemask_epi8(cmp_mask_ge);
if (mask)
{
int bsf = BM_BSF32(mask); return 4 + from + k + (bsf / 4);
}
}
for (; k < len; ++k)
{
if (arr_base[k] >= target)
return from + k;
}
return to + 1;
}
inline
unsigned sse42_idx_arr_block_lookup(const unsigned* idx, unsigned size,
unsigned nb, unsigned start) BMNOEXCEPT
{
const unsigned unroll_factor = 8;
const unsigned len = (size - start);
const unsigned len_unr = len - (len % unroll_factor);
unsigned k;
idx += start;
__m128i nbM = _mm_set1_epi32(nb);
for (k = 0; k < len_unr; k+=unroll_factor)
{
__m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
__m128i idxB = _mm_loadu_si128((__m128i*)(idx+k+4));
__m128i nbA = _mm_srli_epi32(idxA, bm::set_block_shift); __m128i nbB = _mm_srli_epi32(idxB, bm::set_block_shift);
if (!_mm_test_all_ones(_mm_cmpeq_epi32(nbM, nbA)) |
!_mm_test_all_ones(_mm_cmpeq_epi32 (nbM, nbB)))
break;
} for (; k < len; ++k)
{
if (nb != unsigned(idx[k] >> bm::set_block_shift))
break;
}
return start + k;
}
inline
void sse42_set_block_bits(bm::word_t* BMRESTRICT block,
const unsigned* BMRESTRICT idx,
unsigned start, unsigned stop ) BMNOEXCEPT
{
const unsigned unroll_factor = 4;
const unsigned len = (stop - start);
const unsigned len_unr = len - (len % unroll_factor);
idx += start;
unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
__m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
__m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
unsigned k = 0;
for (; k < len_unr; k+=unroll_factor)
{
__m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
__m128i nbitA = _mm_and_si128 (idxA, sb_mask); __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift);
nbitA = _mm_and_si128 (nbitA, sw_mask);
_mm_store_si128 ((__m128i*)mshift_v, nbitA);
__m128i nwordA_0 = _mm_shuffle_epi32(nwordA, 0x0); __m128i cmpA = _mm_cmpeq_epi32(nwordA_0, nwordA); if (_mm_test_all_ones(cmpA)) {
unsigned nword = _mm_extract_epi32(nwordA, 0);
block[nword] |= (1u << mshift_v[0]) | (1u << mshift_v[1])
|(1u << mshift_v[2]) | (1u << mshift_v[3]);
}
else {
_mm_store_si128 ((__m128i*)mword_v, nwordA);
block[mword_v[0]] |= (1u << mshift_v[0]);
block[mword_v[1]] |= (1u << mshift_v[1]);
block[mword_v[2]] |= (1u << mshift_v[2]);
block[mword_v[3]] |= (1u << mshift_v[3]);
}
}
for (; k < len; ++k)
{
unsigned n = idx[k];
unsigned nbit = unsigned(n & bm::set_block_mask);
unsigned nword = nbit >> bm::set_word_shift;
nbit &= bm::set_word_mask;
block[nword] |= (1u << nbit);
} }
inline
void sse4_bit_block_gather_scatter(unsigned* BMRESTRICT arr,
const unsigned* BMRESTRICT blk,
const unsigned* BMRESTRICT idx,
unsigned size,
unsigned start,
unsigned bit_idx) BMNOEXCEPT
{
const unsigned unroll_factor = 4;
const unsigned len = (size - start);
const unsigned len_unr = len - (len % unroll_factor);
__m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
__m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
__m128i maskFF = _mm_set1_epi32(~0u);
__m128i maskZ = _mm_xor_si128(maskFF, maskFF);
__m128i mask_tmp, mask_0;
unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
unsigned k = 0;
unsigned base = start + k;
__m128i* idx_ptr = (__m128i*)(idx + base); __m128i* target_ptr = (__m128i*)(arr + base); for (; k < len_unr; k+=unroll_factor)
{
__m128i nbitA = _mm_and_si128 (_mm_loadu_si128(idx_ptr), sb_mask); __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); _mm_store_si128 ((__m128i*)mshift_v, _mm_and_si128 (nbitA, sw_mask));
_mm_store_si128 ((__m128i*)mword_v, nwordA);
#if 0#endif
mask_0 = _mm_set_epi32(1 << mshift_v[3], 1 << mshift_v[2], 1 << mshift_v[1], 1 << mshift_v[0]);
mask_tmp = _mm_and_si128(_mm_set_epi32(blk[mword_v[3]], blk[mword_v[2]],
blk[mword_v[1]], blk[mword_v[0]]),
mask_0);
mask_tmp = _mm_cmpeq_epi32 (mask_tmp, maskZ); mask_tmp = _mm_xor_si128 (mask_tmp, maskFF); mask_tmp = _mm_srli_epi32 (mask_tmp, 31);
mask_tmp = _mm_slli_epi32(mask_tmp, bit_idx);
_mm_storeu_si128 (target_ptr, _mm_or_si128 (mask_tmp, _mm_loadu_si128(target_ptr)));
++idx_ptr; ++target_ptr;
_mm_prefetch((const char*)target_ptr, _MM_HINT_T0);
}
for (; k < len; ++k)
{
base = start + k;
unsigned nbit = unsigned(idx[base] & bm::set_block_mask);
arr[base] |= unsigned(bool(blk[nbit >> bm::set_word_shift] & (1u << (nbit & bm::set_word_mask))) << bit_idx);
}
}
inline
bool sse42_shift_l1(__m128i* block, unsigned* empty_acc, unsigned co1) BMNOEXCEPT
{
__m128i* block_end =
( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
__m128i mAcc = _mm_set1_epi32(0);
__m128i mMask1 = _mm_set1_epi32(1);
unsigned co2;
for (--block_end; block_end >= block; block_end -= 2)
{
__m128i m1A = _mm_load_si128(block_end);
__m128i m2A = _mm_load_si128(block_end-1);
__m128i m1CO = _mm_and_si128(m1A, mMask1);
__m128i m2CO = _mm_and_si128(m2A, mMask1);
co2 = _mm_extract_epi32(m1CO, 0);
m1A = _mm_srli_epi32(m1A, 1); m2A = _mm_srli_epi32(m2A, 1);
__m128i m1COshft = _mm_srli_si128 (m1CO, 4); __m128i m2COshft = _mm_srli_si128 (m2CO, 4);
m1COshft = _mm_insert_epi32 (m1COshft, co1, 3);
m2COshft = _mm_insert_epi32 (m2COshft, co2, 3);
m1COshft = _mm_slli_epi32(m1COshft, 31);
m2COshft = _mm_slli_epi32(m2COshft, 31);
m1A = _mm_or_si128(m1A, m1COshft); m2A = _mm_or_si128(m2A, m2COshft);
co1 = _mm_extract_epi32(m2CO, 0);
_mm_store_si128(block_end, m1A);
_mm_store_si128(block_end-1, m2A);
mAcc = _mm_or_si128(mAcc, m1A);
mAcc = _mm_or_si128(mAcc, m2A);
}
*empty_acc = !_mm_testz_si128(mAcc, mAcc);
return co1;
}
inline
bool sse42_shift_r1(__m128i* block, unsigned* empty_acc, unsigned co1) BMNOEXCEPT
{
__m128i* block_end =
( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
__m128i m1COshft, m2COshft;
__m128i mAcc = _mm_set1_epi32(0);
unsigned co2;
for (;block < block_end; block += 2)
{
__m128i m1A = _mm_load_si128(block);
__m128i m2A = _mm_load_si128(block+1);
__m128i m1CO = _mm_srli_epi32(m1A, 31);
__m128i m2CO = _mm_srli_epi32(m2A, 31);
co2 = _mm_extract_epi32(m1CO, 3);
m1A = _mm_slli_epi32(m1A, 1); m2A = _mm_slli_epi32(m2A, 1);
m1COshft = _mm_slli_si128 (m1CO, 4); m2COshft = _mm_slli_si128 (m2CO, 4);
m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
m2COshft = _mm_insert_epi32 (m2COshft, co2, 0);
m1A = _mm_or_si128(m1A, m1COshft); m2A = _mm_or_si128(m2A, m2COshft);
co1 = _mm_extract_epi32(m2CO, 3);
_mm_store_si128(block, m1A);
_mm_store_si128(block+1, m2A);
mAcc = _mm_or_si128(mAcc, m1A);
mAcc = _mm_or_si128(mAcc, m2A);
}
*empty_acc = !_mm_testz_si128(mAcc, mAcc);
return co1;
}
inline
bool sse42_shift_r1_and(__m128i* block,
bm::word_t co1,
const __m128i* BMRESTRICT mask_block,
bm::id64_t* digest) BMNOEXCEPT
{
bm::word_t* wblock = (bm::word_t*) block;
const bm::word_t* mblock = (const bm::word_t*) mask_block;
__m128i m1COshft, m2COshft;
__m128i mAcc = _mm_set1_epi32(0);
unsigned co2;
bm::id64_t d, wd;
wd = d = *digest;
unsigned di = 0;
if (!co1)
{
bm::id64_t t = d & -d;
#ifdef BM64_SSE4
di = unsigned(_mm_popcnt_u64(t - 1)); #else
bm::id_t t32 = t & bm::id_max;
if (t32 == 0) {
di = 32;
t32 = t >> 32;
}
di += unsigned(_mm_popcnt_u32(t32 - 1));
#endif
}
for (; di < 64 ; ++di)
{
const unsigned d_base = di * bm::set_block_digest_wave_size;
bm::id64_t dmask = (1ull << di);
if (d & dmask) {
block = (__m128i*) &wblock[d_base];
mask_block = (__m128i*) &mblock[d_base];
mAcc = _mm_xor_si128(mAcc, mAcc); for (unsigned i = 0; i < 4; ++i, block += 2, mask_block += 2)
{
__m128i m1A = _mm_load_si128(block);
__m128i m2A = _mm_load_si128(block+1);
__m128i m1CO = _mm_srli_epi32(m1A, 31);
__m128i m2CO = _mm_srli_epi32(m2A, 31);
co2 = _mm_extract_epi32(m1CO, 3);
m1A = _mm_slli_epi32(m1A, 1); m2A = _mm_slli_epi32(m2A, 1);
m1COshft = _mm_slli_si128 (m1CO, 4); m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
co1 = co2;
co2 = _mm_extract_epi32(m2CO, 3);
m2COshft = _mm_slli_si128 (m2CO, 4);
m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
m1A = _mm_or_si128(m1A, m1COshft); m2A = _mm_or_si128(m2A, m2COshft);
m1A = _mm_and_si128(m1A, _mm_load_si128(mask_block)); m2A = _mm_and_si128(m2A, _mm_load_si128(mask_block+1));
mAcc = _mm_or_si128(mAcc, m1A);
mAcc = _mm_or_si128(mAcc, m2A);
_mm_store_si128(block, m1A);
_mm_store_si128(block+1, m2A);
co1 = co2;
}
if (_mm_testz_si128(mAcc, mAcc))
d &= ~dmask; wd &= wd - 1;
}
else
{
if (co1)
{
BM_ASSERT(co1 == 1);
BM_ASSERT(wblock[d_base] == 0);
bm::id64_t w0 = wblock[d_base] = co1 & mblock[d_base];
d |= (dmask & (w0 << di)); co1 = 0;
}
if (!wd) break;
}
}
*digest = d;
return co1;
}
inline
void sse42_bit_block_xor(bm::word_t* target_block,
const bm::word_t* block, const bm::word_t* xor_block,
bm::id64_t digest) BMNOEXCEPT
{
for (unsigned i = 0; i < bm::block_waves; ++i)
{
const bm::id64_t mask = (1ull << i);
unsigned off = (i * bm::set_block_digest_wave_size);
const __m128i* sub_block = (__m128i*) (block + off);
__m128i* t_sub_block = (__m128i*)(target_block + off);
if (digest & mask) {
const __m128i* xor_sub_block = (__m128i*) (xor_block + off);
__m128i mA, mB, mC, mD;
mA = _mm_xor_si128(_mm_load_si128(sub_block),
_mm_load_si128(xor_sub_block));
mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
_mm_load_si128(xor_sub_block+1));
mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
_mm_load_si128(xor_sub_block+2));
mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
_mm_load_si128(xor_sub_block+3));
_mm_store_si128(t_sub_block, mA);
_mm_store_si128(t_sub_block+1, mB);
_mm_store_si128(t_sub_block+2, mC);
_mm_store_si128(t_sub_block+3, mD);
mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
_mm_load_si128(xor_sub_block+4));
mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
_mm_load_si128(xor_sub_block+5));
mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
_mm_load_si128(xor_sub_block+6));
mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
_mm_load_si128(xor_sub_block+7));
_mm_store_si128(t_sub_block+4, mA);
_mm_store_si128(t_sub_block+5, mB);
_mm_store_si128(t_sub_block+6, mC);
_mm_store_si128(t_sub_block+7, mD);
}
else {
_mm_store_si128(t_sub_block , _mm_load_si128(sub_block));
_mm_store_si128(t_sub_block+1, _mm_load_si128(sub_block+1));
_mm_store_si128(t_sub_block+2, _mm_load_si128(sub_block+2));
_mm_store_si128(t_sub_block+3, _mm_load_si128(sub_block+3));
_mm_store_si128(t_sub_block+4, _mm_load_si128(sub_block+4));
_mm_store_si128(t_sub_block+5, _mm_load_si128(sub_block+5));
_mm_store_si128(t_sub_block+6, _mm_load_si128(sub_block+6));
_mm_store_si128(t_sub_block+7, _mm_load_si128(sub_block+7));
}
} }
inline
void sse42_bit_block_xor_2way(bm::word_t* target_block,
const bm::word_t* xor_block,
bm::id64_t digest) BMNOEXCEPT
{
while (digest)
{
bm::id64_t t = bm::bmi_blsi_u64(digest); unsigned wave = _mm_popcnt_u64(t - 1);
unsigned off = wave * bm::set_block_digest_wave_size;
const __m128i* sub_block = (const __m128i*) (xor_block + off);
__m128i* t_sub_block = (__m128i*)(target_block + off);
__m128i mA, mB, mC, mD;
mA = _mm_xor_si128(_mm_load_si128(sub_block),
_mm_load_si128(t_sub_block));
mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
_mm_load_si128(t_sub_block+1));
mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
_mm_load_si128(t_sub_block+2));
mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
_mm_load_si128(t_sub_block+3));
_mm_store_si128(t_sub_block, mA);
_mm_store_si128(t_sub_block+1, mB);
_mm_store_si128(t_sub_block+2, mC);
_mm_store_si128(t_sub_block+3, mD);
mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
_mm_load_si128(t_sub_block+4));
mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
_mm_load_si128(t_sub_block+5));
mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
_mm_load_si128(t_sub_block+6));
mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
_mm_load_si128(t_sub_block+7));
_mm_store_si128(t_sub_block+4, mA);
_mm_store_si128(t_sub_block+5, mB);
_mm_store_si128(t_sub_block+6, mC);
_mm_store_si128(t_sub_block+7, mD);
digest = bm::bmi_bslr_u64(digest); } }
#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
#define VECT_BITCOUNT(first, last) \
sse4_bit_count((__m128i*) (first), (__m128i*) (last))
#define VECT_BITCOUNT_AND(first, last, mask) \
sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
#define VECT_BITCOUNT_OR(first, last, mask) \
sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
#define VECT_BITCOUNT_XOR(first, last, mask) \
sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
#define VECT_BITCOUNT_SUB(first, last, mask) \
sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
#define VECT_INVERT_BLOCK(first) \
sse2_invert_block((__m128i*)first);
#define VECT_AND_BLOCK(dst, src) \
sse4_and_block((__m128i*) dst, (__m128i*) (src))
#define VECT_AND_DIGEST(dst, src) \
sse4_and_digest((__m128i*) dst, (const __m128i*) (src))
#define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \
sse4_and_or_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
#define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
sse4_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
#define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
#define VECT_OR_BLOCK(dst, src) \
sse2_or_block((__m128i*) dst, (__m128i*) (src))
#define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
sse2_or_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
sse2_or_block_3way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
#define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
#define VECT_SUB_BLOCK(dst, src) \
sse2_sub_block((__m128i*) dst, (const __m128i*) (src))
#define VECT_SUB_DIGEST(dst, src) \
sse4_sub_digest((__m128i*) dst, (const __m128i*) (src))
#define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
sse4_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
#define VECT_XOR_BLOCK(dst, src) \
sse2_xor_block((__m128i*) dst, (__m128i*) (src))
#define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
#define VECT_COPY_BLOCK(dst, src) \
sse2_copy_block((__m128i*) dst, (__m128i*) (src))
#define VECT_COPY_BLOCK_UNALIGN(dst, src) \
sse2_copy_block_unalign((__m128i*) dst, (__m128i*) (src))
#define VECT_STREAM_BLOCK(dst, src) \
sse2_stream_block((__m128i*) dst, (__m128i*) (src))
#define VECT_STREAM_BLOCK_UNALIGN(dst, src) \
sse2_stream_block_unalign((__m128i*) dst, (__m128i*) (src))
#define VECT_SET_BLOCK(dst, value) \
sse2_set_block((__m128i*) dst, value)
#define VECT_IS_ZERO_BLOCK(dst) \
sse4_is_all_zero((__m128i*) dst)
#define VECT_IS_ONE_BLOCK(dst) \
sse4_is_all_one((__m128i*) dst)
#define VECT_IS_DIGEST_ZERO(start) \
sse4_is_digest_zero((__m128i*)start)
#define VECT_BLOCK_SET_DIGEST(dst, val) \
sse4_block_set_digest((__m128i*)dst, val)
#define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
sse4_lower_bound_scan_u32(arr, target, from, to)
#define VECT_SHIFT_L1(b, acc, co) \
sse42_shift_l1((__m128i*)b, acc, co)
#define VECT_SHIFT_R1(b, acc, co) \
sse42_shift_r1((__m128i*)b, acc, co)
#define VECT_SHIFT_R1_AND(b, co, m, digest) \
sse42_shift_r1_and((__m128i*)b, co, (__m128i*)m, digest)
#define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
sse42_idx_arr_block_lookup(idx, size, nb, start)
#define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
sse42_set_block_bits(block, idx, start, stop)
#define VECT_BLOCK_CHANGE(block, size) \
sse42_bit_block_calc_change((__m128i*)block, size)
#define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc) \
sse42_bit_block_calc_xor_change((__m128i*)block, (__m128i*)xor_block, size, gc, bc)
#ifdef BM64_SSE4
#define VECT_BLOCK_CHANGE_BC(block, gc, bc) \
sse42_bit_block_calc_change_bc((__m128i*)block, gc, bc)
#endif
#define VECT_BIT_FIND_FIRST(src, pos) \
sse42_bit_find_first((__m128i*) src, pos)
#define VECT_BIT_FIND_DIFF(src1, src2, pos) \
sse42_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos)
#define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
sse42_bit_block_xor(t, src, src_xor, d)
#define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \
sse42_bit_block_xor_2way(t, src_xor, d)
#define VECT_GAP_BFIND(buf, pos, is_set) \
sse42_gap_bfind(buf, pos, is_set)
#ifdef __GNUG__
#pragma GCC diagnostic pop
#endif
#undef BM_BSF32
#ifdef _MSC_VER
#pragma warning( pop )
#endif
}
#endif