#if defined(_MSC_VER)
#if (defined(_M_IX86) || defined(_M_AMD64))
#include <intrin.h>
#elif defined(_M_ARM64)
#include <simde/x86/sse4.1.h>
#endif
#include <iso646.h>
#include <stdint.h>
#define __restrict__ __restrict
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
#include <x86intrin.h>
#elif defined(__aarch64__)
#include <simde/x86/sse4.1.h>
#elif defined(__GNUC__) && defined(__IWMMXT__)
#include <mmintrin.h>
#elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__))
#include <altivec.h>
#elif defined(__GNUC__) && defined(__SPE__)
#include <spe.h>
#endif
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#if defined(_MSC_VER)
#define ALIGNED(x) __declspec(align(x))
#else
#if defined(__GNUC__)
#define ALIGNED(x) __attribute__((aligned(x)))
#endif
#endif
typedef __m128i xmm_t;
#ifdef __AVX2__
typedef __m256i ymm_t;
#define YMM(x) _mm256_castsi128_si256(x)
#define XMM(y) _mm256_castsi256_si128(y)
#endif
#ifdef IACA
#include </opt/intel/iaca/include/iacaMarks.h>
#else
#undef IACA_START
#define IACA_START
#undef IACA_END
#define IACA_END
#endif
#if defined(_MSC_VER) && !defined(__clang__)
static inline int __builtin_ctz(uint32_t x) {
unsigned long ret;
_BitScanForward(&ret, x);
return (int)ret;
}
#endif
static inline uint8_t _encoded_length(uint32_t val) {
if (val < (1 << 8)) {
return 1;
}
if (val < (1 << 16)) {
return 2;
}
if (val < (1 << 24)) {
return 3;
}
return 4;
}
static inline uint8_t _encode_data(uint32_t val,
uint8_t *__restrict__ *dataPtrPtr) {
uint8_t *dataPtr = *dataPtrPtr;
uint8_t code;
if (val < (1 << 8)) { *dataPtr = (uint8_t)(val);
*dataPtrPtr += 1;
code = 0;
} else if (val < (1 << 16)) { *(uint16_t *)dataPtr = (uint16_t)(val);
*dataPtrPtr += 2;
code = 1;
} else if (val < (1 << 24)) { *(uint16_t *)dataPtr = (uint16_t)(val);
*(dataPtr + 2) = (uint8_t)(val >> 16);
*dataPtrPtr += 3;
code = 2;
} else { *(uint32_t *)dataPtr = val;
*dataPtrPtr += 4;
code = 3;
}
return code;
}
uint8_t *svb_encode_scalar_d1_init(const uint32_t *in,
uint8_t *__restrict__ keyPtr,
uint8_t *__restrict__ dataPtr,
uint32_t count, uint32_t prev) {
if (count == 0)
return dataPtr;
uint8_t shift = 0; uint8_t key = 0;
for (uint32_t c = 0; c < count; c++) {
if (shift == 8) {
shift = 0;
*keyPtr++ = key;
key = 0;
}
uint32_t val = in[c] - prev;
prev = in[c];
uint8_t code = _encode_data(val, &dataPtr);
key |= code << shift;
shift += 2;
}
*keyPtr = key; return dataPtr; }
uint8_t *svb_encode_scalar_d1(const uint32_t *in, uint8_t *__restrict__ keyPtr,
uint8_t *__restrict__ dataPtr, uint32_t count) {
return svb_encode_scalar_d1_init(in, keyPtr, dataPtr, count, 0);
}
uint8_t *svb_encode_scalar(const uint32_t *in, uint8_t *__restrict__ keyPtr,
uint8_t *__restrict__ dataPtr, uint32_t count) {
if (count == 0)
return dataPtr;
uint8_t shift = 0; uint8_t key = 0;
for (uint32_t c = 0; c < count; c++) {
if (shift == 8) {
shift = 0;
*keyPtr++ = key;
key = 0;
}
uint32_t val = in[c];
uint8_t code = _encode_data(val, &dataPtr);
key |= code << shift;
shift += 2;
}
*keyPtr = key; return dataPtr; }
static inline uint32_t _decode_data(uint8_t **dataPtrPtr, uint8_t code) {
uint8_t *dataPtr = *dataPtrPtr;
uint32_t val;
if (code == 0) { val = (uint32_t)*dataPtr;
dataPtr += 1;
} else if (code == 1) { val = (uint32_t) * (uint16_t *)dataPtr;
dataPtr += 2;
} else if (code == 2) { val = (uint32_t) * (uint16_t *)dataPtr;
val |= *(dataPtr + 2) << 16;
dataPtr += 3;
} else { val = *(uint32_t *)dataPtr; dataPtr += 4;
}
*dataPtrPtr = dataPtr;
return val;
}
uint8_t *svb_append_scalar_d1(uint8_t *keyPtr, uint8_t *dataPtr,
size_t sizebytes, size_t count, uint32_t delta) {
uint32_t keyLen = (uint32_t)(count + 3) / 4;
if (count >= keyLen * 4) {
memmove(dataPtr + 1, dataPtr, sizebytes - keyLen);
*dataPtr = 0;
dataPtr++;
sizebytes++;
keyLen++;
}
keyPtr += count / 4;
dataPtr += sizebytes - keyLen;
int shift = (count % 4) * 2;
uint8_t code = _encode_data(delta, &dataPtr);
*keyPtr |= code << shift;
return dataPtr;
}
uint8_t *svb_insert_scalar_d1_init(uint8_t *keyPtr, uint8_t *dataPtr,
size_t dataSize, uint32_t count,
uint32_t prev, uint32_t new_key,
uint32_t *position) {
if (count == 0) {
*position = 0;
return svb_encode_scalar_d1_init(&new_key, keyPtr, dataPtr, 1, prev);
}
uint8_t shift = 0;
uint32_t key = *keyPtr;
uint8_t *dataPtrBegin = dataPtr;
for (uint32_t c = 0; c < count; c++) {
uint8_t *dataPtrPrev = dataPtr;
if (shift == 8) {
shift = 0;
key = *(++keyPtr);
}
uint8_t current_key_code = (key >> shift) & 0x3;
uint32_t current_key = prev + _decode_data(&dataPtr, current_key_code);
if (current_key >= new_key) {
dataPtr = dataPtrPrev;
uint32_t mask_hi = key & (~0u << shift);
uint32_t mask_lo = key & ((1 << shift) - 1);
key = (mask_hi << 2) | mask_lo;
uint32_t carry_bits, prev_carry_bits = (key & (3 << 8)) >> 8;
for (uint8_t *p = keyPtr + 1; p < dataPtrBegin; p++) {
carry_bits = (*p & (3 << 6)) >> 6;
*p <<= 2;
*p |= prev_carry_bits;
prev_carry_bits = carry_bits;
}
int gap = _encoded_length(new_key - prev) +
_encoded_length(current_key - new_key) - (current_key_code + 1);
assert(gap >= 0);
if (gap > 0)
memmove(dataPtr + gap, dataPtr, dataSize - (dataPtr - dataPtrBegin));
uint8_t code = _encode_data(new_key - prev, &dataPtr);
*keyPtr = (uint8_t)(key | (code << shift));
shift += 2;
if (shift == 8) {
shift = 0;
keyPtr++;
}
code = _encode_data(current_key - new_key, &dataPtr);
*keyPtr &= ~(3 << shift);
*keyPtr |= (code << shift);
*position = c;
return dataPtrBegin + dataSize + gap;
}
prev = current_key;
shift += 2;
}
if (shift == 8) {
shift = 0;
keyPtr++;
}
uint8_t code = _encode_data(new_key - prev, &dataPtr);
key &= ~(3 << shift);
key |= code << shift;
*keyPtr = (uint8_t)(key);
*position = count;
return dataPtrBegin + dataSize + code + 1;
}
uint8_t *svb_decode_scalar_d1_init(uint32_t *outPtr, const uint8_t *keyPtr,
uint8_t *dataPtr, uint32_t count,
uint32_t prev) {
if (count == 0)
return dataPtr;
uint8_t shift = 0;
uint32_t key = *keyPtr++;
for (uint32_t c = 0; c < count; c++) {
if (shift == 8) {
shift = 0;
key = *keyPtr++;
}
uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3);
val += prev;
*outPtr++ = val;
prev = val;
shift += 2;
}
return dataPtr; }
uint8_t *svb_decode_scalar_d1(uint32_t *outPtr, const uint8_t *keyPtr,
uint8_t *dataPtr, uint32_t count) {
return svb_decode_scalar_d1_init(outPtr, keyPtr, dataPtr, count, 0);
}
uint8_t *svb_decode_scalar(uint32_t *outPtr, const uint8_t *keyPtr,
uint8_t *dataPtr, uint32_t count) {
if (count == 0)
return dataPtr;
uint8_t shift = 0;
uint32_t key = *keyPtr++;
for (uint32_t c = 0; c < count; c++) {
if (shift == 8) {
shift = 0;
key = *keyPtr++;
}
uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3);
*outPtr++ = val;
shift += 2;
}
return dataPtr; }
static const uint8_t lengthTable[256] = {
4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 5, 6, 7,
8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8,
9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9,
10, 11, 12, 10, 11, 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
8, 9, 10, 11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11,
12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9,
10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7,
8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11,
9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12,
13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13,
14, 15, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8,
9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12,
10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, 12, 13,
14, 12, 13, 14, 15, 13, 14, 15, 16};
static const int8_t shuffleTable[256][16] = {
{0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1}, {0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1}, {0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1}, {0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1}, {0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1}, {0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1}, {0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1}, {0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1}, {0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1}, {0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1}, {0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1}, {0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, -1, -1, -1}, {0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, -1, -1, -1}, {0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, -1, -1, -1}, {0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, -1, -1, -1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, -1, -1, -1}, {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1}, {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1}, {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1}, {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1}, {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1}, {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1}, {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1}, {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, -1, -1, -1}, {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1}, {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1}, {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1}, {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, -1, -1, -1}, {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, -1, -1, -1}, {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, -1, -1, -1}, {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, -1, -1, -1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, -1, -1, -1}, {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1}, {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1}, {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1}, {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1}, {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1}, {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1}, {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1}, {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, -1, -1, -1}, {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1}, {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1}, {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1}, {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, -1, -1, -1}, {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, -1, -1}, {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, -1, -1, -1}, {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, -1, -1, -1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, -1, -1, -1}, {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1}, {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1}, {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1}, {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1}, {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1}, {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1}, {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1}, {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, -1, -1, -1}, {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, -1, -1, -1}, {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, -1, -1, -1}, {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, -1, -1, -1}, {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, -1, -1, -1}, {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1}, {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1}, {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1}, {0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1}, {0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1}, {0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1}, {0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1}, {0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1}, {0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1}, {0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1}, {0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, -1, -1}, {0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1}, {0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1}, {0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1}, {0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, -1, -1}, {0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, -1, -1}, {0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, -1, -1}, {0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, -1, -1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, -1, -1}, {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1}, {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1}, {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1}, {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1}, {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1}, {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1}, {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1}, {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1}, {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1}, {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1}, {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1}, {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, -1, -1}, {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, -1, -1}, {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, -1, -1}, {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, -1, -1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, -1, -1}, {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1}, {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1}, {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1}, {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1}, {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1}, {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1}, {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1}, {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, -1, -1}, {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1}, {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1}, {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1}, {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, -1, -1}, {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, -1, -1}, {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, -1, -1}, {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, -1, -1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, -1, -1}, {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1}, {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1}, {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1}, {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1}, {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1}, {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1}, {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1}, {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, -1, -1}, {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, -1, -1}, {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1, -1}, {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, -1, -1}, {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, -1, -1}, {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1}, {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1}, {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1}, {0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1}, {0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1}, {0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1}, {0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1}, {0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1}, {0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1}, {0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1}, {0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, -1}, {0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1}, {0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1}, {0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1}, {0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, -1}, {0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, -1}, {0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, -1}, {0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, -1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, -1}, {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1}, {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1}, {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1}, {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1}, {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1}, {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1}, {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1}, {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, -1}, {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1}, {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1}, {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1}, {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, -1}, {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, -1}, {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, -1}, {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, -1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, -1}, {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1}, {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1}, {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1}, {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1}, {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1}, {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1}, {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1}, {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, -1}, {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1}, {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1}, {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1}, {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1}, {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, -1}, {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, -1}, {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, -1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, -1}, {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1}, {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1}, {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1}, {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1}, {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1}, {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1}, {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1}, {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, -1}, {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, -1}, {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, -1}, {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, -1}, {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, -1}, {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1}, {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1}, {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1}, {0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6}, {0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7}, {0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8}, {0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9}, {0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7}, {0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8}, {0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9}, {0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, 10}, {0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, 8}, {0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, 9}, {0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, 10}, {0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, 11}, {0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, 9}, {0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, 10}, {0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, 11}, {0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12}, {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7}, {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8}, {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9}, {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10}, {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8}, {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9}, {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10}, {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, 11}, {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, 9}, {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, 10}, {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, 11}, {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, 12}, {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, 10}, {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, 11}, {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, 12}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, 13}, {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8}, {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9}, {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10}, {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11}, {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9}, {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10}, {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11}, {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, 12}, {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 10}, {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, 11}, {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, 12}, {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, 13}, {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11}, {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12}, {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, 13}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, 14}, {0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11}, {0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12}, {0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11}, {0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12}, {0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, 13}, {0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, 11}, {0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, 12}, {0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, 13}, {0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, 14}, {0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, {0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} };
#if !defined(_MSC_VER) || defined(__clang__) || (defined(_MSC_VER) && defined(_M_ARM64))
static const xmm_t High16To32 = { (long long)0xFFFF0B0AFFFF0908, (long long)0xFFFF0F0EFFFF0D0C};
#else
static const xmm_t High16To32 = {8, 9, -1, -1, 10, 11, -1, -1,
12, 13, -1, -1, 14, 15, -1, -1};
#endif
static inline void _write_avx(uint32_t *out, xmm_t Vec) {
_mm_storeu_si128((xmm_t *)out, Vec);
}
#define BroadcastLastXMM 0xFF
static inline xmm_t _write_16bit_avx_d1(uint32_t *out, xmm_t Vec, xmm_t Prev) {
xmm_t Add = _mm_slli_si128(Vec, 2); Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); Vec = _mm_add_epi32(Vec, Add); Add = _mm_slli_si128(Vec, 4); Vec = _mm_add_epi32(Vec, Add); xmm_t V1 = _mm_cvtepu16_epi32(Vec); V1 = _mm_add_epi32(V1, Prev); xmm_t V2 =
_mm_shuffle_epi8(Vec, High16To32); V2 = _mm_add_epi32(V1, V2); _write_avx(out, V1);
_write_avx(out + 4, V2);
return V2;
}
static inline xmm_t _write_avx_d1(uint32_t *out, xmm_t Vec, xmm_t Prev) {
xmm_t Add = _mm_slli_si128(Vec, 4); Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); Vec = _mm_add_epi32(Vec, Add); Add = _mm_slli_si128(Vec, 8); Vec = _mm_add_epi32(Vec, Prev); Vec = _mm_add_epi32(Vec, Add);
_write_avx(out, Vec);
return Vec;
}
static inline xmm_t _decode_avx(uint32_t key,
uint8_t *__restrict__ *dataPtrPtr) {
uint8_t len = lengthTable[key];
xmm_t Data = _mm_loadu_si128((xmm_t *)*dataPtrPtr);
xmm_t Shuf = *(xmm_t *)&shuffleTable[key];
Data = _mm_shuffle_epi8(Data, Shuf);
*dataPtrPtr += len;
return Data;
}
uint8_t *svb_decode_avx_d1_init(uint32_t *out, uint8_t *__restrict__ keyPtr,
uint8_t *__restrict__ dataPtr, uint64_t count,
uint32_t prev) {
uint64_t keybytes = count / 4; if (keybytes >= 8) {
xmm_t Prev = _mm_set1_epi32(prev);
xmm_t Data;
int64_t Offset = -(int64_t)keybytes / 8 + 1;
const uint64_t *keyPtr64 = (const uint64_t *)keyPtr - Offset;
uint64_t nextkeys = keyPtr64[Offset];
for (; Offset != 0; ++Offset) {
uint64_t keys = nextkeys;
nextkeys = keyPtr64[Offset + 1];
if (!keys) {
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr)));
Prev = _write_16bit_avx_d1(out, Data, Prev);
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 8)));
Prev = _write_16bit_avx_d1(out + 8, Data, Prev);
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 16)));
Prev = _write_16bit_avx_d1(out + 16, Data, Prev);
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 24)));
Prev = _write_16bit_avx_d1(out + 24, Data, Prev);
out += 32;
dataPtr += 32;
continue;
}
Data = _decode_avx(keys & 0x00FF, &dataPtr);
Prev = _write_avx_d1(out, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
Prev = _write_avx_d1(out + 4, Data, Prev);
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
Prev = _write_avx_d1(out + 8, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
Prev = _write_avx_d1(out + 12, Data, Prev);
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
Prev = _write_avx_d1(out + 16, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
Prev = _write_avx_d1(out + 20, Data, Prev);
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
Prev = _write_avx_d1(out + 24, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
Prev = _write_avx_d1(out + 28, Data, Prev);
out += 32;
}
{
uint64_t keys = nextkeys;
if (!keys) { Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr)));
Prev = _write_16bit_avx_d1(out, Data, Prev);
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 8)));
Prev = _write_16bit_avx_d1(out + 8, Data, Prev);
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 16)));
Prev = _write_16bit_avx_d1(out + 16, Data, Prev);
Data = _mm_cvtepu8_epi16(_mm_loadl_epi64((xmm_t *)(dataPtr + 24)));
_write_16bit_avx_d1(out + 24, Data, Prev);
out += 32;
dataPtr += 32;
} else {
Data = _decode_avx(keys & 0x00FF, &dataPtr);
Prev = _write_avx_d1(out, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
Prev = _write_avx_d1(out + 4, Data, Prev);
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
Prev = _write_avx_d1(out + 8, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
Prev = _write_avx_d1(out + 12, Data, Prev);
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
Prev = _write_avx_d1(out + 16, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
Prev = _write_avx_d1(out + 20, Data, Prev);
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
Prev = _write_avx_d1(out + 24, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx_d1(out + 28, Data, Prev);
out += 32;
}
}
prev = out[-1];
}
uint64_t consumedkeys = keybytes - (keybytes & 7);
return svb_decode_scalar_d1_init(out, keyPtr + consumedkeys, dataPtr,
count & 31, prev);
}
uint8_t *svb_decode_avx_d1_simple(uint32_t *out, uint8_t *__restrict__ keyPtr,
uint8_t *__restrict__ dataPtr,
uint64_t count) {
return svb_decode_avx_d1_init(out, keyPtr, dataPtr, count, 0);
}
uint8_t *svb_decode_avx_simple(uint32_t *out, uint8_t *__restrict__ keyPtr,
uint8_t *__restrict__ dataPtr, uint64_t count) {
uint64_t keybytes = count / 4; xmm_t Data;
if (keybytes >= 8) {
int64_t Offset = -(int64_t)keybytes / 8 + 1;
const uint64_t *keyPtr64 = (const uint64_t *)keyPtr - Offset;
uint64_t nextkeys = keyPtr64[Offset];
for (; Offset != 0; ++Offset) {
uint64_t keys = nextkeys;
nextkeys = keyPtr64[Offset + 1];
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 4, Data);
keys >>= 16;
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out + 8, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 12, Data);
keys >>= 16;
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out + 16, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 20, Data);
keys >>= 16;
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out + 24, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 28, Data);
out += 32;
}
{
uint64_t keys = nextkeys;
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 4, Data);
keys >>= 16;
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out + 8, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 12, Data);
keys >>= 16;
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out + 16, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 20, Data);
keys >>= 16;
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out + 24, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 28, Data);
out += 32;
}
}
uint64_t consumedkeys = keybytes - (keybytes & 7);
return svb_decode_scalar(out, keyPtr + consumedkeys, dataPtr, count & 31);
}
size_t svb_encode(uint8_t *out, const uint32_t *in, uint32_t count, int delta,
int type) {
*(uint32_t *)out = count; uint8_t *keyPtr = out + 4; uint32_t keyLen = (count + 3) / 4; uint8_t *dataPtr = keyPtr + keyLen;
if (delta == 0 && type == 1) {
return svb_encode_scalar(in, keyPtr, dataPtr, count) - out;
}
if (delta == 1 && type == 1) {
return svb_encode_scalar_d1(in, keyPtr, dataPtr, count) - out;
}
printf("Unknown delta (%d) type (%d) combination.\n", delta, type);
abort();
}
size_t svb_decode(uint32_t *out, uint8_t *in, int delta, int type) {
uint32_t count = *(uint32_t *)in; if (count == 0)
return 0;
uint8_t *keyPtr = in + 4; uint32_t keyLen = ((count + 3) / 4); uint8_t *dataPtr = keyPtr + keyLen;
if (delta == 0 && type == 1) {
return svb_decode_scalar(out, keyPtr, dataPtr, count) - in;
}
if (delta == 1 && type == 1) {
return svb_decode_scalar_d1(out, keyPtr, dataPtr, count) - in;
}
if (delta == 0 && type == 5) {
return svb_decode_avx_simple(out, keyPtr, dataPtr, count) - in;
}
if (delta == 1 && type == 5) {
return svb_decode_avx_d1_simple(out, keyPtr, dataPtr, count) - in;
}
printf("Unknown delta (%d) type (%d) combination.\n", delta, type);
abort();
}
static int lower_bound(uint32_t *A, uint32_t key, int imin, int imax) {
int imid;
imax--;
while (imin + 1 < imax) {
imid = imin + ((imax - imin) / 2);
if (A[imid] >= key) {
imax = imid;
} else if (A[imid] < key) {
imin = imid;
}
}
if (A[imin] >= key)
return imin;
return imax;
}
static ALIGNED(16) int8_t shuffle_mask_bytes[16 * 16] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14,
15, 4, 5, 6, 7, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 12, 13, 14, 15,
8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10,
11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13,
14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0,
1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15,
};
static const __m128i *streamvbyte_shuffle_mask = (__m128i *)shuffle_mask_bytes;
static inline int find_lower_bound(__m128i *PrevHi, __m128i *PrevLow,
uint32_t key, uint32_t *presult) {
int offset = 0;
int s;
#if 0#endif
uint32_t mask;
__m128i key4 = _mm_set1_epi32(key - 2147483648U);
__m128i conversion = _mm_set1_epi32(2147483648U);
__m128i tmp = _mm_sub_epi32(*PrevLow, conversion);
mask = _mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(tmp, key4)));
__m128i *out = PrevLow;
if (mask == 15) {
tmp = _mm_sub_epi32(*PrevHi, conversion);
mask = _mm_movemask_ps(_mm_castsi128_ps(_mm_cmplt_epi32(tmp, key4)));
out = PrevHi;
offset = 4;
}
assert(mask != 15);
const __m128i p = _mm_shuffle_epi8(*out, streamvbyte_shuffle_mask[mask ^ 15]);
s = __builtin_ctz(mask ^ 15);
*presult = _mm_cvtsi128_si32(p);
return (offset + s);
}
static inline void _scan_16bit_avx_d1(xmm_t Vec, xmm_t *PrevHi,
xmm_t *PrevLow) {
xmm_t Prev = *PrevHi;
xmm_t Add = _mm_slli_si128(Vec, 2); Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); Vec = _mm_add_epi32(Vec, Add); Add = _mm_slli_si128(Vec, 4); Vec = _mm_add_epi32(Vec, Add); *PrevLow = _mm_cvtepu16_epi32(Vec); *PrevLow = _mm_add_epi32(*PrevLow, Prev); *PrevHi = _mm_shuffle_epi8(Vec, High16To32); *PrevHi = _mm_add_epi32(
*PrevHi, *PrevLow); }
static inline xmm_t _scan_avx_d1(xmm_t Vec, xmm_t Prev) {
xmm_t Add = _mm_slli_si128(Vec, 4); Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); Vec = _mm_add_epi32(Vec, Add); Add = _mm_slli_si128(Vec, 8); Vec = _mm_add_epi32(Vec, Prev); return _mm_add_epi32(Vec, Add); }
ALIGNED(16)
int8_t streamvbyte_shuffle_mask_bytes[256] = {
0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static const __m128i *shuffle_mask = (__m128i *)streamvbyte_shuffle_mask_bytes;
static inline uint32_t _extract_from_xmm(xmm_t *PrevHi, xmm_t *PrevLow, int i) {
static const int indices[] = {0, 0, 0, 0, 1, 1, 1, 1};
xmm_t *prevs[2] = {PrevLow, PrevHi};
return _mm_cvtsi128_si32(
_mm_shuffle_epi8(*prevs[indices[i]], shuffle_mask[i & 3]));
}
int svb_find_avx_d1_init(uint8_t *keyPtr, uint8_t *dataPtr, uint64_t count,
uint32_t prev, uint32_t key, uint32_t *presult) {
uint64_t keybytes = count / 4; int consumedInts = 0;
if (keybytes >= 8) {
xmm_t PrevHi = _mm_set1_epi32(prev);
xmm_t PrevLow;
xmm_t Data;
int64_t Offset = -(int64_t)keybytes / 8 + 1;
const uint64_t *keyPtr64 = (const uint64_t *)keyPtr - Offset;
uint64_t nextkeys = keyPtr64[Offset];
for (; Offset != 0; ++Offset) {
uint64_t keys = nextkeys;
nextkeys = keyPtr64[Offset + 1];
if (!keys) {
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + s);
}
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 8)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + 8 + s);
}
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 16)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + 16 + s);
}
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 24)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + 24 + s);
}
dataPtr += 32;
consumedInts += 32;
continue;
}
Data = _decode_avx(keys & 0x00FF, &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + s);
}
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + 8 + s);
}
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + 16 + s);
}
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + 24 + s);
}
consumedInts += 32;
}
{
uint64_t keys = nextkeys;
if (!keys) { Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + s);
}
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 8)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + 8 + s);
}
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 16)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + 16 + s);
}
Data = _mm_cvtepu8_epi16(_mm_loadl_epi64((xmm_t *)(dataPtr + 24)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + 24 + s);
}
dataPtr += 32;
consumedInts += 32;
} else {
Data = _decode_avx(keys & 0x00FF, &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + s);
}
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + 8 + s);
}
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + 16 + s);
}
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (key <= (uint32_t)_mm_extract_epi32(PrevHi, 3)) {
int s = find_lower_bound(&PrevHi, &PrevLow, key, presult);
return (consumedInts + 24 + s);
}
consumedInts += 32;
}
prev = (uint32_t)_mm_extract_epi32(PrevHi, 3);
}
}
uint32_t keysleft = count & 31;
if (keysleft > 0) {
uint64_t consumedkeys = keybytes - (keybytes & 7);
uint32_t out[32];
svb_decode_scalar_d1_init(out, keyPtr + consumedkeys, dataPtr, keysleft,
prev);
if (key <= out[keysleft - 1]) {
int s = lower_bound(out, key, 0, keysleft);
assert(s >= 0 && s < (int)keysleft);
*presult = out[s];
return (consumedInts + s);
}
}
*presult = key + 1;
return (int)count;
}
int svb_find_scalar_d1_init(uint8_t *keyPtr, uint8_t *dataPtr, uint64_t count,
uint32_t prev, uint32_t searchkey,
uint32_t *presult) {
uint8_t shift = 0;
uint32_t key = *keyPtr++;
for (uint32_t c = 0; c < count; c++) {
if (shift == 8) {
shift = 0;
key = *keyPtr++;
}
prev += _decode_data(&dataPtr, (key >> shift) & 0x3);
if (prev >= searchkey) {
*presult = prev;
return (c);
}
shift += 2;
}
*presult = searchkey + 1;
return (int)count;
}
uint32_t svb_select_scalar_d1_init(uint8_t *keyPtr, uint8_t *dataPtr,
uint64_t count, uint32_t prev, int slot) {
uint8_t shift = 0;
uint32_t key = *keyPtr++;
(void)count;
for (int c = 0; c <= slot; c++) {
if (shift == 8) {
shift = 0;
key = *keyPtr++;
}
prev += _decode_data(&dataPtr, (key >> shift) & 0x3);
shift += 2;
}
return prev;
}
uint32_t svb_select_avx_d1_init(uint8_t *keyPtr, uint8_t *dataPtr,
uint64_t count, uint32_t prev, int slot) {
uint64_t keybytes = count / 4; int consumedInts = 0;
if (keybytes >= 8) {
xmm_t PrevHi = _mm_set1_epi32(prev);
xmm_t PrevLow;
xmm_t Data;
int64_t Offset = -(int64_t)keybytes / 8 + 1;
const uint64_t *keyPtr64 = (const uint64_t *)keyPtr - Offset;
uint64_t nextkeys = keyPtr64[Offset];
for (; Offset != 0; ++Offset) {
uint64_t keys = nextkeys;
nextkeys = keyPtr64[Offset + 1];
if (!keys) {
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (slot < consumedInts + 8)
return _extract_from_xmm(&PrevHi, &PrevLow, slot - consumedInts);
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 8)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (slot < consumedInts + 16)
return _extract_from_xmm(&PrevHi, &PrevLow,
slot - (consumedInts + 8));
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 16)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (slot < consumedInts + 24)
return _extract_from_xmm(&PrevHi, &PrevLow,
slot - (consumedInts + 16));
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 24)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (slot < consumedInts + 32)
return _extract_from_xmm(&PrevHi, &PrevLow,
slot - (consumedInts + 24));
dataPtr += 32;
consumedInts += 32;
continue;
}
Data = _decode_avx(keys & 0x00FF, &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (slot < consumedInts + 8)
return _extract_from_xmm(&PrevHi, &PrevLow, slot - consumedInts);
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (slot < consumedInts + 16)
return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 8));
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (slot < consumedInts + 24)
return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 16));
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (slot < consumedInts + 32)
return _extract_from_xmm(&PrevHi, &PrevLow, slot - (consumedInts + 24));
consumedInts += 32;
}
{
uint64_t keys = nextkeys;
if (!keys) { Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (slot < consumedInts + 8)
return _extract_from_xmm(&PrevHi, &PrevLow, slot - consumedInts);
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 8)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (slot < consumedInts + 16)
return _extract_from_xmm(&PrevHi, &PrevLow,
slot - (consumedInts + 8));
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((xmm_t *)(dataPtr + 16)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (slot < consumedInts + 24)
return _extract_from_xmm(&PrevHi, &PrevLow,
slot - (consumedInts + 16));
Data = _mm_cvtepu8_epi16(_mm_loadl_epi64((xmm_t *)(dataPtr + 24)));
_scan_16bit_avx_d1(Data, &PrevHi, &PrevLow);
if (slot < consumedInts + 32)
return _extract_from_xmm(&PrevHi, &PrevLow,
slot - (consumedInts + 24));
dataPtr += 32;
consumedInts += 32;
} else {
Data = _decode_avx(keys & 0x00FF, &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (slot < consumedInts + 8)
return _extract_from_xmm(&PrevHi, &PrevLow, slot - consumedInts);
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (slot < consumedInts + 16)
return _extract_from_xmm(&PrevHi, &PrevLow,
slot - (consumedInts + 8));
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (slot < consumedInts + 24)
return _extract_from_xmm(&PrevHi, &PrevLow,
slot - (consumedInts + 16));
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
PrevLow = _scan_avx_d1(Data, PrevHi);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
PrevHi = _scan_avx_d1(Data, PrevLow);
if (slot < consumedInts + 32)
return _extract_from_xmm(&PrevHi, &PrevLow,
slot - (consumedInts + 24));
consumedInts += 32;
}
prev = (uint32_t)_mm_extract_epi32(PrevHi, 3);
}
}
uint64_t consumedkeys = keybytes - (keybytes & 7);
uint32_t keysleft = count & 31;
return svb_select_scalar_d1_init(keyPtr + consumedkeys, dataPtr, keysleft,
prev, slot - consumedInts);
}