#ifndef SIMDFASTPFOR_H_
#define SIMDFASTPFOR_H_
#include "common.h"
#include "codecs.h"
#include "simdbitpacking.h"
#include "usimdbitpacking.h"
#include "memutil.h"
#include "util.h"
#include "simple8b.h"
#include "usimdbitpacking.h"
namespace FastPForLib {
template <uint32_t BlockSizeInUnitsOfPackSize =
8> class SIMDFastPFor : public IntegerCODEC {
public:
using IntegerCODEC::encodeArray;
using IntegerCODEC::decodeArray;
SIMDFastPFor(uint32_t ps = 65536)
: PageSize(ps), bitsPageSize(gccbits(PageSize)), datatobepacked(33),
bytescontainer(PageSize + 3 * PageSize / BlockSize) {
assert(ps / BlockSize * BlockSize == ps);
assert(gccbits(static_cast<uint32_t>(BlockSizeInUnitsOfPackSize * PACKSIZE - 1)) <= 8);
}
enum {
PACKSIZE = 32,
overheadofeachexcept = 8,
overheadduetobits = 8,
overheadduetonmbrexcept = 8,
BlockSize = BlockSizeInUnitsOfPackSize * PACKSIZE
};
static uint32_t *packblockupsimd(const uint32_t *source, uint32_t *out,
const uint32_t bit) {
for (int k = 0; k < BlockSize; k += 128) {
SIMD_fastpack_32(source, reinterpret_cast<__m128i *>(out), bit);
out += 4 * bit;
source += 128;
}
return out;
}
static const uint32_t *unpackblocksimd(const uint32_t *source, uint32_t *out,
const uint32_t bit) {
for (int k = 0; k < BlockSize; k += 128) {
SIMD_fastunpack_32(reinterpret_cast<const __m128i *>(source), out, bit);
source += 4 * bit;
out += 128;
}
return source;
}
template <class STLContainer>
static const uint32_t *unpackmesimd(const uint32_t *in, STLContainer &out,
const uint32_t bit) {
const uint32_t size = *in;
++in;
out.resize((size + 32 - 1) / 32 * 32);
uint32_t j = 0;
for (; j + 128 <= size; j += 128) {
usimdunpack(reinterpret_cast<const __m128i *>(in), &out[j], bit);
in += 4 * bit;
}
for (; j + 31 < size; j += 32) {
fastunpack(in, &out[j], bit);
in += bit;
}
uint32_t buffer[PACKSIZE];
uint32_t remaining = size - j;
memcpy(buffer, in, (remaining * bit + 31) / 32 * sizeof(uint32_t));
uint32_t *bpointer = buffer;
in += (out.size() - j) / 32 * bit;
for (; j < size; j += 32) {
fastunpack(bpointer, &out[j], bit);
bpointer += bit;
}
in -= (j - size) * bit / 32;
out.resize(size);
return in;
}
template <class STLContainer>
static uint32_t *packmeupwithoutmasksimd(STLContainer &source, uint32_t *out,
const uint32_t bit) {
const uint32_t size = static_cast<uint32_t>(source.size());
*out = size;
out++;
if (source.size() == 0)
return out;
source.resize((source.size() + 32 - 1) / 32 * 32);
uint32_t j = 0;
for (; j + 128 <= size; j += 128) {
usimdpackwithoutmask(&source[j], reinterpret_cast<__m128i *>(out), bit);
out += 4 * bit;
}
for (; j < size; j += 32) {
fastpackwithoutmask(&source[j], out, bit);
out += bit;
}
out -= (j - size) * bit / 32;
source.resize(size);
return out;
}
void resetBuffer() {
for (size_t i = 0; i < datatobepacked.size(); ++i) {
std::vector<uint32_t, cacheallocator>().swap(datatobepacked[i]);
}
}
const uint32_t PageSize;
const uint32_t bitsPageSize;
std::vector<std::vector<uint32_t, cacheallocator>> datatobepacked;
std::vector<uint8_t> bytescontainer;
#ifndef NDEBUG
const uint32_t *decodeArray(const uint32_t *in, const size_t length,
#else
const uint32_t *decodeArray(const uint32_t *in, const size_t,
#endif
uint32_t *out, size_t &nvalue) override {
#ifndef NDEBUG
const uint32_t *const initin(in);
#endif
const size_t mynvalue = *in;
++in;
if (mynvalue > nvalue)
throw NotEnoughStorage(mynvalue);
nvalue = mynvalue;
const uint32_t *const finalout(out + nvalue);
while (out != finalout) {
size_t thisnvalue(0);
size_t thissize = static_cast<size_t>(
finalout > PageSize + out ? PageSize : (finalout - out));
__decodeArray(in, thisnvalue, out, thissize);
in += thisnvalue;
out += thissize;
}
assert(initin + length >= in);
resetBuffer(); return in;
}
void encodeArray(const uint32_t *in, const size_t length, uint32_t *out,
size_t &nvalue) override {
checkifdivisibleby(length, BlockSize);
#ifndef NDEBUG
const uint32_t *const initout(out);
#endif
const uint32_t *const finalin(in + length);
*out++ = static_cast<uint32_t>(length);
const size_t oldnvalue = nvalue;
nvalue = 1;
while (in != finalin) {
size_t thissize = static_cast<size_t>(
finalin > PageSize + in ? PageSize : (finalin - in));
size_t thisnvalue(0);
__encodeArray(in, thissize, out, thisnvalue);
nvalue += thisnvalue;
out += thisnvalue;
in += thissize;
}
assert(out == nvalue + initout);
if (oldnvalue < nvalue)
fprintf(stderr,
"It is possible we have a buffer overrun. You reported having allocated "
"%zu bytes for the compressed data but we needed "
"%zu bytes. Please increase the available memory "
"for compressed data or check the value of the last parameter provided "
"to the encodeArray method.\n",
oldnvalue * sizeof(uint32_t), nvalue * sizeof(uint32_t));
resetBuffer(); }
void getBestBFromData(const uint32_t *in, uint8_t &bestb,
uint8_t &bestcexcept, uint8_t &maxb) {
uint32_t freqs[33];
for (uint32_t k = 0; k <= 32; ++k)
freqs[k] = 0;
for (uint32_t k = 0; k < BlockSize; ++k) {
freqs[asmbits(in[k])]++;
}
bestb = 32;
while (freqs[bestb] == 0)
bestb--;
maxb = bestb;
uint32_t bestcost = bestb * BlockSize;
uint32_t cexcept = 0;
bestcexcept = static_cast<uint8_t>(cexcept);
for (uint32_t b = bestb - 1; b < 32; --b) {
cexcept += freqs[b + 1];
uint32_t thiscost = cexcept * overheadofeachexcept +
cexcept * (maxb - b) + b * BlockSize +
8; if (thiscost < bestcost) {
bestcost = thiscost;
bestb = static_cast<uint8_t>(b);
bestcexcept = static_cast<uint8_t>(cexcept);
}
}
}
void __encodeArray(const uint32_t *in, const size_t length, uint32_t *out,
size_t &nvalue) {
uint32_t *const initout = out; checkifdivisibleby(length, BlockSize);
uint32_t *const headerout = out++; for (uint32_t k = 0; k < 32 + 1; ++k)
datatobepacked[k].clear();
uint8_t *bc = &bytescontainer[0];
for (const uint32_t *const final = in + length; (in + BlockSize <= final);
in += BlockSize) {
uint8_t bestb, bestcexcept, maxb;
getBestBFromData(in, bestb, bestcexcept, maxb);
*bc++ = bestb;
*bc++ = bestcexcept;
if (bestcexcept > 0) {
*bc++ = maxb;
std::vector<uint32_t, cacheallocator> &thisexceptioncontainer =
datatobepacked[maxb - bestb];
const uint32_t maxval = 1U << bestb;
for (uint32_t k = 0; k < BlockSize; ++k) {
if (in[k] >= maxval) {
thisexceptioncontainer.push_back(in[k] >> bestb);
*bc++ = static_cast<uint8_t>(k);
}
}
}
out = packblockupsimd(in, out, bestb);
}
headerout[0] = static_cast<uint32_t>(out - headerout);
const uint32_t bytescontainersize =
static_cast<uint32_t>(bc - &bytescontainer[0]);
*(out++) = bytescontainersize;
memcpy(out, &bytescontainer[0], bytescontainersize);
uint8_t* pad8 = (uint8_t*)out + bytescontainersize;
out += (bytescontainersize + sizeof(uint32_t) - 1) / sizeof(uint32_t);
while (pad8 < (uint8_t*)out)
*pad8++ = 0;
uint32_t bitmap = 0;
for (uint32_t k = 2; k <= 32; ++k) {
if (datatobepacked[k].size() != 0)
bitmap |= (1U << (k - 1));
}
*(out++) = bitmap;
for (uint32_t k = 2; k <= 32; ++k) {
if (datatobepacked[k].size() > 0)
out = packmeupwithoutmasksimd(datatobepacked[k], out, k);
}
nvalue = out - initout;
}
void __decodeArray(const uint32_t *in, size_t &length, uint32_t *out,
const size_t nvalue) {
const uint32_t *const initin = in;
const uint32_t *const headerin = in++;
const uint32_t wheremeta = headerin[0];
const uint32_t *inexcept = headerin + wheremeta;
const uint32_t bytesize = *inexcept++;
const uint8_t *bytep = reinterpret_cast<const uint8_t *>(inexcept);
inexcept += (bytesize + sizeof(uint32_t) - 1) / sizeof(uint32_t);
const uint32_t bitmap = *(inexcept++);
for (uint32_t k = 2; k <= 32; ++k) {
if ((bitmap & (1U << (k - 1))) != 0) {
inexcept = unpackmesimd(inexcept, datatobepacked[k], k);
}
}
length = inexcept - initin;
std::vector<uint32_t, cacheallocator>::const_iterator
unpackpointers[32 + 1];
for (uint32_t k = 1; k <= 32; ++k) {
unpackpointers[k] = datatobepacked[k].begin();
}
for (uint32_t run = 0; run < nvalue / BlockSize; ++run, out += BlockSize) {
const uint8_t b = *bytep++;
const uint8_t cexcept = *bytep++;
in = unpackblocksimd(in, out, b);
if (cexcept > 0) {
const uint8_t maxbits = *bytep++;
if (maxbits - b == 1) {
for (uint32_t k = 0; k < cexcept; ++k) {
const uint8_t pos = *(bytep++);
out[pos] |= static_cast<uint32_t>(1) << b;
}
} else {
std::vector<uint32_t, cacheallocator>::const_iterator &exceptionsptr =
unpackpointers[maxbits - b];
for (uint32_t k = 0; k < cexcept; ++k) {
const uint8_t pos = *(bytep++);
out[pos] |= (*(exceptionsptr++)) << b;
}
}
}
}
assert(in == headerin + wheremeta);
}
std::string name() const override {
return std::string("SIMDFastPFor") + std::to_string(BlockSize);
}
};
template <class EXCEPTIONCODER = Simple8b<true>>
class SIMDSimplePFor : public IntegerCODEC {
public:
using IntegerCODEC::encodeArray;
using IntegerCODEC::decodeArray;
EXCEPTIONCODER ecoder;
SIMDSimplePFor(uint32_t ps = 65536)
: ecoder(), PageSize(ps), bitsPageSize(gccbits(PageSize)),
datatobepacked(PageSize),
bytescontainer(PageSize + 3 * PageSize / BlockSize) {
assert(ps / BlockSize * BlockSize == ps);
assert(gccbits(static_cast<uint32_t>(BlockSizeInUnitsOfPackSize * PACKSIZE - 1)) <= 8);
}
enum {
BlockSizeInUnitsOfPackSize = 4,
PACKSIZE = 32,
overheadofeachexcept = 8,
overheadduetobits = 8,
overheadduetonmbrexcept = 8,
BlockSize = BlockSizeInUnitsOfPackSize * PACKSIZE
};
const uint32_t PageSize;
const uint32_t bitsPageSize;
std::vector<uint32_t> datatobepacked;
std::vector<uint8_t> bytescontainer;
const uint32_t *decodeArray(const uint32_t *in, const size_t length,
uint32_t *out, size_t &nvalue) override {
const uint32_t *const initin(in);
const size_t mynvalue = *in;
++in;
if (mynvalue > nvalue)
throw NotEnoughStorage(mynvalue);
nvalue = mynvalue;
const uint32_t *const finalout(out + nvalue);
while (out != finalout) {
size_t thisnvalue = length - (in - initin);
size_t thissize = static_cast<size_t>(
finalout > PageSize + out ? PageSize : (finalout - out));
__decodeArray(in, thisnvalue, out, thissize);
in += thisnvalue;
out += thissize;
}
assert(initin + length >= in);
return in;
}
void encodeArray(const uint32_t *in, const size_t length, uint32_t *out,
size_t &nvalue) override {
checkifdivisibleby(length, BlockSize);
const uint32_t *const initout(out);
const uint32_t *const finalin(in + length);
*out++ = static_cast<uint32_t>(length);
const size_t oldnvalue = nvalue;
nvalue = 1;
while (in != finalin) {
size_t thissize = static_cast<size_t>(
finalin > PageSize + in ? PageSize : (finalin - in));
size_t thisnvalue = oldnvalue - (out - initout);
__encodeArray(in, thissize, out, thisnvalue);
nvalue += thisnvalue;
out += thisnvalue;
in += thissize;
}
assert(out == nvalue + initout);
if (oldnvalue < nvalue)
fprintf(stderr,
"It is possible we have a buffer overrun. You reported having allocated "
"%zu bytes for the compressed data but we needed "
"%zu bytes. Please increase the available memory "
"for compressed data or check the value of the last parameter provided "
"to the encodeArray method.\n",
oldnvalue * sizeof(uint32_t), nvalue * sizeof(uint32_t));
}
void getBestBFromData(const uint32_t *in, uint8_t &bestb,
uint8_t &bestcexcept, uint8_t &maxb) {
uint32_t freqs[33];
for (uint32_t k = 0; k <= 32; ++k)
freqs[k] = 0;
for (uint32_t k = 0; k < BlockSize; ++k) {
freqs[asmbits(in[k])]++;
}
bestb = 32;
while (freqs[bestb] == 0)
bestb--;
maxb = bestb;
uint32_t bestcost = bestb * BlockSize;
uint32_t cexcept = 0;
bestcexcept = static_cast<uint8_t>(cexcept);
for (uint32_t b = bestb - 1; b < 32; --b) {
cexcept += freqs[b + 1];
uint32_t thiscost =
cexcept * overheadofeachexcept + cexcept * (maxb - b) + b * BlockSize;
if (thiscost < bestcost) {
bestcost = thiscost;
bestb = static_cast<uint8_t>(b);
bestcexcept = static_cast<uint8_t>(cexcept);
}
}
}
void __encodeArray(const uint32_t *in, const size_t length, uint32_t *out,
size_t &nvalue) {
uint32_t *const initout = out; checkifdivisibleby(length, BlockSize);
uint32_t *const headerout = out++; datatobepacked.clear();
uint8_t *bc = &bytescontainer[0];
for (const uint32_t *const final = in + length; (in + BlockSize <= final);
in += BlockSize) {
uint8_t bestb, bestcexcept, maxb;
getBestBFromData(in, bestb, bestcexcept, maxb);
*bc++ = bestb;
*bc++ = bestcexcept;
if (bestcexcept > 0) {
assert(bestb < 32);
const uint32_t maxval = 1U << bestb;
for (uint32_t k = 0; k < BlockSize; ++k) {
if (in[k] >= maxval) {
datatobepacked.push_back(in[k] >> bestb);
*bc++ = static_cast<uint8_t>(k);
}
}
}
assert(BlockSize == 128);
usimdpack(in, reinterpret_cast<__m128i *>(out), bestb);
out += BlockSizeInUnitsOfPackSize * bestb;
}
headerout[0] = static_cast<uint32_t>(out - headerout);
const uint32_t bytescontainersize =
static_cast<uint32_t>(bc - &bytescontainer[0]);
*(out++) = bytescontainersize;
memcpy(out, &bytescontainer[0], bytescontainersize);
uint8_t* pad8 = (uint8_t*)out + bytescontainersize;
out += (bytescontainersize + sizeof(uint32_t) - 1) / sizeof(uint32_t);
while (pad8 < (uint8_t*)out)
*pad8++ = 0;
size_t outcap = 0;
ecoder.encodeArray(datatobepacked.data(), datatobepacked.size(), out,
outcap);
out += outcap;
nvalue = out - initout;
}
void __decodeArray(const uint32_t *in, size_t &length, uint32_t *out,
const size_t nvalue) {
const uint32_t *const initin = in;
const uint32_t *const headerin = in++;
const uint32_t wheremeta = headerin[0];
const uint32_t *inexcept = headerin + wheremeta;
const uint32_t bytesize = *inexcept++;
const uint8_t *bytep = reinterpret_cast<const uint8_t *>(inexcept);
inexcept += (bytesize + sizeof(uint32_t) - 1) / sizeof(uint32_t);
datatobepacked.resize(datatobepacked.capacity());
size_t cap = datatobepacked.size();
size_t le = initin + length - inexcept;
inexcept = ecoder.decodeArray(inexcept, le, &datatobepacked[0], cap);
length = inexcept - initin;
auto exceptionsptr = datatobepacked.begin();
for (uint32_t run = 0; run < nvalue / BlockSize; ++run, out += BlockSize) {
const uint8_t b = *bytep++;
const uint8_t cexcept = *bytep++;
assert(BlockSize == 128);
usimdunpack(reinterpret_cast<const __m128i *>(in), out, b);
in += BlockSizeInUnitsOfPackSize * b;
for (uint32_t k = 0; k < cexcept; ++k) {
const uint8_t pos = *(bytep++);
out[pos] |= (*(exceptionsptr++)) << b;
}
}
assert(in == headerin + wheremeta);
}
std::string name() const override { return "SIMDSimplePFor"; }
};
}
#endif