#ifndef SIMDPFOR_H_
#define SIMDPFOR_H_
#include "common.h"
#include "codecs.h"
#include "usimdbitpacking.h"
#include "util.h"
namespace FastPFor {
class SIMDPFor: public IntegerCODEC {
public:
enum {
BlockSizeInUnitsOfPackSize = 4,
PACKSIZE = 32,
BlockSize = BlockSizeInUnitsOfPackSize * PACKSIZE,
blocksizeinbits = 7 };
std::vector<uint32_t> codedcopy;
std::vector<uint32_t> miss;
typedef uint32_t DATATYPE;
SIMDPFor() :
codedcopy(BlockSize), miss(BlockSize) {
}
static uint32_t determineBestBase(
const DATATYPE * in, size_t size) {
if (size == 0) return 0;
const size_t defaultsamplesize = 64 * 1024;
size_t samplesize = size > defaultsamplesize ? defaultsamplesize : size;
uint32_t freqs[33];
for (uint32_t k = 0; k <= 32; ++k)
freqs[k] = 0;
uint32_t rstart = size > samplesize ? ( rand() % (static_cast<uint32_t>(size - samplesize)) ) : 0U;
for (uint32_t k = rstart; k < rstart + samplesize; ++k) {
freqs[asmbits(in[k])]++;
}
uint32_t bestb = 32;
uint32_t numberofexceptions = 0;
double Erate = 0;
double bestcost = 32;
for (uint32_t b = bestb - 1; b < 32; --b) {
numberofexceptions += freqs[b + 1];
Erate = static_cast<double>(numberofexceptions) / static_cast<double>(samplesize);
if(numberofexceptions > 0) {
double altErate = (Erate * 128 - 1) / (Erate * (1U<<b));
if(altErate > Erate) Erate = altErate;
}
const double thiscost = b + Erate * 32;
if (thiscost <= bestcost) {
bestcost = thiscost;
bestb = b;
}
}
return bestb;
}
uint32_t compressblockPFOR(const DATATYPE * __restrict__ in,
uint32_t * __restrict__ outputbegin, const uint32_t b,
DATATYPE * __restrict__ & exceptions) {
if (b == 32) {
for (size_t k = 0; k < BlockSize; ++k)
*(outputbegin++) = *(in++) ;
return BlockSize;
}
size_t exceptcounter = 0;
const uint32_t maxgap = 1U << b;
{
std::vector<uint32_t>::iterator cci = codedcopy.begin();
for (uint32_t k = 0; k < BlockSize; ++k, ++cci) {
miss[exceptcounter] = k;
exceptcounter += (in[k] >= maxgap);
}
}
if (exceptcounter == 0) {
packblock(in, outputbegin, b);
return BlockSize;
}
codedcopy.assign(in,in+BlockSize);
uint32_t firstexcept = miss[0];
uint32_t prev = 0;
*(exceptions++) = codedcopy[firstexcept];
prev = firstexcept;
if (maxgap < BlockSize) {
for (uint32_t i = 1; i < exceptcounter; ++i) {
uint32_t cur = miss[i];
while (cur > maxgap + prev) {
uint32_t compulcur = prev + maxgap;
*(exceptions++) = codedcopy[compulcur];
codedcopy[prev] = maxgap - 1;
prev = compulcur;
}
*(exceptions++) = codedcopy[cur];
codedcopy[prev] = cur - prev - 1;
prev = cur;
}
} else {
for (uint32_t i = 1; i < exceptcounter; ++i) {
uint32_t cur = miss[i];
*(exceptions++) = codedcopy[cur];
codedcopy[prev] = cur - prev - 1;
prev = cur;
}
}
packblock(&codedcopy[0], outputbegin, b);
return firstexcept;
}
void packblock(const uint32_t * source, uint32_t * out, const uint32_t bit) {
usimdpack(source,reinterpret_cast<__m128i *>(out), bit);
}
void unpackblock(const uint32_t * source, uint32_t * out,
const uint32_t bit) {
usimdunpack(reinterpret_cast<const __m128i *>(source),out, bit);
}
void encodeArray(const uint32_t *in, const size_t len, uint32_t *out,
size_t &nvalue) {
*out++ = static_cast<uint32_t>(len);
#ifndef NDEBUG
const uint32_t * const finalin(in + len);
#endif
const uint32_t maxsize = (1U << (32 - blocksizeinbits - 1));
size_t totalnvalue(1);
for (size_t j = 0; j < (len+maxsize-1U)/maxsize; ++j) {
size_t i = j << (32 - blocksizeinbits - 1);
size_t l = maxsize;
if (i + maxsize > len) {
l = len - i;
assert(l <= maxsize);
}
size_t thisnvalue = nvalue - totalnvalue;
assert(in + i + l <= finalin);
__encodeArray(&in[i], l, out, thisnvalue);
totalnvalue += thisnvalue;
assert(totalnvalue <= nvalue);
out += thisnvalue;
}
nvalue = totalnvalue;
}
const uint32_t * decodeArray(const uint32_t *in, const size_t len,
uint32_t *out, size_t &nvalue) {
nvalue = *in++;
if (nvalue == 0)
return in;
#ifndef NDEBUG
const uint32_t * const initin = in;
#endif
const uint32_t * const finalin = in + len;
size_t totalnvalue(0);
while (totalnvalue < nvalue) {
size_t thisnvalue = nvalue - totalnvalue;
#ifndef NDEBUG
const uint32_t * const befin(in);
#endif
assert(finalin <= len + in);
in = __decodeArray(in, finalin - in, out, thisnvalue);
assert(in > befin);
assert(in <= finalin);
out += thisnvalue;
totalnvalue += thisnvalue;
assert(totalnvalue <= nvalue);
}
assert(in <= len + initin);
assert(in <= finalin);
nvalue = totalnvalue;
return in;
}
void __encodeArray(const uint32_t *in, const size_t len, uint32_t *out,
size_t &nvalue) {
checkifdivisibleby(len, BlockSize);
const uint32_t * const initout(out);
std::vector<DATATYPE> exceptions;
exceptions.resize(len);
DATATYPE * __restrict__ i = &exceptions[0];
const uint32_t b = determineBestBase(in,len);
*out++ = static_cast<uint32_t>(len);
*out++ = b;
for (size_t k = 0; k < len / BlockSize; ++k) {
uint32_t * const headerout(out);
++out;
uint32_t firstexcept = compressblockPFOR(in, out, b, i);
out += (BlockSize * b) / 32;
in += BlockSize;
const uint32_t bitsforfirstexcept = blocksizeinbits;
const uint32_t firstexceptmask = (1U << blocksizeinbits) - 1;
const uint32_t exceptindex = static_cast<uint32_t>(i - &exceptions[0]);
*headerout = (firstexcept & firstexceptmask) | (exceptindex
<< bitsforfirstexcept);
}
const size_t howmanyexcept = i - &exceptions[0];
for (uint32_t t = 0; t < howmanyexcept; ++t)
*out++ = exceptions[t];
nvalue = out - initout;
}
#ifndef NDEBUG
const uint32_t * __decodeArray(const uint32_t *in, const size_t len,
#else
const uint32_t * __decodeArray(const uint32_t *in, const size_t ,
#endif
uint32_t *out, size_t &nvalue) {
#ifndef NDEBUG
const uint32_t * const initin(in);
#endif
nvalue = *in++;
checkifdivisibleby(nvalue, BlockSize);
const uint32_t b = *in++;
const DATATYPE * __restrict__ except = in + nvalue * b / 32 + nvalue
/ BlockSize;
const uint32_t bitsforfirstexcept = blocksizeinbits;
const uint32_t firstexceptmask = (1U << blocksizeinbits) - 1;
const DATATYPE * endexceptpointer = except;
const DATATYPE * const initexcept(except);
for (size_t k = 0; k < nvalue / BlockSize; ++k) {
const uint32_t * const headerin(in);
++in;
const uint32_t firstexcept = *headerin & firstexceptmask;
const uint32_t exceptindex = *headerin >> bitsforfirstexcept;
endexceptpointer = initexcept + exceptindex;
uncompressblockPFOR(in, out, b, except, endexceptpointer,
firstexcept);
in += (BlockSize * b) / 32;
out += BlockSize;
}
assert(initin + len >= in);
assert(initin + len >= endexceptpointer);
return endexceptpointer;
}
void uncompressblockPFOR(
const uint32_t * __restrict__ inputbegin, DATATYPE * __restrict__ outputbegin, const uint32_t b,
const DATATYPE *__restrict__ & i, const DATATYPE * __restrict__ end_exception, size_t next_exception ) {
unpackblock(inputbegin, reinterpret_cast<uint32_t *> (outputbegin), b);
for (size_t cur = next_exception; i != end_exception; cur
= next_exception) {
next_exception = cur + static_cast<size_t> (outputbegin[cur]) + 1;
outputbegin[cur] = *(i++);
}
}
virtual std::string name() const {
std::ostringstream convert;
convert << "SIMDPFor";
return convert.str();
}
};
}
#endif