1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
/**
* This code is released under the
* Apache License Version 2.0 http://www.apache.org/licenses/.
*
* (c) Daniel Lemire, http://lemire.me/en/
*/
#ifndef CODECS_H_
#define CODECS_H_
#include "common.h"
#include "util.h"
#include "bitpackinghelpers.h"
namespace FastPForLib {
class NotEnoughStorage : public std::runtime_error {
public:
size_t required; // number of 32-bit symbols required
NotEnoughStorage(const size_t req)
: runtime_error(""), required(req) {}
};
class IntegerCODEC {
public:
/**
* You specify input and input length, as well as
* output and output length. nvalue gets modified to
* reflect how much was used. If the new value of
* nvalue is more than the original value, we can
* consider this a buffer overrun.
*
* You are responsible for allocating the memory (length
* for *in and nvalue for *out).
*/
virtual void encodeArray(const uint32_t *in, const size_t length,
uint32_t *out, size_t &nvalue) = 0;
virtual void encodeArray(const uint64_t *, const size_t ,
uint32_t *, size_t &) {
throw std::logic_error("Not implemented!");
}
/**
* Usage is similar to decodeArray except that it returns a pointer
* incremented from in. In theory it should be in+length. If the
* returned pointer is less than in+length, then this generally means
* that the decompression is not finished (some scheme compress
* the bulk of the data one way, and they then they compress remaining
* integers using another scheme).
*
* As with encodeArray, you need to have length element allocated
* for *in and at least nvalue elements allocated for out. The value
* of the variable nvalue gets updated with the number actually use
* (if nvalue exceeds the original value, there might be a buffer
* overrun).
*
* NOTE: Decoding can be performed with an unknown input length. This
* case is indicated by a length of 0; however, nvalue must be provided
* in order for the decoder knows how many values to decode.
*/
virtual const uint32_t *decodeArray(const uint32_t *in, const size_t length,
uint32_t *out, size_t &nvalue) = 0;
virtual const uint32_t *decodeArray(const uint32_t *, const size_t ,
uint64_t *, size_t &) {
throw std::logic_error("Not implemented!");
}
virtual ~IntegerCODEC() {}
/**
* Will compress the content of a vector into
* another vector.
*
* This is offered for convenience. It might be slow.
*/
virtual std::vector<uint32_t> compress(const std::vector<uint32_t> &data) {
std::vector<uint32_t> compresseddata(data.size() * 2 +
1024); // allocate plenty of memory
size_t memavailable = compresseddata.size();
encodeArray(&data[0], data.size(), &compresseddata[0], memavailable);
compresseddata.resize(memavailable);
return compresseddata;
}
/**
* Will uncompress the content of a vector into
* another vector. Some CODECs know exactly how much data to uncompress,
* others need to uncompress it all to know how data there is to uncompress...
* So it useful to have a hint (expected_uncompressed_size) that tells how
* much data there will be to uncompress. Otherwise, the code will
* try to guess, but the result is uncertain and inefficient. You really
* ought to keep track of how many symbols you had compressed.
*
* For convenience. Might be slow.
*/
virtual std::vector<uint32_t>
uncompress(const std::vector<uint32_t> &compresseddata,
size_t expected_uncompressed_size = 0) {
std::vector<uint32_t> data(
expected_uncompressed_size); // allocate plenty of memory
size_t memavailable = data.size();
try {
decodeArray(&compresseddata[0], compresseddata.size(), &data[0],
memavailable);
} catch (NotEnoughStorage &nes) {
data.resize(nes.required + 1024);
decodeArray(&compresseddata[0], compresseddata.size(), &data[0],
memavailable);
}
data.resize(memavailable);
return data;
}
virtual std::string name() const = 0;
};
/******************
* This just copies the data, no compression.
*/
class JustCopy : public IntegerCODEC {
public:
void encodeArray(const uint32_t *in, const size_t length, uint32_t *out,
size_t &nvalue) {
memcpy(out, in, sizeof(uint32_t) * length);
nvalue = length;
}
// like encodeArray, but we don't actually copy
void fakeencodeArray(const uint32_t * /*in*/, const size_t length,
size_t &nvalue) {
nvalue = length;
}
const uint32_t *decodeArray(const uint32_t *in, const size_t length,
uint32_t *out, size_t &nvalue) {
memcpy(out, in, sizeof(uint32_t) * length);
nvalue = length;
return in + length;
}
std::string name() const { return "JustCopy"; }
};
/********
* This uses a single bit width for the whole array.
* It has fast decompression and random access, but
* relatively poor compression. Included as an example.
*/
class PackedCODEC : public IntegerCODEC {
public:
enum { BlockSize = 32 };
void encodeArray(const uint32_t *in, const size_t length, uint32_t *out,
size_t &nvalue) {
checkifdivisibleby(length, 32);
const uint32_t b = maxbits(in, in + length);
out[0] = static_cast<uint32_t>(length);
out[1] = b;
out += 2;
for (uint32_t run = 0; run < length / 32; ++run, in += 32, out += b) {
fastpackwithoutmask(in, out, b);
}
nvalue = 2 + length * b / 32;
}
#ifndef NDEBUG
const uint32_t *decodeArray(const uint32_t *in, const size_t length,
#else
const uint32_t *decodeArray(const uint32_t *in, const size_t /*length*/,
#endif
uint32_t *out, size_t &nvalue) {
nvalue = in[0];
const uint32_t b = in[1];
assert(length >= nvalue * b / 32);
in += 2;
for (uint32_t run = 0; run < nvalue / 32; ++run, in += b, out += 32) {
fastunpack(in, out, b);
}
return in;
}
std::string name() const { return "PackedCODEC"; }
};
} // namespace FastPForLib
#endif /* CODECS_H_ */