1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
/*!  # Fast Bitpacking algorithms

This crate is a **Rust port of [Daniel Lemire's simdcomp C library](https://github.com/lemire/simdcomp)**.
It contains different flavor of integers compression via bitpacking :  `BitPacker1x`, `BitPacker4x`, and `BitPacker8x`.

Each produces different formats, and are incompatible one with another,
and requires integers to be encoded in block of different size..

`BitPacker4x` and `BitPacker8x` are designed specifically to leverage `SSE3`
and `AVX2` instructions respectively.

The library will fallback to a scalar implementation if these instruction
sets are not available. For instance :
- because your compilation target architecture is not `x86_64`
- because the CPU you use is from an older generation

I recommend using `BitPacker4x` if you are in doubt.

See the [`BitPacker` trait](./trait.BitPacker.html) for example usage.

*/

#![allow(unused_unsafe)]
#![warn(missing_docs)]

use std::marker::Sized;

#[cfg(test)]
#[macro_use]
pub(crate) mod tests;

#[macro_use]
mod macros;
#[macro_use]
mod macros_simple;

trait Available {
    fn available() -> bool;
}

trait UnsafeBitPacker {
    const BLOCK_LEN: usize;
    unsafe fn compress(decompressed: &[u32], compressed: &mut [u8], num_bits: u8) -> usize;
    unsafe fn compress_sorted(
        initial: u32,
        decompressed: &[u32],
        compressed: &mut [u8],
        num_bits: u8,
    ) -> usize;
    unsafe fn compress_strictly_sorted(
        initial: Option<u32>,
        decompressed: &[u32],
        compressed: &mut [u8],
        num_bits: u8,
    ) -> usize;
    unsafe fn decompress(compressed: &[u8], decompressed: &mut [u32], num_bits: u8) -> usize;
    unsafe fn decompress_sorted(
        initial: u32,
        compressed: &[u8],
        decompressed: &mut [u32],
        num_bits: u8,
    ) -> usize;
    unsafe fn decompress_strictly_sorted(
        initial: Option<u32>,
        compressed: &[u8],
        decompressed: &mut [u32],
        num_bits: u8,
    ) -> usize;
    unsafe fn num_bits(decompressed: &[u32]) -> u8;
    unsafe fn num_bits_sorted(initial: u32, decompressed: &[u32]) -> u8;
    unsafe fn num_bits_strictly_sorted(initial: Option<u32>, decompressed: &[u32]) -> u8;
}

/// # Examples without delta-encoding
/// ```
/// extern crate bitpacking;
///
/// use bitpacking::{BitPacker4x, BitPacker};
///
/// # fn main() {
/// # let my_data: Vec<u32> = vec![7, 7, 7, 7, 11, 10, 15, 13, 6, 5, 3, 14, 5, 7,
/// #    15, 12, 1, 10, 8, 10, 12, 14, 13, 1, 10, 1, 1, 10, 4, 15, 12,
/// #    1, 2, 0, 8, 5, 14, 5, 2, 4, 1, 6, 14, 13, 5, 10, 10, 1, 6, 4,
/// #    1, 12, 1, 1, 5, 15, 15, 2, 8, 6, 4, 3, 10, 8, 8, 9, 2, 6, 10,
/// #    5, 7, 9, 0, 13, 15, 5, 13, 10, 0, 2, 10, 14, 5, 9, 12, 8, 5, 10,
/// #    8, 8, 10, 5, 13, 8, 11, 14, 7, 14, 4, 2, 9, 12, 14, 5, 15, 12, 0,
/// #    12, 13, 3, 13, 5, 4, 15, 9, 8, 9, 3, 3, 3, 1, 12, 0, 6, 11, 11, 12, 4];
///
/// let bitpacker = BitPacker4x::new();
///
/// let num_bits: u8 = bitpacker.num_bits(&my_data);
///
/// // A block will be take at most 4 bytes per-integers.
/// let mut compressed = vec![0u8; 4 * BitPacker4x::BLOCK_LEN];
///
/// # assert_eq!(num_bits, 4);
/// let compressed_len = bitpacker.compress(&my_data, &mut compressed[..], num_bits);
///
/// assert_eq!((num_bits as usize) *  BitPacker4x::BLOCK_LEN / 8, compressed_len);
///
/// // Decompressing
/// let mut decompressed = vec![0u32; BitPacker4x::BLOCK_LEN];
/// bitpacker.decompress(&compressed[..compressed_len], &mut decompressed[..], num_bits);
///
/// assert_eq!(&my_data, &decompressed);
/// # }
/// ```
///
///
///
/// # Examples with delta-encoding
///
/// Delta-encoding makes it possible to store sorted integers in an efficient manner.
/// Rather than encoding the integers directly, the interval (or deltas) between each of them
/// are computed and then encoded.
///
/// Decoding then requires to first decode the deltas and then operate a cumulative sum (also called
/// integration or prefix sum) on them.
///
/// ```
/// extern crate bitpacking;
///
/// use bitpacking::{BitPacker4x, BitPacker};
///
/// # fn main() {
/// # let my_data: Vec<u32> = vec![0, 5, 6, 8, 12, 21, 30, 38,
/// # 46, 52, 59, 61, 62, 62, 71, 71, 73, 74, 76,
/// # 77, 80, 87, 96, 99, 105, 114, 119, 121, 128,
/// # 133, 138, 145, 152, 161, 161, 166, 175, 176,
/// # 180, 186, 189, 193, 202, 211, 220, 224, 229,
/// # 238, 247, 255, 261, 267, 267, 268, 269, 269,
/// # 270, 271, 279, 283, 285, 291, 297, 303, 305,
/// # 309, 310, 315, 316, 316, 321, 324, 329, 337,
/// # 339, 342, 350, 355, 364, 373, 382, 386, 392,
/// # 400, 408, 414, 423, 431, 433, 436, 441, 444,
/// # 445, 454, 463, 463, 465, 472, 474, 477, 480,
/// # 488, 493, 496, 501, 503, 509, 515, 519, 526,
/// # 526, 532, 539, 542, 542, 542, 549, 557, 565,
/// # 566, 573, 578, 580, 581, 585, 588, 588, 591];
///
///
/// // The initial value is used to compute the first delta.
/// // In most use cases, you will be compressing long increasing
/// // integer sequences.
/// //
/// // You should probably pass an initial value of `0u32` to the
/// // first block if you do not have any information.
/// //
/// // When encoding the second block however, you will want to pass the last
/// // value of the first block.
/// let initial_value = 0u32;
///
/// let bitpacker = BitPacker4x::new();
///
/// let num_bits: u8 = bitpacker.num_bits_sorted(initial_value, &my_data);
///
/// // A block will be take at most 4 bytes per-integers.
/// let mut compressed = vec![0u8; 4 * BitPacker4x::BLOCK_LEN];
///
/// # assert_eq!(num_bits, 4);
///
/// let compressed_len = bitpacker.compress_sorted(initial_value, &my_data, &mut compressed[..], num_bits);
///
/// assert_eq!((num_bits as usize) *  BitPacker4x::BLOCK_LEN / 8, compressed_len);
///
/// // Decompressing
/// let mut decompressed = vec![0u32; BitPacker4x::BLOCK_LEN];
///
/// // The initial value must be the same as the one passed
/// // when compressing the block.
/// bitpacker.decompress_sorted(initial_value, &compressed[..compressed_len], &mut decompressed[..], num_bits);
///
/// assert_eq!(&my_data, &decompressed);
/// # }
pub trait BitPacker: Sized + Clone + Copy {
    /// Number of `u32` per compressed block
    const BLOCK_LEN: usize;

    /// Checks the available instructions set on the current
    /// CPU and returns the best available implementation.
    ///
    /// Calling `.new()` is extremely cheap, and does not
    /// require any heap allocation. It is *not* required to cache
    /// its result too aggressively.
    fn new() -> Self;

    /// Compress a block of `u32`.
    ///
    /// Assumes that the integers are all lower than `2^num_bits`.
    /// The result is undefined if they are larger.
    ///
    /// Returns the amount of bytes of the compressed block.
    ///
    /// # Panics
    ///
    /// - Panics if the compressed destination array is too small
    /// - Panics if `decompressed` length is not exactly the `BLOCK_LEN`.
    fn compress(&self, decompressed: &[u32], compressed: &mut [u8], num_bits: u8) -> usize;

    /// Delta encode and compressed the `decompressed` array.
    ///
    /// Assumes that the elements in the `decompressed` array are sorted.
    /// `initial` will be used to compute the first `delta`.
    ///
    /// # Panics
    ///
    /// - Panics if `initial` is greater than `decompressed[0]`
    /// - Panics if `decompressed` is not sorted
    /// - Panics if `decompressed`'s length is not exactly `BLOCK_LEN`
    /// - Panics if `compressed` is not large enough to receive the compressed data
    /// - Panics if the compressed destination array is too small.
    ///
    /// Returns the amount of bytes of the compressed block.
    ///
    /// # Panics
    ///
    /// - Panics if the compressed array is too short.
    /// - Panics if the decompressed array is not exactly the `BLOCK_LEN`.
    fn compress_sorted(
        &self,
        initial: u32,
        decompressed: &[u32],
        compressed: &mut [u8],
        num_bits: u8,
    ) -> usize;

    /// Delta encode and compress the `decompressed` array.
    ///
    /// Assumes that the elements in the `decompressed` array are strictly
    /// monotonous, that is, each element is strictly greater than the previous.
    ///
    /// This codec can be more efficient that the simply sorted compressor by up to one bit per
    /// integer. This has an important impact on saturated or nearly saturated datasets (almost
    /// every number appears in sequence), but isn't very different from the sorted compressor
    /// on more sparse datasets.
    ///
    /// # Panics
    ///
    /// - Panics if `initial` is greater or equal to `decompressed[0]`
    /// - Panics if `decompressed` isn't strictly monotonic
    /// - Panics if `decompressed`'s length is not exactly `BLOCK_LEN`
    /// - Panics if `compressed` is not large enough to receive the compressed data
    ///
    /// Returns the amount of bytes in the compressed block.
    fn compress_strictly_sorted(
        &self,
        initial: Option<u32>,
        decompressed: &[u32],
        compressed: &mut [u8],
        num_bits: u8,
    ) -> usize;

    /// Decompress the `compress` array to the `decompressed` array.
    ///
    /// Returns the amount of bytes that were consumed.
    ///
    /// # Panics
    ///
    /// Panics if the compressed array is too short, or the decompressed array is too short.
    fn decompress(&self, compressed: &[u8], decompressed: &mut [u32], num_bits: u8) -> usize;

    /// Decompress the`compress`array to the `decompressed` array.
    /// The `compressed` array is assumed to have been delta-encoded and compressed.
    ///
    /// `initial` must be the value that was passed as the `initial` argument compressing
    /// the block.
    ///
    /// Returns the amount of bytes that have been read.
    ///
    /// # Panics
    ///
    /// - Panics if the compressed array is too short to contain `BLOCK_LEN` elements
    /// - Panics if the decompressed array is too short.
    fn decompress_sorted(
        &self,
        initial: u32,
        compressed: &[u8],
        decompressed: &mut [u32],
        num_bits: u8,
    ) -> usize;

    /// Decompress the`compress`array to the `decompressed` array.
    /// The `compressed` array is assumed to have been strict-delta-encoded and compressed.
    ///
    /// `initial` must be the value that was passed as the `initial` argument compressing
    /// the block.
    ///
    /// Returns the amount of bytes that have been read.
    ///
    /// # Panics
    ///
    /// - Panics if the compressed array is too short to contain `BLOCK_LEN` elements
    /// - Panics if the decompressed array is too short.

    fn decompress_strictly_sorted(
        &self,
        initial: Option<u32>,
        compressed: &[u8],
        decompressed: &mut [u32],
        num_bits: u8,
    ) -> usize;

    /// Returns the minimum number of bits used to represent the largest integer in the
    /// `decompressed` block.
    ///
    /// # Panics
    ///
    /// Panics if `decompressed`'s len is not exactly `BLOCK_LEN`.
    fn num_bits(&self, decompressed: &[u32]) -> u8;

    /// Returns the minimum number of bits used to represent the largest `delta` in the deltas in the
    /// `decompressed` block.
    ///
    /// # Panics
    ///
    /// Panics if `decompressed`'s len is not exactly `BLOCK_LEN`.
    fn num_bits_sorted(&self, initial: u32, decompressed: &[u32]) -> u8;

    /// Returns the minimum number of bits used to represent the largest `delta-1` in the deltas in the
    /// `decompressed` block.
    ///
    /// # Panics
    ///
    /// Panics if `decompressed`'s len is not exactly `BLOCK_LEN`.
    fn num_bits_strictly_sorted(&self, initial: Option<u32>, decompressed: &[u32]) -> u8;

    /// Returns the size of a compressed block.
    fn compressed_block_size(num_bits: u8) -> usize {
        Self::BLOCK_LEN * (num_bits as usize) / 8
    }
}

/// Returns the most significant bit.&self,
fn most_significant_bit(v: u32) -> u8 {
    if v == 0 {
        0
    } else {
        32u8 - (v.leading_zeros() as u8)
    }
}

#[cfg(all(feature = "bitpacker1x", any(test, not(debug_assertions))))]
mod bitpacker1x;
#[cfg(all(feature = "bitpacker4x", any(test, not(debug_assertions))))]
mod bitpacker4x;

#[cfg(all(feature = "bitpacker1x", debug_assertions))]
mod bitpacker1x_simple;
#[cfg(all(feature = "bitpacker4x", debug_assertions))]
mod bitpacker4x_simple;

#[cfg(feature = "bitpacker8x")]
mod bitpacker8x;

#[cfg(all(feature = "bitpacker1x", not(debug_assertions)))]
pub use bitpacker1x::BitPacker1x;
#[cfg(all(feature = "bitpacker1x", debug_assertions))]
pub use bitpacker1x_simple::BitPacker1x;

#[cfg(all(feature = "bitpacker4x", not(debug_assertions)))]
pub use bitpacker4x::BitPacker4x;
#[cfg(all(feature = "bitpacker4x", debug_assertions))]
pub use bitpacker4x_simple::BitPacker4x;

#[cfg(feature = "bitpacker8x")]
pub use bitpacker8x::BitPacker8x;

#[cfg(test)]
mod tests_unit {
    use super::*;

    #[test]
    #[should_panic(expected = "`decompressed`'s len is not `BLOCK_LEN=128`")]
    fn test_num_bits_block_too_long() {
        let bit_packer = BitPacker4x::new();
        let v = vec![0u32; BitPacker4x::BLOCK_LEN + 1];
        bit_packer.num_bits(&v[..]);
    }

    #[test]
    #[should_panic(expected = "`decompressed`'s len is not `BLOCK_LEN=128`")]
    fn test_num_bits_block_too_short() {
        let bit_packer = BitPacker4x::new();
        let v = vec![0u32; BitPacker4x::BLOCK_LEN - 1];
        bit_packer.num_bits(&v[..]);
    }
}

#[cfg(test)]
mod functional_tests {
    use crate::{BitPacker, BitPacker4x};
    use proptest::prelude::*;

    proptest! {
        #[test]
        #[ignore]
        fn check_block(
            values in prop::collection::vec(u32::arbitrary(), BitPacker4x::BLOCK_LEN),
        ) {
            let bit_packer = BitPacker4x::new();
            let bit_width = bit_packer.num_bits(&*values);

            let mut block = vec![0u8; BitPacker4x::compressed_block_size(bit_width)];
            bit_packer.compress(&values, &mut block, bit_width);

            let mut decoded_values = vec![0x10101010; BitPacker4x::BLOCK_LEN];
            bit_packer.decompress(&block, &mut decoded_values, bit_width);

            prop_assert_eq!(values, decoded_values);
        }

        #[ignore]
        #[test]
        fn check_sorted_block(
            (values, init_value) in prop::collection::vec(u32::arbitrary(), BitPacker4x::BLOCK_LEN).prop_flat_map(|mut values| {
                values.sort();
                let min_value = values[0];
                (Just(values), (0..=min_value))
            }),
        ) {
            let bit_packer = BitPacker4x::new();
            let bit_width = bit_packer.num_bits_sorted(init_value, &*values);

            let mut block = vec![0u8; BitPacker4x::compressed_block_size(bit_width)];
            bit_packer.compress_sorted(init_value, &values, &mut block, bit_width);

            let mut decoded_values = vec![0x10101010; BitPacker4x::BLOCK_LEN];
            bit_packer.decompress_sorted(init_value, &block, &mut decoded_values, bit_width);

            prop_assert_eq!(values, decoded_values);
        }
    }
}