group_varint_offset_encoding/
lib.rs

1mod decoder;
2use decoder::decode_block;
3mod util;
4use util::*;
5
6#[cfg(test)]
7mod tests {
8    use super::*;
9
10    #[test]
11    fn tst1() {
12        let data = compress([0u32, 0, 0]);
13        assert_eq!(data, [0, 0, 0, 0]);
14    }
15
16    #[test]
17    fn tst2() {
18        let data = compress([128, 0, 0]);
19        assert_eq!(data, [0, 128, 0, 0]);
20    }
21
22    #[test]
23    fn tst3() {
24        let data = compress([255, 255, 255]);
25        assert_eq!(data, [1 << 6, 0, 0, 0]);
26    }
27
28    #[test]
29    fn tst4() {
30        let data = compress([255 * 2, 255, 255]);
31        assert_eq!(data, [1 << 6, 255, 0, 0]);
32    }
33
34    #[test]
35    fn test_lots_of_numbers() {
36        // create lots of input data, data can be grouped by three
37        let mut data: Vec<u32> = (0..256).map(|i| i * i * i * i).collect();
38        data.extend(0..256);
39        data.extend((0..256).map(|i| i << 12));
40
41        let cmpr = compress(data.iter().cloned());
42        let result = decompress(&cmpr);
43
44        assert_eq!(
45            result, data,
46            "expect data to be the same before and after compression/decompression"
47        );
48    }
49}
50
51struct DataBlockIter<'a> {
52    data: &'a [u8],
53}
54
55impl<'a> DataBlockIter<'a> {
56    fn to_vec(self) -> Vec<u32> {
57        let mut v = Vec::new();
58
59        for [a, b, c] in self {
60            v.push(a);
61            v.push(b);
62            v.push(c);
63        }
64
65        v
66    }
67}
68
69fn get_offset(index: u8) -> u32 {
70    match index {
71        0b00 => 0,
72        0b01 => 0xff,
73        0b10 => 0xffff,
74        0b11 => 0xffffff,
75        _ => panic!("expect number ranging from 0..=3"),
76    }
77}
78
79impl<'a> Iterator for DataBlockIter<'a> {
80    type Item = [u32; 3];
81
82    fn next(&mut self) -> Option<Self::Item> {
83        if self.data.is_empty() {
84            return None;
85        }
86
87        let v = self.data[0];
88        let offset = get_offset(v >> 6);
89        let data = &self.data[1..];
90
91        let (mut a, mut b, mut c, bytes_consumed) = decode_block(v, data);
92
93        a += offset;
94        b += offset;
95        c += offset;
96
97        self.data = &data[bytes_consumed..];
98
99        Some([a, b, c])
100    }
101}
102
103pub fn decompress(data: &[u8]) -> Vec<u32> {
104    DataBlockIter { data }.to_vec()
105}
106
107pub fn compress(iter: impl IntoIterator<Item = u32>) -> Vec<u8> {
108    let mut buffer = Vec::new();
109    let iter = iter.into_iter();
110    for mut chunk in (Chunk { iter }) {
111        // We append zeros instead of storing the length extra.
112        while chunk.len() < 3 {
113            chunk.push(0);
114        }
115
116        compress_block(&mut buffer, to_block(chunk));
117    }
118
119    buffer.shrink_to_fit();
120
121    buffer
122}
123
124fn to_block(v: Vec<u32>) -> [u32; 3] {
125    if v.len() != 3 {
126        unreachable!("length of vector must be 3");
127    }
128
129    [v[0], v[1], v[2]]
130}
131
132/// Computes how many bytes can be stripped
133/// from each numbers, without them under flowing
134fn max_viable_offset(chunk: [u32; 3]) -> u8 {
135    for i in [3, 2, 1] {
136        let offset = get_offset(i);
137
138        let [a, b, c] = chunk.map(|elem| elem >= offset);
139
140        if a & b & c {
141            return i;
142        }
143    }
144
145    0
146}
147
148fn compress_block(buffer: &mut Vec<u8>, chunk: [u32; 3]) {
149    // first, apply offset to chunk
150    let offset_index = max_viable_offset(chunk);
151    let offset = get_offset(offset_index);
152
153    // subtract offset from number.
154    let chunk = chunk.map(|elem| elem - offset);
155
156    let mut mask = offset_index << 6; //bits0 | bits1 << 2 | bits2 << 4 | offset_index << 6;
157    let maskidx = buffer.len();
158    buffer.push(0);
159
160    // loop over every integer in the chunk
161    for i in 0..3u8 {
162        let elem = chunk[i as usize];
163
164        let bits = var_bits(elem);
165        mask |= bits << (i << 1);
166
167        // the first byte uses less instructions to encode.
168        buffer.push((elem & 0xff) as u8);
169        for byte_index in 1..=bits {
170            let byte_index = byte_index * 8;
171            let byte = (elem >> byte_index) & 0xff;
172            buffer.push(byte as u8);
173        }
174    }
175
176    // apply mask
177    buffer[maskidx] = mask;
178}
179
180use smallvec::SmallVec;
181
182/// Compressed list of u32 integers.
183pub struct ListUInt32 {
184    data: Vec<u8>,
185    head: SmallVec<[u32; 3]>,
186}
187
188impl ListUInt32 {
189    pub fn new() -> Self {
190        ListUInt32 {
191            data: Vec::new(),
192            head: SmallVec::new(),
193        }
194    }
195
196    pub fn push(&mut self, value: u32) {
197        if self.head.len() == 2 {
198            let chunk = [self.head[0], self.head[1], self.head[2]];
199            compress_block(&mut self.data, chunk);
200        } else {
201            self.head.push(value);
202        }
203    }
204
205    pub fn to_vec(&self) -> Vec<u32> {
206        let i = DataBlockIter { data: &self.data };
207        let mut v = i.to_vec();
208        for i in &self.head {
209            v.push(*i);
210        }
211        v
212    }
213}
214
215impl Default for ListUInt32 {
216    fn default() -> Self {
217        Self::new()
218    }
219}