lz4_compression/
compress.rs

1//! The compression algorithm.
2//!
3//! We make use of hash tables to find duplicates. This gives a reasonable compression ratio with a
4//! high performance. It has fixed memory usage, which contrary to other approachs, makes it less
5//! memory hungry.
6
7use std::convert::TryInto;
8
9/// Duplication dictionary size.
10///
11/// Every four bytes is assigned an entry. When this number is lower, fewer entries exists, and
12/// thus collisions are more likely, hurting the compression ratio.
13const DICTIONARY_SIZE: usize = 4096;
14
15
16/// A LZ4 block.
17///
18/// This defines a single compression "unit", consisting of two parts, a number of raw literals,
19/// and possibly a pointer to the already encoded buffer from which to copy.
20#[derive(Debug)]
21struct Block {
22    /// The length (in bytes) of the literals section.
23    lit_len: usize,
24
25    /// The duplicates section if any.
26    ///
27    /// Only the last block in a stream can lack of the duplicates section.
28    dup: Option<Duplicate>,
29}
30
31/// A consecutive sequence of bytes found in already encoded part of the input.
32#[derive(Copy, Clone, Debug)]
33struct Duplicate {
34    /// The number of bytes before our cursor, where the duplicate starts.
35    offset: u16,
36
37    /// The length beyond the four first bytes.
38    ///
39    /// Adding four to this number yields the actual length.
40    extra_bytes: usize,
41}
42
43/// An LZ4 encoder.
44struct Encoder<'a> {
45    /// The raw uncompressed input.
46    input: &'a [u8],
47
48    /// The compressed output.
49    output: &'a mut Vec<u8>,
50
51    /// The number of bytes from the input that are encoded.
52    cur: usize,
53
54    /// The dictionary of previously encoded sequences.
55    ///
56    /// This is used to find duplicates in the stream so they are not written multiple times.
57    ///
58    /// Every four bytes are hashed, and in the resulting slot their position in the input buffer
59    /// is placed. This way we can easily look up a candidate to back references.
60    dict: [usize; DICTIONARY_SIZE],
61}
62
63impl<'a> Encoder<'a> {
64    /// Go forward by some number of bytes.
65    ///
66    /// This will update the cursor and dictionary to reflect the now processed bytes.
67    ///
68    /// This returns `false` if all the input bytes are processed.
69    fn go_forward(&mut self, steps: usize) -> bool {
70        // Go over all the bytes we are skipping and update the cursor and dictionary.
71        for _ in 0..steps {
72            // Insert the cursor position into the dictionary.
73            self.insert_cursor();
74
75            // Increment the cursor.
76            self.cur += 1;
77        }
78
79        // Return `true` if there's more to read.
80        self.cur <= self.input.len()
81    }
82
83    /// Insert the batch under the cursor into the dictionary.
84    fn insert_cursor(&mut self) {
85        // Make sure that there is at least one batch remaining.
86        if self.remaining_batch() {
87            // Insert the cursor into the table.
88            self.dict[self.get_cur_hash()] = self.cur;
89        }
90    }
91
92    /// Check if there are any remaining batches.
93    fn remaining_batch(&self) -> bool {
94        self.cur + 4 < self.input.len()
95    }
96
97    /// Get the hash of the current four bytes below the cursor.
98    ///
99    /// This is guaranteed to be below `DICTIONARY_SIZE`.
100    fn get_cur_hash(&self) -> usize {
101        // Use PCG transform to generate a relatively good hash of the four bytes batch at the
102        // cursor.
103        let mut x = self.get_batch_at_cursor().wrapping_mul(0xa4d94a4f);
104        let a = x >> 16;
105        let b = x >> 30;
106        x ^= a >> b;
107        x = x.wrapping_mul(0xa4d94a4f);
108
109        x as usize % DICTIONARY_SIZE
110    }
111
112    /// Read a 4-byte "batch" from some position.
113    ///
114    /// This will read a native-endian 4-byte integer from some position.
115    fn get_batch(&self, n: usize) -> u32 {
116        debug_assert!(self.remaining_batch(), "Reading a partial batch.");
117
118        u32::from_ne_bytes(self.input[n..n+4].try_into().unwrap())
119    }
120
121    /// Read the batch at the cursor.
122    fn get_batch_at_cursor(&self) -> u32 {
123        self.get_batch(self.cur)
124    }
125
126    /// Find a duplicate of the current batch.
127    ///
128    /// If any duplicate is found, a tuple `(position, size - 4)` is returned.
129    fn find_duplicate(&self) -> Option<Duplicate> {
130        // If there is no remaining batch, we return none.
131        if !self.remaining_batch() {
132            return None;
133        }
134
135        // Find a candidate in the dictionary by hashing the current four bytes.
136        let candidate = self.dict[self.get_cur_hash()];
137
138        // Three requirements to the candidate exists:
139        // - The candidate is not the trap value (0xFFFFFFFF), which represents an empty bucket.
140        // - We should not return a position which is merely a hash collision, so w that the
141        //   candidate actually matches what we search for.
142        // - We can address up to 16-bit offset, hence we are only able to address the candidate if
143        //   its offset is less than or equals to 0xFFFF.
144        if candidate != !0
145            && self.get_batch(candidate) == self.get_batch_at_cursor()
146            && self.cur - candidate <= 0xFFFF {
147
148            // Calculate the "extension bytes", i.e. the duplicate bytes beyond the batch. These
149            // are the number of prefix bytes shared between the match and needle.
150            let ext = self.input[self.cur + 4..]
151                .iter()
152                .zip(&self.input[candidate + 4..])
153                .take_while(|&(a, b)| a == b)
154                .count();
155
156            Some(Duplicate {
157                offset: (self.cur - candidate) as u16,
158                extra_bytes: ext,
159            })
160        } else { None }
161    }
162
163    /// Write an integer to the output in LSIC format.
164    fn write_integer(&mut self, mut n: usize) {
165        // Write the 0xFF bytes as long as the integer is higher than said value.
166        while n >= 0xFF {
167            n -= 0xFF;
168            self.output.push(0xFF);
169        }
170
171        // Write the remaining byte.
172        self.output.push(n as u8);
173    }
174
175    /// Read the block of the top of the stream.
176    fn pop_block(&mut self) -> Block {
177        // The length of the literals section.
178        let mut lit = 0;
179
180        loop {
181            // Search for a duplicate.
182            if let Some(dup) = self.find_duplicate() {
183                // We found a duplicate, so the literals section is over...
184
185                // Move forward. Note that `ext` is actually the steps minus 4, because of the
186                // minimum matchlenght, so we need to add 4.
187                self.go_forward(dup.extra_bytes + 4);
188
189                return Block {
190                    lit_len: lit,
191                    dup: Some(dup),
192                };
193            }
194
195            // Try to move forward.
196            if !self.go_forward(1) {
197                // We reached the end of the stream, and no duplicates section follows.
198                return Block {
199                    lit_len: lit,
200                    dup: None,
201                };
202            }
203
204            // No duplicates found yet, so extend the literals section.
205            lit += 1;
206        }
207    }
208
209    /// Complete the encoding into `self.output`.
210    fn complete(&mut self) {
211        // Construct one block at a time.
212        loop {
213            // The start of the literals section.
214            let start = self.cur;
215
216            // Read the next block into two sections, the literals and the duplicates.
217            let block = self.pop_block();
218
219            // Generate the higher half of the token.
220            let mut token = if block.lit_len < 0xF {
221                // Since we can fit the literals length into it, there is no need for saturation.
222                (block.lit_len as u8) << 4
223            }
224            else {
225                // We were unable to fit the literals into it, so we saturate to 0xF. We will later
226                // write the extensional value through LSIC encoding.
227                0xF0
228            };
229
230            // Generate the lower half of the token, the duplicates length.
231            let dup_extra_len = block.dup.map_or(0, |x| x.extra_bytes);
232            token |= if dup_extra_len < 0xF {
233                // We could fit it in.
234                dup_extra_len as u8
235            }
236            else {
237                // We were unable to fit it in, so we default to 0xF, which will later be extended
238                // by LSIC encoding.
239                0xF
240            };
241
242            // Push the token to the output stream.
243            self.output.push(token);
244
245            // If we were unable to fit the literals length into the token, write the extensional
246            // part through LSIC.
247            if block.lit_len >= 0xF {
248                self.write_integer(block.lit_len - 0xF);
249            }
250
251            // Now, write the actual literals.
252            self.output.extend_from_slice(&self.input[start..start + block.lit_len]);
253
254            if let Some(Duplicate { offset, .. }) = block.dup {
255                // Wait! There's more. Now, we encode the duplicates section.
256
257                // Push the offset in little endian.
258                self.output.push(offset as u8);
259                self.output.push((offset >> 8) as u8);
260
261                // If we were unable to fit the duplicates length into the token, write the
262                // extensional part through LSIC.
263                if dup_extra_len >= 0xF {
264                    self.write_integer(dup_extra_len - 0xF);
265                }
266            } else {
267                break;
268            }
269        }
270    }
271}
272
273/// Compress all bytes of `input` into `output`.
274pub fn compress_into(input: &[u8], output: &mut Vec<u8>) {
275    Encoder {
276        input,
277        output,
278        cur: 0,
279        dict: [!0; DICTIONARY_SIZE],
280    }.complete();
281}
282
283/// Compress all bytes of `input`.
284pub fn compress(input: &[u8]) -> Vec<u8> {
285    // In most cases, the compression won't expand the size, so we set the input size as capacity.
286    let mut vec = Vec::with_capacity(input.len());
287
288    compress_into(input, &mut vec);
289
290    vec
291}