Skip to main content

tdb_succinct/
vbyte.rs

1//! A variable-byte encoding implementation for `u64`.
2//!
3//! The canonical reference for this variable-byte encoding technique appears to be:
4//!
5//! Hugh E. Williams and Justin Zobel.
6//! Compressing integers for fast file access.
7//! The Computer Journal, 42:193–201, 1999.
8//!
9//! There are a number of different implementations for variable-byte encoding. This particular
10//! implementation follows the [reference Java implementation] for the RDF HDT project.
11//! Another popular implementation is the one for Google's [Protocol Buffers]; however, that
12//! differs on where the most significant bit (msb) is set and cleared.
13//!
14//! [reference Java implementation]: https://github.com/rdfhdt/hdt-java/blob/master/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/integer/VByte.java
15//! [Protocol Buffers]: https://developers.google.com/protocol-buffers/docs/encoding
16
17use futures::io;
18use thiserror::Error;
19use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
20
21use bytes::Buf;
22
23/// The maximum number of bytes required for any `u64` in a variable-byte encoding.
24pub const MAX_ENCODING_LEN: usize = 10;
25
26/// Returns the number of bytes required for a `u64` in its variable-byte encoding.
27pub fn encoding_len(num: u64) -> usize {
28    match num {
29        0 => 1,
30        num => ((64 + 6) - num.leading_zeros() as usize) / 7,
31    }
32}
33
34#[derive(Debug, PartialEq, Error)]
35/// An error returned by `decode`.
36pub enum DecodeError {
37    /// `decode` cannot fit the encoded value into a `u64`.
38    #[error("cannot fit the encoded value into a u64")]
39    EncodedValueTooLarge,
40    /// `decode` did not find the last encoded byte before reaching the end of the buffer.
41    #[error("could not find the last encoded byte before reaching the end of the buffer")]
42    UnexpectedEndOfBuffer,
43    /// `decode` did not find the last encoded byte before reaching the maximum encoding length.
44    #[error("could not find the last encoded byte before reaching the maximum encoding length")]
45    UnexpectedEncodingLen,
46}
47
48#[derive(Debug, Error)]
49pub enum DecodeReaderError {
50    #[error(transparent)]
51    Io(io::Error),
52    #[error(transparent)]
53    DecodeError(#[from] DecodeError),
54}
55
56impl From<io::Error> for DecodeReaderError {
57    fn from(value: io::Error) -> Self {
58        match value.kind() {
59            io::ErrorKind::UnexpectedEof => {
60                DecodeReaderError::DecodeError(DecodeError::UnexpectedEndOfBuffer)
61            }
62            _ => DecodeReaderError::Io(value),
63        }
64    }
65}
66
67/// Returns `true` if the most significant bit (msb) of the byte is set. This indicates the byte is
68/// the last of the encoding.
69#[inline]
70const fn is_last_encoded_byte(byte: u8) -> bool {
71    byte >= 0x80
72}
73
74/// Mask the byte to ignore the most significant bit (msb).
75#[inline]
76const fn clear_msb(byte: u8) -> u8 {
77    byte & 0x7f
78}
79
80/// Set the most significant bit (msb) of the byte.
81#[inline]
82const fn set_msb(byte: u8) -> u8 {
83    byte | 0x80
84}
85
86/// Returns `true` if `shift` indicates we're at the maximum possible byte of the encoding and if
87/// that byte value is too large for the result to fit into the unsigned 64-bit value.
88#[inline]
89fn max_byte_too_large(shift: u32, byte: u8) -> bool {
90    shift == 63 && byte > 0x81
91}
92
93/// Decodes a `u64` from a variable-byte-encoded slice.
94///
95/// On success, this function returns `Ok` with the decoded value and encoding length. Otherwise,
96/// the slice data is invalid, and the function returns `Err` with the corresponding `DecodeError`
97/// giving the reason.
98///
99/// This function expects the encoded value to start at the beginning of the slice; and the slice
100/// must be large enough to include all of the encoded bytes of one value. Decoding stops at the
101/// end of the encoded value, so it doesn't matter if the slice is longer.
102pub fn decode(mut buf: &[u8]) -> Result<(u64, usize), DecodeError> {
103    decode_buf(&mut buf)
104}
105
106/// Decodes a `u64` from a variable-byte-encoded slice.
107///
108/// On success, this function returns `Ok` with the decoded value and encoding length. Otherwise,
109/// the slice data is invalid, and the function returns `Err` with the corresponding `DecodeError`
110/// giving the reason.
111///
112/// This function expects the encoded value to start at the beginning of the slice; and the slice
113/// must be large enough to include all of the encoded bytes of one value. Decoding stops at the
114/// end of the encoded value, so it doesn't matter if the slice is longer.
115pub fn decode_buf<B: Buf>(buf: &mut B) -> Result<(u64, usize), DecodeError> {
116    // This will be the decoded result.
117    let mut num: u64 = 0;
118    // This is how many bits we shift `num` by on each iteration in increments of 7.
119    let mut shift: u32 = 0;
120    // Loop through each 8-bit byte value with its index.
121    let mut count = 0;
122    loop {
123        if !buf.has_remaining() {
124            return Err(DecodeError::UnexpectedEndOfBuffer);
125        }
126
127        let b = buf.get_u8();
128        count += 1;
129
130        if is_last_encoded_byte(b) {
131            return if max_byte_too_large(shift, b) {
132                Err(DecodeError::EncodedValueTooLarge)
133            } else {
134                // Return the result (clearing the msb) and the encoding length.
135                Ok((num | ((clear_msb(b) as u64) << shift), count))
136            };
137        }
138        // This is not the last byte. Update the result.
139        num |= (b as u64) << shift;
140        // Increment the shift amount for the next 7 bits.
141        shift += 7;
142        // Stop if we are about to exceed the maximum encoding length.
143        if shift > 64 {
144            // We have reached the maximum encoding length without encountering the last encoded
145            // byte.
146            return Err(DecodeError::UnexpectedEncodingLen);
147        }
148    }
149}
150
151/// Decodes a `u64` from a variable-byte-encoded slice.
152///
153/// On success, this function returns `Ok` with the decoded value and encoding length. Otherwise,
154/// the slice data is invalid, and the function returns `Err` with the corresponding `DecodeError`
155/// giving the reason.
156///
157/// This function expects the encoded value to start at the beginning of the slice; and the slice
158/// must be large enough to include all of the encoded bytes of one value. Decoding stops at the
159/// end of the encoded value, so it doesn't matter if the slice is longer.
160pub async fn decode_reader<R: AsyncRead + Unpin>(
161    mut reader: R,
162) -> Result<(u64, usize), DecodeReaderError> {
163    // This will be the decoded result.
164    let mut num: u64 = 0;
165    // This is how many bits we shift `num` by on each iteration in increments of 7.
166    let mut shift: u32 = 0;
167    // Loop through each 8-bit byte value with its index.
168    let mut count = 0;
169    loop {
170        let b = reader.read_u8().await?;
171        count += 1;
172
173        if is_last_encoded_byte(b) {
174            return if max_byte_too_large(shift, b) {
175                Err(DecodeError::EncodedValueTooLarge.into())
176            } else {
177                // Return the result (clearing the msb) and the encoding length.
178                Ok((num | ((clear_msb(b) as u64) << shift), count))
179            };
180        }
181        // This is not the last byte. Update the result.
182        num |= (b as u64) << shift;
183        // Increment the shift amount for the next 7 bits.
184        shift += 7;
185        // Stop if we are about to exceed the maximum encoding length.
186        if shift > 64 {
187            // We have reached the maximum encoding length without encountering the last encoded
188            // byte.
189            return Err(DecodeError::UnexpectedEncodingLen.into());
190        }
191    }
192}
193
194/// Returns `true` if more than 7 bits remain to be encoded.
195#[inline]
196const fn more_than_7bits_remain(num: u64) -> bool {
197    num >= 0x80
198}
199
200/// Encodes a `u64` by writing its variable-byte encoding to a slice.
201///
202/// Returns the encoding length.
203///
204/// This function does not ensure that `buf` is large enough to include the encoding length of the
205/// number. In particular, there are no bounds checks on indexing. The caller of this function must
206/// ensure that `buf` is large enough for the encoded `num`. This can be done, for example, by
207/// using `MAX_ENCODING_LEN` to create the `buf` or by using `encoding_len` to validate the length
208/// of `buf`.
209unsafe fn encode_unchecked(buf: &mut [u8], mut num: u64) -> usize {
210    // Initialize the buffer index. This will be used for the encoding length at the end.
211    let mut i = 0;
212    // Loop through all 7-bit strings of the number.
213    while more_than_7bits_remain(num) {
214        // This is not the last encoded byte.
215        *buf.get_unchecked_mut(i) = clear_msb(num as u8);
216        // Get the next 7 bits.
217        num >>= 7;
218        // Increment the index.
219        i += 1;
220    }
221    // This is the last encoded byte.
222    *buf.get_unchecked_mut(i) = set_msb(num as u8);
223    // Return the encoding length.
224    i + 1
225}
226
227/// Encodes a `u64` by writing its variable-byte encoding to a slice.
228///
229/// On success, this function returns `Some` encoding length. Otherwise, the target slice is not
230/// large enough, and the function returns `None`.
231pub fn encode_slice(buf: &mut [u8], num: u64) -> Option<usize> {
232    // Validate the length of the buffer.
233    if encoding_len(num) > buf.len() {
234        None
235    } else {
236        // Safety: We have verified that `buf.len()` is large enough to hold the encoded bytes of
237        // `num`.
238        unsafe { Some(encode_unchecked(buf, num)) }
239    }
240}
241
242/// Encodes a `u64` with a variable-byte encoding in a `Vec`.
243///
244/// The length of the resultant `Vec` is the encoding length of `num`.
245pub fn encode_vec(num: u64) -> Vec<u8> {
246    // Allocate a `Vec` of the right size.
247    let mut vec = vec![0; encoding_len(num)];
248    // Safety: We have created `vec` with the length of the encoded bytes of `num`.
249    unsafe { encode_unchecked(&mut vec, num) };
250    vec
251}
252
253/// Encodes a `u64` with a variable-byte encoding in an array.
254///
255/// The array is always length 10. Additinally, the actual size of the vbyte is returned.
256pub fn encode_array(num: u64) -> ([u8; 10], usize) {
257    // Allocate a `Vec` of the right size.
258    let mut buf = [0; 10];
259    // Safety: We have created `vec` with the length of the encoded bytes of `num`.
260    let size = unsafe { encode_unchecked(&mut buf, num) };
261    (buf, size)
262}
263
264/*
265pub fn encode_into_writer<W:Write>(writer: &mut W, mut num: u64) -> std::io::Result<usize> {
266    let mut i = 0;
267    // Loop through all 7-bit strings of the number.
268    while more_than_7bits_remain(num) {
269        // This is not the last encoded byte.
270        let b = clear_msb(num as u8);
271        writer.write_u8(b)?;
272        // Get the next 7 bits.
273        num >>= 7;
274        i+=1;
275    }
276    // This is the last encoded byte.
277    let b = set_msb(num as u8);
278    // Return the encoding length.
279    writer.write_u8(b)?;
280    Ok(i + 1)
281}
282*/
283
284/// Encodes a `u64` with a variable-byte encoding in a `Vec` and writes that `Vec` to the
285/// destination `dest` in a future.
286pub async fn write_async<A>(dest: &mut A, num: u64) -> io::Result<usize>
287where
288    A: 'static + AsyncWrite + Unpin + Send,
289{
290    let vec = encode_vec(num);
291    let len = vec.len();
292    dest.write_all(&vec).await?;
293
294    Ok(len)
295}
296
297#[cfg(test)]
298mod tests {
299    use super::*;
300
301    fn encode_decode_success(buf: &mut [u8], expected: &[u8], num: u64) {
302        assert_eq!(Some(expected.len()), encode_slice(buf, num));
303        assert_eq!(expected, buf);
304        let (n, len) = decode(&buf).unwrap();
305        assert_eq!(num, n);
306        assert_eq!(expected.len(), len);
307    }
308
309    #[test]
310    fn encode_decode_1_byte() {
311        let mut buf = [0; 1];
312        encode_decode_success(&mut buf, &[set_msb(0)], 0);
313        encode_decode_success(&mut buf, &[set_msb(1)], 1);
314        encode_decode_success(&mut buf, &[set_msb(42)], 42);
315    }
316
317    #[test]
318    fn encode_decode_2_bytes() {
319        let mut buf = [0; 2];
320        encode_decode_success(&mut buf, &[0b0101000, set_msb(0b1)], 0b1_0101000);
321        let expected = [0b0000000, set_msb(0b1111111)];
322        encode_decode_success(&mut buf, &expected, 0b1111111_0000000);
323    }
324
325    #[test]
326    fn encode_decode_4_bytes() {
327        let mut buf = [0; 4];
328        let expected = [0b0110010, 0b0101111, 0b0011101, set_msb(0b10100)];
329        encode_decode_success(&mut buf, &expected, 0b10100_0011101_0101111_0110010);
330    }
331
332    #[test]
333    fn encode_decode_7_bytes() {
334        let mut buf = [0; 7];
335        let expected = [
336            0b0100000,
337            0b0010010,
338            0b1010101,
339            0b1001010,
340            0b1001000,
341            0b1000110,
342            set_msb(0b1100001),
343        ];
344        let num = 0b1100001_1000110_1001000_1001010_1010101_0010010_0100000;
345        encode_decode_success(&mut buf, &expected, num);
346    }
347
348    #[test]
349    fn encode_decode_max_bytes() {
350        let mut buf = [0; MAX_ENCODING_LEN];
351        let mut expected = [0x7f; 10];
352        assert_eq!(Err(DecodeError::UnexpectedEncodingLen), decode(&buf));
353        expected[9] = set_msb(0x01);
354        encode_decode_success(&mut buf, &expected, u64::max_value());
355        expected[0] -= 1;
356        encode_decode_success(&mut buf, &expected, u64::max_value() - 1);
357    }
358
359    #[test]
360    fn encode_decode_4_bytes_in_20_bytes() {
361        let mut buf = [0; 20];
362        assert_eq!(Some(4), encode_slice(&mut buf, 194984659));
363        let (n, len) = decode(&buf).unwrap();
364        assert_eq!(194984659, n);
365        assert_eq!(4, len);
366    }
367
368    #[test]
369    fn encode_0_bytes_fails() {
370        let mut buf = [];
371        assert_eq!(None, encode_slice(&mut buf, 0));
372    }
373
374    #[test]
375    fn encode_4_bytes_to_3_bytes_fails() {
376        let mut buf = [0; 3];
377        let num = 0b1011100_1111100_1110101_1010011;
378        assert_eq!(None, encode_slice(&mut buf, num));
379    }
380
381    #[test]
382    fn decode_0_bytes_fails() {
383        let buf = [];
384        assert_eq!(Err(DecodeError::UnexpectedEndOfBuffer), decode(&buf));
385    }
386
387    #[test]
388    fn decode_1_byte_without_msb_fails() {
389        let buf = [0b0001000];
390        assert_eq!(Err(DecodeError::UnexpectedEndOfBuffer), decode(&buf));
391    }
392
393    #[test]
394    fn encoded_value_too_large_fails() {
395        let mut buf = [0; 10];
396        buf[9] = set_msb(0x02);
397        assert_eq!(Err(DecodeError::EncodedValueTooLarge), decode(&buf));
398    }
399
400    #[test]
401    fn decode_11_bytes_fails() {
402        let mut buf = [0; 11];
403        buf[10] = set_msb(0x01);
404        assert_eq!(Err(DecodeError::UnexpectedEncodingLen), decode(&buf));
405    }
406
407    #[test]
408    fn encoded_len_tests() {
409        for &(len, num) in &[
410            (1, 0),
411            (1, 1),
412            (2, 0b11_0101000),
413            (7, 0b1100001_1000110_1001000_1001010_1010101_0010010_0100000),
414            (10, u64::max_value() - 1),
415            (MAX_ENCODING_LEN, u64::max_value()),
416        ] {
417            assert_eq!(len, encoding_len(num));
418        }
419    }
420}