tdb_succinct/vbyte.rs
1//! A variable-byte encoding implementation for `u64`.
2//!
3//! The canonical reference for this variable-byte encoding technique appears to be:
4//!
5//! Hugh E. Williams and Justin Zobel.
6//! Compressing integers for fast file access.
7//! The Computer Journal, 42:193–201, 1999.
8//!
9//! There are a number of different implementations for variable-byte encoding. This particular
10//! implementation follows the [reference Java implementation] for the RDF HDT project.
11//! Another popular implementation is the one for Google's [Protocol Buffers]; however, that
12//! differs on where the most significant bit (msb) is set and cleared.
13//!
14//! [reference Java implementation]: https://github.com/rdfhdt/hdt-java/blob/master/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/integer/VByte.java
15//! [Protocol Buffers]: https://developers.google.com/protocol-buffers/docs/encoding
16
17use futures::io;
18use thiserror::Error;
19use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
20
21use bytes::Buf;
22
23/// The maximum number of bytes required for any `u64` in a variable-byte encoding.
24pub const MAX_ENCODING_LEN: usize = 10;
25
26/// Returns the number of bytes required for a `u64` in its variable-byte encoding.
27pub fn encoding_len(num: u64) -> usize {
28 match num {
29 0 => 1,
30 num => ((64 + 6) - num.leading_zeros() as usize) / 7,
31 }
32}
33
34#[derive(Debug, PartialEq, Error)]
35/// An error returned by `decode`.
36pub enum DecodeError {
37 /// `decode` cannot fit the encoded value into a `u64`.
38 #[error("cannot fit the encoded value into a u64")]
39 EncodedValueTooLarge,
40 /// `decode` did not find the last encoded byte before reaching the end of the buffer.
41 #[error("could not find the last encoded byte before reaching the end of the buffer")]
42 UnexpectedEndOfBuffer,
43 /// `decode` did not find the last encoded byte before reaching the maximum encoding length.
44 #[error("could not find the last encoded byte before reaching the maximum encoding length")]
45 UnexpectedEncodingLen,
46}
47
48#[derive(Debug, Error)]
49pub enum DecodeReaderError {
50 #[error(transparent)]
51 Io(io::Error),
52 #[error(transparent)]
53 DecodeError(#[from] DecodeError),
54}
55
56impl From<io::Error> for DecodeReaderError {
57 fn from(value: io::Error) -> Self {
58 match value.kind() {
59 io::ErrorKind::UnexpectedEof => {
60 DecodeReaderError::DecodeError(DecodeError::UnexpectedEndOfBuffer)
61 }
62 _ => DecodeReaderError::Io(value),
63 }
64 }
65}
66
67/// Returns `true` if the most significant bit (msb) of the byte is set. This indicates the byte is
68/// the last of the encoding.
69#[inline]
70const fn is_last_encoded_byte(byte: u8) -> bool {
71 byte >= 0x80
72}
73
74/// Mask the byte to ignore the most significant bit (msb).
75#[inline]
76const fn clear_msb(byte: u8) -> u8 {
77 byte & 0x7f
78}
79
80/// Set the most significant bit (msb) of the byte.
81#[inline]
82const fn set_msb(byte: u8) -> u8 {
83 byte | 0x80
84}
85
86/// Returns `true` if `shift` indicates we're at the maximum possible byte of the encoding and if
87/// that byte value is too large for the result to fit into the unsigned 64-bit value.
88#[inline]
89fn max_byte_too_large(shift: u32, byte: u8) -> bool {
90 shift == 63 && byte > 0x81
91}
92
93/// Decodes a `u64` from a variable-byte-encoded slice.
94///
95/// On success, this function returns `Ok` with the decoded value and encoding length. Otherwise,
96/// the slice data is invalid, and the function returns `Err` with the corresponding `DecodeError`
97/// giving the reason.
98///
99/// This function expects the encoded value to start at the beginning of the slice; and the slice
100/// must be large enough to include all of the encoded bytes of one value. Decoding stops at the
101/// end of the encoded value, so it doesn't matter if the slice is longer.
102pub fn decode(mut buf: &[u8]) -> Result<(u64, usize), DecodeError> {
103 decode_buf(&mut buf)
104}
105
106/// Decodes a `u64` from a variable-byte-encoded slice.
107///
108/// On success, this function returns `Ok` with the decoded value and encoding length. Otherwise,
109/// the slice data is invalid, and the function returns `Err` with the corresponding `DecodeError`
110/// giving the reason.
111///
112/// This function expects the encoded value to start at the beginning of the slice; and the slice
113/// must be large enough to include all of the encoded bytes of one value. Decoding stops at the
114/// end of the encoded value, so it doesn't matter if the slice is longer.
115pub fn decode_buf<B: Buf>(buf: &mut B) -> Result<(u64, usize), DecodeError> {
116 // This will be the decoded result.
117 let mut num: u64 = 0;
118 // This is how many bits we shift `num` by on each iteration in increments of 7.
119 let mut shift: u32 = 0;
120 // Loop through each 8-bit byte value with its index.
121 let mut count = 0;
122 loop {
123 if !buf.has_remaining() {
124 return Err(DecodeError::UnexpectedEndOfBuffer);
125 }
126
127 let b = buf.get_u8();
128 count += 1;
129
130 if is_last_encoded_byte(b) {
131 return if max_byte_too_large(shift, b) {
132 Err(DecodeError::EncodedValueTooLarge)
133 } else {
134 // Return the result (clearing the msb) and the encoding length.
135 Ok((num | ((clear_msb(b) as u64) << shift), count))
136 };
137 }
138 // This is not the last byte. Update the result.
139 num |= (b as u64) << shift;
140 // Increment the shift amount for the next 7 bits.
141 shift += 7;
142 // Stop if we are about to exceed the maximum encoding length.
143 if shift > 64 {
144 // We have reached the maximum encoding length without encountering the last encoded
145 // byte.
146 return Err(DecodeError::UnexpectedEncodingLen);
147 }
148 }
149}
150
151/// Decodes a `u64` from a variable-byte-encoded slice.
152///
153/// On success, this function returns `Ok` with the decoded value and encoding length. Otherwise,
154/// the slice data is invalid, and the function returns `Err` with the corresponding `DecodeError`
155/// giving the reason.
156///
157/// This function expects the encoded value to start at the beginning of the slice; and the slice
158/// must be large enough to include all of the encoded bytes of one value. Decoding stops at the
159/// end of the encoded value, so it doesn't matter if the slice is longer.
160pub async fn decode_reader<R: AsyncRead + Unpin>(
161 mut reader: R,
162) -> Result<(u64, usize), DecodeReaderError> {
163 // This will be the decoded result.
164 let mut num: u64 = 0;
165 // This is how many bits we shift `num` by on each iteration in increments of 7.
166 let mut shift: u32 = 0;
167 // Loop through each 8-bit byte value with its index.
168 let mut count = 0;
169 loop {
170 let b = reader.read_u8().await?;
171 count += 1;
172
173 if is_last_encoded_byte(b) {
174 return if max_byte_too_large(shift, b) {
175 Err(DecodeError::EncodedValueTooLarge.into())
176 } else {
177 // Return the result (clearing the msb) and the encoding length.
178 Ok((num | ((clear_msb(b) as u64) << shift), count))
179 };
180 }
181 // This is not the last byte. Update the result.
182 num |= (b as u64) << shift;
183 // Increment the shift amount for the next 7 bits.
184 shift += 7;
185 // Stop if we are about to exceed the maximum encoding length.
186 if shift > 64 {
187 // We have reached the maximum encoding length without encountering the last encoded
188 // byte.
189 return Err(DecodeError::UnexpectedEncodingLen.into());
190 }
191 }
192}
193
194/// Returns `true` if more than 7 bits remain to be encoded.
195#[inline]
196const fn more_than_7bits_remain(num: u64) -> bool {
197 num >= 0x80
198}
199
200/// Encodes a `u64` by writing its variable-byte encoding to a slice.
201///
202/// Returns the encoding length.
203///
204/// This function does not ensure that `buf` is large enough to include the encoding length of the
205/// number. In particular, there are no bounds checks on indexing. The caller of this function must
206/// ensure that `buf` is large enough for the encoded `num`. This can be done, for example, by
207/// using `MAX_ENCODING_LEN` to create the `buf` or by using `encoding_len` to validate the length
208/// of `buf`.
209unsafe fn encode_unchecked(buf: &mut [u8], mut num: u64) -> usize {
210 // Initialize the buffer index. This will be used for the encoding length at the end.
211 let mut i = 0;
212 // Loop through all 7-bit strings of the number.
213 while more_than_7bits_remain(num) {
214 // This is not the last encoded byte.
215 *buf.get_unchecked_mut(i) = clear_msb(num as u8);
216 // Get the next 7 bits.
217 num >>= 7;
218 // Increment the index.
219 i += 1;
220 }
221 // This is the last encoded byte.
222 *buf.get_unchecked_mut(i) = set_msb(num as u8);
223 // Return the encoding length.
224 i + 1
225}
226
227/// Encodes a `u64` by writing its variable-byte encoding to a slice.
228///
229/// On success, this function returns `Some` encoding length. Otherwise, the target slice is not
230/// large enough, and the function returns `None`.
231pub fn encode_slice(buf: &mut [u8], num: u64) -> Option<usize> {
232 // Validate the length of the buffer.
233 if encoding_len(num) > buf.len() {
234 None
235 } else {
236 // Safety: We have verified that `buf.len()` is large enough to hold the encoded bytes of
237 // `num`.
238 unsafe { Some(encode_unchecked(buf, num)) }
239 }
240}
241
242/// Encodes a `u64` with a variable-byte encoding in a `Vec`.
243///
244/// The length of the resultant `Vec` is the encoding length of `num`.
245pub fn encode_vec(num: u64) -> Vec<u8> {
246 // Allocate a `Vec` of the right size.
247 let mut vec = vec![0; encoding_len(num)];
248 // Safety: We have created `vec` with the length of the encoded bytes of `num`.
249 unsafe { encode_unchecked(&mut vec, num) };
250 vec
251}
252
253/// Encodes a `u64` with a variable-byte encoding in an array.
254///
255/// The array is always length 10. Additinally, the actual size of the vbyte is returned.
256pub fn encode_array(num: u64) -> ([u8; 10], usize) {
257 // Allocate a `Vec` of the right size.
258 let mut buf = [0; 10];
259 // Safety: We have created `vec` with the length of the encoded bytes of `num`.
260 let size = unsafe { encode_unchecked(&mut buf, num) };
261 (buf, size)
262}
263
264/*
265pub fn encode_into_writer<W:Write>(writer: &mut W, mut num: u64) -> std::io::Result<usize> {
266 let mut i = 0;
267 // Loop through all 7-bit strings of the number.
268 while more_than_7bits_remain(num) {
269 // This is not the last encoded byte.
270 let b = clear_msb(num as u8);
271 writer.write_u8(b)?;
272 // Get the next 7 bits.
273 num >>= 7;
274 i+=1;
275 }
276 // This is the last encoded byte.
277 let b = set_msb(num as u8);
278 // Return the encoding length.
279 writer.write_u8(b)?;
280 Ok(i + 1)
281}
282*/
283
284/// Encodes a `u64` with a variable-byte encoding in a `Vec` and writes that `Vec` to the
285/// destination `dest` in a future.
286pub async fn write_async<A>(dest: &mut A, num: u64) -> io::Result<usize>
287where
288 A: 'static + AsyncWrite + Unpin + Send,
289{
290 let vec = encode_vec(num);
291 let len = vec.len();
292 dest.write_all(&vec).await?;
293
294 Ok(len)
295}
296
297#[cfg(test)]
298mod tests {
299 use super::*;
300
301 fn encode_decode_success(buf: &mut [u8], expected: &[u8], num: u64) {
302 assert_eq!(Some(expected.len()), encode_slice(buf, num));
303 assert_eq!(expected, buf);
304 let (n, len) = decode(&buf).unwrap();
305 assert_eq!(num, n);
306 assert_eq!(expected.len(), len);
307 }
308
309 #[test]
310 fn encode_decode_1_byte() {
311 let mut buf = [0; 1];
312 encode_decode_success(&mut buf, &[set_msb(0)], 0);
313 encode_decode_success(&mut buf, &[set_msb(1)], 1);
314 encode_decode_success(&mut buf, &[set_msb(42)], 42);
315 }
316
317 #[test]
318 fn encode_decode_2_bytes() {
319 let mut buf = [0; 2];
320 encode_decode_success(&mut buf, &[0b0101000, set_msb(0b1)], 0b1_0101000);
321 let expected = [0b0000000, set_msb(0b1111111)];
322 encode_decode_success(&mut buf, &expected, 0b1111111_0000000);
323 }
324
325 #[test]
326 fn encode_decode_4_bytes() {
327 let mut buf = [0; 4];
328 let expected = [0b0110010, 0b0101111, 0b0011101, set_msb(0b10100)];
329 encode_decode_success(&mut buf, &expected, 0b10100_0011101_0101111_0110010);
330 }
331
332 #[test]
333 fn encode_decode_7_bytes() {
334 let mut buf = [0; 7];
335 let expected = [
336 0b0100000,
337 0b0010010,
338 0b1010101,
339 0b1001010,
340 0b1001000,
341 0b1000110,
342 set_msb(0b1100001),
343 ];
344 let num = 0b1100001_1000110_1001000_1001010_1010101_0010010_0100000;
345 encode_decode_success(&mut buf, &expected, num);
346 }
347
348 #[test]
349 fn encode_decode_max_bytes() {
350 let mut buf = [0; MAX_ENCODING_LEN];
351 let mut expected = [0x7f; 10];
352 assert_eq!(Err(DecodeError::UnexpectedEncodingLen), decode(&buf));
353 expected[9] = set_msb(0x01);
354 encode_decode_success(&mut buf, &expected, u64::max_value());
355 expected[0] -= 1;
356 encode_decode_success(&mut buf, &expected, u64::max_value() - 1);
357 }
358
359 #[test]
360 fn encode_decode_4_bytes_in_20_bytes() {
361 let mut buf = [0; 20];
362 assert_eq!(Some(4), encode_slice(&mut buf, 194984659));
363 let (n, len) = decode(&buf).unwrap();
364 assert_eq!(194984659, n);
365 assert_eq!(4, len);
366 }
367
368 #[test]
369 fn encode_0_bytes_fails() {
370 let mut buf = [];
371 assert_eq!(None, encode_slice(&mut buf, 0));
372 }
373
374 #[test]
375 fn encode_4_bytes_to_3_bytes_fails() {
376 let mut buf = [0; 3];
377 let num = 0b1011100_1111100_1110101_1010011;
378 assert_eq!(None, encode_slice(&mut buf, num));
379 }
380
381 #[test]
382 fn decode_0_bytes_fails() {
383 let buf = [];
384 assert_eq!(Err(DecodeError::UnexpectedEndOfBuffer), decode(&buf));
385 }
386
387 #[test]
388 fn decode_1_byte_without_msb_fails() {
389 let buf = [0b0001000];
390 assert_eq!(Err(DecodeError::UnexpectedEndOfBuffer), decode(&buf));
391 }
392
393 #[test]
394 fn encoded_value_too_large_fails() {
395 let mut buf = [0; 10];
396 buf[9] = set_msb(0x02);
397 assert_eq!(Err(DecodeError::EncodedValueTooLarge), decode(&buf));
398 }
399
400 #[test]
401 fn decode_11_bytes_fails() {
402 let mut buf = [0; 11];
403 buf[10] = set_msb(0x01);
404 assert_eq!(Err(DecodeError::UnexpectedEncodingLen), decode(&buf));
405 }
406
407 #[test]
408 fn encoded_len_tests() {
409 for &(len, num) in &[
410 (1, 0),
411 (1, 1),
412 (2, 0b11_0101000),
413 (7, 0b1100001_1000110_1001000_1001010_1010101_0010010_0100000),
414 (10, u64::max_value() - 1),
415 (MAX_ENCODING_LEN, u64::max_value()),
416 ] {
417 assert_eq!(len, encoding_len(num));
418 }
419 }
420}