dsi_bitstream/codes/vbyte.rs
1/*
2 * SPDX-FileCopyrightText: 2023 Tommaso Fontana
3 * SPDX-FileCopyrightText: 2023 Sebastiano Vigna
4 *
5 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
6 */
7
8//! Variable-length byte codes.
9//!
10//! These codes represent a natural number as a sequence of bytes, the length of
11//! the sequence depends on the magnitude of the number. They are used in many
12//! contexts, and they are known under a plethora of different names such as
13//! “vbyte”, “varint”, “[variable-length
14//! quantity](https://en.wikipedia.org/wiki/Variable-length_quantity)”, “LEB”,
15//! and so on.
16//!
17//! There are several variants of their definition, but their implied
18//! distribution is always ≈ 1/*x*<sup>8/7</sup>
19//!
20//! The supported range is [0 . . 2⁶⁴).
21//!
22//! # Definition
23//!
24//! Since there are a few slightly different variants used in production code
25//! and in the literature, before going into the details of this implementation
26//! we will try to define a clear taxonomy by explaining in detail the
27//! three properties that define such variants.
28//!
29//! The simplest variable-length byte code encodes a number with a binary
30//! representation of *k* bits using ⌈*k* / 7⌉ bytes. The binary representation
31//! is left-padded with zeros so to obtain exactly ⌈*k* / 7⌉ blocks of 7 bits.
32//! Each block is stored in a byte, prefixed with a continuation bit which is
33//! one for all blocks except for the last one.
34//!
35//! ## Endianness
36//!
37//! The first property is the endianness of the bytes: in big-endian codes, the
38//! first byte contains the highest (most significant) bits, whereas in
39//! little-endian codes, the first byte contains the lowest (least significant)
40//! bits.
41//!
42//! The advantage of the big-endian variant is that is lexicographical, that is,
43//! comparing lexicographically a stream of encoded natural numbers will give
44//! the same results as comparing lexicographically the encoded numbers, much
45//! like it happens for [UTF-8 encoding](https://en.wikipedia.org/wiki/UTF-8).
46//!
47//! ## Completeness
48//!
49//! This basic representation discussed above is not *complete*, as there are
50//! sequences that are not used. For example, zero can be written in many ways
51//! (e.g., `0x00` or `0x80 0x00`), but we are using only the single-byte
52//! representation. Incompleteness leads to a (small) loss in compression.
53//!
54//! To have completeness, one can offset the representation in *k* bits by the
55//! maximum number representable using *k* − 1 bits. That is, we represent the
56//! interval [0..2⁷) with one byte, then the interval [2⁷..2⁷ + 2¹⁴) with two
57//! bytes, the interval [2⁷ + 2¹⁴..2⁷ + 2¹⁴ + 2²¹) with three bytes, and so on.
58//!
59//! ## Grouping
60//!
61//! In the basic representation, the continuation bit is the most significant
62//! bit of each byte. However, one can gather all continuation bits in the first
63//! byte ([as UTF-8 does](https://en.wikipedia.org/wiki/UTF-8)). This approach
64//! makes it possible to compute the length of the code using a call to
65//! [`u8::leading_ones`] on the first negated byte, which usually maps to a
66//! negation and a call to a fast instruction for the detection of the most
67//! significant bit, improving branch prediction.
68//!
69//! Note that if the code is grouped, choosing a code with the same endianness
70//! as your hardware can lead to a performance improvement, as after the first
71//! byte the rest of the code can be read with a
72//.
73//!
74//! ## Sign
75//!
76//! It is possible to extend the codes to represent signed integers. Two
77//! possible approaches are using a [bijective mapping](crate::codes::ToInt)
78//! between the integers and the natural numbers, or defining a specialized
79//! code.
80//!
81//! # Implementations
82//!
83//! We provide two unsigned, ungrouped, complete representations, one
84//! [big-endian](VByteBeRead) and one [little-endian](VByteLeRead). We recommend
85//! in general the big-endian version as it is lexicographical. However, the
86//! big-endian version needs a small buffer when writing, so on some hardware
87//! the little-endian version might be faster.
88//!
89//! Note that the endianness of the code is independent from the endianness of
90//! the underlying bit stream, as it just depends on the order in which bytes are
91//! written.
92//!
93//! Since this code is byte-aligned, we provide also convenient, fast methods
94//! [`vbyte_write`] and [`vbyte_read`] that can be used on types implementing
95//! [`std::io::Read`] and [`std::io::Write`]. There are also endianness-specific
96//! methods [`vbyte_write_be`], [`vbyte_write_le`], [`vbyte_read_be`], and
97//! [`vbyte_read_le`].
98//!
99//! # Examples
100//!
101//! - The [LEB128](https://en.wikipedia.org/wiki/LEB128) code used by LLVM is a
102//! little-endian incomplete ungrouped representation. There is both a signed
103//! and an unsigned version; the signed version represents negative numbers
104//! using two's complement.
105//!
106//! - The [code used by
107//! git](https://github.com/git/git/blob/7fb6aefd2aaffe66e614f7f7b83e5b7ab16d4806/varint.c)
108//! is a big-endian complete ungrouped representation.
109//!
110//! - [This implementation in
111//! folly](https://github.com/facebook/folly/blob/dd4a5eb057afbc3c7c7da71801df2ee3c61c47d1/folly/Varint.h#L109)
112//! is a little-endian incomplete ungrouped representation.
113
114use crate::traits::*;
115
116/// Returns the length of the variable-length byte code for `n` in bytes.
117#[must_use]
118#[inline(always)]
119pub const fn byte_len_vbyte(mut n: u64) -> usize {
120 let mut len = 1;
121 loop {
122 n >>= 7;
123 if n == 0 {
124 return len;
125 }
126 n -= 1;
127 len += 1;
128 }
129}
130
131/// Returns the length of the variable-length byte code for `n` in bits.
132#[must_use]
133#[inline(always)]
134pub const fn bit_len_vbyte(n: u64) -> usize {
135 8 * byte_len_vbyte(n)
136}
137
138/// Trait for reading big-endian variable-length byte codes.
139///
140/// Note that the endianness of the code is independent
141/// from the endianness of the underlying bit stream.
142pub trait VByteBeRead<E: Endianness>: BitRead<E> {
143 fn read_vbyte_be(&mut self) -> Result<u64, Self::Error>;
144}
145
146/// Trait for reading little-endian variable-length byte codes.
147///
148/// Note that the endianness of the code is independent
149/// from the endianness of the underlying bit stream.
150pub trait VByteLeRead<E: Endianness>: BitRead<E> {
151 fn read_vbyte_le(&mut self) -> Result<u64, Self::Error>;
152}
153
154impl<E: Endianness, B: BitRead<E>> VByteBeRead<E> for B {
155 #[inline(always)]
156 fn read_vbyte_be(&mut self) -> Result<u64, Self::Error> {
157 let mut byte = self.read_bits(8)?;
158 let mut value = byte & 0x7F;
159 while (byte >> 7) != 0 {
160 value += 1;
161 byte = self.read_bits(8)?;
162 value = (value << 7) | (byte & 0x7F);
163 }
164 Ok(value)
165 }
166}
167
168impl<E: Endianness, B: BitRead<E>> VByteLeRead<E> for B {
169 #[inline(always)]
170 fn read_vbyte_le(&mut self) -> Result<u64, Self::Error> {
171 let mut result = 0;
172 let mut shift = 0;
173 loop {
174 let byte = self.read_bits(8)?;
175 result += (byte & 0x7F) << shift;
176 if (byte >> 7) == 0 {
177 break;
178 }
179 shift += 7;
180 result += 1 << shift;
181 }
182 Ok(result)
183 }
184}
185
186/// Trait for writing big-endian variable-length byte codes.
187///
188/// Note that the endianness of the code is independent
189/// from the endianness of the underlying bit stream.
190pub trait VByteBeWrite<E: Endianness>: BitWrite<E> {
191 fn write_vbyte_be(&mut self, n: u64) -> Result<usize, Self::Error>;
192}
193/// Trait for writing little-endian variable-length byte codes.
194///
195/// Note that the endianness of the code is independent
196/// from the endianness of the underlying bit stream.
197pub trait VByteLeWrite<E: Endianness>: BitWrite<E> {
198 fn write_vbyte_le(&mut self, n: u64) -> Result<usize, Self::Error>;
199}
200
201impl<E: Endianness, B: BitWrite<E>> VByteBeWrite<E> for B {
202 #[inline(always)]
203 fn write_vbyte_be(&mut self, mut n: u64) -> Result<usize, Self::Error> {
204 let mut buf = [0u8; 10];
205 let mut pos = buf.len() - 1;
206 buf[pos] = (n & 0x7F) as u8;
207 n >>= 7;
208 while n != 0 {
209 n -= 1;
210 pos -= 1;
211 buf[pos] = 0x80 | (n & 0x7F) as u8;
212 n >>= 7;
213 }
214 let bytes_to_write = buf.len() - pos;
215 for &byte in &buf[pos..] {
216 self.write_bits(byte as u64, 8)?;
217 }
218 Ok(bytes_to_write * 8)
219 }
220}
221
222impl<E: Endianness, B: BitWrite<E>> VByteLeWrite<E> for B {
223 #[inline(always)]
224 fn write_vbyte_le(&mut self, mut n: u64) -> Result<usize, Self::Error> {
225 let mut len = 1;
226 loop {
227 let byte = (n & 0x7F) as u8;
228 n >>= 7;
229 if n != 0 {
230 self.write_bits((byte | 0x80) as u64, 8)?;
231 } else {
232 self.write_bits(byte as u64, 8)?;
233 break;
234 }
235 n -= 1;
236 len += 1;
237 }
238 Ok(len * 8)
239 }
240}
241
242/// Writes a natural number to a byte stream using variable-length byte codes and
243/// returns the number of bytes written.
244///
245/// This method just delegates to the correct endianness-specific method.
246#[inline(always)]
247#[cfg(feature = "std")]
248pub fn vbyte_write<E: Endianness, W: std::io::Write>(
249 n: u64,
250 writer: &mut W,
251) -> std::io::Result<usize> {
252 if core::any::TypeId::of::<E>() == core::any::TypeId::of::<BigEndian>() {
253 vbyte_write_be(n, writer)
254 } else {
255 vbyte_write_le(n, writer)
256 }
257}
258
259/// Encode a natural number to a big-endian byte stream using variable-length
260/// byte codes and return the number of bytes written.
261#[inline(always)]
262#[cfg(feature = "std")]
263pub fn vbyte_write_be<W: std::io::Write>(mut n: u64, w: &mut W) -> std::io::Result<usize> {
264 let mut buf = [0u8; 10];
265 let mut pos = buf.len() - 1;
266 buf[pos] = (n & 0x7F) as u8;
267 n >>= 7;
268 while n != 0 {
269 n -= 1;
270 pos -= 1;
271 buf[pos] = 0x80 | (n & 0x7F) as u8;
272 n >>= 7;
273 }
274 let bytes_to_write = buf.len() - pos;
275 w.write_all(&buf[pos..])?;
276 Ok(bytes_to_write)
277}
278
279/// Encode a natural number to a little-endian byte stream using variable-length
280/// byte codes and return the number of bytes written.
281#[inline(always)]
282#[cfg(feature = "std")]
283pub fn vbyte_write_le<W: std::io::Write>(mut n: u64, writer: &mut W) -> std::io::Result<usize> {
284 let mut len = 1;
285 loop {
286 let byte = (n & 0x7F) as u8;
287 n >>= 7;
288 if n != 0 {
289 writer.write_all(&[byte | 0x80])?;
290 } else {
291 writer.write_all(&[byte])?;
292 break;
293 }
294 n -= 1;
295 len += 1;
296 }
297 Ok(len)
298}
299
300/// Decode a natural number from a byte stream using variable-length byte codes.
301///
302/// This method just delegates to the correct endianness-specific method.
303#[inline(always)]
304#[cfg(feature = "std")]
305pub fn vbyte_read<E: Endianness, R: std::io::Read>(reader: &mut R) -> std::io::Result<u64> {
306 if core::any::TypeId::of::<E>() == core::any::TypeId::of::<BigEndian>() {
307 vbyte_read_be(reader)
308 } else {
309 vbyte_read_le(reader)
310 }
311}
312
313/// Decode a natural number from a big-endian byte stream using variable-length
314/// byte codes.
315#[inline(always)]
316#[cfg(feature = "std")]
317pub fn vbyte_read_be<R: std::io::Read>(reader: &mut R) -> std::io::Result<u64> {
318 let mut buf = [0u8; 1];
319 let mut value: u64;
320 reader.read_exact(&mut buf)?;
321 value = (buf[0] & 0x7F) as u64;
322 while (buf[0] >> 7) != 0 {
323 value += 1;
324 reader.read_exact(&mut buf)?;
325 value = (value << 7) | ((buf[0] & 0x7F) as u64);
326 }
327 Ok(value)
328}
329
330/// Decode a natural number from a little-endian byte stream using
331/// variable-length byte codes.
332#[inline(always)]
333#[cfg(feature = "std")]
334pub fn vbyte_read_le<R: std::io::Read>(reader: &mut R) -> std::io::Result<u64> {
335 let mut result = 0;
336 let mut shift = 0;
337 let mut buffer = [0; 1];
338 loop {
339 reader.read_exact(&mut buffer)?;
340 let byte = buffer[0];
341 result += ((byte & 0x7F) as u64) << shift;
342 if (byte >> 7) == 0 {
343 break;
344 }
345 shift += 7;
346 result += 1 << shift;
347 }
348 Ok(result)
349}
350
351#[cfg(test)]
352#[cfg(feature = "std")]
353mod tests {
354 #[allow(unused_imports)]
355 use super::*;
356
357 const UPPER_BOUND_1: u64 = 128;
358 const UPPER_BOUND_2: u64 = 128_u64.pow(2) + UPPER_BOUND_1;
359 const UPPER_BOUND_3: u64 = 128_u64.pow(3) + UPPER_BOUND_2;
360 const UPPER_BOUND_4: u64 = 128_u64.pow(4) + UPPER_BOUND_3;
361 const UPPER_BOUND_5: u64 = 128_u64.pow(5) + UPPER_BOUND_4;
362 const UPPER_BOUND_6: u64 = 128_u64.pow(6) + UPPER_BOUND_5;
363 const UPPER_BOUND_7: u64 = 128_u64.pow(7) + UPPER_BOUND_6;
364 const UPPER_BOUND_8: u64 = 128_u64.pow(8) + UPPER_BOUND_7;
365
366 macro_rules! impl_tests {
367 ($test_name:ident, $E:ty) => {
368 #[test]
369 fn $test_name() -> std::io::Result<()> {
370 const MAX: usize = 1 << 20;
371 const MIN: usize = 0;
372 let mut buffer = std::io::Cursor::new(Vec::with_capacity(128));
373 let mut lens = Vec::new();
374
375 for i in MIN..MAX {
376 lens.push(vbyte_write::<$E, _>(i as _, &mut buffer)?);
377 }
378 buffer.set_position(0);
379 for (i, l) in (MIN..MAX).zip(lens.iter()) {
380 let j = vbyte_read::<$E, _>(&mut buffer)?;
381 assert_eq!(byte_len_vbyte(i as _), *l);
382 assert_eq!(j, i as u64);
383 }
384
385 let values = [
386 0,
387 UPPER_BOUND_1 - 1,
388 UPPER_BOUND_1 + 1,
389 UPPER_BOUND_2 - 1,
390 UPPER_BOUND_2 + 1,
391 UPPER_BOUND_3 - 1,
392 UPPER_BOUND_3 + 1,
393 UPPER_BOUND_4 - 1,
394 UPPER_BOUND_4 + 1,
395 UPPER_BOUND_5 - 1,
396 UPPER_BOUND_5 + 1,
397 UPPER_BOUND_6 - 1,
398 UPPER_BOUND_6 + 1,
399 UPPER_BOUND_7 - 1,
400 UPPER_BOUND_7 + 1,
401 UPPER_BOUND_8 - 1,
402 UPPER_BOUND_8 + 1,
403 u64::MAX,
404 ];
405
406 let tell: u64 = buffer.position();
407 for &i in values.iter() {
408 vbyte_write::<$E, _>(i, &mut buffer)?;
409 }
410 buffer.set_position(tell);
411 for &i in values.iter() {
412 assert_eq!(i, vbyte_read::<$E, _>(&mut buffer)?);
413 }
414 Ok(())
415 }
416 };
417 }
418
419 impl_tests!(test_vbytes_be, BE);
420 impl_tests!(test_vbytes_le, LE);
421}