dsi_bitstream/codes/vbyte.rs
1/*
2 * SPDX-FileCopyrightText: 2023 Tommaso Fontana
3 * SPDX-FileCopyrightText: 2023 Sebastiano Vigna
4 *
5 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
6 */
7
8//! Variable-length byte codes.
9//!
10//! These codes represent a natural number as a sequence of bytes, the length of
11//! the sequence depends on the magnitude of the number. They are used in many
12//! contexts, and they are known under a plethora of different names such
13//! “vbyte”, “varint”, “[variable-length
14//! quantity](https://en.wikipedia.org/wiki/Variable-length_quantity)”, “LEB”,
15//! and so on.
16//!
17//! There are several variants of their definition, but their implied
18//! distribution is always ≈ 1/*x*<sup>8/7</sup>
19//!
20//! # Definition
21//!
22//! Since there are a few slightly different variants used in production code
23//! and in the literature, before going into the details of this implementation
24//! we will try to define a clear taxonomy by explaining in detail the four
25//! three properties that define such variants.
26//!
27//! The simplest variable-length byte code encodes a number with a binary
28//! representation of *k* bits using ⌈*k* / 7⌉ bytes. The binary representation
29//! is left-padded with zeros so to obtain exactly ⌈*k* / 7⌉ blocks of 7 bits.
30//! Each block is stored in a byte, prefixed with a continuation bit which is
31//! one for all blocks except for the last one.
32//!
33//! ## Endianness
34//!
35//! The first property is the endianness of the bytes: in big-endian codes, the
36//! first byte contains the highest (most significant) bits, whereas in
37//! little-endian codes, the first byte contains the lowest (less significant)
38//! bits.
39//!
40//! The advantage of the big-endian variant is that is lexicographical, that is,
41//! comparing lexicographically a stream of encoded natural numbers will give
42//! the same results as comparing lexicographically the encoded numbers, much
43//! like it happens for [UTF-8 encoding](https://en.wikipedia.org/wiki/UTF-8).
44//!
45//! ## Completeness
46//!
47//! This basic representation discussed above is not *complete*, as there are
48//! sequences that are not used. For example, zero can be written in many ways
49//! (e.g., `0x00` or `0x80 0x00` ), but we are using only the single-byte
50//! representation. Uncompleteness leads to a (small) loss in compression.
51//!
52//! To have completeness, one can offset the representation in *k* bits by the
53//! maximum number representable using *k* − 1 bits. That is, we represent the
54//! interval [0..2⁷) with one byte, then the interval [2⁷..2⁷ + 2¹⁴] with two
55//! bytes, the interval [2⁷ + 2¹⁴..2⁷ + 2¹⁴ + 2²¹] with three bytes, and so on.
56//!
57//! ## Grouping
58//!
59//! In the basic representation, the continuation bit is the most significant
60//! bit of each byte. However, one can gather all continuation bits in the first
61//! byte ([as UTF-8 does](https://en.wikipedia.org/wiki/UTF-8)). This approach
62//! makes it possible to compute the length of the code using a call to
63//! [`usize::leading_ones`] on the first negated byte, which usually maps to a
64//! negation and a call to a fast instruction for the detection of the most
65//! significant bit, improving branch prediction.
66//!
67//! Note that if the code is grouped, choosing a code with the same endianness
68//! as your hardare can lead to a performance improvement, as after the first
69//! byte the rest of the code can be read with a
70//.
71//!
72//! ## Sign
73//!
74//! It is possible to extend the codes to represent signed integers. Two
75//! possible approaches are using a [bijective mapping](crate::codes::ToInt)
76//! between the integers and the natural numbers, or defining a specialized
77//! code.
78//!
79//! # Implementations
80//!
81//! We provide two unsigned, ungrouped, complete representations, one
82//! [big-endian](VByteBeRead) and one [little-endian](VByteLeRead). We recommend
83//! in general big-endian version as it is lexicographical. However, the
84//! big-endian version needs a small buffer when writing, so on some hardware
85//! the little-endian version might be faster.
86//!
87//! Note that the endianness of the code is independent from the endianness of
88//! the underlying bit stream, as it just depend on the order in which bytes are
89//! written.
90//!
91//! Since this code is byte-aligned, we provide also convenient, fast methods
92//! [`vbyte_write`] and [`vbyte_read`] that can be used on types implementing
93//! [`std::io::Read`] and [`std::io::Write`]. There are also endianness-specific
94//! methods [`vbyte_write_be`], [`vbyte_write_le`], [`vbyte_read_be`], and
95//! [`vbyte_read_le`].
96//!
97//! [`LEB128`]: https://en.wikipedia.org/wiki/LEB128
98//! [`VLQ`]: https://en.wikipedia.org/wiki/Variable-length_quantity
99//!
100//! # Examples
101//!
102//! - The [LEB128](https://en.wikipedia.org/wiki/LEB128) code used by LLVM is a
103//! little-endian incomplete ungrouped representation. There is both a signed
104//! and an unsigned version; the signed version represent negative numbers
105//! using two's complement.
106//!
107//! - The [code used by
108//! git](https://github.com/git/git/blob/7fb6aefd2aaffe66e614f7f7b83e5b7ab16d4806/varint.c)
109//! is a big-endian complete ungrouped representation.
110//!
111//! - [This implementation in
112//! folly](https://github.com/facebook/folly/blob/dd4a5eb057afbc3c7c7da71801df2ee3c61c47d1/folly/Varint.h#L109)
113//! is a little-endian incomplete ungrouped representation.
114
115use crate::traits::*;
116
117/// Return the length of the variable-length byte code for `value` in bytes.
118#[must_use]
119#[inline(always)]
120pub fn byte_len_vbyte(mut value: u64) -> usize {
121 let mut len = 1;
122 loop {
123 value >>= 7;
124 if value == 0 {
125 return len;
126 }
127 value -= 1;
128 len += 1;
129 }
130}
131
132/// Return the length of the variable-length byte code for `value` in bits.
133#[must_use]
134#[inline(always)]
135pub fn bit_len_vbyte(value: u64) -> usize {
136 8 * byte_len_vbyte(value)
137}
138
139/// Trait for reading big-endian variable-length byte codes.
140///
141/// Note that the endianess of the code is independent
142/// from the endianness of the underlying bit stream.
143pub trait VByteBeRead<E: Endianness>: BitRead<E> {
144 fn read_vbyte_be(&mut self) -> Result<u64, Self::Error>;
145}
146
147/// Trait for reading little-endian variable-length byte codes.
148///
149/// Note that the endianess of the code is independent
150/// from the endianness of the underlying bit stream.
151pub trait VByteLeRead<E: Endianness>: BitRead<E> {
152 fn read_vbyte_le(&mut self) -> Result<u64, Self::Error>;
153}
154
155impl<E: Endianness, B: BitRead<E>> VByteBeRead<E> for B {
156 #[inline(always)]
157 fn read_vbyte_be(&mut self) -> Result<u64, Self::Error> {
158 let mut byte = self.read_bits(8)?;
159 let mut value = byte & 0x7F;
160 while (byte >> 7) != 0 {
161 value += 1;
162 byte = self.read_bits(8)?;
163 value = (value << 7) | (byte & 0x7F);
164 }
165 Ok(value)
166 }
167}
168
169impl<E: Endianness, B: BitRead<E>> VByteLeRead<E> for B {
170 #[inline(always)]
171 fn read_vbyte_le(&mut self) -> Result<u64, Self::Error> {
172 let mut result = 0;
173 let mut shift = 0;
174 loop {
175 let byte = self.read_bits(8)?;
176 result += (byte & 0x7F) << shift;
177 if (byte >> 7) == 0 {
178 break;
179 }
180 shift += 7;
181 result += 1 << shift;
182 }
183 Ok(result)
184 }
185}
186
187/// Trait for write big-endian variable-length byte codes.
188///
189/// Note that the endianess of the code is independent
190/// from the endianness of the underlying bit stream.
191pub trait VByteBeWrite<E: Endianness>: BitWrite<E> {
192 fn write_vbyte_be(&mut self, value: u64) -> Result<usize, Self::Error>;
193}
194/// Trait for write little-endian variable-length byte codes.
195///
196/// Note that the endianess of the code is independent
197/// from the endianness of the underlying bit stream.
198pub trait VByteLeWrite<E: Endianness>: BitWrite<E> {
199 fn write_vbyte_le(&mut self, value: u64) -> Result<usize, Self::Error>;
200}
201
202impl<E: Endianness, B: BitWrite<E>> VByteBeWrite<E> for B {
203 #[inline(always)]
204 fn write_vbyte_be(&mut self, mut value: u64) -> Result<usize, Self::Error> {
205 let mut buf = [0u8; 10];
206 let mut pos = buf.len() - 1;
207 buf[pos] = (value & 0x7F) as u8;
208 value >>= 7;
209 while value != 0 {
210 value -= 1;
211 pos -= 1;
212 buf[pos] = 0x80 | (value & 0x7F) as u8;
213 value >>= 7;
214 }
215 let bytes_to_write = buf.len() - pos;
216 for &byte in &buf[pos..] {
217 self.write_bits(byte as u64, 8)?;
218 }
219 Ok(bytes_to_write * 8)
220 }
221}
222
223impl<E: Endianness, B: BitWrite<E>> VByteLeWrite<E> for B {
224 #[inline(always)]
225 fn write_vbyte_le(&mut self, mut value: u64) -> Result<usize, Self::Error> {
226 let mut len = 1;
227 loop {
228 let byte = (value & 0x7F) as u8;
229 value >>= 7;
230 if value != 0 {
231 self.write_bits((byte | 0x80) as u64, 8)?;
232 } else {
233 self.write_bits(byte as u64, 8)?;
234 break;
235 }
236 value -= 1;
237 len += 1;
238 }
239 Ok(len * 8)
240 }
241}
242
243/// Write a natural number to a byte stream using variable-length byte codes and
244/// return the number of bytes written.
245///
246/// This method just delegates to the correct endianness-specific method.
247#[inline(always)]
248pub fn vbyte_write<E: Endianness, W: std::io::Write>(
249 value: u64,
250 writer: &mut W,
251) -> std::io::Result<usize> {
252 if core::any::TypeId::of::<E>() == core::any::TypeId::of::<BigEndian>() {
253 vbyte_write_be(value, writer)
254 } else {
255 vbyte_write_le(value, writer)
256 }
257}
258
259/// Encode a natural number to a big-endian byte stream using variable-length
260/// byte codes and return the number of bytes written.
261#[inline(always)]
262pub fn vbyte_write_be<W: std::io::Write>(mut value: u64, w: &mut W) -> std::io::Result<usize> {
263 let mut buf = [0u8; 10];
264 let mut pos = buf.len() - 1;
265 buf[pos] = (value & 0x7F) as u8;
266 value >>= 7;
267 while value != 0 {
268 value -= 1;
269 pos -= 1;
270 buf[pos] = 0x80 | (value & 0x7F) as u8;
271 value >>= 7;
272 }
273 let bytes_to_write = buf.len() - pos;
274 w.write_all(&buf[pos..])?;
275 Ok(bytes_to_write)
276}
277
278/// Encode a natural number to a little-endian byte stream using variable-length
279/// byte codes and return the number of bytes written.
280#[inline(always)]
281pub fn vbyte_write_le<W: std::io::Write>(mut value: u64, writer: &mut W) -> std::io::Result<usize> {
282 let mut len = 1;
283 loop {
284 let byte = (value & 0x7F) as u8;
285 value >>= 7;
286 if value != 0 {
287 writer.write_all(&[byte | 0x80])?;
288 } else {
289 writer.write_all(&[byte])?;
290 break;
291 }
292 value -= 1;
293 len += 1;
294 }
295 Ok(len)
296}
297
298#[inline(always)]
299/// Decode a natural number from a byte stream using variable-length byte codes.
300///
301/// This method just delegates to the correct endianness-specific method.
302pub fn vbyte_read<E: Endianness, R: std::io::Read>(reader: &mut R) -> std::io::Result<u64> {
303 if core::any::TypeId::of::<E>() == core::any::TypeId::of::<BigEndian>() {
304 vbyte_read_be(reader)
305 } else {
306 vbyte_read_le(reader)
307 }
308}
309
310/// Decode a natural number from a big-endian byte stream using variable-length
311/// byte codes.
312#[inline(always)]
313pub fn vbyte_read_be<R: std::io::Read>(reader: &mut R) -> std::io::Result<u64> {
314 let mut buf = [0u8; 1];
315 let mut value: u64;
316 reader.read_exact(&mut buf)?;
317 value = (buf[0] & 0x7F) as u64;
318 while (buf[0] >> 7) != 0 {
319 value += 1;
320 reader.read_exact(&mut buf)?;
321 value = (value << 7) | ((buf[0] & 0x7F) as u64);
322 }
323 Ok(value)
324}
325
326/// Decode a natural number from a little-endian byte stream using
327/// variable-length byte codes.
328#[inline(always)]
329pub fn vbyte_read_le<R: std::io::Read>(reader: &mut R) -> std::io::Result<u64> {
330 let mut result = 0;
331 let mut shift = 0;
332 let mut buffer = [0; 1];
333 loop {
334 reader.read_exact(&mut buffer)?;
335 let byte = buffer[0];
336 result += ((byte & 0x7F) as u64) << shift;
337 if (byte >> 7) == 0 {
338 break;
339 }
340 shift += 7;
341 result += 1 << shift;
342 }
343 Ok(result)
344}
345
346#[cfg(test)]
347mod test {
348 use super::*;
349
350 const UPPER_BOUND_1: u64 = 128;
351 const UPPER_BOUND_2: u64 = 128_u64.pow(2) + UPPER_BOUND_1;
352 const UPPER_BOUND_3: u64 = 128_u64.pow(3) + UPPER_BOUND_2;
353 const UPPER_BOUND_4: u64 = 128_u64.pow(4) + UPPER_BOUND_3;
354 const UPPER_BOUND_5: u64 = 128_u64.pow(5) + UPPER_BOUND_4;
355 const UPPER_BOUND_6: u64 = 128_u64.pow(6) + UPPER_BOUND_5;
356 const UPPER_BOUND_7: u64 = 128_u64.pow(7) + UPPER_BOUND_6;
357 const UPPER_BOUND_8: u64 = 128_u64.pow(8) + UPPER_BOUND_7;
358
359 macro_rules! impl_tests {
360 ($test_name:ident, $E:ty) => {
361 #[test]
362 fn $test_name() {
363 const MAX: usize = 1 << 20;
364 const MIN: usize = 0;
365 let mut buffer = std::io::Cursor::new(Vec::with_capacity(128));
366 let mut lens = Vec::new();
367
368 for i in MIN..MAX {
369 lens.push(vbyte_write::<$E, _>(i as _, &mut buffer).unwrap());
370 }
371 buffer.set_position(0);
372 for (i, l) in (MIN..MAX).zip(lens.iter()) {
373 let j = vbyte_read::<$E, _>(&mut buffer).unwrap();
374 assert_eq!(byte_len_vbyte(i as _), *l);
375 assert_eq!(j, i as u64);
376 }
377
378 let values = [
379 0,
380 UPPER_BOUND_1 - 1,
381 UPPER_BOUND_1 + 1,
382 UPPER_BOUND_2 - 1,
383 UPPER_BOUND_2 + 1,
384 UPPER_BOUND_3 - 1,
385 UPPER_BOUND_3 + 1,
386 UPPER_BOUND_4 - 1,
387 UPPER_BOUND_4 + 1,
388 UPPER_BOUND_5 - 1,
389 UPPER_BOUND_5 + 1,
390 UPPER_BOUND_6 - 1,
391 UPPER_BOUND_6 + 1,
392 UPPER_BOUND_7 - 1,
393 UPPER_BOUND_7 + 1,
394 UPPER_BOUND_8 - 1,
395 UPPER_BOUND_8 + 1,
396 u64::MAX,
397 ];
398
399 let tell: u64 = buffer.position();
400 for &i in values.iter() {
401 vbyte_write::<$E, _>(i, &mut buffer).unwrap();
402 }
403 buffer.set_position(tell);
404 for &i in values.iter() {
405 assert_eq!(i, vbyte_read::<$E, _>(&mut buffer).unwrap());
406 }
407 }
408 };
409 }
410
411 impl_tests!(test_vbytes_be, BE);
412 impl_tests!(test_vbytes_le, LE);
413}