parquet_format_safe/thrift/varint/
decode.rs

1use std::io;
2use std::io::Result;
3use std::mem::size_of;
4
5/// Most-significant byte, == 0x80
6pub const MSB: u8 = 0b1000_0000;
7/// All bits except for the most significant. Can be used as bitmask to drop the most-signficant
8/// bit using `&` (binary-and).
9const DROP_MSB: u8 = 0b0111_1111;
10
11/// How many bytes an integer uses when being encoded as a VarInt.
12#[inline]
13fn required_encoded_space_unsigned(mut v: u64) -> usize {
14    if v == 0 {
15        return 1;
16    }
17
18    let mut logcounter = 0;
19    while v > 0 {
20        logcounter += 1;
21        v >>= 7;
22    }
23    logcounter
24}
25
26/// How many bytes an integer uses when being encoded as a VarInt.
27#[inline]
28fn required_encoded_space_signed(v: i64) -> usize {
29    required_encoded_space_unsigned(zigzag_encode(v))
30}
31
32/// Varint (variable length integer) encoding, as described in
33/// https://developers.google.com/protocol-buffers/docs/encoding.
34///
35/// Uses zigzag encoding (also described there) for signed integer representation.
36pub trait VarInt: Sized + Copy {
37    /// Returns the number of bytes this number needs in its encoded form. Note: This varies
38    /// depending on the actual number you want to encode.
39    fn required_space(self) -> usize;
40    /// Decode a value from the slice. Returns the value and the number of bytes read from the
41    /// slice (can be used to read several consecutive values from a big slice)
42    /// return None if all bytes has MSB set.
43    fn decode_var(src: &[u8]) -> Option<(Self, usize)>;
44    /// Encode a value into the slice. The slice must be at least `required_space()` bytes long.
45    /// The number of bytes taken by the encoded integer is returned.
46    fn encode_var(self, src: &mut [u8]) -> usize;
47
48    /// Helper: Encode a value and return the encoded form as Vec. The Vec must be at least
49    /// `required_space()` bytes long.
50    fn encode_var_vec(self) -> Vec<u8> {
51        let mut v = Vec::new();
52        v.resize(self.required_space(), 0);
53        self.encode_var(&mut v);
54        v
55    }
56}
57
58#[inline]
59fn zigzag_encode(from: i64) -> u64 {
60    ((from << 1) ^ (from >> 63)) as u64
61}
62
63// see: http://stackoverflow.com/a/2211086/56332
64// casting required because operations like unary negation
65// cannot be performed on unsigned integers
66#[inline]
67fn zigzag_decode(from: u64) -> i64 {
68    ((from >> 1) ^ (-((from & 1) as i64)) as u64) as i64
69}
70
71pub(crate) trait VarIntMaxSize {
72    fn varint_max_size() -> usize;
73}
74
75impl<VI: VarInt> VarIntMaxSize for VI {
76    fn varint_max_size() -> usize {
77        (size_of::<VI>() * 8 + 7) / 7
78    }
79}
80
81macro_rules! impl_varint {
82    ($t:ty, unsigned) => {
83        impl VarInt for $t {
84            fn required_space(self) -> usize {
85                required_encoded_space_unsigned(self as u64)
86            }
87
88            fn decode_var(src: &[u8]) -> Option<(Self, usize)> {
89                let (n, s) = u64::decode_var(src)?;
90                Some((n as Self, s))
91            }
92
93            fn encode_var(self, dst: &mut [u8]) -> usize {
94                (self as u64).encode_var(dst)
95            }
96        }
97    };
98    ($t:ty, signed) => {
99        impl VarInt for $t {
100            fn required_space(self) -> usize {
101                required_encoded_space_signed(self as i64)
102            }
103
104            fn decode_var(src: &[u8]) -> Option<(Self, usize)> {
105                let (n, s) = i64::decode_var(src)?;
106                Some((n as Self, s))
107            }
108
109            fn encode_var(self, dst: &mut [u8]) -> usize {
110                (self as i64).encode_var(dst)
111            }
112        }
113    };
114}
115
116impl_varint!(u32, unsigned);
117impl_varint!(u16, unsigned);
118impl_varint!(u8, unsigned);
119
120impl_varint!(i32, signed);
121impl_varint!(i16, signed);
122impl_varint!(i8, signed);
123
124// Below are the "base implementations" doing the actual encodings; all other integer types are
125// first cast to these biggest types before being encoded.
126
127impl VarInt for u64 {
128    fn required_space(self) -> usize {
129        required_encoded_space_unsigned(self)
130    }
131
132    #[inline]
133    fn decode_var(src: &[u8]) -> Option<(Self, usize)> {
134        let mut result: u64 = 0;
135        let mut shift = 0;
136
137        let mut success = false;
138        for b in src.iter() {
139            let msb_dropped = b & DROP_MSB;
140            result |= (msb_dropped as u64) << shift;
141            shift += 7;
142
143            if b & MSB == 0 || shift > (9 * 7) {
144                success = b & MSB == 0;
145                break;
146            }
147        }
148
149        if success {
150            Some((result, shift / 7))
151        } else {
152            None
153        }
154    }
155
156    #[inline]
157    fn encode_var(self, dst: &mut [u8]) -> usize {
158        assert!(dst.len() >= self.required_space());
159        let mut n = self;
160        let mut i = 0;
161
162        while n >= 0x80 {
163            dst[i] = MSB | (n as u8);
164            i += 1;
165            n >>= 7;
166        }
167
168        dst[i] = n as u8;
169        i + 1
170    }
171}
172
173impl VarInt for i64 {
174    fn required_space(self) -> usize {
175        required_encoded_space_signed(self)
176    }
177
178    #[inline]
179    fn decode_var(src: &[u8]) -> Option<(Self, usize)> {
180        if let Some((result, size)) = u64::decode_var(src) {
181            Some((zigzag_decode(result) as Self, size))
182        } else {
183            None
184        }
185    }
186
187    #[inline]
188    fn encode_var(self, dst: &mut [u8]) -> usize {
189        assert!(dst.len() >= self.required_space());
190        let mut n: u64 = zigzag_encode(self as i64);
191        let mut i = 0;
192
193        while n >= 0x80 {
194            dst[i] = MSB | (n as u8);
195            i += 1;
196            n >>= 7;
197        }
198
199        dst[i] = n as u8;
200        i + 1
201    }
202}
203
204/// A trait for reading VarInts from any other `Reader`.
205///
206/// It's recommended to use a buffered reader, as many small reads will happen.
207pub trait VarIntReader {
208    /// Returns either the decoded integer, or an error.
209    ///
210    /// In general, this always reads a whole varint. If the encoded varint's value is bigger
211    /// than the valid value range of `VI`, then the value is truncated.
212    ///
213    /// On EOF, an io::Error with io::ErrorKind::UnexpectedEof is returned.
214    fn read_varint<VI: VarInt>(&mut self) -> Result<VI>;
215}
216
217/// VarIntProcessor encapsulates the logic for decoding a VarInt byte-by-byte.
218#[derive(Default)]
219pub(crate) struct VarIntProcessor {
220    buf: [u8; 10],
221    maxsize: usize,
222    pub i: usize,
223}
224
225impl VarIntProcessor {
226    pub fn new<VI: VarIntMaxSize>() -> VarIntProcessor {
227        VarIntProcessor {
228            maxsize: VI::varint_max_size(),
229            ..VarIntProcessor::default()
230        }
231    }
232
233    pub fn push(&mut self, b: u8) -> Result<()> {
234        if self.i >= self.maxsize {
235            return Err(io::Error::new(
236                io::ErrorKind::InvalidData,
237                "Unterminated varint",
238            ));
239        }
240        self.buf[self.i] = b;
241        self.i += 1;
242        Ok(())
243    }
244
245    pub fn finished(&self) -> bool {
246        self.i > 0 && (self.buf[self.i - 1] & MSB == 0)
247    }
248
249    pub fn decode<VI: VarInt>(&self) -> Option<VI> {
250        Some(VI::decode_var(&self.buf[0..self.i])?.0)
251    }
252}
253
254impl<R: io::Read> VarIntReader for R {
255    fn read_varint<VI: VarInt>(&mut self) -> Result<VI> {
256        let mut buf = [0_u8; 1];
257        let mut p = VarIntProcessor::new::<VI>();
258
259        while !p.finished() {
260            let read = self.read(&mut buf)?;
261
262            // EOF
263            if read == 0 && p.i == 0 {
264                return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "Reached EOF"));
265            }
266            if read == 0 {
267                break;
268            }
269
270            p.push(buf[0])?;
271        }
272
273        p.decode()
274            .ok_or_else(|| io::Error::new(io::ErrorKind::UnexpectedEof, "Reached EOF"))
275    }
276}