dotnet_binary_io/
reader.rs

1#[cfg(feature = "std")]
2use std::borrow::Cow;
3
4use zerocopy::byteorder::{LE, U16};
5use zerocopy::FromBytes;
6
7pub type Result<T> = core::result::Result<T, BinaryReaderError>;
8
9/// Reads values from a slice of bytes. The values are encoded using the rules defined by .NET's
10/// `System.IO.BinaryWriter`.
11///
12/// Most simple fixed-size types are simply encoded using in-memory byte representation of the type,
13/// using little-endian memory ordering if applicable.
14///
15/// Variable-length types, such as strings and variable-length integers, have different encodings.
16/// Each of the methods that decodes such a type describes its representation.
17///
18/// This type only supports reading values from a slice of bytes. If you need to read values from
19/// a file or `Read` implementation, then you should copy the data into an in-memory buffer first.
20///
21/// Another option is to use "restartable" decoding.  Before calling any function that decodes a
22/// value, read the `data` slice (or simply its length). Then, call a function to decode a value
23/// (potentially multiple calls to decode multiple values).  If any function fails with
24/// `Err(ReaderError::NeedsMoreData)`, then go read more data from the source and reset `data`
25/// to point to the original location, plus any new data. Then repeat the calls that decode data.
26///
27/// This is feasible and it may be necessary for some designs. However, simply reading data into
28/// `Vec<u8>` or another in-memory container is likely to be simpler, less bug-prone, and
29/// probably faster, too.
30pub struct BinaryReader<'a> {
31    /// The input data being parsed. Each time a value is parsed from `data`, `data` is reassigned
32    /// to the remaining data.
33    pub data: &'a [u8],
34}
35
36impl<'a> BinaryReader<'a> {
37    /// Constructor
38    pub fn new(data: &'a [u8]) -> Self {
39        Self { data }
40    }
41
42    /// Reads a single `u8` value.
43    #[inline(always)]
44    pub fn read_u8(&mut self) -> Result<u8> {
45        if !self.data.is_empty() {
46            let value = self.data[0];
47            self.data = &self.data[1..];
48            Ok(value)
49        } else {
50            Err(BinaryReaderError::NeedsMoreData)
51        }
52    }
53
54    /// Reads a single `bool` value.
55    #[inline(always)]
56    pub fn read_bool(&mut self) -> Result<bool> {
57        if !self.data.is_empty() {
58            let value = self.data[0];
59            self.data = &self.data[1..];
60            Ok(value != 0)
61        } else {
62            Err(BinaryReaderError::NeedsMoreData)
63        }
64    }
65
66    /// Reads a slice of bytes whose length is `len`. This function returns a slice reference
67    /// to the bytes; it does not copy them.
68    #[inline(always)]
69    pub fn read_bytes(&mut self, len: usize) -> Result<&'a [u8]> {
70        if self.data.len() < len {
71            Err(BinaryReaderError::NeedsMoreData)
72        } else {
73            let (lo, hi) = self.data.split_at(len);
74            self.data = hi;
75            Ok(lo)
76        }
77    }
78
79    /// Reads a small array of bytes, with a constant length.
80    #[inline(always)]
81    pub fn read_cbytes<const N: usize>(&mut self) -> Result<[u8; N]> {
82        if self.data.len() < N {
83            Err(BinaryReaderError::NeedsMoreData)
84        } else {
85            let (lo, hi) = self.data.split_at(N);
86            self.data = hi;
87            // This unwrap() call will get optimized out.
88            Ok(*<&[u8; N]>::try_from(lo).unwrap())
89        }
90    }
91
92    /// Reads a small array of bytes, with a constant length.
93    #[inline(always)]
94    pub fn read_cbytes_ref<const N: usize>(&mut self) -> Result<&[u8; N]> {
95        if self.data.len() < N {
96            Err(BinaryReaderError::NeedsMoreData)
97        } else {
98            let (lo, hi) = self.data.split_at(N);
99            self.data = hi;
100            // This unwrap() call will get optimized out.
101            Ok(<&[u8; N]>::try_from(lo).unwrap())
102        }
103    }
104
105    /// Reads a `u16` in little-endian byte order.
106    #[inline(always)]
107    pub fn read_u16(&mut self) -> Result<u16> {
108        Ok(u16::from_le_bytes(self.read_cbytes()?))
109    }
110
111    /// Reads a `u32` in little-endian byte order.
112    #[inline(always)]
113    pub fn read_u32(&mut self) -> Result<u32> {
114        Ok(u32::from_le_bytes(self.read_cbytes()?))
115    }
116
117    /// Reads a `u64` in little-endian byte order.
118    #[inline(always)]
119    pub fn read_u64(&mut self) -> Result<u64> {
120        Ok(u64::from_le_bytes(self.read_cbytes()?))
121    }
122
123    /// Reads a `i16` in little-endian byte order.
124    #[inline(always)]
125    pub fn read_i16(&mut self) -> Result<i16> {
126        Ok(i16::from_le_bytes(self.read_cbytes()?))
127    }
128
129    /// Reads a `i32` in little-endian byte order.
130    #[inline(always)]
131    pub fn read_i32(&mut self) -> Result<i32> {
132        Ok(i32::from_le_bytes(self.read_cbytes()?))
133    }
134
135    /// Reads a `i64` in little-endian byte order.
136    #[inline(always)]
137    pub fn read_i64(&mut self) -> Result<i64> {
138        Ok(i64::from_le_bytes(self.read_cbytes()?))
139    }
140
141    /// Reads a variable-length integer and returns the value in `i32`.
142    pub fn read_7bit_encoded_i32(&mut self) -> Result<i32> {
143        // Each byte encodes 7 bits of the integer and 1 bit indicating whether there are
144        // more bytes following this one. Because 32 is not evenly divisible by 7, the last
145        // byte has some meaningless bits in them. We could validate those bits (rejecting
146        // input where the last byte contains non-zero meaningless bits), but that would be
147        // stricter than the .NET implementation, so we do not.
148
149        const MORE: u8 = 0x80;
150
151        let mut shift: u32 = 0;
152        let mut n: u32 = 0;
153
154        loop {
155            let b = self.read_u8()?;
156            n |= ((b & 0x7f) as u32) << shift;
157
158            if (b & MORE) == 0 {
159                break;
160            }
161
162            shift += 7;
163            if shift >= 32 {
164                return Err(BinaryReaderError::Invalid);
165            }
166        }
167
168        Ok(n as i32)
169    }
170
171    /// Reads a variable-length integer and returns the value in `i64`.
172    pub fn read_7bit_encoded_i64(&mut self) -> Result<i64> {
173        const MORE: u8 = 0x80;
174
175        let mut shift: u32 = 0;
176        let mut n: u64 = 0;
177
178        loop {
179            let b = self.read_u8()?;
180            n |= ((b & 0x7f) as u64) << shift;
181
182            if (b & MORE) == 0 {
183                break;
184            }
185
186            shift += 7;
187            if shift >= 64 {
188                return Err(BinaryReaderError::Invalid);
189            }
190        }
191
192        Ok(n as i64)
193    }
194
195    /// Reads a length-prefixed UTF-8 string.
196    ///
197    /// This does not copy any data. It reads the prefixed length, locates the contents of the
198    /// string, then returns the string data as a `&[u8]`.
199    ///
200    /// The caller must handle validating that the string is well-formed UTF-8, if necessary.
201    pub fn read_utf8_bytes(&mut self) -> Result<&'a [u8]> {
202        let len_i32 = self.read_7bit_encoded_i32()?;
203        let Ok(len_usize) = usize::try_from(len_i32) else {
204            return Err(BinaryReaderError::Invalid);
205        };
206
207        self.read_bytes(len_usize)
208    }
209
210    /// Reads a length-prefixed UTF-8 string.
211    ///
212    /// This does not copy any data. It reads the prefixed length, locates the contents of the
213    /// string, then returns the string data as a `bstr::BStr`.
214    ///
215    /// The caller must handle validating that the string is well-formed UTF-8, if necessary.
216    ///
217    /// The encoded stream does not contain any information that distinguishes UTF-8 strings and
218    /// UTF-16 strings, so applications will need to make sure that they call the correct
219    /// `read_utf8_*` or `read_utf16_*` function.
220    #[cfg(feature = "bstr")]
221    pub fn read_utf8_bstr(&mut self) -> Result<&'a bstr::BStr> {
222        Ok(bstr::BStr::new(self.read_utf8_bytes()?))
223    }
224
225    /// Reads a length-prefixed UTF-8 string and returns it as `&str`.
226    ///
227    /// This does not copy any data. It reads the prefixed length, locates the contents of the
228    /// string, validates that the contents are well-formed UTF-8 and returns the string slice.
229    ///
230    /// The encoded stream does not contain any information that distinguishes UTF-8 strings and
231    /// UTF-16 strings, so applications will need to make sure that they call the correct
232    /// `read_utf8_*` or `read_utf16_*` function.
233    pub fn read_utf8_str(&mut self) -> Result<&'a str> {
234        let bytes = self.read_utf8_bytes()?;
235        if let Ok(s) = core::str::from_utf8(bytes) {
236            Ok(s)
237        } else {
238            Err(BinaryReaderError::NeedsMoreData)
239        }
240    }
241
242    /// Reads a length-prefixed UTF-8 string and returns it as `Cow<str>`.
243    ///
244    /// The input string is expected to be valid UTF-8. However, if the input contains byte
245    /// sequences that do not code for valid UTF-8, then those sequences will be replaced with
246    /// the Unicore replacement character and the rest of the string will be processed.
247    ///
248    /// The encoded stream does not contain any information that distinguishes UTF-8 strings and
249    /// UTF-16 strings, so applications will need to make sure that they call the correct
250    /// `read_utf8_*` or `read_utf16_*` function.
251    #[cfg(feature = "std")]
252    pub fn read_utf8_string_lossy(&mut self) -> Result<Cow<'a, str>> {
253        let bytes = self.read_utf8_bytes()?;
254        Ok(String::from_utf8_lossy(bytes))
255    }
256
257    /// Reads a length-prefixed UTF-16 string and returns it as `&[U16<LE>]`.
258    ///
259    /// This does not copy any data. It reads the prefixed length, locates the contents of the
260    /// string, validates that the contents are the right size for UTF-16 (meaning: the length in
261    /// bytes is a multiple of 2) and returns the string slice.
262    ///
263    /// The caller is responsible for converting the returned slice to a different, more usable
264    /// form.
265    pub fn read_utf16_wchars(&mut self) -> Result<&'a [U16<LE>]> {
266        let bytes_len_i32 = self.read_7bit_encoded_i32()?;
267        let Ok(bytes_len_usize) = usize::try_from(bytes_len_i32) else {
268            return Err(BinaryReaderError::Invalid);
269        };
270
271        let bytes = self.read_bytes(bytes_len_usize)?;
272
273        let Ok(wchars) = <[U16<LE>]>::ref_from_bytes(bytes) else {
274            return Err(BinaryReaderError::Invalid);
275        };
276
277        Ok(wchars)
278    }
279
280    /// Reads a length-prefixed UTF-16 string and returns it as `String`.
281    ///
282    /// The input string is required to be well-formed UTF-16; if it contains illegal UTF-16 code
283    /// points or illegal surrogate sequences, then this function will return
284    /// `Err(ReaderError::Invalid)`.
285    ///
286    /// The length in bytes of the string is required to be a multiple of 2. If it is not, then
287    /// this function will return `Err(ReaderError::Invalid)`.
288    ///
289    /// The encoded stream does not contain any information that distinguishes UTF-8 strings and
290    /// UTF-16 strings, so applications will need to make sure that they call the correct
291    /// `read_utf8_*` or `read_utf16_*` function.
292    #[cfg(feature = "std")]
293    pub fn read_utf16_string(&mut self) -> Result<String> {
294        let wchars = self.read_utf16_wchars()?;
295        let wchars_u16: Vec<u16> = wchars.iter().map(|c| c.get()).collect();
296        String::from_utf16(&wchars_u16).map_err(|_| BinaryReaderError::Invalid)
297    }
298
299    /// Reads a length-prefixed UTF-16 string and returns it as `String`.
300    ///
301    /// If the input sequence contains illegal UTF-16 code points or illegal surrogate sequences,
302    /// then this function will replace the illegal code units with the Unicode replacement
303    /// character.
304    ///
305    /// The length in bytes of the string is required to be a multiple of 2. If it is not, then
306    /// this function will return `Err(ReaderError::Invalid)`.
307    #[cfg(feature = "std")]
308    pub fn read_utf16_string_lossy(&mut self) -> Result<String> {
309        let wchars = self.read_utf16_wchars()?;
310        let wchars_u16: Vec<u16> = wchars.iter().map(|c| c.get()).collect();
311        Ok(String::from_utf16_lossy(&wchars_u16))
312    }
313}
314
315/// Error type for `BinaryReader`
316#[derive(Clone, Eq, PartialEq, Debug)]
317pub enum BinaryReaderError {
318    /// A `read_*` method reached the end of the input data, but requires more data to finish
319    /// reading the input.
320    ///
321    /// If a function returns this error value, then the encoded value may still be well-formed,
322    /// if the rest of the data can be read. However, most of the `read_*` functions _do not_
323    /// guarantee that they don't advance the read position, even if they return `EndOfData`.
324    NeedsMoreData,
325
326    /// The `read_*` request found invalid data in the input. The input is malformed.
327    Invalid,
328}
329
330impl core::error::Error for BinaryReaderError {}
331
332impl core::fmt::Display for BinaryReaderError {
333    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
334        match self {
335            Self::NeedsMoreData => f.write_str(
336                "The value could not be decoded because the input data was not complete.",
337            ),
338            Self::Invalid => {
339                f.write_str("The value could not be decoded because the input data is malformed.")
340            }
341        }
342    }
343}
dotnet_binary_io/reader.rs

dotnet_binary_io/
reader.rs