dotnet_binary_io/reader.rs
1#[cfg(feature = "std")]
2use std::borrow::Cow;
3
4use zerocopy::byteorder::{LE, U16};
5use zerocopy::FromBytes;
6
7pub type Result<T> = core::result::Result<T, BinaryReaderError>;
8
9/// Reads values from a slice of bytes. The values are encoded using the rules defined by .NET's
10/// `System.IO.BinaryWriter`.
11///
12/// Most simple fixed-size types are simply encoded using in-memory byte representation of the type,
13/// using little-endian memory ordering if applicable.
14///
15/// Variable-length types, such as strings and variable-length integers, have different encodings.
16/// Each of the methods that decodes such a type describes its representation.
17///
18/// This type only supports reading values from a slice of bytes. If you need to read values from
19/// a file or `Read` implementation, then you should copy the data into an in-memory buffer first.
20///
21/// Another option is to use "restartable" decoding. Before calling any function that decodes a
22/// value, read the `data` slice (or simply its length). Then, call a function to decode a value
23/// (potentially multiple calls to decode multiple values). If any function fails with
24/// `Err(ReaderError::NeedsMoreData)`, then go read more data from the source and reset `data`
25/// to point to the original location, plus any new data. Then repeat the calls that decode data.
26///
27/// This is feasible and it may be necessary for some designs. However, simply reading data into
28/// `Vec<u8>` or another in-memory container is likely to be simpler, less bug-prone, and
29/// probably faster, too.
30pub struct BinaryReader<'a> {
31 /// The input data being parsed. Each time a value is parsed from `data`, `data` is reassigned
32 /// to the remaining data.
33 pub data: &'a [u8],
34}
35
36impl<'a> BinaryReader<'a> {
37 /// Constructor
38 pub fn new(data: &'a [u8]) -> Self {
39 Self { data }
40 }
41
42 /// Reads a single `u8` value.
43 #[inline(always)]
44 pub fn read_u8(&mut self) -> Result<u8> {
45 if !self.data.is_empty() {
46 let value = self.data[0];
47 self.data = &self.data[1..];
48 Ok(value)
49 } else {
50 Err(BinaryReaderError::NeedsMoreData)
51 }
52 }
53
54 /// Reads a single `bool` value.
55 #[inline(always)]
56 pub fn read_bool(&mut self) -> Result<bool> {
57 if !self.data.is_empty() {
58 let value = self.data[0];
59 self.data = &self.data[1..];
60 Ok(value != 0)
61 } else {
62 Err(BinaryReaderError::NeedsMoreData)
63 }
64 }
65
66 /// Reads a slice of bytes whose length is `len`. This function returns a slice reference
67 /// to the bytes; it does not copy them.
68 #[inline(always)]
69 pub fn read_bytes(&mut self, len: usize) -> Result<&'a [u8]> {
70 if self.data.len() < len {
71 Err(BinaryReaderError::NeedsMoreData)
72 } else {
73 let (lo, hi) = self.data.split_at(len);
74 self.data = hi;
75 Ok(lo)
76 }
77 }
78
79 /// Reads a small array of bytes, with a constant length.
80 #[inline(always)]
81 pub fn read_cbytes<const N: usize>(&mut self) -> Result<[u8; N]> {
82 if self.data.len() < N {
83 Err(BinaryReaderError::NeedsMoreData)
84 } else {
85 let (lo, hi) = self.data.split_at(N);
86 self.data = hi;
87 // This unwrap() call will get optimized out.
88 Ok(*<&[u8; N]>::try_from(lo).unwrap())
89 }
90 }
91
92 /// Reads a small array of bytes, with a constant length.
93 #[inline(always)]
94 pub fn read_cbytes_ref<const N: usize>(&mut self) -> Result<&[u8; N]> {
95 if self.data.len() < N {
96 Err(BinaryReaderError::NeedsMoreData)
97 } else {
98 let (lo, hi) = self.data.split_at(N);
99 self.data = hi;
100 // This unwrap() call will get optimized out.
101 Ok(<&[u8; N]>::try_from(lo).unwrap())
102 }
103 }
104
105 /// Reads a `u16` in little-endian byte order.
106 #[inline(always)]
107 pub fn read_u16(&mut self) -> Result<u16> {
108 Ok(u16::from_le_bytes(self.read_cbytes()?))
109 }
110
111 /// Reads a `u32` in little-endian byte order.
112 #[inline(always)]
113 pub fn read_u32(&mut self) -> Result<u32> {
114 Ok(u32::from_le_bytes(self.read_cbytes()?))
115 }
116
117 /// Reads a `u64` in little-endian byte order.
118 #[inline(always)]
119 pub fn read_u64(&mut self) -> Result<u64> {
120 Ok(u64::from_le_bytes(self.read_cbytes()?))
121 }
122
123 /// Reads a `i16` in little-endian byte order.
124 #[inline(always)]
125 pub fn read_i16(&mut self) -> Result<i16> {
126 Ok(i16::from_le_bytes(self.read_cbytes()?))
127 }
128
129 /// Reads a `i32` in little-endian byte order.
130 #[inline(always)]
131 pub fn read_i32(&mut self) -> Result<i32> {
132 Ok(i32::from_le_bytes(self.read_cbytes()?))
133 }
134
135 /// Reads a `i64` in little-endian byte order.
136 #[inline(always)]
137 pub fn read_i64(&mut self) -> Result<i64> {
138 Ok(i64::from_le_bytes(self.read_cbytes()?))
139 }
140
141 /// Reads a variable-length integer and returns the value in `i32`.
142 pub fn read_7bit_encoded_i32(&mut self) -> Result<i32> {
143 // Each byte encodes 7 bits of the integer and 1 bit indicating whether there are
144 // more bytes following this one. Because 32 is not evenly divisible by 7, the last
145 // byte has some meaningless bits in them. We could validate those bits (rejecting
146 // input where the last byte contains non-zero meaningless bits), but that would be
147 // stricter than the .NET implementation, so we do not.
148
149 const MORE: u8 = 0x80;
150
151 let mut shift: u32 = 0;
152 let mut n: u32 = 0;
153
154 loop {
155 let b = self.read_u8()?;
156 n |= ((b & 0x7f) as u32) << shift;
157
158 if (b & MORE) == 0 {
159 break;
160 }
161
162 shift += 7;
163 if shift >= 32 {
164 return Err(BinaryReaderError::Invalid);
165 }
166 }
167
168 Ok(n as i32)
169 }
170
171 /// Reads a variable-length integer and returns the value in `i64`.
172 pub fn read_7bit_encoded_i64(&mut self) -> Result<i64> {
173 const MORE: u8 = 0x80;
174
175 let mut shift: u32 = 0;
176 let mut n: u64 = 0;
177
178 loop {
179 let b = self.read_u8()?;
180 n |= ((b & 0x7f) as u64) << shift;
181
182 if (b & MORE) == 0 {
183 break;
184 }
185
186 shift += 7;
187 if shift >= 64 {
188 return Err(BinaryReaderError::Invalid);
189 }
190 }
191
192 Ok(n as i64)
193 }
194
195 /// Reads a length-prefixed UTF-8 string.
196 ///
197 /// This does not copy any data. It reads the prefixed length, locates the contents of the
198 /// string, then returns the string data as a `&[u8]`.
199 ///
200 /// The caller must handle validating that the string is well-formed UTF-8, if necessary.
201 pub fn read_utf8_bytes(&mut self) -> Result<&'a [u8]> {
202 let len_i32 = self.read_7bit_encoded_i32()?;
203 let Ok(len_usize) = usize::try_from(len_i32) else {
204 return Err(BinaryReaderError::Invalid);
205 };
206
207 self.read_bytes(len_usize)
208 }
209
210 /// Reads a length-prefixed UTF-8 string.
211 ///
212 /// This does not copy any data. It reads the prefixed length, locates the contents of the
213 /// string, then returns the string data as a `bstr::BStr`.
214 ///
215 /// The caller must handle validating that the string is well-formed UTF-8, if necessary.
216 ///
217 /// The encoded stream does not contain any information that distinguishes UTF-8 strings and
218 /// UTF-16 strings, so applications will need to make sure that they call the correct
219 /// `read_utf8_*` or `read_utf16_*` function.
220 #[cfg(feature = "bstr")]
221 pub fn read_utf8_bstr(&mut self) -> Result<&'a bstr::BStr> {
222 Ok(bstr::BStr::new(self.read_utf8_bytes()?))
223 }
224
225 /// Reads a length-prefixed UTF-8 string and returns it as `&str`.
226 ///
227 /// This does not copy any data. It reads the prefixed length, locates the contents of the
228 /// string, validates that the contents are well-formed UTF-8 and returns the string slice.
229 ///
230 /// The encoded stream does not contain any information that distinguishes UTF-8 strings and
231 /// UTF-16 strings, so applications will need to make sure that they call the correct
232 /// `read_utf8_*` or `read_utf16_*` function.
233 pub fn read_utf8_str(&mut self) -> Result<&'a str> {
234 let bytes = self.read_utf8_bytes()?;
235 if let Ok(s) = core::str::from_utf8(bytes) {
236 Ok(s)
237 } else {
238 Err(BinaryReaderError::NeedsMoreData)
239 }
240 }
241
242 /// Reads a length-prefixed UTF-8 string and returns it as `Cow<str>`.
243 ///
244 /// The input string is expected to be valid UTF-8. However, if the input contains byte
245 /// sequences that do not code for valid UTF-8, then those sequences will be replaced with
246 /// the Unicore replacement character and the rest of the string will be processed.
247 ///
248 /// The encoded stream does not contain any information that distinguishes UTF-8 strings and
249 /// UTF-16 strings, so applications will need to make sure that they call the correct
250 /// `read_utf8_*` or `read_utf16_*` function.
251 #[cfg(feature = "std")]
252 pub fn read_utf8_string_lossy(&mut self) -> Result<Cow<'a, str>> {
253 let bytes = self.read_utf8_bytes()?;
254 Ok(String::from_utf8_lossy(bytes))
255 }
256
257 /// Reads a length-prefixed UTF-16 string and returns it as `&[U16<LE>]`.
258 ///
259 /// This does not copy any data. It reads the prefixed length, locates the contents of the
260 /// string, validates that the contents are the right size for UTF-16 (meaning: the length in
261 /// bytes is a multiple of 2) and returns the string slice.
262 ///
263 /// The caller is responsible for converting the returned slice to a different, more usable
264 /// form.
265 pub fn read_utf16_wchars(&mut self) -> Result<&'a [U16<LE>]> {
266 let bytes_len_i32 = self.read_7bit_encoded_i32()?;
267 let Ok(bytes_len_usize) = usize::try_from(bytes_len_i32) else {
268 return Err(BinaryReaderError::Invalid);
269 };
270
271 let bytes = self.read_bytes(bytes_len_usize)?;
272
273 let Ok(wchars) = <[U16<LE>]>::ref_from_bytes(bytes) else {
274 return Err(BinaryReaderError::Invalid);
275 };
276
277 Ok(wchars)
278 }
279
280 /// Reads a length-prefixed UTF-16 string and returns it as `String`.
281 ///
282 /// The input string is required to be well-formed UTF-16; if it contains illegal UTF-16 code
283 /// points or illegal surrogate sequences, then this function will return
284 /// `Err(ReaderError::Invalid)`.
285 ///
286 /// The length in bytes of the string is required to be a multiple of 2. If it is not, then
287 /// this function will return `Err(ReaderError::Invalid)`.
288 ///
289 /// The encoded stream does not contain any information that distinguishes UTF-8 strings and
290 /// UTF-16 strings, so applications will need to make sure that they call the correct
291 /// `read_utf8_*` or `read_utf16_*` function.
292 #[cfg(feature = "std")]
293 pub fn read_utf16_string(&mut self) -> Result<String> {
294 let wchars = self.read_utf16_wchars()?;
295 let wchars_u16: Vec<u16> = wchars.iter().map(|c| c.get()).collect();
296 String::from_utf16(&wchars_u16).map_err(|_| BinaryReaderError::Invalid)
297 }
298
299 /// Reads a length-prefixed UTF-16 string and returns it as `String`.
300 ///
301 /// If the input sequence contains illegal UTF-16 code points or illegal surrogate sequences,
302 /// then this function will replace the illegal code units with the Unicode replacement
303 /// character.
304 ///
305 /// The length in bytes of the string is required to be a multiple of 2. If it is not, then
306 /// this function will return `Err(ReaderError::Invalid)`.
307 #[cfg(feature = "std")]
308 pub fn read_utf16_string_lossy(&mut self) -> Result<String> {
309 let wchars = self.read_utf16_wchars()?;
310 let wchars_u16: Vec<u16> = wchars.iter().map(|c| c.get()).collect();
311 Ok(String::from_utf16_lossy(&wchars_u16))
312 }
313}
314
315/// Error type for `BinaryReader`
316#[derive(Clone, Eq, PartialEq, Debug)]
317pub enum BinaryReaderError {
318 /// A `read_*` method reached the end of the input data, but requires more data to finish
319 /// reading the input.
320 ///
321 /// If a function returns this error value, then the encoded value may still be well-formed,
322 /// if the rest of the data can be read. However, most of the `read_*` functions _do not_
323 /// guarantee that they don't advance the read position, even if they return `EndOfData`.
324 NeedsMoreData,
325
326 /// The `read_*` request found invalid data in the input. The input is malformed.
327 Invalid,
328}
329
330impl core::error::Error for BinaryReaderError {}
331
332impl core::fmt::Display for BinaryReaderError {
333 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
334 match self {
335 Self::NeedsMoreData => f.write_str(
336 "The value could not be decoded because the input data was not complete.",
337 ),
338 Self::Invalid => {
339 f.write_str("The value could not be decoded because the input data is malformed.")
340 }
341 }
342 }
343}