rw_utils/
string_read.rs

1use std::io;
2use std::io::{Error, ErrorKind, Read};
3use std::mem::size_of;
4
5///
6/// Trait that provides various methods to read strings.
7/// Automatically implemented for all implementations of io::Read.
8/// This trait is sealed and cannot be implemented manually.
9///
10pub trait StringRead : private::Sealed {
11
12    ///
13    /// Reads an u32 in little endian length prefix followed by that amount of bytes.
14    /// The bytes are then parsed as utf-8.
15    ///
16    fn read_string_u16_le_len_utf8(&mut self) -> io::Result<String>;
17
18    ///
19    /// Reads an u32 in big endian length prefix followed by that amount of bytes.
20    /// The bytes are then parsed as utf-8.
21    ///
22    fn read_string_u16_be_len_utf8(&mut self) -> io::Result<String>;
23
24    ///
25    /// Reads an u32 in little endian length prefix followed by that amount of bytes.
26    /// The bytes are then parsed as utf-8.
27    ///
28    fn read_string_u32_le_len_utf8(&mut self) -> io::Result<String>;
29
30    ///
31    /// Reads an u32 in big endian length prefix followed by that amount of bytes.
32    /// The bytes are then parsed as utf-8.
33    ///
34    fn read_string_u32_be_len_utf8(&mut self) -> io::Result<String>;
35
36    ///
37    /// Reads until zero byte and treats all bytes read as utf-8 string.
38    ///
39    fn read_string_zero_terminated_utf8(&mut self) -> io::Result<String>;
40
41    ///
42    /// Read given amount of bytes and treat them as UTF-8 string.
43    ///
44    fn read_string_utf8(&mut self, size: usize) -> io::Result<String>;
45
46    ///
47    /// Read given amount of characters of an utf-16-be string.
48    ///
49    fn read_string_utf16_be(&mut self, size_in_characters: usize) -> io::Result<String>;
50
51    ///
52    /// Read given amount of characters of an utf-16-le string.
53    ///
54    fn read_string_utf16_le(&mut self, size_in_characters: usize) -> io::Result<String>;
55
56    ///
57    /// Read given amount of characters of an utf-32-be string.
58    ///
59    fn read_string_utf32_be(&mut self, size_in_characters: usize) -> io::Result<String>;
60
61    ///
62    /// Read given amount of characters of an utf-32-le string.
63    ///
64    fn read_string_utf32_le(&mut self, size_in_characters: usize) -> io::Result<String>;
65
66    ///
67    /// Reads a string that was produced by a java program using the java.io.DataOutput#writeUTF facility.
68    /// In general, it reads an u16 in big endian to indicate how many further bytes are needed.
69    /// It then parses the data to create an utf-16 u16 array.
70    /// Each u16 consists of 1, 2 or 3 bytes encoded in a custom java specific encoding.
71    /// After parsing all the data the u16 array contains utf-16
72    /// data that will be turned into a String using String::from_utf16.
73    ///
74    fn read_java_data_input_utf(&mut self) -> io::Result<String>;
75}
76
77impl <T> StringRead for T where T: Read {
78    fn read_string_u16_le_len_utf8(&mut self) -> io::Result<String> {
79        let mut len_bytes = [0u8; 2];
80        self.read_exact(len_bytes.as_mut_slice())?;
81        let len = u16::from_le_bytes(len_bytes);
82        return self.read_string_utf8(len as usize);
83    }
84
85    fn read_string_u16_be_len_utf8(&mut self) -> io::Result<String> {
86        let mut len_bytes = [0u8; 2];
87        self.read_exact(len_bytes.as_mut_slice())?;
88        let len = u16::from_be_bytes(len_bytes);
89        return self.read_string_utf8(len as usize);
90    }
91
92    fn read_string_u32_le_len_utf8(&mut self) -> io::Result<String> {
93        let mut len_bytes = [0u8; 4];
94        self.read_exact(len_bytes.as_mut_slice())?;
95        let len = u32::from_le_bytes(len_bytes);
96        return self.read_string_utf8(len as usize);
97    }
98
99    fn read_string_u32_be_len_utf8(&mut self) -> io::Result<String> {
100        let mut len_bytes = [0u8; 4];
101        self.read_exact(len_bytes.as_mut_slice())?;
102        let len = u32::from_be_bytes(len_bytes);
103        return self.read_string_utf8(len as usize);
104    }
105
106    fn read_string_zero_terminated_utf8(&mut self) -> io::Result<String> {
107        let mut data = Vec::with_capacity(64);
108        let mut buf = [0u8];
109        let sl = buf.as_mut_slice();
110        loop {
111            self.read_exact(sl)?;
112            if sl[0] == 0 {
113                break;
114            }
115
116            data.push(sl[0]);
117        }
118
119        return String::from_utf8(data).map_err(|_e| Error::new(ErrorKind::InvalidData, "invalid utf-8 data"));
120    }
121
122    fn read_string_utf8(&mut self, size: usize) -> io::Result<String> {
123        let mut data = vec![0u8; size];
124        self.read_exact(data.as_mut_slice())?;
125        return String::from_utf8(data).map_err(|_e| Error::new(ErrorKind::InvalidData, "invalid utf-8 data"));
126    }
127
128    #[cfg(target_endian = "little")]
129    fn read_string_utf16_be(&mut self, size_in_characters: usize) -> io::Result<String> {
130        if size_in_characters == 0 {
131            return Ok("".to_string());
132        }
133
134        let mut data = vec![0u8; size_in_characters<<1];
135        self.read_exact(data.as_mut_slice())?;
136
137        let sl :&mut [u16] = unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr().cast(), size_in_characters) };
138        for i in 0 .. sl.len() {
139            sl[i] = sl[i].to_be();
140        }
141
142        if sl[0] == 0xFFFE {
143            return Err(Error::new(ErrorKind::InvalidData, "Encountered byte order mark 0xFFFE. This indicates a wrong byte order.".to_string()));
144        }
145
146        return String::from_utf16(sl).map_err(|_e| Error::new(ErrorKind::InvalidData, "invalid utf-16 data"));
147    }
148
149    #[cfg(target_endian = "big")]
150    fn read_string_utf16_be(&mut self, size_in_characters: usize) -> io::Result<String> {
151        if size_in_characters == 0 {
152            return Ok("".to_string());
153        }
154
155        let mut data = vec![0u8; size_in_characters<<1];
156        self.read_exact(data.as_mut_slice())?;
157
158        let sl :&[u16] = unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr().cast(), size_in_characters) };
159
160        if sl[0] == 0xFFFE {
161            return Err(Error::new(ErrorKind::InvalidData, "Encountered byte order mark 0xFFFE. This indicates a wrong byte order.".to_string()));
162        }
163
164        return String::from_utf16(sl).map_err(|_e| Error::new(ErrorKind::InvalidData, "invalid utf-16 data"));
165    }
166
167    #[cfg(target_endian = "little")]
168    fn read_string_utf16_le(&mut self, size_in_characters: usize) -> io::Result<String> {
169        if size_in_characters == 0 {
170            return Ok("".to_string());
171        }
172
173        let mut data = vec![0u8; size_in_characters<<1];
174        self.read_exact(data.as_mut_slice())?;
175
176        let sl :&[u16] = unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr().cast(), size_in_characters) };
177
178        if sl[0] == 0xFFFE {
179            return Err(Error::new(ErrorKind::InvalidData, "Encountered byte order mark 0xFFFE. This indicates a wrong byte order.".to_string()));
180        }
181
182        return String::from_utf16(sl).map_err(|_e| Error::new(ErrorKind::InvalidData, "invalid utf-16 data"));
183    }
184
185    #[cfg(target_endian = "big")]
186    fn read_string_utf16_le(&mut self, size_in_characters: usize) -> io::Result<String> {
187        if size_in_characters == 0 {
188            return Ok("".to_string());
189        }
190
191        let mut data = vec![0u8; size_in_characters<<1];
192        self.read_exact(data.as_mut_slice())?;
193
194        let sl :&mut [u16] = unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr().cast(), size_in_characters) };
195        for i in 0 .. sl.len() {
196            sl[i] = sl[i].to_le();
197        }
198
199        if sl[0] == 0xFFFE {
200            return Err(Error::new(ErrorKind::InvalidData, "Encountered byte order mark 0xFFFE. This indicates a wrong byte order.".to_string()));
201        }
202
203        return String::from_utf16(sl).map_err(|_e| Error::new(ErrorKind::InvalidData, "invalid utf-16 data"));
204    }
205
206    #[cfg(target_endian = "big")]
207    fn read_string_utf32_be(&mut self, size_in_characters: usize) -> io::Result<String> {
208        if size_in_characters == 0 {
209            return Ok("".to_string());
210        }
211        let mut data = vec![0u32; size_in_characters];
212        let sl : &mut [u8] = unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr().cast(), size_in_characters * size_of::<char>()) };
213        self.read_exact(sl)?;
214
215        if data[0] == 0xFFFE0000u32 {
216            return Err(Error::new(ErrorKind::InvalidData, "Encountered byte order mark 0xFFFE. This indicates a wrong byte order.".to_string()));
217        }
218
219        for i in 0 .. data.len() {
220            let cur = data[i];
221
222            if char::from_u32(cur).is_none() {
223                return Err(Error::new(ErrorKind::InvalidData, format!("{} is not a valid unicode codepoint.", cur)));
224            }
225        }
226
227        let sl : &mut [char] = unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr().cast(), size_in_characters) };
228        let str : String = sl.iter().collect();
229        return Ok(str);
230    }
231
232    #[cfg(target_endian = "big")]
233    fn read_string_utf32_le(&mut self, size_in_characters: usize) -> io::Result<String> {
234        if size_in_characters == 0 {
235            return Ok("".to_string());
236        }
237        let mut data = vec![0u32; size_in_characters];
238        let sl : &mut [u8] = unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr().cast(), size_in_characters * 4) };
239        self.read_exact(sl)?;
240
241        if data[0].to_le() == 0xFFFE0000u32 {
242            return Err(Error::new(ErrorKind::InvalidData, "Encountered byte order mark 0xFFFE. This indicates a wrong byte order.".to_string()));
243        }
244
245        for i in 0 .. data.len() {
246            let cur = data[i].to_le();
247            data[i] = cur;
248
249            if char::from_u32(cur).is_none() {
250                return Err(Error::new(ErrorKind::InvalidData, format!("{} is not a valid unicode codepoint.", cur)));
251            }
252        }
253
254        let sl : &mut [char] = unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr().cast(), size_in_characters) };
255        let str : String = sl.iter().collect();
256        return Ok(str);
257    }
258
259    #[cfg(target_endian = "little")]
260    fn read_string_utf32_be(&mut self, size_in_characters: usize) -> io::Result<String> {
261        if size_in_characters == 0 {
262            return Ok("".to_string());
263        }
264        let mut data = vec![0u32; size_in_characters];
265        let sl : &mut [u8] = unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr().cast(), size_in_characters * 4) };
266        self.read_exact(sl)?;
267
268        if data[0].to_be() == 0xFFFE0000u32 {
269            return Err(Error::new(ErrorKind::InvalidData, "Encountered byte order mark 0xFFFE. This indicates a wrong byte order.".to_string()));
270        }
271
272        for i in 0 .. data.len() {
273            let cur = data[i].to_be();
274            data[i] = cur;
275
276            if char::from_u32(cur).is_none() {
277                return Err(Error::new(ErrorKind::InvalidData, format!("{} is not a valid unicode codepoint.", cur)));
278            }
279        }
280
281        let sl : &mut [char] = unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr().cast(), size_in_characters) };
282        let str : String = sl.iter().collect();
283        return Ok(str);
284    }
285
286    #[cfg(target_endian = "little")]
287    fn read_string_utf32_le(&mut self, size_in_characters: usize) -> io::Result<String> {
288        if size_in_characters == 0 {
289            return Ok("".to_string());
290        }
291        let mut data = vec![0u32; size_in_characters];
292        let sl : &mut [u8] = unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr().cast(), size_in_characters * size_of::<char>()) };
293        self.read_exact(sl)?;
294
295        if data[0] == 0xFFFE0000u32 {
296            return Err(Error::new(ErrorKind::InvalidData, "Encountered byte order mark 0xFFFE. This indicates a wrong byte order.".to_string()));
297        }
298
299        for i in 0 .. data.len() {
300            let cur = data[i];
301
302            if char::from_u32(cur).is_none() {
303                return Err(Error::new(ErrorKind::InvalidData, format!("{} is not a valid unicode codepoint.", cur)));
304            }
305        }
306
307        let sl : &mut [char] = unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr().cast(), size_in_characters) };
308        let str : String = sl.iter().collect();
309        return Ok(str);
310    }
311
312    fn read_java_data_input_utf(&mut self) -> io::Result<String> {
313        let mut buf = [0u8; 2];
314        self.read_exact(buf.as_mut_slice())?;
315        //this is always big endian in java
316        let byte_count = (buf[0] as u16 >> 8 | buf[1] as u16) as usize;
317
318        let mut buf = vec![0u8; byte_count];
319        self.read_exact(buf.as_mut_slice())?;
320
321        //This is optimistic alloc and works if we only send ascii;
322        let mut characters: Vec<u16> = Vec::with_capacity(byte_count);
323
324        let mut index = 0usize;
325        while index < buf.len() {
326            let c = buf[index] as u32;
327
328            match c >> 4 {
329                0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 => {
330                    characters.push(c as u16);
331                    index += 1;
332                }
333                12 | 13 => {
334                    if index + 2 > buf.len() {
335                        return Err(Error::new(ErrorKind::InvalidData, "Invalid input"));
336                    }
337                    let c2 = buf[index + 1] as u32;
338                    index += 2;
339                    if (c2 & 0xC0) != 0x80 {
340                        return Err(Error::new(ErrorKind::InvalidData, "Invalid input"));
341                    }
342
343                    let v = ((c & 0x1F) << 6) | (c2 & 0x3F);
344                    characters.push(v as u16)
345                }
346                14 => {
347                    if index + 3 > buf.len() {
348                        return Err(Error::new(ErrorKind::InvalidData, "Invalid input"));
349                    }
350                    let c2 = buf[index + 1] as u32;
351                    let c3 = buf[index + 2] as u32;
352                    index += 3;
353                    if ((c2 & 0xC0) != 0x80) || ((c3 & 0xC0) != 0x80) {
354                        return Err(Error::new(ErrorKind::InvalidData, "Invalid input"));
355                    }
356                    let v = ((c & 0x0F) << 12) | ((c2 & 0x3F) << 6) | ((c3 & 0x3F) << 0);
357                    characters.push(v as u16)
358                }
359                _ => {
360                    return Err(Error::new(ErrorKind::InvalidData, "Invalid input"));
361                }
362            }
363        }
364
365        let result = String::from_utf16(&characters).map_err(|_| Error::new(ErrorKind::InvalidData, "Invalid input"))?;
366        Ok(result)
367    }
368}
369
370mod private {
371    use std::io::Read;
372
373    impl <T> Sealed for T where T: Read {}
374    pub trait Sealed {
375
376    }
377}