utf8_chars/
lib.rs

1#![cfg_attr(feature="bench", feature(test))]
2
3#![deny(warnings)]
4#![allow(clippy::needless_doctest_main)]
5#![allow(clippy::needless_lifetimes)]
6#![doc(test(attr(deny(warnings))))]
7#![doc(test(attr(allow(dead_code))))]
8#![doc(test(attr(allow(unused_variables))))]
9
10#[cfg(all(feature="bench", test))]
11extern crate test;
12
13#[doc=include_str!("../README.md")]
14type _DocTestReadme = ();
15
16use std::fmt::{self};
17use std::char::{self};
18use std::error::{Error};
19use std::io::{self, BufRead};
20use arrayvec::{ArrayVec};
21
22/// A structure, containing read bytes, and an [`io::Error`].
23///
24/// The `io::Error` is an actual I/O error if some occurred,
25/// or a synthetic error with either the [`UnexpectedEof`](std::io::ErrorKind::UnexpectedEof)
26/// kind if a multi-byte char was unexpectedly terminated,
27/// either the [`InvalidData`](std::io::ErrorKind::InvalidData)
28/// kind if no actual I/O error occurred, but read byte sequence was not recognised as a valid UTF-8.  
29#[derive(Debug)]
30pub struct ReadCharError {
31    bytes: ArrayVec<u8, { CHAR_MAX_LEN as usize }>,
32    io_error: io::Error,
33}
34
35impl ReadCharError {
36    /// A byte sequence, representing an invalid or incomplete UTF-8-encoded char.
37    pub fn as_bytes(&self) -> &[u8] { &self.bytes }
38    /// Returns a reference to the I/O error.
39    pub fn as_io_error(&self) -> &io::Error { &self.io_error }
40    /// Consumes the `ReadCharError`, returning the I/O error.
41    pub fn into_io_error(self) -> io::Error { self.io_error }
42}
43
44impl Error for ReadCharError {
45    fn source(&self) -> Option<&(dyn Error + 'static)> { Some(&self.io_error) }
46}
47
48impl From<ReadCharError> for io::Error {
49    fn from(e: ReadCharError) -> io::Error { e.into_io_error() }
50}
51
52impl fmt::Display for ReadCharError {
53    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
54        write!(f, "invalid UTF-8 byte sequence")?;
55        for b in self.as_bytes() {
56            write!(f, " {b:02X}")?;
57        }
58        write!(f, " read")?;
59        match self.as_io_error().kind() {
60            io::ErrorKind::InvalidData => { },
61            io::ErrorKind::UnexpectedEof => { write!(f, " (unexpected EOF)")?; }
62            _ => { write!(f, " ({})", self.as_io_error())?; }
63        }
64        Ok(())
65    }
66}
67
68/// An iterator over the chars of an instance of [`BufRead`].
69///
70/// In contrast to [`CharsRaw`], the error type is
71/// [`io::Error`], and therefore more likely to be drop-in
72/// compatible, at the price of losing the UTF-8 context bytes in the error
73/// message.
74///
75/// This struct is generally created by calling
76/// [`chars`](BufReadCharsExt::chars) on a [`BufRead`].
77#[derive(Debug)]
78pub struct Chars<'a, T: BufRead + ?Sized>(&'a mut T);
79
80impl<'a, T: BufRead + ?Sized> Iterator for Chars<'a, T> {
81    type Item = io::Result<char>;
82
83    fn next(&mut self) -> Option<Self::Item> {
84        self.0.read_char_raw().map_err(|x| x.into_io_error()).transpose()
85    }
86}
87
88/// An iterator over the chars of an instance of [`BufRead`].
89///
90/// This struct is generally created by calling [`chars_raw`](BufReadCharsExt::chars_raw)
91/// on a [`BufRead`].
92#[derive(Debug)]
93pub struct CharsRaw<'a, T: BufRead + ?Sized>(&'a mut T);
94
95impl<'a, T: BufRead + ?Sized> Iterator for CharsRaw<'a, T> {
96    type Item = Result<char, ReadCharError>;
97
98    fn next(&mut self) -> Option<Self::Item> {
99        self.0.read_char_raw().transpose()
100    }
101}
102
103const CHAR_MAX_LEN: u8 = 4;
104const LEAD_BYTE_MASK: [u8; CHAR_MAX_LEN as usize] = [0x7F, 0x1F, 0x0F, 0x07];
105const TAIL_BYTE_MASK: u8 = 0x3F;
106const TAIL_BYTE_SIGNATURE: u8 = 0x80;
107const TAIL_BYTE_BITS_COUNT: u8 = 6;
108const CHAR_MIN_VALUE: [u32; CHAR_MAX_LEN as usize] = [0, 0x80, 0x800, 0x10000];
109
110fn read_byte_and_ignore_interrupts(reader: &mut (impl BufRead + ?Sized)) -> io::Result<Option<u8>> {
111    loop {
112        match reader.fill_buf() {
113            Ok(buf) => return Ok(buf.first().copied()),
114            Err(e) => {
115                if e.kind() != io::ErrorKind::Interrupted {
116                    return Err(e)
117                }
118            }
119        }
120    };
121}
122
123/// Extends [`BufRead`] with methods for reading chars.
124pub trait BufReadCharsExt : BufRead {
125    /// Returns an iterator over the chars of this reader.
126    ///
127    /// In contrast to [`chars_raw`](BufReadCharsExt::chars_raw), the error type is
128    /// [`io::Error`], and therefore more likely to be drop-in
129    /// compatible, at the price of losing the UTF-8 context bytes in the error
130    /// message.
131    ///
132    /// The iterator returned from this function will yield instances of
133    /// [`io::Result`]`<char>`.
134    fn chars(&mut self) -> Chars<'_, Self> { Chars(self) }
135
136    /// Returns an iterator over the chars of this reader.
137    ///
138    /// The iterator returned from this function will yield instances of
139    /// [`Result`]`<char, `[`ReadCharError`]`>`.
140    fn chars_raw(&mut self) -> CharsRaw<'_, Self> { CharsRaw(self) }
141
142    /// Reads a char from the underlying reader.
143    ///
144    /// In contrast to [`read_char_raw`](BufReadCharsExt::read_char_raw), the error type is
145    /// [`io::Error`], and therefore more likely to be drop-in
146    /// compatible, at the price of losing the UTF-8 context bytes in the error
147    /// message.
148    ///
149    /// Returns
150    /// - `Ok(Some(char))` if a char has successfully read,
151    /// - `Ok(None)` if the stream has reached EOF before any byte was read,
152    /// - `Err(err)` if an I/O error occurred, or read byte sequence was not recognised as a valid UTF-8.
153    ///
154    /// If this function encounters an error of the kind
155    /// [`io::ErrorKind::Interrupted`]
156    /// then the error is ignored and the operation will continue.
157    fn read_char(&mut self) -> io::Result<Option<char>> {
158        self.read_char_raw().map_err(|x| x.into_io_error())
159    }
160
161    /// Reads a char from the underlying reader.
162    ///
163    /// Returns
164    /// - `Ok(Some(char))` if a char has successfully read,
165    /// - `Ok(None)` if the stream has reached EOF before any byte was read,
166    /// - `Err(err)` if an I/O error occurred, or read byte sequence was not recognised as a valid UTF-8.
167    ///
168    /// If this function encounters an error of the kind
169    /// [`io::ErrorKind::Interrupted`]
170    /// then the error is ignored and the operation will continue.
171    fn read_char_raw(&mut self) -> Result<Option<char>, ReadCharError> {
172        match read_byte_and_ignore_interrupts(self) {
173            Err(e) => Err(ReadCharError { bytes: ArrayVec::new(), io_error: e }),
174            Ok(None) => Ok(None),
175            Ok(Some(lead_byte)) => {
176                self.consume(1);
177                let leading_ones = lead_byte.leading_ones();
178                if leading_ones == 0 { return Ok(Some(char::from(lead_byte))); }
179                if leading_ones == 1 || leading_ones > 4 {
180                    let mut bytes = ArrayVec::new();
181                    bytes.push(lead_byte);
182                    return Err(ReadCharError { bytes, io_error: io::Error::from(io::ErrorKind::InvalidData) });
183                }
184                let mut bytes = ArrayVec::new();
185                bytes.push(lead_byte);
186                let tail_bytes_count = (leading_ones - 1) as u8;
187                let mut item =
188                    ((lead_byte & LEAD_BYTE_MASK[tail_bytes_count as usize]) as u32)
189                    << (TAIL_BYTE_BITS_COUNT * tail_bytes_count)
190                ;
191                for tail_byte_index in (0 .. tail_bytes_count).rev() {
192                    match read_byte_and_ignore_interrupts(self) {
193                        Err(e) => return Err(ReadCharError { bytes, io_error: e }),
194                        Ok(None) => return Err(ReadCharError { bytes, io_error: io::Error::from(io::ErrorKind::UnexpectedEof) }),
195                        Ok(Some(tail_byte)) => {
196                            if tail_byte & !TAIL_BYTE_MASK != TAIL_BYTE_SIGNATURE {
197                                return Err(ReadCharError { bytes, io_error: io::Error::from(io::ErrorKind::InvalidData) });
198                            }
199                            bytes.push(tail_byte);
200                            item |=
201                                ((tail_byte & TAIL_BYTE_MASK) as u32)
202                                << (tail_byte_index * TAIL_BYTE_BITS_COUNT)
203                            ;
204                            self.consume(1);
205                        }
206                    }
207                }
208                if item < CHAR_MIN_VALUE[tail_bytes_count as usize] {
209                    return Err(ReadCharError { bytes, io_error: io::Error::from(io::ErrorKind::InvalidData) });
210                }
211                match char::from_u32(item) {
212                    None => Err(ReadCharError { bytes, io_error: io::Error::from(io::ErrorKind::InvalidData) }),
213                    Some(item) => Ok(Some(item))
214                }
215            }
216        }
217    }
218}
219
220impl<T: BufRead + ?Sized> BufReadCharsExt for T { }
221
222#[cfg(test)]
223mod tests {
224    use quickcheck_macros::quickcheck;
225    use std::io::{BufRead, BufReader, ErrorKind};
226    use crate::{BufReadCharsExt};
227
228    #[test]
229    fn read_valid_unicode() {
230        assert_eq!(vec!['A', 'B', 'c', 'd', ' ', 'А', 'Б', 'в', 'г', 'д', ' ', 'U', '\0'],
231                    BufReader::new("ABcd АБвгд U\0".as_bytes()).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
232    }
233
234    #[test]
235    fn edgecase_one_two_bytes() {
236        assert_eq!(vec!['\x7F'],
237                    BufReader::new(&[ 0x7F ][..]).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
238        assert_eq!(vec!['\u{0080}'],
239                    BufReader::new(&[ 0xC2, 0x80 ][..]).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
240
241        let mut bytes = BufReader::new(&[ 0xC2 ][..]);
242        let res = bytes.chars_raw().collect::<Vec<_>>();
243        assert_eq!(1, res.len());
244        let err = res[0].as_ref().err().unwrap();
245        assert_eq!(&[0xC2][..], err.as_bytes());
246        assert_eq!(ErrorKind::UnexpectedEof, err.as_io_error().kind());
247
248        let mut bytes = BufReader::new(&[ 0xC1, 0xBF ][..]);
249        let res = bytes.chars_raw().collect::<Vec<_>>();
250        assert_eq!(1, res.len());
251        let err = res[0].as_ref().err().unwrap();
252        assert_eq!(&[0xC1, 0xBF][..], err.as_bytes());
253        assert_eq!(ErrorKind::InvalidData, err.as_io_error().kind());
254    }
255
256    #[test]
257    fn edgecase_two_three_bytes() {
258        assert_eq!(vec!['\u{07FF}'],
259                    BufReader::new(&[ 0xDF, 0xBF ][..]).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
260        assert_eq!(vec!['\u{0800}'],
261                    BufReader::new(&[ 0xE0, 0xA0, 0x80 ][..]).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
262
263        let mut bytes = BufReader::new(&[ 0xE0, 0xA0 ][..]);
264        let res = bytes.chars_raw().collect::<Vec<_>>();
265        assert_eq!(1, res.len());
266        let err = res[0].as_ref().err().unwrap();
267        assert_eq!(&[0xE0, 0xA0][..], err.as_bytes());
268        assert_eq!(ErrorKind::UnexpectedEof, err.as_io_error().kind());
269
270        let mut bytes = BufReader::new(&[ 0xE0, 0x9F, 0xBF ][..]);
271        let res = bytes.chars_raw().collect::<Vec<_>>();
272        assert_eq!(1, res.len());
273        let err = res[0].as_ref().err().unwrap();
274        assert_eq!(&[0xE0, 0x9F, 0xBF][..], err.as_bytes());
275        assert_eq!(ErrorKind::InvalidData, err.as_io_error().kind());
276    }
277
278    #[test]
279    fn edgecase_three_four_bytes() {
280        assert_eq!(vec!['\u{00FFFF}'],
281                    BufReader::new(&[ 0xEF, 0xBF, 0xBF ][..]).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
282        assert_eq!(vec!['\u{010000}'],
283                    BufReader::new(&[ 0xF0, 0x90, 0x80, 0x80 ][..]).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
284
285        let mut bytes = BufReader::new(&[ 0xF0, 0x90, 0x80 ][..]);
286        let res = bytes.chars_raw().collect::<Vec<_>>();
287        assert_eq!(1, res.len());
288        let err = res[0].as_ref().err().unwrap();
289        assert_eq!(&[0xF0, 0x90, 0x80][..], err.as_bytes());
290        assert_eq!(ErrorKind::UnexpectedEof, err.as_io_error().kind());
291
292        let mut bytes = BufReader::new(&[ 0xF0, 0x8F, 0xBF, 0xBF ][..]);
293        let res = bytes.chars_raw().collect::<Vec<_>>();
294        assert_eq!(1, res.len());
295        let err = res[0].as_ref().err().unwrap();
296        assert_eq!(&[0xF0, 0x8F, 0xBF, 0xBF][..], err.as_bytes());
297        assert_eq!(ErrorKind::InvalidData, err.as_io_error().kind());
298    }
299
300    #[test]
301    fn edgecase_four_bytes_max() {
302        assert_eq!(vec!['\u{10FFFF}'],
303                    BufReader::new(&[ 0xF4, 0x8F, 0xBF, 0xBF ][..]).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
304        //            BufReader::new(&[ 0xF7, 0xBF, 0xBF, 0xBF ][..]).chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
305
306        let mut bytes = BufReader::new(&[ 0xF8, 0x41 ][..]);
307        let res = bytes.chars_raw().collect::<Vec<_>>();
308        assert_eq!(2, res.len());
309        let err = res[0].as_ref().err().unwrap();
310        assert_eq!(&[0xF8][..], err.as_bytes());
311        assert_eq!(ErrorKind::InvalidData, err.as_io_error().kind());
312
313        let normal_char = res[1].as_ref().unwrap();
314        assert_eq!(&'A', normal_char);
315
316        // Now we want to force `read_char` to make this call:
317        assert_eq!(None, std::char::from_u32(0x00110000));
318        // Sadly, there is no more specific way to test this.
319        let mut bytes = BufReader::new(&[ 0xF4, 0x90, 0x80, 0x80 ][..]);
320        let res = bytes.chars_raw().collect::<Vec<_>>();
321        assert_eq!(1, res.len());
322        let err = res[0].as_ref().err().unwrap();
323        assert_eq!(&[0xF4, 0x90, 0x80, 0x80][..], err.as_bytes());
324        assert_eq!(ErrorKind::InvalidData, err.as_io_error().kind());
325    }
326
327    #[test]
328    fn read_io_valid_unicode() {
329        assert_eq!(vec!['A', 'B', 'c', 'd', ' ', 'А', 'Б', 'в', 'г', 'д', ' ', 'U', '\0'],
330                    BufReader::new("ABcd АБвгд U\0".as_bytes()).chars().map(|x| x.unwrap()).collect::<Vec<_>>());
331    }
332
333    #[test]
334    fn read_valid_unicode_from_dyn_read() {
335        let bytes: &mut dyn BufRead = &mut BufReader::new("ABcd АБвгд UV".as_bytes());
336        assert_eq!(
337            vec!['A', 'B', 'c', 'd', ' ', 'А', 'Б', 'в', 'г', 'д', ' ', 'U', 'V'],
338            bytes.chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>()
339        );
340    }
341
342    #[test]
343    fn do_not_take_extra_bytes() {
344        let mut bytes = BufReader::new("ABcd АБвгд UV".as_bytes());
345        assert_eq!(vec!['A', 'B', 'c', 'd'], bytes.chars_raw().take(4).map(|x| x.unwrap()).collect::<Vec<_>>());
346        assert_eq!(vec![' ', 'А', 'Б', 'в', 'г', 'д', ' ', 'U', 'V'], bytes.chars_raw().map(|x| x.unwrap()).collect::<Vec<_>>());
347    }
348
349    #[test]
350    fn read_value_out_of_range() {
351        let mut bytes = BufReader::new(&[ 0xF5, 0x8F, 0xBF, 0xBF ][..]);
352        let res = bytes.chars_raw().collect::<Vec<_>>();
353        assert_eq!(1, res.len());
354        let err = res[0].as_ref().err().unwrap();
355        assert_eq!(&[0xF5, 0x8F, 0xBF, 0xBF][..], err.as_bytes());
356    }
357
358    #[test]
359    fn read_io_value_out_of_range() {
360        let mut bytes = BufReader::new(&[ 0xF5, 0x8F, 0xBF, 0xBF ][..]);
361        let res = bytes.chars().collect::<Vec<_>>();
362        assert_eq!(1, res.len());
363        let err = res[0].as_ref().err().unwrap();
364        assert_eq!(ErrorKind::InvalidData, err.kind());
365    }
366
367    #[test]
368    fn read_io_incomplete_twobyte() {
369        let mut bytes = BufReader::new(&[ 0xC3 ][..]);  // 0xC3 0xA4 = 'ä'
370        let res = bytes.chars().collect::<Vec<_>>();
371        assert_eq!(1, res.len());
372        let err = res[0].as_ref().err().unwrap();
373        assert_eq!(ErrorKind::UnexpectedEof, err.kind());
374    }
375
376    #[test]
377    fn read_io_incomplete_threebyte() {
378        let mut bytes = BufReader::new(&[ 0xE1, 0xBA ][..]);  // 0xE1 0xBA 0xB9 = 'ẹ'
379        let res = bytes.chars().collect::<Vec<_>>();
380        assert_eq!(1, res.len());
381        let err = res[0].as_ref().err().unwrap();
382        assert_eq!(ErrorKind::UnexpectedEof, err.kind());
383    }
384
385    #[test]
386    fn read_surrogate() {
387        let mut bytes = BufReader::new(&[ 0xED, 0xA0, 0x80 ][..]);
388        let res = bytes.chars_raw().collect::<Vec<_>>();
389        assert_eq!(1, res.len());
390        let err = res[0].as_ref().err().unwrap();
391        assert_eq!(&[0xED, 0xA0, 0x80][..], err.as_bytes());
392    }
393
394    #[test]
395    fn read_invalid_sequences() {
396        let mut bytes = BufReader::new(&[ 0x81, 0x82, 0xC1, 0x07, 0xC1, 0x87, 0xC2, 0xC2, 0x82, 0xF7, 0x88, 0x89, 0x07 ][..]);
397        let res = bytes.chars_raw().collect::<Vec<_>>();
398        assert_eq!(9, res.len());
399        assert_eq!(&[0x81][..], res[0].as_ref().err().unwrap().as_bytes());
400        assert_eq!(&[0x82][..], res[1].as_ref().err().unwrap().as_bytes());
401        assert_eq!(&[0xC1][..], res[2].as_ref().err().unwrap().as_bytes());
402        assert_eq!('\x07', *res[3].as_ref().unwrap());
403        assert_eq!(&[0xC1, 0x87][..], res[4].as_ref().err().unwrap().as_bytes());
404        assert_eq!(&[0xC2][..], res[5].as_ref().err().unwrap().as_bytes());
405        assert_eq!('\u{82}', *res[6].as_ref().unwrap());
406        assert_eq!(&[0xF7, 0x88, 0x89][..], res[7].as_ref().err().unwrap().as_bytes());
407        assert_eq!('\x07', *res[8].as_ref().unwrap());
408    }
409
410    #[quickcheck]
411    fn read_string(s: String) -> bool {
412        let mut t = String::new();
413        BufReader::new(s.as_bytes()).chars_raw().for_each(|c| t.push(c.unwrap()));
414        s == t
415    }
416
417    #[quickcheck]
418    fn read_array(b: Vec<u8>) -> bool {
419        let mut t = Vec::new();
420        BufReader::new(&b[..]).chars_raw().for_each(|c|
421            t.append(&mut c.map_or_else(|e| e.as_bytes().to_vec(), |s| s.to_string().as_bytes().to_vec()))
422        );
423        b == t
424    }
425}
426
427#[cfg(all(feature="bench", test))]
428mod benchs {
429    use rand::distributions::{Distribution, Uniform};
430    use rand::thread_rng;
431    use std::hint::black_box;
432    use std::io::BufReader;
433    use std::vec::{Vec};
434    use test::Bencher;
435    use crate::{BufReadCharsExt};
436
437    #[bench]
438    fn read_array_bench(b: &mut Bencher) {
439        let mut rng = thread_rng();
440        let mut bytes: Vec<u8> = Uniform::new_inclusive(0u8, 255u8).sample_iter(&mut rng).take(10000).collect();
441        b.iter(move || {
442            black_box(&mut bytes);
443            black_box(BufReader::new(&bytes[..]).chars_raw().last());
444        });
445    }
446}