unicode_reader/
codepoints.rs

1// Copyright (c) 2016-2021 William R. Fraser
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9use std::error::Error;
10use std::fmt;
11use std::io;
12use std::str;
13
14use smallvec::SmallVec;
15
16/// Wraps a byte-oriented reader and yields the UTF-8 data one code point at a time.
17/// Any UTF-8 parsing errors are raised as `io::Error` with `ErrorKind::InvalidData`.
18pub struct CodePoints<R: Iterator<Item = io::Result<u8>>> {
19    input: R,
20    buffer: SmallVec<[u8; 4]>,
21}
22
23impl<R: Iterator<Item = io::Result<u8>>> Iterator for CodePoints<R> {
24    /// The type of the elements being iterated over: a `io::Result` with one Unicode code point
25    /// (as a `char`), or any I/O error raised by the underlying reader, or any error encountered
26    /// while trying to parse the byte stream as UTF-8.
27    type Item = io::Result<char>;
28
29    /// Get the next Unicode code point from the stream. Any malformed UTF-8 data will be returned
30    /// as an `io::Error` with `ErrorKind::InvalidData`, including if the stream reaches EOF before
31    /// a complete code point is read (which is returned as `ErrorKind::UnexpectedEof`). Any I/O
32    /// error raised by the underlying stream will be returned as well.
33    fn next(&mut self) -> Option<Self::Item> {
34        loop {
35            if !self.buffer.is_empty() {
36                // See if we have a valid codepoint.
37                match str::from_utf8(&self.buffer) {
38                    Ok(s) => {
39                        let mut chars = s.chars();
40                        let c = chars.next().unwrap();
41                        if c.len_utf8() < self.buffer.len() {
42                            self.buffer = SmallVec::from_slice(&self.buffer[c.len_utf8()..]);
43                        } else {
44                            self.buffer.clear();
45                        }
46                        return Some(Ok(c));
47                    }
48                    Err(e) => {
49                        if self.buffer.len() - e.valid_up_to() >= 4 {
50                            // If we have 4 bytes that still don't make up a valid code point, then
51                            // we have garbage.
52                            // Remove leading bytes until either the buffer is empty, or we have a
53                            // valid code point.
54                            let mut split_point = 1;
55                            let mut badbytes = vec![];
56                            loop {
57                                let (bad, rest) = self.buffer.split_at(split_point);
58                                if rest.is_empty() || str::from_utf8(rest).is_ok() {
59                                    badbytes.extend_from_slice(bad);
60                                    self.buffer = SmallVec::from_slice(rest);
61                                    break;
62                                }
63                                split_point += 1;
64                            }
65
66                            // Raise the error. If we still have data in the buffer, it will be
67                            // returned on the next loop.
68                            return Some(Err(io::Error::new(io::ErrorKind::InvalidData,
69                                                           BadUtf8Error { bytes: badbytes })));
70                        }
71                        // else: We probably have a partial code point. Keep reading bytes to find
72                        // out.
73                    }
74                }
75            }
76            match self.input.next() {
77                Some(Ok(byte)) => {
78                    self.buffer.push(byte);
79                }
80                None => {
81                    if self.buffer.is_empty() {
82                        return None;
83                    } else {
84                        // Invalid utf-8 at end of stream.
85                        let bytes = self.buffer.to_vec();
86                        self.buffer = SmallVec::new();
87                        return Some(Err(io::Error::new(io::ErrorKind::UnexpectedEof,
88                                                       BadUtf8Error { bytes })));
89                    }
90                }
91                Some(Err(e)) => {
92                    return Some(Err(e));
93                }
94            }
95        }
96    }
97}
98
99impl<R: Iterator<Item = io::Result<u8>>> From<R> for CodePoints<R> {
100    fn from(input: R) -> CodePoints<R> {
101        CodePoints {
102            input,
103            buffer: SmallVec::new(),
104        }
105    }
106}
107
108/// An error raised when parsing a UTF-8 byte stream fails.
109#[derive(Debug)]
110pub struct BadUtf8Error {
111    /// The bytes that could not be parsed as a code point.
112    pub bytes: Vec<u8>,
113}
114
115impl Error for BadUtf8Error {
116    fn description(&self) -> &str {
117        "BadUtf8Error"
118    }
119}
120
121impl fmt::Display for BadUtf8Error {
122    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
123        write!(f, "Bad UTF-8: {:?}", self.bytes)
124    }
125}