unicode_reader/codepoints.rs
1// Copyright (c) 2016-2021 William R. Fraser
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9use std::error::Error;
10use std::fmt;
11use std::io;
12use std::str;
13
14use smallvec::SmallVec;
15
16/// Wraps a byte-oriented reader and yields the UTF-8 data one code point at a time.
17/// Any UTF-8 parsing errors are raised as `io::Error` with `ErrorKind::InvalidData`.
18pub struct CodePoints<R: Iterator<Item = io::Result<u8>>> {
19 input: R,
20 buffer: SmallVec<[u8; 4]>,
21}
22
23impl<R: Iterator<Item = io::Result<u8>>> Iterator for CodePoints<R> {
24 /// The type of the elements being iterated over: a `io::Result` with one Unicode code point
25 /// (as a `char`), or any I/O error raised by the underlying reader, or any error encountered
26 /// while trying to parse the byte stream as UTF-8.
27 type Item = io::Result<char>;
28
29 /// Get the next Unicode code point from the stream. Any malformed UTF-8 data will be returned
30 /// as an `io::Error` with `ErrorKind::InvalidData`, including if the stream reaches EOF before
31 /// a complete code point is read (which is returned as `ErrorKind::UnexpectedEof`). Any I/O
32 /// error raised by the underlying stream will be returned as well.
33 fn next(&mut self) -> Option<Self::Item> {
34 loop {
35 if !self.buffer.is_empty() {
36 // See if we have a valid codepoint.
37 match str::from_utf8(&self.buffer) {
38 Ok(s) => {
39 let mut chars = s.chars();
40 let c = chars.next().unwrap();
41 if c.len_utf8() < self.buffer.len() {
42 self.buffer = SmallVec::from_slice(&self.buffer[c.len_utf8()..]);
43 } else {
44 self.buffer.clear();
45 }
46 return Some(Ok(c));
47 }
48 Err(e) => {
49 if self.buffer.len() - e.valid_up_to() >= 4 {
50 // If we have 4 bytes that still don't make up a valid code point, then
51 // we have garbage.
52 // Remove leading bytes until either the buffer is empty, or we have a
53 // valid code point.
54 let mut split_point = 1;
55 let mut badbytes = vec![];
56 loop {
57 let (bad, rest) = self.buffer.split_at(split_point);
58 if rest.is_empty() || str::from_utf8(rest).is_ok() {
59 badbytes.extend_from_slice(bad);
60 self.buffer = SmallVec::from_slice(rest);
61 break;
62 }
63 split_point += 1;
64 }
65
66 // Raise the error. If we still have data in the buffer, it will be
67 // returned on the next loop.
68 return Some(Err(io::Error::new(io::ErrorKind::InvalidData,
69 BadUtf8Error { bytes: badbytes })));
70 }
71 // else: We probably have a partial code point. Keep reading bytes to find
72 // out.
73 }
74 }
75 }
76 match self.input.next() {
77 Some(Ok(byte)) => {
78 self.buffer.push(byte);
79 }
80 None => {
81 if self.buffer.is_empty() {
82 return None;
83 } else {
84 // Invalid utf-8 at end of stream.
85 let bytes = self.buffer.to_vec();
86 self.buffer = SmallVec::new();
87 return Some(Err(io::Error::new(io::ErrorKind::UnexpectedEof,
88 BadUtf8Error { bytes })));
89 }
90 }
91 Some(Err(e)) => {
92 return Some(Err(e));
93 }
94 }
95 }
96 }
97}
98
99impl<R: Iterator<Item = io::Result<u8>>> From<R> for CodePoints<R> {
100 fn from(input: R) -> CodePoints<R> {
101 CodePoints {
102 input,
103 buffer: SmallVec::new(),
104 }
105 }
106}
107
108/// An error raised when parsing a UTF-8 byte stream fails.
109#[derive(Debug)]
110pub struct BadUtf8Error {
111 /// The bytes that could not be parsed as a code point.
112 pub bytes: Vec<u8>,
113}
114
115impl Error for BadUtf8Error {
116 fn description(&self) -> &str {
117 "BadUtf8Error"
118 }
119}
120
121impl fmt::Display for BadUtf8Error {
122 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
123 write!(f, "Bad UTF-8: {:?}", self.bytes)
124 }
125}