utf8_zero/read.rs
1use super::*;
2use std::error::Error;
3use std::fmt;
4use std::io::{self, BufRead};
5use std::str;
6use std::string::String;
7
8/// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8.
9///
10/// # Examples
11///
12/// Lossy decoding of an in-memory byte stream:
13///
14/// ```
15/// use std::io::BufReader;
16/// use utf8_zero::BufReadDecoder;
17///
18/// let input = b"Hello \xF0\x9F\x8C\x8D\xC0world";
19/// let reader = BufReader::new(&input[..]);
20/// let output = BufReadDecoder::read_to_string_lossy(reader).unwrap();
21/// assert_eq!(output, "Hello \u{1F30D}\u{FFFD}world");
22/// ```
23///
24/// Strict chunk-by-chunk decoding:
25///
26/// ```
27/// use std::io::BufReader;
28/// use utf8_zero::{BufReadDecoder, BufReadDecoderError};
29///
30/// let input = b"ok\xFFend";
31/// let mut decoder = BufReadDecoder::new(BufReader::new(&input[..]));
32/// let mut parts = Vec::new();
33/// while let Some(result) = decoder.next_strict() {
34/// match result {
35/// Ok(s) => parts.push(format!("str:{s}")),
36/// Err(BufReadDecoderError::InvalidByteSequence(b)) => {
37/// parts.push(format!("err:{b:02x?}"));
38/// }
39/// Err(BufReadDecoderError::Io(e)) => panic!("io error: {e}"),
40/// }
41/// }
42/// assert_eq!(parts, vec!["str:ok", "err:[ff]", "str:end"]);
43/// ```
44pub struct BufReadDecoder<B: BufRead> {
45 buf_read: B,
46 bytes_consumed: usize,
47 incomplete: Incomplete,
48}
49
50/// Error returned by [`BufReadDecoder::next_strict()`].
51#[derive(Debug)]
52pub enum BufReadDecoderError<'a> {
53 /// Represents one UTF-8 error in the byte stream.
54 ///
55 /// In lossy decoding, each such error should be replaced with U+FFFD.
56 /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.)
57 InvalidByteSequence(&'a [u8]),
58
59 /// An I/O error from the underlying byte stream
60 Io(io::Error),
61}
62
63impl<'a> BufReadDecoderError<'a> {
64 /// Replace UTF-8 errors with U+FFFD
65 pub fn lossy(self) -> Result<&'static str, io::Error> {
66 match self {
67 BufReadDecoderError::Io(error) => Err(error),
68 BufReadDecoderError::InvalidByteSequence(_) => Ok(REPLACEMENT_CHARACTER),
69 }
70 }
71}
72
73impl<'a> fmt::Display for BufReadDecoderError<'a> {
74 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
75 match *self {
76 BufReadDecoderError::InvalidByteSequence(bytes) => {
77 write!(f, "invalid byte sequence: {:02x?}", bytes)
78 }
79 BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err),
80 }
81 }
82}
83
84impl<'a> Error for BufReadDecoderError<'a> {
85 fn source(&self) -> Option<&(dyn Error + 'static)> {
86 match *self {
87 BufReadDecoderError::InvalidByteSequence(_) => None,
88 BufReadDecoderError::Io(ref err) => Some(err),
89 }
90 }
91}
92
93impl<B: BufRead> BufReadDecoder<B> {
94 /// This is to `Read::read_to_string` what `String::from_utf8_lossy` is to `String::from_utf8`.
95 pub fn read_to_string_lossy(buf_read: B) -> io::Result<String> {
96 let mut decoder = Self::new(buf_read);
97 let mut string = String::new();
98 while let Some(result) = decoder.next_lossy() {
99 string.push_str(result?)
100 }
101 Ok(string)
102 }
103
104 /// Wrap a buffered byte stream for UTF-8 decoding.
105 pub fn new(buf_read: B) -> Self {
106 Self {
107 buf_read,
108 bytes_consumed: 0,
109 incomplete: Incomplete::empty(),
110 }
111 }
112
113 /// Same as `BufReadDecoder::next_strict`, but replace UTF-8 errors with U+FFFD.
114 pub fn next_lossy(&mut self) -> Option<io::Result<&str>> {
115 self.next_strict()
116 .map(|result| result.or_else(|e| e.lossy()))
117 }
118
119 /// Decode and consume the next chunk of UTF-8 input.
120 ///
121 /// This method is intended to be called repeatedly until it returns `None`,
122 /// which represents EOF from the underlying byte stream.
123 /// This is similar to `Iterator::next`,
124 /// except that decoded chunks borrow the decoder (~iterator)
125 /// so they need to be handled or copied before the next chunk can start decoding.
126 pub fn next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError<'_>>> {
127 enum BytesSource {
128 BufRead(usize),
129 Incomplete,
130 }
131 macro_rules! try_io {
132 ($io_result: expr) => {
133 match $io_result {
134 Ok(value) => value,
135 Err(error) => return Some(Err(BufReadDecoderError::Io(error))),
136 }
137 };
138 }
139 let (source, result) = loop {
140 if self.bytes_consumed > 0 {
141 self.buf_read.consume(self.bytes_consumed);
142 self.bytes_consumed = 0;
143 }
144 let buf = try_io!(self.buf_read.fill_buf());
145
146 // Force loop iteration to go through an explicit `continue`
147 enum Unreachable {}
148 let _: Unreachable = if self.incomplete.is_empty() {
149 if buf.is_empty() {
150 return None; // EOF
151 }
152 match str::from_utf8(buf) {
153 Ok(_) => break (BytesSource::BufRead(buf.len()), Ok(())),
154 Err(error) => {
155 let valid_up_to = error.valid_up_to();
156 if valid_up_to > 0 {
157 break (BytesSource::BufRead(valid_up_to), Ok(()));
158 }
159 match error.error_len() {
160 Some(invalid_sequence_length) => {
161 break (BytesSource::BufRead(invalid_sequence_length), Err(()))
162 }
163 None => {
164 self.bytes_consumed = buf.len();
165 self.incomplete = Incomplete::new(buf);
166 // need more input bytes
167 continue;
168 }
169 }
170 }
171 }
172 } else {
173 if buf.is_empty() {
174 break (BytesSource::Incomplete, Err(())); // EOF with incomplete code point
175 }
176 let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf);
177 self.bytes_consumed = consumed;
178 match opt_result {
179 None => {
180 // need more input bytes
181 continue;
182 }
183 Some(result) => break (BytesSource::Incomplete, result),
184 }
185 };
186 };
187 let bytes = match source {
188 BytesSource::BufRead(byte_count) => {
189 self.bytes_consumed = byte_count;
190 let buf = try_io!(self.buf_read.fill_buf());
191 &buf[..byte_count]
192 }
193 BytesSource::Incomplete => self.incomplete.take_buffer(),
194 };
195 match result {
196 Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })),
197 Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))),
198 }
199 }
200}