Skip to main content

utf8_zero/
read.rs

1use super::*;
2use std::error::Error;
3use std::fmt;
4use std::io::{self, BufRead};
5use std::str;
6use std::string::String;
7
8/// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8.
9///
10/// # Examples
11///
12/// Lossy decoding of an in-memory byte stream:
13///
14/// ```
15/// use std::io::BufReader;
16/// use utf8_zero::BufReadDecoder;
17///
18/// let input = b"Hello \xF0\x9F\x8C\x8D\xC0world";
19/// let reader = BufReader::new(&input[..]);
20/// let output = BufReadDecoder::read_to_string_lossy(reader).unwrap();
21/// assert_eq!(output, "Hello \u{1F30D}\u{FFFD}world");
22/// ```
23///
24/// Strict chunk-by-chunk decoding:
25///
26/// ```
27/// use std::io::BufReader;
28/// use utf8_zero::{BufReadDecoder, BufReadDecoderError};
29///
30/// let input = b"ok\xFFend";
31/// let mut decoder = BufReadDecoder::new(BufReader::new(&input[..]));
32/// let mut parts = Vec::new();
33/// while let Some(result) = decoder.next_strict() {
34///     match result {
35///         Ok(s) => parts.push(format!("str:{s}")),
36///         Err(BufReadDecoderError::InvalidByteSequence(b)) => {
37///             parts.push(format!("err:{b:02x?}"));
38///         }
39///         Err(BufReadDecoderError::Io(e)) => panic!("io error: {e}"),
40///     }
41/// }
42/// assert_eq!(parts, vec!["str:ok", "err:[ff]", "str:end"]);
43/// ```
44pub struct BufReadDecoder<B: BufRead> {
45    buf_read: B,
46    bytes_consumed: usize,
47    incomplete: Incomplete,
48}
49
50/// Error returned by [`BufReadDecoder::next_strict()`].
51#[derive(Debug)]
52pub enum BufReadDecoderError<'a> {
53    /// Represents one UTF-8 error in the byte stream.
54    ///
55    /// In lossy decoding, each such error should be replaced with U+FFFD.
56    /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.)
57    InvalidByteSequence(&'a [u8]),
58
59    /// An I/O error from the underlying byte stream
60    Io(io::Error),
61}
62
63impl<'a> BufReadDecoderError<'a> {
64    /// Replace UTF-8 errors with U+FFFD
65    pub fn lossy(self) -> Result<&'static str, io::Error> {
66        match self {
67            BufReadDecoderError::Io(error) => Err(error),
68            BufReadDecoderError::InvalidByteSequence(_) => Ok(REPLACEMENT_CHARACTER),
69        }
70    }
71}
72
73impl<'a> fmt::Display for BufReadDecoderError<'a> {
74    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
75        match *self {
76            BufReadDecoderError::InvalidByteSequence(bytes) => {
77                write!(f, "invalid byte sequence: {:02x?}", bytes)
78            }
79            BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err),
80        }
81    }
82}
83
84impl<'a> Error for BufReadDecoderError<'a> {
85    fn source(&self) -> Option<&(dyn Error + 'static)> {
86        match *self {
87            BufReadDecoderError::InvalidByteSequence(_) => None,
88            BufReadDecoderError::Io(ref err) => Some(err),
89        }
90    }
91}
92
93impl<B: BufRead> BufReadDecoder<B> {
94    /// This is to `Read::read_to_string` what `String::from_utf8_lossy` is to `String::from_utf8`.
95    pub fn read_to_string_lossy(buf_read: B) -> io::Result<String> {
96        let mut decoder = Self::new(buf_read);
97        let mut string = String::new();
98        while let Some(result) = decoder.next_lossy() {
99            string.push_str(result?)
100        }
101        Ok(string)
102    }
103
104    /// Wrap a buffered byte stream for UTF-8 decoding.
105    pub fn new(buf_read: B) -> Self {
106        Self {
107            buf_read,
108            bytes_consumed: 0,
109            incomplete: Incomplete::empty(),
110        }
111    }
112
113    /// Same as `BufReadDecoder::next_strict`, but replace UTF-8 errors with U+FFFD.
114    pub fn next_lossy(&mut self) -> Option<io::Result<&str>> {
115        self.next_strict()
116            .map(|result| result.or_else(|e| e.lossy()))
117    }
118
119    /// Decode and consume the next chunk of UTF-8 input.
120    ///
121    /// This method is intended to be called repeatedly until it returns `None`,
122    /// which represents EOF from the underlying byte stream.
123    /// This is similar to `Iterator::next`,
124    /// except that decoded chunks borrow the decoder (~iterator)
125    /// so they need to be handled or copied before the next chunk can start decoding.
126    pub fn next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError<'_>>> {
127        enum BytesSource {
128            BufRead(usize),
129            Incomplete,
130        }
131        macro_rules! try_io {
132            ($io_result: expr) => {
133                match $io_result {
134                    Ok(value) => value,
135                    Err(error) => return Some(Err(BufReadDecoderError::Io(error))),
136                }
137            };
138        }
139        let (source, result) = loop {
140            if self.bytes_consumed > 0 {
141                self.buf_read.consume(self.bytes_consumed);
142                self.bytes_consumed = 0;
143            }
144            let buf = try_io!(self.buf_read.fill_buf());
145
146            // Force loop iteration to go through an explicit `continue`
147            enum Unreachable {}
148            let _: Unreachable = if self.incomplete.is_empty() {
149                if buf.is_empty() {
150                    return None; // EOF
151                }
152                match str::from_utf8(buf) {
153                    Ok(_) => break (BytesSource::BufRead(buf.len()), Ok(())),
154                    Err(error) => {
155                        let valid_up_to = error.valid_up_to();
156                        if valid_up_to > 0 {
157                            break (BytesSource::BufRead(valid_up_to), Ok(()));
158                        }
159                        match error.error_len() {
160                            Some(invalid_sequence_length) => {
161                                break (BytesSource::BufRead(invalid_sequence_length), Err(()))
162                            }
163                            None => {
164                                self.bytes_consumed = buf.len();
165                                self.incomplete = Incomplete::new(buf);
166                                // need more input bytes
167                                continue;
168                            }
169                        }
170                    }
171                }
172            } else {
173                if buf.is_empty() {
174                    break (BytesSource::Incomplete, Err(())); // EOF with incomplete code point
175                }
176                let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf);
177                self.bytes_consumed = consumed;
178                match opt_result {
179                    None => {
180                        // need more input bytes
181                        continue;
182                    }
183                    Some(result) => break (BytesSource::Incomplete, result),
184                }
185            };
186        };
187        let bytes = match source {
188            BytesSource::BufRead(byte_count) => {
189                self.bytes_consumed = byte_count;
190                let buf = try_io!(self.buf_read.fill_buf());
191                &buf[..byte_count]
192            }
193            BytesSource::Incomplete => self.incomplete.take_buffer(),
194        };
195        match result {
196            Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })),
197            Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))),
198        }
199    }
200}