1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
use std::io::{self, Error, ErrorKind};
use std::str::{from_utf8, from_utf8_unchecked};

pub trait BufRead: io::BufRead {
    /// Read a number of bytes les than or equal to the [`capacity`] of the its buffer, and
    /// return their utf-8 representation as a [`io::Result`]`<&`[`str`]`>`.
    ///
    /// This function will read bytes from the underlying stream until its buffer is full, an
    /// invalid or incomplete codepoint is found, or EOF is found. Once found, all codepoints
    /// up to, including the EOF (if found), but not including the invalid or incomplete codepoint
    /// (if found), will be returned.
    ///
    /// If this function returns [`Ok("")`], the stream has reached EOF.
    ///
    /// This function avoids the usual issues of using [`BufRead`]`::`[`read_line`]`(&self, &mut `
    /// [`String`]`)` or [`BufRead`]`::`[`lines`]`(&self)` on big text file without newline
    /// delimiters: It will not load the whole file in memory.
    ///
    /// [`capacity`]: std::io::BufRead::capacity
    /// [`io::Result`]: std::io::Result
    /// [`Ok("")`]: Ok
    /// [`BufRead`]: std::io::BufRead
    /// [`read_line`]: std::io::BufRead::read_line
    /// [`lines`]: std::io::BufRead::lines
    ///
    /// # Errors
    ///
    /// This function will immediately return any errors returned by [`fill_buf`].
    ///
    /// If an [`Utf8Error`] is returned by the internal call to [`from_utf8`], all valid codepoints
    /// are returned, and no error is returned, unless no valid codepoints were read. This
    /// allows not to lose any valid data, and the error will be returned on the next call.
    ///
    /// If the first codepoint encountered by [`from_utf8`] is invalid or incomplete, an
    /// [`ErrorKind`]`::`[`InvalidData`] caused by an [`Utf8Error`] is returned. This error cannot
    /// be recovered from, and you will have to read bytes manually to determine if the error was
    /// caused by an invalid codepoint in middle of the file or by an incomplete codepoint because
    /// of an early EOF.
    ///
    /// [`fill_buf`]: std::io::BufRead::fill_buf
    /// [`Utf8Error`]: std::str::Utf8Error
    /// [`from_utf8`]: std::str::from_utf8
    /// [`ErrorKind`]: std::io::ErrorKind
    /// [`Invalid`]: std::io::ErrorKind::InvalidData
    ///
    /// # Examples
    ///
    /// [`std::io::Cursor`][`Cursor`] is a type that implements `BufRead`. In
    /// this example, we use [`Cursor`] to read
    ///
    /// ```
    /// use utf8_bufread::BufRead;
    /// use std::io::{BufReader, ErrorKind};
    ///
    /// // "foo\nbar" + some invalid bytes
    /// // We give the buffer more than enough capacity to be able to read all the bytes in one
    /// // call
    /// let mut reader = BufReader::with_capacity(
    ///     16,
    ///     [0x66u8, 0x6f, 0x6f, 0xa, 0x62, 0x61, 0x72, 0x9f, 0x92, 0x96].as_ref(),
    /// );
    ///
    /// // On the first read_utf8() call, we will read up to the first byte of the invalid
    /// // codepoint (ie "foo\nbar")
    /// let read_str = reader
    ///     .read_utf8()
    ///     .expect("We will get all the valid bytes without error");
    /// assert_eq!("foo\nbar", read_str);
    ///
    /// // Then on the second call we will get the InvalidData error caused by the Utf8Error error,
    /// // as there is no bytes forming valid codepoints left
    /// let read_err = reader.read_utf8().expect_err("We will get an error");
    /// assert_eq!(ErrorKind::InvalidData, read_err.kind())
    /// ```
    fn read_utf8(&mut self) -> io::Result<String> {
        let (s, used) = {
            // Fill the buffer from inner reader's data and get its content
            let read_bytes = match self.fill_buf() {
                Ok(r) => r,
                // We do not handle `ErrorKind::Interrupt`
                Err(e) => return Err(e),
            };
            // We attempt converting read bytes to utf8
            match from_utf8(read_bytes) {
                Ok(s) => (s, read_bytes.len()),
                Err(e) => {
                    // If we have an error, we will first attempt to return all valid read bytes,
                    // putting the invalid or incomplete codepoint at the beginning of the buffer.
                    // This allows us to recover from reading up to a byte that isn't on a char
                    // boundary by reading the complete codepoint on the next call
                    let used = e.valid_up_to();
                    if used == 0 {
                        // If we cannot decode any valid utf8 byte from the buffer, it either means
                        // - There was a parse error earlier, and we read everything up to this
                        //   point in a previous read call, and now the invalid codepoint is at
                        //   the front of the buffer, we should then return an Utf8Error
                        // - We reached EOF with an incomplete codepoint, we should return an
                        //   Utf8Error too
                        return Err(Error::new(ErrorKind::InvalidData, e));
                    }
                    // This is safe, see `Utf8Error::valid_up_to(&self)` doc
                    (unsafe { from_utf8_unchecked(&read_bytes[..used]) }, used)
                }
            }
        };
        let s = s.to_owned(); // FIXME how to avoid cloning ?
        self.consume(used);
        Ok(s)
    }
}

impl<R: io::BufRead> BufRead for R {}

#[cfg(test)]
mod tests {

    #[test]
    fn readme_simple_example() {
        use crate::BufRead;
        use std::io::BufReader;

        assert_eq!(
            "💖",
            BufReader::<&[u8]>::new("💖".as_ref()).read_utf8().unwrap()
        );
    }

    // TODO more / other ?
}