utf8-bufread 0.1.4

Functions for BufRead to read large text file without worrying about newlines
Documentation
//! This crate provides a trait, [`BufRead`], providing functions to read utf-8 text streams
//! using an [`io::BufRead`] without waiting for newline delimiters.
//!
//! # Quick Start
//!
//! The simplest way to read a whole file with a [`BufRead`] type is to repeatedly calling its
//! [`read_utf8`] method:
//!
//! ```
//! use utf8_bufread::BufRead;
//! use std::io::BufReader;
//!
//! // Reader may be any type implementing io::BufRead
//! // We'll just use a BufReader wrapping a slice for this example
//! let mut reader = BufReader::<&[u8]>::new("💖".as_ref());
//! // The string we'll use to store the text of the read file
//! let mut text = String::new();
//! loop { // Loop until EOF
//!     match reader.read_utf8(&mut text) {
//!         Ok(0) => break, // EOF
//!         Ok(_) => continue,
//!         Err(e) => panic!(e), // io::Error or Utf8Error
//!     }
//! }
//! assert_eq!("💖", text.as_str());
//! ```
//!
//! *Note that this example does exactly what this crate tries to avoid: storing the whole file in
//! memory.*
//!
//! see [`BufRead`]'s documentation for more.
//!
//! [`BufRead`]: self::BufRead
//! [`io::BufRead`]: std::io::BufRead
//! [`read_utf8`]: self::BufRead::read_utf8

use std::io::{self, Error, ErrorKind};
use std::str::{from_utf8, from_utf8_unchecked};

#[deny(missing_crate_level_docs, missing_docs, missing_doc_code_examples)]

/// A trait implemented for all types implementing [`io::BufRead`], providing  functions to
/// read utf-8 text streams without waiting for newline delimiters.
///
/// [`io::BufRead`]: std::io::BufRead
pub trait BufRead: io::BufRead {
    /// Read a number of bytes less than or equal to the [`capacity`] of the its buffer, and push
    /// their utf-8 representation in the provided `buf`. It returns the number of bytes read as a
    /// [`io::Result`]`<`[`usize`]`>`.
    ///
    /// This function will read bytes from the underlying stream until its buffer is full, an
    /// invalid or incomplete codepoint is found, or EOF is found. Once found, all codepoints
    /// up to, including the EOF (if found), but not including the invalid or incomplete codepoint
    /// (if found), will be appended to the provided `buf`.
    ///
    /// If the operation is successful, this function resturns the number of bytes read. Note this
    /// may **not** be the number of [`char`]s read, as UTF-8 is a variable-length encoding.
    ///
    /// If this function returns [`Ok(0)`], the stream has reached EOF.
    ///
    /// This function avoids the usual issues of using [`BufRead`]`::`[`read_line`]`(&self, &mut `
    /// [`String`]`)` or [`BufRead`]`::`[`lines`]`(&self)` on big text file without newline
    /// delimiters: It will not load the whole file in memory.
    ///
    /// [`capacity`]: std::io::BufRead::capacity
    /// [`io::Result`]: std::io::Result
    /// [`Ok("")`]: Ok
    /// [`BufRead`]: std::io::BufRead
    /// [`read_line`]: std::io::BufRead::read_line
    /// [`lines`]: std::io::BufRead::lines
    ///
    /// # Errors
    ///
    /// This function will immediately return any errors returned by [`fill_buf`].
    ///
    /// If an [`Utf8Error`] is returned by the internal call to [`from_utf8`], all valid codepoints
    /// are returned, and no error is returned, unless no valid codepoints were read. This
    /// allows not to lose any valid data, and the error will be returned on the next call.
    ///
    /// If the first codepoint encountered by [`from_utf8`] is invalid or incomplete, an
    /// [`ErrorKind`]`::`[`InvalidData`] caused by an [`Utf8Error`] is returned. This error cannot
    /// be recovered from, and you will have to read bytes manually to determine if the error was
    /// caused by an invalid codepoint in middle of the file or by an incomplete codepoint because
    /// of an early EOF.
    ///
    /// [`fill_buf`]: std::io::BufRead::fill_buf
    /// [`Utf8Error`]: std::str::Utf8Error
    /// [`from_utf8`]: std::str::from_utf8
    /// [`ErrorKind`]: std::io::ErrorKind
    /// [`InvalidData`]: std::io::ErrorKind::InvalidData
    ///
    /// # Examples
    ///
    /// ```
    /// use utf8_bufread::BufRead;
    /// use std::io::{BufReader, ErrorKind};
    ///
    /// // "foo\nbar" + some invalid bytes
    /// // We give the buffer more than enough capacity to be able to read all the bytes in one
    /// // call
    /// let mut reader = BufReader::with_capacity(
    ///     16,
    ///     [0x66u8, 0x6f, 0x6f, 0xa, 0x62, 0x61, 0x72, 0x9f, 0x92, 0x96].as_ref(),
    /// );
    /// let mut buf = String::new();
    ///
    /// // On the first read_utf8() call, we will read up to the first byte of the invalid
    /// // codepoint (ie "foo\nbar")
    /// let n_read = reader
    ///     .read_utf8(&mut buf)
    ///     .expect("We will get all the valid bytes without error");
    /// assert_eq!("foo\nbar", buf.as_str());
    /// assert_eq!(7, n_read);
    ///
    /// // Then on the second call we will get the InvalidData error caused by the Utf8Error error,
    /// // as there is no bytes forming valid codepoints left
    /// let read_err = reader.read_utf8(&mut buf).expect_err("We will get an error");
    /// assert_eq!(ErrorKind::InvalidData, read_err.kind());
    /// assert_eq!(7, buf.len());  // no byte appended to buf
    /// ```
    fn read_utf8(&mut self, buf: &mut String) -> io::Result<usize> {
        // Fill the buffer from inner reader's data and get its content
        let read_bytes = match self.fill_buf() {
            Ok(r) => r,
            // We do not handle `ErrorKind::Interrupt`
            Err(e) => return Err(e),
        };
        // We attempt converting read bytes to utf8
        match from_utf8(read_bytes) {
            Ok(s) => {
                let used = read_bytes.len();
                buf.push_str(s);
                self.consume(used);
                Ok(used)
            }
            Err(e) => {
                // If we have an error, we will first attempt to return all valid read bytes,
                // putting the invalid or incomplete codepoint at the beginning of the buffer.
                // This allows us to recover from reading up to a byte that isn't on a char
                // boundary by reading the complete codepoint on the next call
                let used = e.valid_up_to();
                if used == 0 {
                    // If we cannot decode any valid utf8 byte from the buffer, it either means
                    // - We reached EOF with an incomplete codepoint, we should return an
                    //   Utf8Error
                    // - There was a parse error earlier, and we read everything up to this
                    //   point in a previous read call, there is two possible situations again:
                    //   - There is more than 2 bytes following the first byte of the invalid
                    //     slice, this means there truly is an invalid codepoint, we should
                    //     return an Utf8Error
                    //   - There is less than 4 bytes left in the buffer, meaning we may have
                    //     an incomplete codepoint and need to read up to 3 bytes further.
                    if read_bytes.len() < 4 {
                        let mut v = Vec::from(read_bytes);
                        // Consume the last bytes, so that the next call to `fill_buff` will read
                        // more bytes from the underlying stream
                        self.consume(v.len());
                        // Let's try reading more bytes
                        let additional_bytes = match self.fill_buf() {
                            Ok(r) => r,
                            // We do not handle `ErrorKind::Interrupt`
                            Err(e) => return Err(e),
                        };
                        if additional_bytes.len() == 0 {
                            // No additional bytes, we reached EOF on an incomplete codepoint
                            return Err(Error::from(ErrorKind::InvalidData));
                        } else if additional_bytes.len() + v.len() < 4 {
                            // If this is true the following for loop *will* panic because of
                            // an index out of bound
                            // This means our buffer is only 1 byte long ! This doesn't sound
                            // plausible, but we never know
                            return Err(Error::new(
                                ErrorKind::InvalidInput,
                                format!(
                                    "Internal buffer capacity of at least 2 bytes expected to be \
                                    able to read utf-8, but it is: {}",
                                    additional_bytes.len()
                                ),
                            ));
                        }
                        // Try adding bytes until our incomplete codepoint is complete, up to 3
                        // (we know that v.len() < 4)
                        for i in 0..(4 - v.len()) {
                            v.push(additional_bytes[i]);
                            match from_utf8(v.as_slice()) {
                                Ok(s) => {
                                    // Hurray, we got a valid codepoint
                                    buf.push_str(s);
                                    // Don't forget to tell BufRead we consumed those bytes
                                    self.consume(i + 1);
                                    return Ok(v.len());
                                }
                                Err(_) => {} // ignore fails, we will return an error below
                            }
                        }
                    }
                    // We couldn't get a valid codepoint, return Utf8Error
                    return Err(Error::new(ErrorKind::InvalidData, e));
                }
                // This is safe, see `Utf8Error::valid_up_to(&self)` doc
                buf.push_str(unsafe { from_utf8_unchecked(&read_bytes[..used]) });
                self.consume(used);
                Ok(used)
            }
        }
    }
}

impl<R: io::BufRead> BufRead for R {}

#[cfg(test)]
mod tests {

    #[test]
    fn readme_simple_example() {
        use crate::BufRead;
        use std::io::BufReader;

        let mut buf = String::new();
        assert_eq!(
            4,
            BufReader::<&[u8]>::new("💖".as_ref())
                .read_utf8(&mut buf)
                .unwrap()
        );
        assert_eq!("💖", buf.as_str());
    }

    #[test]
    fn codepoint_on_buffer_boundary() {
        use crate::BufRead;
        use std::io::BufReader;

        // 💖 is 4 bytes long
        // String layout (grouped by 4 bytes): 0💖💖💖|💖0💖💖|💖💖0u|0💖💖💖|💖0u0|💖💖💖💖
        // Should be read (grouped by read): 0|💖💖💖💖|0|💖💖💖💖|0u|0|💖💖💖💖|0u0|💖💖💖💖|
        //                                    1        2 3        4  5 6        7   8        9
        //           incomplete codepoint <--/        / /        /  / /        /   /        /
        //                      recover codepoint <--/ /        /  / /        /   /        /
        //                    incomplete codepoint <--/        /  / /        /   /        /
        //                               recover codepoint <--/  / /        /   /        /
        //                                       buffer full <--/ /        /   /        /
        //                               incomplete codepoint <--/        /   /        /
        //                                          recover codepoint <--/   /        /
        //                                          incomplete codepoint <--/        /
        //                                                     recover codepoint <--/
        // Sorry to all the users of non truly monospaced fonts for this horrendous diagram
        let mut reader = BufReader::<&[u8]>::with_capacity(4, "0💖0💖0u0💖0u0💖".as_ref());
        let mut buf = String::new();
        // 1. Reading 1 byte until encountering 3 bytes of incomplete codepoint
        assert_eq!(1, reader.read_utf8(&mut buf).unwrap());
        assert_eq!("0", buf.as_str());
        // 2. Reading the whole codepoint
        assert_eq!(4, reader.read_utf8(&mut buf).unwrap());
        assert_eq!("0💖", buf.as_str());
        // 3. Reading 1 byte until encountering 2 bytes of incomplete codepoint
        assert_eq!(1, reader.read_utf8(&mut buf).unwrap());
        assert_eq!("0💖0", buf.as_str());
        // 4. Reading the whole codepoint
        assert_eq!(4, reader.read_utf8(&mut buf).unwrap());
        assert_eq!("0💖0💖", buf.as_str());
        // 5. Reading 2 byte until end of buffer
        assert_eq!(2, reader.read_utf8(&mut buf).unwrap());
        assert_eq!("0💖0💖0u", buf.as_str());
        // 6. Reading 1 byte until encountering 3 bytes of incomplete codepoint
        assert_eq!(1, reader.read_utf8(&mut buf).unwrap());
        assert_eq!("0💖0💖0u0", buf.as_str());
        // 7. Reading the whole codepoint
        assert_eq!(4, reader.read_utf8(&mut buf).unwrap());
        assert_eq!("0💖0💖0u0💖", buf.as_str());
        // 8. Reading 3 byte until end of buffer
        assert_eq!(3, reader.read_utf8(&mut buf).unwrap());
        assert_eq!("0💖0💖0u0💖0u0", buf.as_str());
        // 9. Reading 4 byte, the whole codepoint, until end of buffer
        assert_eq!(4, reader.read_utf8(&mut buf).unwrap());
        assert_eq!("0💖0💖0u0💖0u0💖", buf.as_str());
    }

    #[test]
    fn two_bytes_capacity() {
        use crate::BufRead;
        use std::io::BufReader;

        let mut reader = BufReader::<&[u8]>::with_capacity(2, "💖".as_ref());
        let mut buf = String::new();
        assert_eq!(4, reader.read_utf8(&mut buf).unwrap());
        assert_eq!("💖", buf.as_str());
    }

    #[test]
    fn one_byte_capacity() {
        use crate::BufRead;
        use std::io::{BufReader, ErrorKind};

        // "€" is 3 bytes long
        let mut reader = BufReader::<&[u8]>::with_capacity(1, "€".as_ref());
        let mut buf = String::new();
        let err = reader.read_utf8(&mut buf);
        assert!(err.is_err());
        let err = err.unwrap_err();
        assert_eq!(ErrorKind::InvalidInput, err.kind());
    }
}