utf8-bufread 1.0.0

#![feature(option_result_unwrap_unchecked)]
//! This crate provides functions to read utf-8 text from any type implementing [`io::BufRead`]
//! through a trait, [`BufRead`], without waiting for newline delimiters. These functions take
//! advantage of buffering and either return `&`[`str`] or [`char`]s. Each has an associated
//! iterator, some have an equivalent to a [`Map`] iterator that avoids allocation and cloning as
//! well.
//!
//! # Quick Start
//!
//! The simplest way to read a file using this crate may be something along the following:
//!
//! ```
//! use utf8_bufread::BufRead;
//! use std::io::{Cursor, ErrorKind};
//! use std::borrow::Cow;
//!
//! // Reader may be any type implementing io::BufRead
//! // We'll just use a cursor wrapping a slice for this example
//! let mut reader = Cursor::new("Löwe 老虎 Léopard");
//! loop { // Loop until EOF
//!     match reader.read_str() {
//!         Ok(s) => {
//!             if s.is_empty() {
//!                 break; // EOF
//!             }
//!             // Do something with `s` ...
//!             print!("{}", s);
//!         }
//!         Err(e) => {
//!             // We should try again if we get interrupted
//!             if e.kind() != ErrorKind::Interrupted {
//!                 break;
//!             }
//!         }
//!     }
//! }
//! ```
//!
//! # Reading arbitrary-length string slices
//!
//! The [`read_str`] function returns a `&`[`str`] of arbitrary length (up to the reader's buffer
//! capacity) read from the inner reader, without cloning data, unless a valid codepoint ends up
//! cut at the end of the reader's buffer. Its associated iterator can be obtained by calling
//! [`str_iter`], and since it involves cloning the data at each iteration, [`str_map`] is also
//! provided.
//!
//! # Reading codepoints
//!
//! The [`read_char`] function returns a [`char`] read from the inner reader. Its associated
//! iterator can be obtained by calling [`char_iter`].
//!
//! # Iterator types
//!
//! This crate provides several structs for several ways of iterating over the inner reader's data:
//! - [`StrIter`] and [`CodepointIter`] clone the data on each iteration, but use an [`Rc`] to
//!   check if the returned [`String`] buffer is still used. If not, it is re-used to avoid
//!   re-allocating.
//!   ```
//!   use utf8_bufread::BufRead;
//!   use std::io::Cursor;
//!
//!   let mut reader = Cursor::new("Löwe 老虎 Léopard");
//!   for s in reader.str_iter().filter_map(|r| r.ok()) {
//!       // Do something with s ...
//!       print!("{}", s);
//!   }
//!   ```
//! - [`StrMap`] and [`CodepointMap`] allow having access to read data without cloning, but then it
//!   cannot be passed to further iterator adapters.
//!   ```
//!   use utf8_bufread::BufRead;
//!   use std::io::Cursor;
//!
//!   let s = "Löwe 老虎 Léopard";
//!   let mut reader = Cursor::new(s);
//!   let count: usize = reader.str_map(|s| s.len()).filter_map(Result::ok).sum();
//!   println!("There is {} valid utf-8 bytes in {}", count, s);
//!   ```
//! - [`CharIter`] is similar to [`StrIter`] and others, except it relies on [`char`]s implementing
//!   [`Copy`] and thus doesn't need a buffer nor the "`Rc` trick".
//!   ```
//!   use utf8_bufread::BufRead;
//!   use std::io::Cursor;
//!
//!   let s = "Löwe 老虎 Léopard";
//!   let mut reader = Cursor::new(s);
//!   let count = reader.char_iter().filter_map(Result::ok).filter(|c| c.is_lowercase()).count();
//!   assert_eq!(count, 9);
//!   ```
//!
//! All these iterators may read data until EOF or an invalid codepoint is found. If valid
//! codepoints are read from the inner reader, they *will* be returned before reporting an error.
//! After encountering an error or EOF, they always return `None`. They always ignore any
//! [`Interrupted`] error.
//!
//! [`read_str`]: self::BufRead::read_str
//! [`str_iter`]: self::BufRead::str_iter
//! [`str_map`]: self::BufRead::str_map
//! [`read_char`]: self::BufRead::read_char
//! [`char_iter`]: self::BufRead::char_iter
//! [`Map`]: std::iter::Map
//! [`Interrupted`]: std::io::ErrorKind::Interrupted

#[deny(missing_crate_level_docs, missing_docs, missing_doc_code_examples)]
mod error;

use error::Result;
use std::borrow::Cow;
use std::io::{self, ErrorKind};
use std::rc::Rc;
use std::slice::from_raw_parts;
use std::str::{from_utf8, from_utf8_unchecked, FromStr};

pub use error::Error;

/// A trait implemented for all types implementing [`io::BufRead`], providing  functions to
/// read utf-8 text streams without waiting for newline delimiters.
///
/// [`io::BufRead`]: std::io::BufRead
///
/// # Examples
///
/// ```
/// use std::io::Cursor;
/// use utf8_bufread::BufRead;
///
/// // Prints "I luv you too !"
/// if Cursor::new("💖").read_str().map_or(false, |s| s == "💖") {
///     println!("I luv you too !");
/// }
/// ```
pub trait BufRead: io::BufRead {
    /// Reads some bytes from the inner reader and returns a [`Cow`]`<&`[`str`]`>` of it referring
    /// to all valid codepoints read, wrapped in an [`io::Result`].
    ///
    /// This function will read all bytes from the underlying stream until its buffer is full, an
    /// invalid or incomplete codepoint is found, or EOF is found. Once found, all codepoints up
    /// to, including the EOF (if found), but not including the invalid or incomplete codepoint
    /// (if found), will be returned. This function may read an arbitrary number of byte, between 1
    /// and this reader's buffer capacity (unless the buffer is not big enough to fit a unicode
    /// codepoint).
    ///
    /// The returned reference points to this reader's actual buffer, meaning it borrows the
    /// reader.
    ///
    /// A [`Cow`] is used to gracefully handle cases where a valid codepoint is cut by the end of
    /// the buffer of this reader, and more bytes may need to be read from the inner reader to form
    /// a hopefully valid codepoint. This function only allocates a new [`String`] and clones data
    /// in that scenario. In worst case it happens once every two calls, allocating and cloning
    /// 4 bytes every `c` bytes read, where `c` is this reader's buffer capacity.
    ///
    /// If this function returns [`Ok`]`("")`, the stream has reached EOF.
    ///
    /// # Errors
    ///
    /// If the internal call to [`fill_buf`] returns an [`io::Error`] or this function returns
    /// immediately an [`Error`] wrapping the original error.
    ///
    /// If the first codepoint read from the inner reader is invalid, an [`Error`] wrapping the
    /// original [`Utf8Error`] or [`FromUtf8Error`] is returned.
    ///
    /// If the codepoint is complete but invalid, the returned error will have a [`kind`] of
    /// [`ErrorKind`]`::`[`InvalidData`]. If EOF was encountered before the end of a codepoint,
    /// the error will have a [`kind`] of [`ErrorKind`]`::`[`UnexpectedEof`].
    ///
    /// The returned [`Error`] may contain a non-zero amount of "leftover" bytes (see
    /// [`Error::leftovers`] for more info). When it is the case, it is guaranteed that the
    /// following read operation will not return any of those bytes, nor "skip" bytes from this
    /// reader.
    ///
    /// Note that if the buffer of this reader is less than 4 bytes long it may fail to read
    /// complete codepoints and "spuriously" return the same error as when it unexpectedly
    /// encounters EOF, since we're unable to load enough bytes to form a valid codepoint. We
    /// cannot check the capacity of the buffer using the [`io::BufRead`] API only, it is then up
    /// to the user to ensure this won't happen. *It should not happen unless you explicitly set
    /// the capacity yourself*.
    ///
    /// # Examples
    ///
    /// This example simply reads from a stream and prints it to standard output.
    ///
    /// ```
    /// use std::io::{Cursor, Error, ErrorKind};
    /// use utf8_bufread::BufRead;
    /// use std::borrow::Cow;
    ///
    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
    /// let mut  reader = Cursor::new("Löwe 老虎 Léopard");
    ///
    /// loop {
    ///     match reader.read_str() {
    ///         Ok(s) => {
    ///             if s.is_empty() {
    ///                 break; // EOF
    ///             }
    ///             print!("{}", s)
    ///         }
    ///         Err(e) => {
    ///             if ErrorKind::Interrupted != e.kind() {
    ///                 // Ignore interrupted errors
    ///                 eprintln!("{}", e);
    ///             }
    ///         }
    ///     }
    /// }
    /// ```
    ///
    /// [`kind`]: self::Error::kind
    /// [`fill_buf`]: std::io::BufRead::fill_buf
    /// [`Interrupted`]: std::io::ErrorKind::Interrupted
    /// [`InvalidData`]: std::io::ErrorKind::InvalidData
    /// [`UnexpectedEof`]: std::io::ErrorKind::UnexpectedEof
    /// [`Utf8Error`]: std::str::Utf8Error
    /// [`FromUtf8Error`]: std::string::FromUtf8Error
    fn read_str(&mut self) -> Result<Cow<str>> {
        // Fill the buffer from inner reader's data and get its content
        let read_bytes = self.fill_buf()?;
        let read_len = read_bytes.len();
        if read_len == 0 {
            return Ok(Cow::from(""));
        }
        let ptr = read_bytes.as_ptr();
        // We attempt converting read bytes to utf8
        match from_utf8(read_bytes) {
            Ok(_) => {
                self.consume(read_len);
                // The call to `from_raw_parts` is safe, as:
                // a. It is within the memory region of the reader's now filled buffer.
                // b. Implicit lifetimes imply the reader is mutably borrowed for the lifetime of the
                //    returned str reference
                // TODO: ask for review of point b. above
                // The call to `from_utf8_unchecked` is safe as we just ran the validation on the same
                // memory region above
                Ok(Cow::from(unsafe {
                    from_utf8_unchecked(from_raw_parts(ptr, read_len))
                }))
            }
            Err(e) => {
                // If we have an error, we will first attempt to return all valid read bytes,
                // putting the invalid or incomplete codepoint at the beginning of the buffer.
                // This allows us to recover from reading up to a byte that isn't on a char
                // boundary by reading the complete codepoint on the next call
                let len = e.valid_up_to();
                if len != 0 {
                    self.consume(len);
                    // This is safe, see `Utf8Error::valid_up_to(&self)` doc
                    Ok(Cow::from(unsafe {
                        from_utf8_unchecked(from_raw_parts(ptr, len))
                    }))
                } else if read_len >= codepoint_length(read_bytes[0]) {
                    // If we cannot decode any valid utf8 byte from the buffer, it either means
                    // - We reached EOF with an incomplete codepoint, we should return an
                    //   UnexpectedEof Error
                    // - There was a parse error earlier, and we read everything up to this
                    //   point in a previous read call, there is two possible situations again:
                    //   - There is more than 2 bytes following the first byte of the invalid
                    //     slice, this means there truly is an invalid codepoint, we should
                    //     return an Utf8Error
                    //   - There is less than 4 bytes left in the buffer, meaning we may have
                    //     an incomplete codepoint and need to read up to 3 bytes further.
                    // We know read_bytes is not empty
                    // We couldn't get a valid codepoint despite reading enough bytes
                    Err(Error::from(e))
                } else {
                    // Not enough bytes read, we will try to read more bytes
                    // Consume the last bytes, so that the next call to `fill_buff` will read
                    // more bytes from the underlying stream
                    self.consume(read_len);
                    read_across_boundary(self, Vec::from(unsafe { from_raw_parts(ptr, read_len) }))
                }
            }
        }
    }

    /// Reads 1 to 4 bytes from the inner reader and returns a [`Cow`]`<&`[`str`]`>` of it
    /// referring to the valid codepoints read, wrapped in an [`io::Result`].
    ///
    /// This function will read bytes from the underlying stream until one codepoint is read, an
    /// invalid or incomplete codepoint is found, or EOF is found.
    ///
    /// The returned reference points to this reader's actual buffer, meaning it borrows the
    /// reader.
    ///
    /// A [`Cow`] is used to gracefully handle cases where a valid codepoint is cut by the end of
    /// the buffer of this reader, and more bytes may need to be read from the inner reader to form
    /// a hopefully valid codepoint. This function only allocates a new [`String`] and clones data
    /// in that scenario. In worst case it allocates and clones 4 bytes every `c` bytes read,
    /// where `c` is this reader's buffer capacity.
    ///
    /// If this function returns [`Ok`]`("")`, the stream has reached EOF.
    ///
    /// # Errors
    ///
    /// If the internal call to [`fill_buf`] returns an [`io::Error`] or this function returns
    /// immediately an [`Error`] wrapping the original error.
    ///
    /// If the first codepoint read from the inner reader is invalid or incomplete, an [`Error`]
    /// wrapping the original [`Utf8Error`] or [`FromUtf8Error`] is returned.
    ///
    /// If the codepoint is complete but invalid, the returned error will have a [`kind`] of
    /// [`ErrorKind`]`::`[`InvalidData`]. If EOF was encountered before the end of a codepoint,
    /// the error will have a [`kind`] of [`ErrorKind`]`::`[`UnexpectedEof`].
    ///
    /// The returned [`Error`] may contain a non-zero amount of "leftover" bytes (see
    /// [`Error::leftovers`] for more info). When it is the case, it is guaranteed that the
    /// following read operation will not return any of those bytes, nor "skip" bytes from this
    /// reader.
    ///
    /// Note that if the buffer of this reader is less than 4 bytes long it may fail to read
    /// complete codepoints and "spuriously" return the same error as when it unexpectedly
    /// encounters EOF, since we're unable to load enough bytes to form a valid codepoint. We
    /// cannot check the capacity of the buffer using the [`io::BufRead`] API only, it is then up
    /// to the user to ensure this won't happen. *It should not happen unless you explicitly set
    /// the capacity yourself*.
    ///
    /// # Examples
    ///
    /// This example simply reads from a stream and counts the number of `🏳` character.
    ///
    /// ```
    /// use std::io::{Cursor, Error, ErrorKind};
    /// use utf8_bufread::BufRead;
    /// use std::borrow::Cow;
    ///
    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
    /// let mut  reader = Cursor::new("Löwe 老虎 🏳Léopard");
    /// let mut count = 0;
    ///
    /// loop {
    ///     match reader.read_codepoint() {
    ///         Ok(s) => {
    ///             if s.is_empty() {
    ///                 break; // EOF
    ///             }
    ///             if s == "🏳" {
    ///                 count += 1;
    ///             }
    ///         }
    ///         Err(e) => {
    ///             if ErrorKind::Interrupted != e.kind() {
    ///                 // Ignore interrupted errors
    ///                 eprintln!("{}", e);
    ///             }
    ///         }
    ///     }
    /// }
    /// assert_eq!(count, 1);
    /// ```
    #[doc(hidden)]
    fn read_codepoint(&mut self) -> Result<Cow<str>> {
        // Fill the buffer from inner reader's data and get its content
        let read_bytes = self.fill_buf()?;
        let read_len = read_bytes.len();
        if read_len == 0 {
            return Ok(Cow::from(""));
        }
        let ptr = read_bytes.as_ptr();
        let len = codepoint_length(read_bytes[0]);
        if read_len < len {
            // Not enough bytes read, we will try to read more bytes
            // Consume the last bytes, so that the next call to `fill_buff` will read
            // more bytes from the underlying stream
            self.consume(read_len);
            read_across_boundary(self, Vec::from(unsafe { from_raw_parts(ptr, read_len) }))
        } else {
            match from_utf8(&read_bytes[..len]) {
                Ok(_) => {
                    self.consume(len);
                    // The call to `from_raw_parts` is safe, as:
                    // a. It is within the memory region of the reader's now filled buffer.
                    // b. Implicit lifetimes imply the reader is mutably borrowed for the lifetime of the
                    //    returned str reference
                    // TODO: ask for review of point b. above
                    // The call to `from_utf8_unchecked` is safe as we just ran the validation on the same
                    // memory region above
                    Ok(Cow::from(unsafe {
                        from_utf8_unchecked(from_raw_parts(ptr, len))
                    }))
                }
                Err(e) => Err(Error::from(e)),
            }
        }
    }

    /// Reads 1 to 4 bytes from the inner reader and returns the [`char`] read, wrapped in an
    /// [`io::Result`].
    ///
    /// This function will read bytes from the underlying stream until one codepoint is read, an
    /// invalid or incomplete codepoint is found, or EOF is found.
    ///
    /// If this function returns [`Ok`]`('\0')`, the stream has reached EOF.
    ///
    /// # Errors
    ///
    /// If the internal call to [`fill_buf`] returns an [`io::Error`] or this function returns
    /// immediately an [`Error`] wrapping the original error.
    ///
    /// If the first codepoint read from the inner reader is invalid or incomplete, an [`Error`]
    /// wrapping the original [`Utf8Error`] or [`FromUtf8Error`] is returned.
    ///
    /// If the codepoint is complete but invalid, the returned error will have a [`kind`] of
    /// [`ErrorKind`]`::`[`InvalidData`]. If EOF was encountered before the end of a codepoint,
    /// the error will have a [`kind`] of [`ErrorKind`]`::`[`UnexpectedEof`].
    ///
    /// The returned [`Error`] may contain a non-zero amount of "leftover" bytes (see
    /// [`Error::leftovers`] for more info). When it is the case, it is guaranteed that the
    /// following read operation will not return any of those bytes, nor "skip" bytes from this
    /// reader.
    ///
    /// Note that if the buffer of this reader is less than 4 bytes long it may fail to read
    /// complete codepoints and "spuriously" return the same error as when it unexpectedly
    /// encounters EOF, since we're unable to load enough bytes to form a valid codepoint. We
    /// cannot check the capacity of the buffer using the [`io::BufRead`] API only, it is then up
    /// to the user to ensure this won't happen. *It should not happen unless you explicitly set
    /// the capacity yourself*.
    ///
    /// # Examples
    ///
    /// This example simply reads from a stream and counts the number of lowercase characters
    ///
    /// ```
    /// use std::io::{Cursor, Error, ErrorKind};
    /// use utf8_bufread::BufRead;
    /// use std::borrow::Cow;
    ///
    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
    /// let mut  reader = Cursor::new("Löwe 老虎 Léopard");
    /// let mut count = 0;
    ///
    /// loop {
    ///     match reader.read_char() {
    ///         Ok('\0') => break, // EOF
    ///         Ok(c) => {
    ///             if c.is_lowercase() {
    ///                 count += 1;
    ///             }
    ///         }
    ///         Err(e) => {
    ///             if ErrorKind::Interrupted != e.kind() {
    ///                 // Ignore interrupted errors
    ///                 eprintln!("{}", e);
    ///             }
    ///         }
    ///     }
    /// }
    /// assert_eq!(count, 9);
    /// ```
    ///
    /// [`fill_buf`]: std::io::BufRead::fill_buf
    /// [`Interrupted`]: std::io::ErrorKind::Interrupted
    /// [`InvalidData`]: std::io::ErrorKind::InvalidData
    /// [`UnexpectedEof`]: std::io::ErrorKind::UnexpectedEof
    /// [`Utf8Error`]: std::str::Utf8Error
    /// [`FromUtf8Error`]: std::string::FromUtf8Error
    /// [`kind`]: crate::error::Error::kind
    fn read_char(&mut self) -> Result<char> {
        // We guarantee that self.read_codepoint returns:
        // - An empty string or
        // - Exactly one valid codepoint
        let c = self.read_codepoint()?;
        if c.is_empty() {
            return Ok('\0');
        }
        Ok(unsafe { char::from_str(c.as_ref()).unwrap_unchecked() })
    }

    /// Returns an iterator over string slices of this reader.
    ///
    /// It is equivalent to calling [`read_str`] in a loop, ignoring
    /// [`ErrorKind`]`::`[`Interrupted`] errors, until EOF or the first error encountered.
    ///
    /// The iterator returned by this function will yield instances of
    /// [`io::Result`]`<`[`Rc`]`<`[`String`]`>>`. We use the [`Rc`] to check while iterating if the
    /// iterator is the only one holding a reference to it, avoiding allocating a new buffer if
    /// that's the case.
    ///
    /// The iterator returned will yield at most one [`io::Error`]. Once an error is yielded, it
    /// will only yield [`None`].
    ///
    /// # Examples
    ///
    /// This example simply reads from a string and prints it to standard output:
    ///
    /// ```
    /// use std::io::Cursor;
    /// use utf8_bufread::BufRead;
    ///
    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
    /// let mut  reader = Cursor::new("Löwe 老虎 Léopard");
    /// // We ignore any error, we know once we encounter one we can't read any further anyway
    /// reader.str_iter().filter_map(Result::ok).for_each(|s| print!("{}", s));
    /// ```
    ///
    /// [`read_str`]: self::BufRead::read_str
    /// [`Interrupted`]: std::io::ErrorKind::Interrupted
    fn str_iter(&mut self) -> StrIter<'_, Self> {
        let default_cap = 8 * 1024;
        StrIter {
            reader: self,
            buf: Rc::new(String::with_capacity(default_cap)),
            default_cap,
            ended: false,
        }
    }

    /// Returns an iterator over codepoints of this reader.
    ///
    /// It is equivalent to calling [`read_codepoint`] in a loop, ignoring
    /// [`ErrorKind`]`::`[`Interrupted`] errors, until EOF or the first error encountered.
    ///
    /// The iterator returned by this function will yield instances of
    /// [`io::Result`]`<`[`Rc`]`<`[`String`]`>>`. We use the [`Rc`] to check while iterating if the
    /// iterator is the only one holding a reference to it, avoiding allocating a new buffer if
    /// that's the case.
    ///
    /// The iterator returned will yield at most one [`io::Error`]. Once an error is yielded, it
    /// will only yield [`None`].
    ///
    /// # Examples
    ///
    /// This example simply reads from a stream and counts the number of `🏳` character.
    ///
    /// ```
    /// use std::io::Cursor;
    /// use utf8_bufread::BufRead;
    ///
    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
    /// let mut  reader = Cursor::new("Löwe 老虎 🏳Léopard");
    /// let count = reader.codepoint_iter()
    ///     .filter_map(Result::ok)
    ///     .filter(|s| s.as_ref() == "🏳")
    ///     .count();
    /// assert_eq!(count, 1);
    /// ```
    #[doc(hidden)]
    fn codepoint_iter(&mut self) -> CodepointIter<'_, Self> {
        let default_cap = 4;
        CodepointIter {
            reader: self,
            buf: Rc::new(String::with_capacity(default_cap)),
            default_cap,
            ended: false,
        }
    }

    /// Returns an iterator over chars of this reader.
    ///
    /// It is equivalent to calling [`read_char`] in a loop, ignoring
    /// [`ErrorKind`]`::`[`Interrupted`] errors, until EOF or the first error encountered.
    ///
    /// The iterator returned by this function will yield instances of
    /// [`io::Result`]`<`[`char`]`>`.
    ///
    /// The iterator returned will yield at most one [`io::Error`]. Once an error is yielded, it
    /// will only yield [`None`].
    ///
    /// # Examples
    ///
    /// This example simply reads from a stream, filtering out any whitespace:
    ///
    /// ```
    /// use std::io::Cursor;
    /// use utf8_bufread::BufRead;
    ///
    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
    /// let mut  reader = Cursor::new("Löwe 老虎 Léopard");
    /// let result: String = reader.char_iter()
    ///     .filter_map(Result::ok)
    ///     .filter(|c| !c.is_whitespace())
    ///     .collect();
    /// assert_eq!(result.as_str(), "Löwe老虎Léopard");
    /// ```
    ///
    /// [`read_char`]: self::BufRead::read_char
    /// [`Interrupted`]: std::io::ErrorKind::Interrupted
    fn char_iter(&mut self) -> CharIter<'_, Self> {
        CharIter {
            reader: self,
            ended: false,
        }
    }

    /// Returns an mapping iterator over string slices of this reader.
    ///
    /// It is equivalent to calling [`read_str`] in a loop, ignoring
    /// [`ErrorKind`]`::`[`Interrupted`] errors, until EOF or the first error encountered.
    ///
    /// The iterator returned by this function will call `f` with instances of [`Cow`]`<`[`str`]`>`
    /// as returned by [`read_str`], and yield instances of [`io::Result`]`<T>`. This may help
    /// avoids the allocations and clonings [`str_iter`] does.
    ///
    /// The iterator returned will yield at most one [`io::Error`], and if one is yielded it will
    /// always be the last item.
    ///
    /// # Examples
    ///
    /// This example simply reads from a stream and counts the number of bytes read:
    ///
    /// ```
    /// use std::io::Cursor;
    /// use utf8_bufread::BufRead;
    ///
    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
    /// let mut  reader = Cursor::new("Löwe 老虎 Léopard");
    /// let count: usize = reader.str_map(|s| s.len()).filter_map(Result::ok).sum();
    /// assert_eq!(count, 21);
    /// ```
    ///
    /// [`read_str`]: self::BufRead::read_str
    /// [`str_iter`]: self::BufRead::str_iter
    /// [`Interrupted`]: std::io::ErrorKind::Interrupted
    fn str_map<F, T>(&mut self, f: F) -> StrMap<'_, Self, F>
    where
        F: FnMut(Cow<str>) -> T,
    {
        StrMap {
            reader: self,
            map: Rc::new(f),
            ended: false,
        }
    }

    /// Returns an mapping iterator over codepoints of this reader.
    ///
    /// It is equivalent to calling [`read_codepoint`] in a loop, ignoring
    /// [`ErrorKind`]`::`[`Interrupted`] errors, until EOF or the first error encountered.
    ///
    /// The iterator returned by this function will call `f` with instances of [`Cow`]`<`[`str`]`>`
    /// as returned by [`read_str`], and yield instances of [`io::Result`]`<T>`. This may help
    /// avoids the allocations and clonings [`str_iter`] does.
    ///
    /// The iterator returned will yield at most one [`io::Error`], and if one is yielded it will
    /// always be the last item.
    ///
    /// # Examples
    ///
    /// This example simply reads maps each codepoints to their length in bytes:
    ///
    /// ```
    /// use std::io::Cursor;
    /// use utf8_bufread::BufRead;
    ///
    /// // We could use any type implementing io::BufRead, we'll just use a cursor here
    /// let mut  reader = Cursor::new("Löwe 老虎 Léopard");
    /// let lengths: Vec<_> = reader.codepoint_map(|s| s.len()).filter_map(Result::ok).collect();
    /// assert_eq!(lengths.as_ref(), [1, 2, 1, 1, 1, 3, 3, 1, 1, 2, 1, 1, 1, 1, 1]);
    /// ```
    #[doc(hidden)]
    fn codepoint_map<F, T>(&mut self, f: F) -> CodepointMap<'_, Self, F>
    where
        F: FnMut(Cow<str>) -> T,
    {
        CodepointMap {
            reader: self,
            map: Rc::new(f),
            ended: false,
        }
    }
}

impl<R: io::BufRead> BufRead for R {}

/// An iterator over string slices of an instance of [`io::BufRead`], created by [`str_iter`], see
/// its documentation for more details.
///
/// [`str_iter`]: self::BufRead::str_iter
pub struct StrIter<'r, R>
where
    R: ?Sized,
{
    reader: &'r mut R,
    buf: Rc<String>,
    default_cap: usize,
    ended: bool,
}

impl<R> Iterator for StrIter<'_, R>
where
    R: io::BufRead,
{
    type Item = Result<Rc<String>>;

    //noinspection DuplicatedCode
    fn next(&mut self) -> Option<Self::Item> {
        if self.ended {
            return None;
        }
        let buf = match Rc::get_mut(&mut self.buf) {
            None => {
                self.buf = Rc::new(String::with_capacity(self.default_cap));
                Rc::make_mut(&mut self.buf)
            }
            Some(buf) => {
                buf.clear();
                buf
            }
        };
        loop {
            match self.reader.read_str() {
                Err(e) => {
                    if let ErrorKind::Interrupted = e.kind() {
                        continue;
                    }
                    self.ended = true;
                    break Some(Err(e));
                }
                Ok(s) => {
                    if s.is_empty() {
                        self.ended = true;
                        break None;
                    } else {
                        buf.push_str(s.as_ref());
                        break Some(Ok(Rc::clone(&self.buf)));
                    }
                }
            }
        }
    }
}

/// An iterator over string slices of an instance of [`io::BufRead`], created by
/// [`codepoints_iter`], see its documentation for more details.
///
/// [`codepoints_iter`]: self::BufRead::codepoints_iter
#[doc(hidden)]
pub struct CodepointIter<'r, R>
where
    R: ?Sized,
{
    reader: &'r mut R,
    buf: Rc<String>,
    default_cap: usize,
    ended: bool,
}

impl<R> Iterator for CodepointIter<'_, R>
where
    R: io::BufRead,
{
    type Item = Result<Rc<String>>;

    //noinspection DuplicatedCode
    fn next(&mut self) -> Option<Self::Item> {
        if self.ended {
            return None;
        }
        let buf = match Rc::get_mut(&mut self.buf) {
            None => {
                self.buf = Rc::new(String::with_capacity(self.default_cap));
                Rc::make_mut(&mut self.buf)
            }
            Some(buf) => {
                buf.clear();
                buf
            }
        };
        loop {
            match self.reader.read_codepoint() {
                Err(e) => {
                    if let ErrorKind::Interrupted = e.kind() {
                        continue;
                    }
                    self.ended = true;
                    break Some(Err(e));
                }
                Ok(s) => {
                    if s.is_empty() {
                        self.ended = true;
                        break None;
                    } else {
                        buf.push_str(s.as_ref());
                        break Some(Ok(Rc::clone(&self.buf)));
                    }
                }
            }
        }
    }
}

/// A mapping iterator over string slices of an instance of [`io::BufRead`], created by
/// [`str_map`], see its documentation for more details.
///
/// [`str_map`]: self::BufRead::str_map
pub struct StrMap<'r, R, F>
where
    R: ?Sized,
{
    reader: &'r mut R,
    map: Rc<F>,
    ended: bool,
}

impl<R, F, T> Iterator for StrMap<'_, R, F>
where
    R: io::BufRead,
    F: FnMut(Cow<str>) -> T,
{
    type Item = Result<T>;

    //noinspection DuplicatedCode
    fn next(&mut self) -> Option<Self::Item> {
        if self.ended {
            return None;
        }
        loop {
            match self.reader.read_str() {
                Ok(s) => {
                    if s.is_empty() {
                        self.ended = true;
                        break None;
                    } else {
                        break Some(Ok((Rc::get_mut(&mut self.map)
                            .expect("MappingIter's mapping function cannot be shared !"))(
                            s
                        )));
                    }
                }
                Err(e) => {
                    if let ErrorKind::Interrupted = e.kind() {
                        continue;
                    }
                    self.ended = true;
                    break Some(Err(e));
                }
            }
        }
    }
}

/// A mapping iterator over codepoints of an instance of [`io::BufRead`], created by [`str_map`],
/// see its documentation for more details.
#[doc(hidden)]
pub struct CodepointMap<'r, R, F>
where
    R: ?Sized,
{
    reader: &'r mut R,
    map: Rc<F>,
    ended: bool,
}

impl<R, F, T> Iterator for CodepointMap<'_, R, F>
where
    R: io::BufRead,
    F: FnMut(Cow<str>) -> T,
{
    type Item = Result<T>;

    //noinspection DuplicatedCode
    fn next(&mut self) -> Option<Self::Item> {
        if self.ended {
            return None;
        }
        loop {
            match self.reader.read_codepoint() {
                Ok(s) => {
                    if s.is_empty() {
                        self.ended = true;
                        break None;
                    } else {
                        break Some(Ok((Rc::get_mut(&mut self.map)
                            .expect("MappingIter's mapping function cannot be shared !"))(
                            s
                        )));
                    }
                }
                Err(e) => {
                    if let ErrorKind::Interrupted = e.kind() {
                        continue;
                    }
                    self.ended = true;
                    break Some(Err(e));
                }
            }
        }
    }
}

/// An iterator over chars of an instance of [`io::BufRead`], created by [`char_iter`], see its
/// documentation for more details.
///
/// [`char_iter`]: self::BufRead::char_iter
pub struct CharIter<'r, R>
where
    R: ?Sized,
{
    reader: &'r mut R,
    ended: bool,
}

impl<R> Iterator for CharIter<'_, R>
where
    R: io::BufRead,
{
    type Item = Result<char>;

    fn next(&mut self) -> Option<Self::Item> {
        if self.ended {
            return None;
        }
        match self.reader.read_char() {
            Ok(c) => {
                if c == '\0' {
                    self.ended = true;
                    None
                } else {
                    Some(Ok(c))
                }
            }
            Err(e) => {
                self.ended = true;
                Some(Err(e))
            }
        }
    }
}

fn read_across_boundary<R>(reader: &mut R, mut leftovers: Vec<u8>) -> Result<Cow<str>>
where
    R: io::BufRead + ?Sized,
{
    debug_assert!(!leftovers.is_empty());
    // We know leftovers is not empty
    let len = codepoint_length(leftovers[0]);
    let first_read_len = leftovers.len();
    debug_assert!(len > first_read_len);
    let additional_len = (len - first_read_len) as usize;
    // Let's try reading more bytes
    let additional_bytes = &reader.fill_buf()?;
    if additional_bytes.len() < additional_len {
        // Not enough additional bytes, we reached EOF on an incomplete codepoint
        return Err(Error::from(ErrorKind::UnexpectedEof).with_leftovers(leftovers));
    }
    // we know we have enough data
    leftovers.extend_from_slice(&additional_bytes[..additional_len]);
    reader.consume(additional_len);
    match String::from_utf8(leftovers) {
        Ok(s) => Ok(Cow::from(s)),
        // We read enough bytes, they simply were not valid
        Err(e) => Err(Error::from(e)),
    }
}

#[inline]
fn codepoint_length(x: u8) -> usize {
    if x < 0x80 {
        1
    } else if x < 0xE0 {
        2
    } else if x < 0xF0 {
        3
    } else {
        4
    }
}

#[cfg(test)]
mod read_str_tests {
    use crate::BufRead;
    use std::io::{BufReader, Cursor, ErrorKind};
    use std::str::Utf8Error;
    use std::string::FromUtf8Error;

    #[test]
    fn empty_read() {
        let mut r = Cursor::new("");
        let s = r.read_str();
        assert!(s.is_ok());
        let s = s.unwrap();
        assert!(s.is_empty());
    }

    #[test]
    fn invalid_in_buffer() {
        let mut r = Cursor::new([0x9fu8, 0x92, 0x96, 0x0]);
        let e = r.read_str();
        assert!(e.is_err());
        let e = e.unwrap_err();
        assert_eq!(e.kind(), ErrorKind::InvalidData);
        let e = e.into_inner_checked();
        assert!(e.is_ok());
        let e = e.unwrap();
        assert!(e.is_some());
        let e = e.unwrap();
        assert!(e.is::<Utf8Error>());
    }

    #[test]
    fn incomplete_in_buffer() {
        let mut r = Cursor::new(&"💖".as_bytes()[..3]);
        let e = r.read_str();
        assert!(e.is_err());
        let e = e.unwrap_err();
        assert_eq!(e.kind(), ErrorKind::UnexpectedEof);
        assert!(!e.leftovers().is_empty());
        let e = e.into_inner_lossy();
        assert!(e.is_none());
    }

    #[test]
    fn invalid_across_boundary() {
        let mut r = BufReader::<&[u8]>::with_capacity(2, [0xffu8, 0x92, 0x96, 0x0].as_ref());
        let e = r.read_str();
        assert!(e.is_err());
        let e = e.unwrap_err();
        assert_eq!(e.kind(), ErrorKind::InvalidData);
        assert!(!e.leftovers().is_empty());
        let e = e.into_inner_lossy();
        assert!(e.is_some());
        let e = e.unwrap();
        assert!(e.is::<FromUtf8Error>());
    }

    #[test]
    fn incomplete_across_boundary() {
        let mut r = BufReader::<&[u8]>::with_capacity(2, &"💖".as_bytes()[..3]);
        let e = r.read_str();
        assert!(e.is_err());
        let e = e.unwrap_err();
        assert_eq!(e.kind(), ErrorKind::UnexpectedEof);

        let e = e.into_inner_lossy();
        assert!(e.is_none());
    }

    #[test]
    fn complete_successful_read() {
        let mut r = Cursor::new("💖");
        let s = r.read_str();
        assert!(s.is_ok());
        let s = s.unwrap();
        assert_eq!(s, "💖");
    }

    #[test]
    fn incomplete_successful_read() {
        let mut r = Cursor::new([0x6fu8, 0xa, 0x9f, 0x92, 0x96, 0x0]);
        let s = r.read_str();
        assert!(s.is_ok());
        let s = s.unwrap();
        assert_eq!(s, "o\n");
    }

    #[test]
    fn read_across_boundary() {
        let mut r = BufReader::<&[u8]>::with_capacity(2, "💖".as_ref());
        let s = r.read_str();
        assert!(s.is_ok());
        let s = s.unwrap();
        assert_eq!(s, "💖");
    }

    #[test]
    fn multi_codepoints_read() {
        let mut r = Cursor::new("foo💖bär€");
        let s = r.read_str();
        assert!(s.is_ok());
        let s = s.unwrap();
        assert_eq!(s, "foo💖bär€");
        let s = r.read_str();
        assert!(s.is_ok());
        let s = s.unwrap();
        assert_eq!(s, "");
    }
}

#[cfg(test)]
mod buf_too_small_tests {
    macro_rules! buf_too_small_test {
        ($name:ident $cap:literal $input:literal: success) => {
            #[test]
            fn $name() {
                let mut r = BufReader::<&[u8]>::with_capacity($cap, $input.as_bytes());
                let mut call_count = 0;
                // Reading until EOF
                loop {
                    let s = r.read_str();
                    assert!(s.is_ok());
                    let s = s.unwrap();
                    if s.is_empty() {
                        break;
                    } else {
                        call_count += 1;
                    }
                }
                // Asserting we did not encounter EOF on the first call
                assert_ne!(call_count, 0);
            }
        };
        ($name:ident $cap:literal $input:literal: failure) => {
            #[test]
            fn $name() {
                let mut r = BufReader::<&[u8]>::with_capacity($cap, $input.as_bytes());
                // Reading until we fail
                loop {
                    let e = r.read_str();
                    match e {
                        Ok(s) => {
                            // We shouldn't reach EOF without failing a read
                            assert!(!s.is_empty());
                        }
                        Err(e) => {
                            assert_eq!(e.kind(), ErrorKind::UnexpectedEof);
                            assert!(!e.leftovers().is_empty());
                            let e = e.into_inner_lossy();
                            assert!(e.is_none());
                            break;
                        }
                    }
                }
            }
        };
    }
    mod buf_capacity_1 {
        use crate::BufRead;
        use std::io::{BufReader, ErrorKind};

        buf_too_small_test!(codepoint_length_1_offset_0 1 "f": success);
        buf_too_small_test!(codepoint_length_2_offset_0 1 "ä": success);
        buf_too_small_test!(codepoint_length_3_offset_0 1 "€": failure);
        buf_too_small_test!(codepoint_length_4_offset_0 1 "💖": failure);
    }

    mod buf_capacity_2 {
        use crate::BufRead;
        use std::io::{BufReader, ErrorKind};

        buf_too_small_test!(codepoint_length_1_offset_0 2 "f": success);
        buf_too_small_test!(codepoint_length_2_offset_0 2 "ä": success);
        buf_too_small_test!(codepoint_length_2_offset_1 2 "xä": success);
        buf_too_small_test!(codepoint_length_3_offset_0 2 "€": success);
        buf_too_small_test!(codepoint_length_3_offset_1 2 "x€": success);
        buf_too_small_test!(codepoint_length_4_offset_0 2 "💖": success);
        buf_too_small_test!(codepoint_length_4_offset_1 2 "x💖": failure);
    }

    mod buf_capacity_3 {
        use crate::BufRead;
        use std::io::BufReader;

        buf_too_small_test!(codepoint_length_1_offset_0 3 "f": success);
        buf_too_small_test!(codepoint_length_2_offset_0 3 "ä": success);
        buf_too_small_test!(codepoint_length_2_offset_1 3 "xä": success);
        buf_too_small_test!(codepoint_length_3_offset_0 3 "€": success);
        buf_too_small_test!(codepoint_length_3_offset_1 3 "x€": success);
        buf_too_small_test!(codepoint_length_3_offset_2 3 "xx€": success);
        buf_too_small_test!(codepoint_length_4_offset_0 3 "💖": success);
        buf_too_small_test!(codepoint_length_4_offset_1 3 "x💖": success);
        buf_too_small_test!(codepoint_length_4_offset_2 3 "xx💖": success);
    }
}