//! This crate provides a trait, [`BufRead`], providing functions to read utf-8 text streams
//! using an [`io::BufRead`] without waiting for newline delimiters.
//!
//! # Quick Start
//!
//! The simplest way to read a whole file with a [`BufRead`] type is to repeatedly calling its
//! [`read_utf8`] method:
//!
//! ```
//! use utf8_bufread::BufRead;
//! use std::io::BufReader;
//!
//! // Reader may be any type implementing io::BufRead
//! // We'll just use a BufReader wrapping a slice for this example
//! let mut reader = BufReader::<&[u8]>::new("💖".as_ref());
//! // The string we'll use to store the text of the read file
//! let mut text = String::new();
//! loop { // Loop until EOF
//!     match reader.read_utf8(&mut text) {
//!         Ok(0) => break, // EOF
//!         Ok(_) => continue,
//!         Err(e) => panic!(e), // io::Error or Utf8Error
//!     }
//! }
//! assert_eq!("💖", text.as_str());
//! ```
//!
//! *Note that this example does exactly what this crate tries to avoid: storing the whole file in
//! memory.*
//!
//! see [`BufRead`]'s documentation for more.
//!
//! [`BufRead`]: self::BufRead
//! [`io::BufRead`]: std::io::BufRead
//! [`read_utf8`]: self::BufRead::read_utf8

use std::cell::Cell;
use std::io::{self, Error, ErrorKind};
use std::marker::PhantomData;
use std::mem::MaybeUninit;
use std::str::{from_utf8, from_utf8_unchecked};

#[deny(missing_crate_level_docs, missing_docs, missing_doc_code_examples)]

/// A trait implemented for all types implementing [`io::BufRead`], providing  functions to
/// read utf-8 text streams without waiting for newline delimiters.
///
/// [`io::BufRead`]: std::io::BufRead
pub trait BufRead: io::BufRead {
    /// Read some bytes from the inner reader, and push their utf-8 representation in the provided
    /// `buf`. Return the number of bytes read as a [`io::Result`]`<`[`usize`]`>`.
    ///
    /// This functions calls [`with_utf8_chunk`] and push passed `&`[`str`] to `buf` (which means
    /// it clones the bytes), see its documentation for more info.
    ///
    /// # Errors
    ///
    /// This function follows the same error policy as [`with_utf8_chunk`].
    ///
    /// [`io::Result`]: std::io::Result
    /// [`with_utf8_chunk`]: self::BufRead::with_utf8_chunk
    ///
    /// # Examples
    ///
    /// ```
    /// use utf8_bufread::BufRead;
    /// use std::io::{BufReader, ErrorKind};
    ///
    /// // "foo\nbar" + some invalid bytes
    /// // We give the buffer more than enough capacity to be able to read all the bytes in one
    /// // call
    /// let mut reader = BufReader::with_capacity(
    ///     16,
    ///     [0x66u8, 0x6f, 0x6f, 0xa, 0x62, 0x61, 0x72, 0x9f, 0x92, 0x96, 0x0].as_ref(),
    /// );
    /// let mut buf = String::new();
    ///
    /// // On the first read_utf8() call, we will read up to the first byte of the invalid
    /// // codepoint (ie "foo\nbar")
    /// let n_read = reader
    ///     .read_utf8(&mut buf)
    ///     .expect("We will get all the valid bytes without error");
    /// assert_eq!("foo\nbar", buf.as_str());
    /// assert_eq!(7, n_read);
    ///
    /// // Then on the second call we will get the InvalidData error caused by the Utf8Error error,
    /// // as there is no bytes forming valid codepoints left
    /// let read_err = reader.read_utf8(&mut buf).expect_err("We will get an error");
    /// assert_eq!(ErrorKind::InvalidData, read_err.kind());
    /// assert_eq!(7, buf.len());  // no byte appended to buf
    /// ```
    fn read_utf8(&mut self, buf: &mut String) -> io::Result<usize> {
        self.with_utf8_chunk(|s| buf.push_str(s))
    }

    /// Read some bytes from the inner reader, and call provided function with a reference to read
    /// data as an UTF-8 [`str`]. Returns the number of bytes read as a
    /// [`io::Result`]`<`[`usize`]`>`.
    ///
    /// `f` is called if and only if we read a non-zero amount of valid UTF-8 bytes.
    ///
    /// If the operation is successful, this function returns the number of bytes read. Note this
    /// may **not** be the number of [`char`]s read, as UTF-8 is a variable-length encoding.
    ///
    /// If this function returns [`Ok(0)`], the stream has reached EOF.
    ///
    /// This function will read bytes from the underlying stream until its buffer is full, an
    /// invalid or incomplete codepoint is found, or EOF is found. Once found, all codepoints
    /// up to, including the EOF (if found), but not including the invalid or incomplete codepoint
    /// (if found), will be passed as `f`'s argument. Note this may allow you to manipulate the
    /// [`str`] without cloning data.
    ///
    /// This function avoids the usual issues of using [`BufRead`]`::`[`read_line`]`(&self, &mut `
    /// [`String`]`)` or [`BufRead`]`::`[`lines`]`(&self)` on big text file without newline
    /// delimiters: It will not load the whole file in memory.
    ///
    /// The amount of byte read depends on the size of the underlying buffer as well as previous
    /// calls. It cannot exceed the size of the buffer, unless it is not big enough to fit a
    /// unicode codepoint.
    ///
    /// # Errors
    ///
    /// This function will immediately return any errors returned by [`fill_buf`].
    ///
    /// If an [`Utf8Error`] is returned by the internal call to [`from_utf8`], all valid codepoints
    /// are returned, and no error is returned, unless no valid codepoints were read. This
    /// allows not to lose any valid data, and the error will be returned on the next call.
    ///
    /// If the first codepoint encountered by [`from_utf8`] is invalid, an
    /// [`ErrorKind`]`::`[`InvalidData`] caused by an [`Utf8Error`] is returned. You can still read
    /// bytes from this reader but any convertion to UTF-8 will fail.
    ///
    /// If EOF is encountered on an incomplete codepoint, an [`ErrorKind`]`::`[`UnexpectedEof`] is
    /// returned.
    ///
    /// Note this function will return an [`ErrorKind`]`::`[`InvalidInput`] if the buffer of this
    /// reader is too small to read a unicode codepoint. Currently, a buffer of size `1` will
    /// always reading any non-ascii codepoint, and a buffer of size `2` may or may not cause this
    /// function to fail. A buffer of size `3` will allow this function to read any codepoint
    /// correctly.
    ///
    /// [`io::Result`]: std::io::Result
    /// [`Ok(0)`]: Ok
    /// [`BufRead`]: std::io::BufRead
    /// [`read_line`]: std::io::BufRead::read_line
    /// [`lines`]: std::io::BufRead::lines
    /// [`fill_buf`]: std::io::BufRead::fill_buf
    /// [`Utf8Error`]: std::str::Utf8Error
    /// [`from_utf8`]: std::str::from_utf8
    /// [`ErrorKind`]: std::io::ErrorKind
    /// [`InvalidData`]: std::io::ErrorKind::InvalidData
    /// [`UnexpectedEof`]: std::io::ErrorKind::UnexpectedEof
    /// [`InvalidInput`]: std::io::ErrorKind::InvalidInput
    ///
    /// # Examples
    ///
    /// ```
    /// use utf8_bufread::BufRead;
    /// use std::io::{BufReader, ErrorKind};
    ///
    /// // "foo\nbar" + some invalid bytes
    /// // We give the buffer more than enough capacity to be able to read all the bytes in one
    /// // call
    /// let mut reader = BufReader::with_capacity(
    ///     16,
    ///     [0x66u8, 0x6f, 0x6f, 0xa, 0x62, 0x61, 0x72, 0x9f, 0x92, 0x96, 0x0].as_ref(),
    /// );
    /// // We will store data in this buffer while inside passed closure
    /// let mut buf = String::new();
    ///
    /// // On the first read_utf8() call, we will read up to the first byte of the invalid
    /// // codepoint (ie "foo\nbar")
    /// let n_read = reader
    ///     .with_utf8_chunk(|s| buf.push_str(s))
    ///     .expect("We will get all the valid bytes without error");
    /// assert_eq!("foo\nbar", buf.as_str());
    /// assert_eq!(7, n_read);
    ///
    /// // Then on the second call we will get the InvalidData error caused by the Utf8Error error,
    /// // as there is no bytes forming valid codepoints left
    /// // Passed closure will not be called
    /// let mut is_called = false;
    /// let read_err = reader.with_utf8_chunk(|_| {is_called = true;})
    ///     .expect_err("We will get an error");
    /// assert_eq!(ErrorKind::InvalidData, read_err.kind());
    /// assert!(!is_called);
    /// ```
    fn with_utf8_chunk<F>(&mut self, f: F) -> io::Result<usize>
    where
        F: FnOnce(&str),
    {
        // Fill the buffer from inner reader's data and get its content
        let read_bytes = match self.fill_buf() {
            Ok(r) => r,
            // We do not handle `ErrorKind::Interrupt`
            Err(e) => return Err(e),
        };
        // We attempt converting read bytes to utf8
        match from_utf8(read_bytes) {
            Ok(s) => {
                let used = read_bytes.len();
                f(s);
                self.consume(used);
                Ok(used)
            }
            Err(e) => {
                // If we have an error, we will first attempt to return all valid read bytes,
                // putting the invalid or incomplete codepoint at the beginning of the buffer.
                // This allows us to recover from reading up to a byte that isn't on a char
                // boundary by reading the complete codepoint on the next call
                let used = e.valid_up_to();
                if used == 0 {
                    // If we cannot decode any valid utf8 byte from the buffer, it either means
                    // - We reached EOF with an incomplete codepoint, we should return an
                    //   Utf8Error
                    // - There was a parse error earlier, and we read everything up to this
                    //   point in a previous read call, there is two possible situations again:
                    //   - There is more than 2 bytes following the first byte of the invalid
                    //     slice, this means there truly is an invalid codepoint, we should
                    //     return an Utf8Error
                    //   - There is less than 4 bytes left in the buffer, meaning we may have
                    //     an incomplete codepoint and need to read up to 3 bytes further.
                    if read_bytes.len() < 4 {
                        let mut v = Vec::from(read_bytes);
                        // Consume the last bytes, so that the next call to `fill_buff` will read
                        // more bytes from the underlying stream
                        self.consume(v.len());
                        // Let's try reading more bytes
                        let additional_bytes = match self.fill_buf() {
                            Ok(r) => r,
                            // We do not handle `ErrorKind::Interrupt`
                            Err(e) => return Err(e),
                        };
                        if additional_bytes.is_empty() {
                            // No additional bytes, we reached EOF on an incomplete codepoint
                            return Err(Error::from(ErrorKind::UnexpectedEof));
                        } else if additional_bytes.len() + v.len() < 4 {
                            // If this is true we may not be able to read a codepoint across the
                            // buffer boundary
                            return Err(Error::new(
                                ErrorKind::InvalidInput,
                                format!(
                                    "Internal buffer capacity of at least 3 bytes expected to be \
                                    able to read utf-8, but it is: {}",
                                    // One of the two must be from a read filling all the buffer
                                    // for above check to be true
                                    additional_bytes.len().max(v.len())
                                ),
                            ));
                        }
                        // Try adding bytes until our incomplete codepoint is complete, up to 3
                        for (i, b) in additional_bytes.iter().enumerate() {
                            v.push(*b);
                            if let Ok(s) = from_utf8(v.as_slice()) {
                                // Hurray, we got a valid codepoint
                                f(s);
                                // Don't forget to tell BufRead we consumed those bytes
                                self.consume(i + 1);
                                return Ok(v.len());
                            }
                        }
                    }
                    // We couldn't get a valid codepoint, return Utf8Error
                    return Err(Error::new(ErrorKind::InvalidData, e));
                }
                // This is safe, see `Utf8Error::valid_up_to(&self)` doc
                f(unsafe { from_utf8_unchecked(&read_bytes[..used]) });
                self.consume(used);
                Ok(used)
            }
        }
    }

    /// Takes a closure and creates an [`Iterator`] which calls that closure on each read chunk of
    /// data.
    ///
    /// This is equivalent to calling [`with_utf8_chunk`] in a loop.
    ///
    /// The created iterator will stop when reaching EOF or an invalid UTF-8 byte. If you wish to
    /// know the cause, see [`map_utf8_results`]
    ///
    /// [`with_utf8_chunk`]: self::BufRead::with_utf8_chunk
    /// [`map_utf8_results`]: self::BufRead::map_utf8_results
    ///
    /// # Examples
    ///
    /// ```
    /// use utf8_bufread::BufRead;
    /// use std::io::{BufReader, ErrorKind};
    ///
    /// // "foo\nbar" + some invalid bytes
    /// // We do not give the buffer enough capacity to read the whole slice in one call, just to
    /// // make it iterate more than once for this example
    /// let mut reader = BufReader::with_capacity(
    ///     4,
    ///     [0x66u8, 0x6f, 0x6f, 0xa, 0x62, 0x61, 0x72, 0x9f, 0x92, 0x96, 0x0].as_ref(),
    /// );
    ///
    /// // We read all the data we can, and sum the substrings length
    /// assert_eq!(7usize, reader.map_utf8(|s| s.len()).sum());
    /// ```
    fn map_utf8<F, T>(&mut self, map: F) -> ChunkSliceMap<'_, F, T, Self>
    where
        F: FnMut(&str) -> T,
    {
        ChunkIter {
            inner: Cell::new(Some(self)),
            map,
            phantom: Default::default(),
        }
    }

    /// Takes a closure and creates an [`Iterator`] which calls that closure on each read chunk of
    /// data with either an [`Ok`] containing the read `&`[`str`], or the error returned by
    /// [`with_utf8_chunk`].
    ///
    /// The created iterator will stop when reaching EOF or an invalid UTF-8 byte.
    ///
    /// [`with_utf8_chunk`]: self::BufRead::with_utf8_chunk
    ///
    /// # Examples
    ///
    /// ```
    /// use utf8_bufread::BufRead;
    /// use std::io::{BufReader, ErrorKind};
    ///
    /// // "foo\nbar" + some invalid bytes
    /// // We do not give the buffer enough capacity to read the whole slice in one call, just to
    /// // make it iterate more than once for this example
    /// let mut reader = BufReader::with_capacity(
    ///     4,
    ///     [0x66u8, 0x6f, 0x6f, 0xa, 0x62, 0x61, 0x72, 0x9f, 0x92, 0x96, 0x0].as_ref(),
    /// );
    ///
    /// let err = reader
    ///     // Take the length of the string or the returned error
    ///     .map_utf8_results(|r| match r { Ok(s) => Ok(s.len()), Err(e) => Err(e)})
    ///     // Sum strings length, but returns the error if encountered
    ///     // Iterator stops after returning an error, so no need to short-circuit
    ///     .fold(Ok(0), |acc, r| if let Ok(n) = r { Ok(n + acc.unwrap()) } else { r } )
    ///     // We are getting an error since we have invalid bytes
    ///     .unwrap_err();
    /// assert_eq!(ErrorKind::InvalidData, err.kind());
    /// ```
    fn map_utf8_results<F, T>(&mut self, map: F) -> ChunkResultMap<'_, F, T, Self>
    where
        F: FnMut(io::Result<&str>) -> T,
    {
        ChunkIter {
            inner: Cell::new(Some(self)),
            map,
            phantom: Default::default(),
        }
    }

    /// Creates an [`Iterator`] over the chunks of utf8 data read by this reader.
    ///
    /// This is equivalent to creating a new [`String`] and calling [`read_utf8`] in a loop.
    ///
    /// The created iterator will stop when reaching EOF or an invalid UTF-8 byte. If you wish to
    /// know the cause, see [`iter_utf8_results`].
    ///
    /// Note returned iterator always clones the data read from the reader, regardless if it is
    /// later thrown away.
    ///
    /// [`read_utf8`]: self::BufRead::read_utf8
    /// [`iter_utf8_results`]: self::BufRead::iter_utf8_results
    ///
    /// # Examples
    ///
    /// *Note the following example involves cloning each read chunk two times.*
    ///
    /// ```
    /// use utf8_bufread::BufRead;
    /// use std::io::BufReader;
    ///
    /// // "foo\nbar" + some invalid bytes
    /// // We do not give the buffer enough capacity to read the whole slice in one call, just to
    /// // make it iterate more than once for this example
    /// let mut reader = BufReader::with_capacity(
    ///     4,
    ///     [0x66u8, 0x6f, 0x6f, 0xa, 0x62, 0x61, 0x72, 0x9f, 0x92, 0x96, 0x0].as_ref(),
    /// );
    ///
    /// // Getting all valid data until EOF or invalid codepoint
    /// let text: String = reader.iter_utf8().collect();
    /// assert_eq!("foo\nbar", text.as_str());
    /// ```
    fn iter_utf8(&mut self) -> ChunkSliceIter<'_, Self> {
        ChunkIter {
            inner: Cell::new(Some(self)),
            map: |s| String::from(s),
            phantom: Default::default(),
        }
    }

    /// Creates an [`Iterator`] over the chunks of utf8 data read by this reader.
    ///
    /// This is equivalent to creating a new [`String`] and calling [`read_utf8`] in a loop.
    ///
    /// Note returned iterator always clones the data read from the reader, regardless if it is
    /// later thrown away.
    ///
    /// [`read_utf8`]: self::BufRead::read_utf8
    ///
    /// # Examples
    ///
    /// *Note the following example still involves cloning each read chunk one time.*
    ///
    /// ```
    /// use utf8_bufread::BufRead;
    /// use std::io::{BufReader, ErrorKind};
    ///
    /// // "foo\nbar" + some invalid bytes
    /// // We do not give the buffer enough capacity to read the whole slice in one call, just to
    /// // make it iterate more than once for this example
    /// let mut reader = BufReader::with_capacity(
    ///     4,
    ///     [0x66u8, 0x6f, 0x6f, 0xa, 0x62, 0x61, 0x72, 0x9f, 0x92, 0x96, 0x0].as_ref(),
    /// );
    ///
    /// // We just take the last element which should be the error cause by the invalid bytes
    /// let err = reader.iter_utf8_results().last().unwrap();
    /// assert!(err.is_err());
    /// assert_eq!(ErrorKind::InvalidData, err.unwrap_err().kind());
    /// ```
    fn iter_utf8_results(&mut self) -> ChunkResultIter<'_, Self> {
        ChunkIter {
            inner: Cell::new(Some(self)),
            map: |r| match r {
                Ok(s) => Ok(String::from(s)),
                Err(e) => Err(e),
            },
            phantom: Default::default(),
        }
    }
}

impl<R: io::BufRead> BufRead for R {}

pub struct ChunkIter<'r, R, F, A, T>
where
    R: ?Sized,
    F: FnOnce(A) -> T,
{
    inner: Cell<Option<&'r mut R>>,
    map: F,
    // This field allows us to put A as a generic type for this struct which allows us to implement
    // the iterator trait for different map input's arguments types
    // Otherwise, rust would not consider the types to be different and would cause an E0119
    phantom: PhantomData<*const A>,
}

// Just some alias because that phantom can be introducing quite the boilerplate
type ChunkSliceMap<'s, F, T, R> = ChunkIter<'s, R, F, &'s str, T>;
type ChunkResultMap<'s, F, T, R> = ChunkIter<'s, R, F, io::Result<&'s str>, T>;
type ChunkSliceIter<'s, R> = ChunkIter<'s, R, fn(&str) -> String, &'s str, String>;
type ChunkResultIter<'s, R> = ChunkIter<
    's,
    R,
    fn(io::Result<&str>) -> io::Result<String>,
    io::Result<&'s str>,
    io::Result<String>,
>;

impl<R, F, T> Iterator for ChunkIter<'_, R, F, &str, T>
where
    R: io::BufRead,
    F: FnMut(&str) -> T,
{
    type Item = T;

    fn next(&mut self) -> Option<T> {
        let mut res: MaybeUninit<T> = MaybeUninit::uninit();
        // This is kind of an ugly trick, we use a cell to separately borrow the reader as mutable
        // while still borrowing self as immutable, so that we can also borrow self.map while
        // inside reader.with_utf8_chunk, ie while borrowing mutably the reader
        // We allow this only because this function requires a mutable reference to self, meaning
        // we are the only one calling take() on the cell, which holds a mutable reference to the
        // reader, so that we can guarantee we won't panic and we are the only ones reading
        let reader = self
            .inner
            .take()
            .expect("No inner reader in ChunkMapIter !");
        let res = match {
            reader.with_utf8_chunk(|s| {
                res = MaybeUninit::new((self.map)(s));
            })
        } {
            Ok(0) => None,
            // We guarantee that self.map was called if we return a Ok, this is safe
            Ok(_) => Some(unsafe { res.assume_init() }),
            Err(_) => None,
        };
        // Put back our reader in the cell
        self.inner.set(Some(reader));
        res
    }
}

impl<R, F, T> Iterator for ChunkIter<'_, R, F, io::Result<&str>, T>
where
    R: io::BufRead,
    F: FnMut(io::Result<&str>) -> T,
{
    type Item = T;

    fn next(&mut self) -> Option<T> {
        let mut res: MaybeUninit<T> = MaybeUninit::uninit();
        // IMPORTANT see comment for corresponding implementation for FnMut(&str)
        // A none here means we previously had an Error, we shouldn't iterate further
        let reader = self.inner.take()?;
        let res = match {
            reader.with_utf8_chunk(|s| {
                res = MaybeUninit::new((self.map)(Ok(s)));
            })
        } {
            Ok(0) => None,
            // We guarantee that self.map was called if we return a Ok, this is safe
            Ok(_) => Some(unsafe { res.assume_init() }),
            Err(e) => {
                // Returning without putting back the reader in the cell
                return Some((self.map)(Err(e)));
            }
        };
        // Put back our reader in the cell
        self.inner.set(Some(reader));
        res
    }
}