#![no_std]
//! # Rut
//! 
//! Rut is a small UTF-8 parsing library for applications that need to parse individual `char`s.\
//! It provides a byte-wise parsing mechanism, and functions for processing byte slices.
//! 
//! It is completely `#[no_std]` and should produce very small binaries.<sup>[[*citation needed*](index.html)]</sup>
//! 
//! # Conformance
//! 
//! Rut aims to be fully conformant to the specifications and restrictions of the [Unicode standard].\
//! Due to the nature of byte-wise parsing, some [extra caution] might be required when using Rut.
//! 
//! The [`parse_one`] and [`parse`] functions take care of this.
//! 
//! # Testing
//! 
//! A few tests validating the expected behavior are already in place, but it is not comprehensive by any means yet.
//! More tests will be added.
//! 
//! I have thrown a fuzzer at it for several minutes, and it passes this [stress test for UTF-8 decoders](https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt).
//!
//! # Examples
//!
//! ```
//! use rut::Utf8Parser;
//!
//! // UTF-8 encoding of '€'
//! let bytes = [0xE2, 0x82, 0xAC];
//!
//! let mut p = Utf8Parser::new();
//! 
//! assert_eq!(p.parse_byte(bytes[0]), Ok(None));
//! assert_eq!(p.parse_byte(bytes[1]), Ok(None));
//! assert_eq!(p.parse_byte(bytes[2]), Ok(Some('€')));
//! ```
//!
//! [Unicode standard]: http://www.unicode.org/standard/standard.html
//! [extra caution]: struct.Utf8Parser.html#conformance
//! [`parse_one`]: fn.parse_one.html
//! [`parse`]: fn.parse.html

/// A UTF-8 parser error.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum Utf8ParseError {
    /// A byte value of `C0`, `C1`, or `F5..=FF` was encountered.
    InvalidByte,
    /// A continuation was encountered outside of a sequence.
    UnexpectedContinuation,
    /// A sequence was terminated by a byte that isn't a valid continuation.
    BrokenSequence,
    /// An [overlong encoding](https://en.wikipedia.org/wiki/UTF-8#Overlong_encodings) was encountered.
    OverlongEncoding,
    /// An encoding which would result in an invalid unicode scalar value (U+D800-U+DFFF or > U+10FFF) was encountered.
    InvalidCodePoint,
    /// The end of input was reached before the sequence could be fully parsed.
    TruncatedSequence,
}

/// A byte-wise UTF-8 parser.
#[derive(Copy, Clone, Debug, Default)]
pub struct Utf8Parser {
    // Code point parsed up to this point.
    value: u32,
    // Number of continuation bytes in this sequence.
    length: u8,
    // Leading byte of this sequence.
    lead: u8,
}

/// Bitmask used for leading bytes.
/// 110x xxxx for 2 byte,
/// 1110 xxxx for 3 byte, and
/// 1111 0xxx for 4 byte sequences.
const LEAD_MASK: [u8; 3] = [0x1F, 0x0F, 0x07];

/// Bitmask used for continuation bytes.
/// Always 10xx xxxx.
const CONT_MASK: u8 = 0x3F;

impl Utf8Parser {
    /// Creates a new parser.
    /// This is exactly equivalent to `Utf8Parser::default()`.
    #[inline]
    pub fn new() -> Utf8Parser {
        Utf8Parser::default()
    }

    /// Resets the parser to the base state.
    /// This is generally not necessary, as a reset happens whenever an error is encountered during parsing.
    #[inline]
    pub fn reset(&mut self) {
        // NOTE: `self.value` and `self.lead` are not reset here,
        // because this implicitly happens in the
        // "new character, leading byte" branch of `parse_byte`.
        self.length = 0;
    }

    /// Checks if the parser is currently inside a sequence.
    /// Since it has no concept of an 'end of input' condition,
    /// this can be used to check if a sequence is unterminated so far.
    #[inline]
    pub fn is_mid_parse(&self) -> bool {
        self.length != 0
    }

    /// Parses a single UTF-8 byte.
    /// 
    /// Returns `Ok(Some(char))` if a full character was parsed,
    /// `Ok(None)` if the byte was parsed but no full character yet,
    /// and `Err(Utf8ParseError)` otherwise.
    ///
    /// # Conformance
    ///
    /// Due to the nature of this parser, incorrect use may lead to behavior not conformant to the Unicode standard.
    /// Namely, Unicode forbids interpreting well-formed sequences as part of ill-formed ones.
    ///
    /// For example, the byte sequence `C2 41 42` would yield the result `<None> <Error> B`.
    /// Unicode expects this sequence to instead result in `<Error> A B`.
    ///
    /// Implementing this would prove very difficult, so instead, if `parse_byte` returns `Err(BrokenSequence)`,
    /// it should be called with the same byte value again, not with the next one.
    ///
    /// The following example uses the Unicode replacement character (�) on errors.
    ///
    /// # Example
    ///
    /// ```
    /// use std::fmt::Write;
    /// use rut::{Utf8Parser, Utf8ParseError::*};
    ///
    /// let mut s = String::new();
    /// let mut p = Utf8Parser::new();
    ///
    /// // An ill-formed UTF-8 sequence as described above.
    /// let bytes = [0xC2, 0x41, 0x42];
    ///
    /// for &byte in &bytes {
    ///     let mut result = p.parse_byte(byte);
    ///
    ///     // If we get an invalid continuation byte, try again.
    ///     // This makes sure that if `byte` is the start of a well-formed sequence,
    ///     // it isn't "swallowed" by the parser.
    ///     if let Err(BrokenSequence) = result {
    ///         write!(&mut s, "�");
    ///         result = p.parse_byte(byte);
    ///     }
    ///
    ///     match result {
    ///         Ok(Some(c)) => write!(&mut s, "{}", c).unwrap(),
    ///         Err(_)      => write!(&mut s, "�").unwrap(),
    ///         _           => ()
    ///     }
    /// }
    ///
    /// assert_eq!(&s, "�AB")
    /// ```
    pub fn parse_byte(&mut self, byte: u8) -> Result<Option<char>, Utf8ParseError> {
        use core::convert::TryFrom;
        use self::Utf8ParseError::*;

        if !is_valid_utf8_byte(byte) {
            self.reset();
            return Err(InvalidByte);
        }

        // Start parsing a new sequence.
        if self.length == 0 {
            if byte < 0x80 {
                return Ok(Some(char::from(byte)));
            } else if is_utf8_continuation_byte(byte) {
                // We do not call self.reset() here because we are already in the base state.
                return Err(UnexpectedContinuation);
            }

            // We subtract 1 and treat it as the number of bytes following this one.
            self.lead = byte;
            self.length = utf8_length(byte) - 1;
            self.value = (byte & LEAD_MASK[(self.length - 1) as usize]) as u32;

            // Parsing is Ok, but we don't have a full char yet.
            return Ok(None);
        } else {
            // Continue parsing the current sequence
            // Implies byte is in 0x80..=0xBF
            if !is_utf8_continuation_byte(byte) {
                self.reset();
                return Err(BrokenSequence);
            }

            // This branch is only hit on the first continuation byte, and checks for the sequence's validity.
            if self.lead != 0 {
                // E0 XX for XX < A0 and F0 XX for XX < 90 represent a non-shortest encoding.
                if self.lead == 0xE0 && byte < 0xA0
                || self.lead == 0xF0 && byte < 0x90 {
                    self.reset();
                    return Err(OverlongEncoding);
                }

                // ED XX for XX > 9F and F4 XX for XX > 8F represent illegal unicode scalar values
                if self.lead == 0xED && byte > 0x9F
                || self.lead == 0xF4 && byte > 0x8F {
                    self.reset();
                    return Err(InvalidCodePoint);
                }

                self.lead = 0;
            }

            self.value   = (self.value << 6) | (byte & CONT_MASK) as u32;
            self.length -= 1;

            // We're done
            if self.length == 0 {
                return Ok(Some(
                    char::try_from(self.value).expect("Uncought Invalid Code Point"),
                ));
            } else {
                // Parsing is Ok, but we don't have a full char yet.
                return Ok(None);
            }
        }
    }
}
/// Parses one UTF-8 sequence from a byte slice, returning a `Result` and the unparsed remainder of the slice.
/// 
/// # Panics
/// 
/// Panics if `bytes` is empty.
/// 
/// # Examples
/// 
/// ```
/// // Valid UTF-8 encoding of '€'
/// let bytes = [0xE2, 0x82, 0xAC];
/// 
/// let (result, rest) = rut::parse_one(&bytes);
/// 
/// assert_eq!(result, Ok('€'));
/// assert_eq!(rest, &[]);
/// ```
/// 
/// ```
/// use rut::Utf8ParseError::*;
/// 
/// // Ill-formed sequence followed by 2 valid characters
/// let bytes = [0xC2, 0x41, 0x42];
/// 
/// let (result1, rest1) = rut::parse_one(&bytes);
/// let (result2, rest2) = rut::parse_one(rest1);
/// let (result3, rest3) = rut::parse_one(rest2);
/// 
/// assert_eq!(result1, Err(BrokenSequence));
/// assert_eq!(result2, Ok('A'));
/// assert_eq!(result3, Ok('B'));
/// assert_eq!(rest1, &[0x41, 0x42]);
/// assert_eq!(rest2, &[0x42]);
/// assert_eq!(rest3, &[]);
/// ```
pub fn parse_one<'a>(bytes: &'a [u8]) -> (Result<char, Utf8ParseError>, &'a [u8]) {
    use self::Utf8ParseError::*;

    assert!(!bytes.is_empty());

    let mut idx = 0;
    let mut p   = Utf8Parser::new();

    loop {
        if let Some(&byte) = bytes.get(idx) {
            match p.parse_byte(byte) {
                // If we didn't get anything yet, continue the loop
                Ok(None)    => (),
                // If we got a char, return it and start from the next byte
                Ok(Some(c))         => return (Ok(c),  &bytes[idx+1..]),
                // If we got a broken sequence, we need to start parsing from the current byte again.
                Err(BrokenSequence) => return (Err(BrokenSequence), &bytes[idx..]),
                // On all other errors, continue to the next byte.
                Err(e)              => return (Err(e), &bytes[idx+1..])
            }
        } else {
            return (Err(TruncatedSequence), &[])
        }

        idx += 1;
    }
}

/// An iterator for parsing UTF-8 sequences from a byte slice.
/// 
/// See the [`parse`](fn.parse.html) function for more information.
#[must_use = "iterators are lazy and do nothing unless consumed"]
#[derive(Clone)]
pub struct Parse<'a> {
    bytes: &'a [u8]
}

/// Creates an iterator for parsing UTF-8 sequences from a byte slice.
/// 
/// # Examples
/// 
/// ```
/// // Valid UTF-8 encoding of '€'
/// let bytes = [0xE2, 0x82, 0xAC];
/// 
/// let mut it = rut::parse(&bytes);
/// 
/// assert_eq!(it.next(), Some(Ok('€')));
/// assert_eq!(it.next(), None);
/// ```
/// 
/// ```
/// use rut::Utf8ParseError::*;
/// 
/// // Ill-formed sequence followed by 2 valid characters
/// let bytes = [0xC2, 0x41, 0x42];
/// 
/// let mut it = rut::parse(&bytes);
/// 
/// assert_eq!(it.next(), Some(Err(BrokenSequence)));
/// assert_eq!(it.next(), Some(Ok('A')));
/// assert_eq!(it.next(), Some(Ok('B')));
/// assert_eq!(it.next(). None);
/// ```
#[inline]
pub fn parse<'a>(bytes: &'a [u8]) -> Parse<'a> {
    Parse { bytes }
}

impl ::core::iter::Iterator for Parse<'_> {
    type Item = Result<char, Utf8ParseError>;

    fn next(&mut self) -> Option<Self::Item> {
        if self.bytes.is_empty() {
            None
        } else {
            let (result, rest) = parse_one(self.bytes);
            self.bytes = rest;
            Some(result)
        }
    }
}

impl ::core::iter::FusedIterator for Parse<'_> {}

// Helper functions

#[inline]
fn is_valid_utf8_byte(byte: u8) -> bool {
    match byte {
        0xC0 | 0xC1 | 0xF5..=0xFF => false,
        _ => true,
    }
}

#[inline]
fn is_utf8_continuation_byte(byte: u8) -> bool {
    // Continuation bytes look like 10xx xxxx,
    // so we look at the top 2 bits and see if they match.
    (byte >> 6) == 0b10
}

#[inline]
fn utf8_length(byte: u8) -> u8 {
    // This function assumes that `byte` is both a valid UTF-8 byte and *not* a continuation byte.
    debug_assert!(is_valid_utf8_byte(byte) && !is_utf8_continuation_byte(byte));

    if byte < 0x80 {
        1
    } else {
        (!byte).leading_zeros() as u8
    }
}