#![no_std]
//! Rut is a small UTF-8 decoding library for applications that need to decode individual characters.\
//! It provides a bytewise decoder, and functions for decoding byte slices.
//!
//! It is completely `no_std` and should provide good performance.<sup>[*citation needed*]</sup>
//!
//! # Conformance
//!
//! Rut is fully conformant to the specifications and restrictions of the [Unicode standard][unicode].\
//! Additionally, it follows [W3C's standard for UTF-8 decoding][w3c] with regards to error signalling.
//!
//! # Testing
//!
//! Some tests are in place, however it is not comprehensive yet.
//! However, Rut has been pretty thoroughly fuzzed on random input and passes this [stress test for UTF-8 decoders][stress].
//!
//! # As Seen on TV!
//!
//! Rut began life, and is still used in, [Termiku], a terminal emulator written in Rust.
//!
//! [unicode]: https://www.unicode.org/versions/latest/
//! [w3c]: https://www.w3.org/TR/encoding/#utf-8-decoder
//! [stress]: https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
//! [Termiku]: https://github.com/ShinySaana/Termiku

/// Bitmasks for UTF-8 bytes.
/// 10xx xxxx for continuation bytes,
/// 110x xxxx for 2 byte,
/// 1110 xxxx for 3 byte, and
/// 1111 0xxx for 4 byte leading bytes.
const MASK: [u8; 4] = [0x3F, 0x1F, 0x0F, 0x07];

/// UTF-8 length table.
/// This is blatantly taken from Rust's corelib.
/// However, the API for it is unstable, so we supply our own.
/// https://doc.rust-lang.org/stable/core/str/fn.utf8_char_width.html
const UTF8_LENGTH_TABLE: [u8; 256] = [
    // 0x00-0x7F: 1 byte.
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    // 0x80-0xBF: continuation bytes.
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    // 0xC2-0xDF: 2 bytes.
    0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    // 0xE0-0xEF: 3 bytes.
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
    // 0xF0-0xF4: 4 bytes.
    4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,
];

/// Result type for the [`Decoder::decode_byte`](struct.Decoder.html#method.decode_byte) method.
///
/// Note that this is *not* an alias for Rust's default `Result` type.
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum DecoderResult {
    /// A byte was decoded successfully.
    Continue,
    /// A character was decoded successfully.
    Char(char),
    /// An error occured.
    Error(self::Error),
}

/// The error type returned by all decoding methods.
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum Error {
    /// An invalid byte value was encountered.
    InvalidByte,

    /// A continuation byte was encountered outside of a sequence.
    UnexpectedContinuation,

    /// A sequence was terminated by a byte that isn't a valid continuation.
    BrokenSequence,

    /// An [overlong encoding](https://en.wikipedia.org/wiki/UTF-8#Overlong_encodings) was encountered.
    OverlongEncoding,

    /// An encoding for an invalid unicode scalar value was encountered.
    InvalidCodePoint,

    /// The end of the sequence was reached before it could be fully decoded.
    TruncatedSequence,
}

/// A bytewise UTF-8 decoder.
#[derive(Copy, Clone, Debug, Default)]
pub struct Decoder {
    /// Unicode Scalar Value decoded up to this point.
    value: u32,
    /// Number of continuation bytes in this sequence.
    needed: u8,
    /// Leading byte of this sequence.
    lead: u8,
}

impl Decoder {
    /// Creates a new decoder.
    #[inline]
    pub fn new() -> Decoder {
        Decoder::default()
    }

    /// Decodes a single byte.
    ///
    /// # Correct Use
    ///
    /// Due to the nature of bytewise UTF-8 decoding, blindly continuing through errors will lead to vastly incorrect results.
    ///
    /// For example, the byte sequence `C2, 41, 42` would return the results `<Continue>, <Error>, B`.\
    /// However, Unicode expects this sequence to be treated as `<Error>, A, B`.
    ///
    /// This means that, when an error is encountered inside of a sequence,
    /// the offending byte should be passsed to `decode_byte` again.\
    /// This is because an ill-formed sequence ends at the last valid byte, not at the first invalid byte.
    ///
    /// This is essentially what the [`decode_one`](fn.decode_one.html) function does.
    ///
    /// # Example
    ///
    /// Performing lossy decoding with `Decoder`:
    ///
    /// ```
    /// use std::fmt::Write;
    /// use rut::{Decoder, DecoderResult, Error::*};
    ///
    /// let mut s = String::new();
    /// let mut d = Decoder::new();
    ///
    /// // An ill-formed UTF-8 sequence as described above.
    /// let bytes = [0xC2, 0x41, 0x42];
    ///
    /// for &byte in &bytes {
    ///     let mut result = d.decode_byte(byte);
    ///
    ///     if let DecoderResult::Error(e) = result {
    ///         match e {
    ///             BrokenSequence
    ///             | OverlongEncoding
    ///             | InvalidCodePoint => {
    ///                 write!(&mut s, "\u{FFFD}").unwrap();
    ///                 result = d.decode_byte(byte);
    ///             },
    ///             _ => {}
    ///         }
    ///     }
    ///
    ///     match result {
    ///         DecoderResult::Continue => continue,
    ///         DecoderResult::Char(c)  => write!(&mut s, "{}", c).unwrap(),
    ///         DecoderResult::Error(_) => write!(&mut s, "\u{FFFD}").unwrap()
    ///     }
    /// }
    ///
    /// assert_eq!(&s, "�AB");
    /// ```
    pub fn decode_byte(&mut self, byte: u8) -> DecoderResult {
        use self::Error::*;

        // These values can never appear.
        // More specifically, these would be leading bytes.
        // C0 and C1 can only produce overlong encodings of U+0000 through U+007F
        // F5 through FF can only produce scalar values greater than U+10FFFF, or encodings longer than 4 bytes.
        match byte {
            0xC0 | 0xC1 | 0xF5..=0xFF => {
                self.needed = 0;
                return DecoderResult::Error(InvalidByte);
            }
            _ => {}
        }

        if self.needed == 0 {
            match byte {
                0x00..=0x7F => return DecoderResult::Char(char::from(byte)),
                0x80..=0xBF => return DecoderResult::Error(UnexpectedContinuation),
                _ => {}
            }

            let needed = UTF8_LENGTH_TABLE[byte as usize] - 1;

            self.value = (byte & MASK[needed as usize]) as u32;
            self.needed = needed as u8;
            self.lead = byte;

            return DecoderResult::Continue;
        }

        if !(0x80..=0xBF).contains(&byte) {
            self.needed = 0;
            return DecoderResult::Error(BrokenSequence);
        }

        // Check for the validity of this sequence.
        if self.lead != 0 {
            // E0 80 through E0 9F and F0 80 through F0 8F
            // produce overlong encodings.
            if self.lead == 0xE0 && byte < 0xA0 ||
               self.lead == 0xF0 && byte < 0x90 {
                self.needed = 0;
                return DecoderResult::Error(OverlongEncoding);
            }

            // ED 80 through ED 9E and F4 90 through F4 BF
            // produce invalid unicode scalar values.
            if self.lead == 0xED && byte > 0x9F ||
               self.lead == 0xF4 && byte > 0x8F {
                self.needed = 0;
                return DecoderResult::Error(InvalidCodePoint);
            }

            // We only want to do this check for the first continuation byte, so we reset `lead` here.
            self.lead = 0;
        }

        self.value = (self.value << 6) | (byte & MASK[0]) as u32;
        self.needed -= 1;

        // We're done!
        if self.needed == 0 {
            return DecoderResult::Char(unsafe {
                core::char::from_u32_unchecked(self.value)
            });
        }

        DecoderResult::Continue
    }
}

/// Decodes one character from a byte slice, returning a `Result` and the remainder of the slice.
///
/// # Panics
///
/// Panics if `bytes` is empty.
///
/// # Examples
///
/// ```rust
/// // Valid UTF-8 encoding of '€'
/// let bytes = [0xE2, 0x82, 0xAC];
///
/// let (result, rest) = rut::decode_one(&bytes);
///
/// assert_eq!(result, Ok('€'));
/// assert_eq!(rest, &[]);
/// ```
///
/// ```rust
/// use rut::Error::*;
///
/// // Ill-formed sequence followed by 2 valid characters
/// let bytes = [0xC2, 0x41, 0x42];
///
/// let (result1, rest1) = rut::decode_one(&bytes);
/// let (result2, rest2) = rut::decode_one(rest1);
/// let (result3, rest3) = rut::decode_one(rest2);
///
/// assert_eq!(result1, Err(BrokenSequence));
/// assert_eq!(result2, Ok('A'));
/// assert_eq!(result3, Ok('B'));
/// assert_eq!(rest1, &[0x41, 0x42]);
/// assert_eq!(rest2, &[0x42]);
/// assert_eq!(rest3, &[]);
/// ```
pub fn decode_one(bytes: &[u8]) -> (Result<char, Error>, &[u8]) {
    use self::Error::*;

    // An empty slice would return `TruncatedSequence`, which doesn't make much sense.
    // Just panic instead.
    assert!(!bytes.is_empty());

    let mut p = Decoder::new();

    for (idx, &byte) in bytes.iter().enumerate() {
        match p.decode_byte(byte) {
            DecoderResult::Continue => continue,
            DecoderResult::Char(c)  => return (Ok(c), &bytes[idx + 1..]),
            // To match W3C's / Unicode's behavior,
            // errors which occur inside a sequence require
            // the current byte to get decoded again.
            // This is equivalent to step 4.2 in https://www.w3.org/TR/encoding/#utf-8-decoder.
            DecoderResult::Error(e) => match e {
                BrokenSequence
                | OverlongEncoding
                | InvalidCodePoint => {
                    return (Err(e), &bytes[idx..])
                }
                _ => return (Err(e), &bytes[idx + 1..]),
            },
        }
    }

    // We didn't get anything out of the slice (i.e. every byte returned `Continue`).
    (Err(TruncatedSequence), &[])
}

/// An iterator that decodes characters from a byte slice.
///
/// This `struct` is created by the [`decode`](fn.decode.html) function.
/// See its documentation for more.
#[must_use = "iterators are lazy and do nothing unless consumed"]
#[derive(Clone)]
pub struct Decode<'a> {
    bytes: &'a [u8],
}

/// Creates an iterator for decoding characters from a byte slice.
///
/// This is done by calling [`decode_one`](fn.decode_one.html) repeatedly until the slice has been exhausted.
///
/// # Examples
///
/// ```rust
/// // Valid UTF-8 encoding of '€'
/// let bytes = [0xE2, 0x82, 0xAC];
///
/// let mut it = rut::decode(&bytes);
///
/// assert_eq!(it.next(), Some(Ok('€')));
/// assert_eq!(it.next(), None);
/// ```
///
/// ```rust
/// use rut::Error::*;
///
/// // Ill-formed sequence followed by 2 valid characters
/// let bytes = [0xC2, 0x41, 0x42];
///
/// let mut it = rut::decode(&bytes);
///
/// assert_eq!(it.next(), Some(Err(BrokenSequence)));
/// assert_eq!(it.next(), Some(Ok('A')));
/// assert_eq!(it.next(), Some(Ok('B')));
/// assert_eq!(it.next(), None);
/// ```
#[inline]
pub fn decode(bytes: &[u8]) -> Decode<'_> {
    Decode { bytes }
}

impl Decode<'_> {
    /// Returns the unprocessed part of the stored byte slice.
    ///
    /// # Example
    ///
    /// ```rust
    /// // "ABC"
    /// let bytes = &[0x41, 0x42, 0x43];
    ///
    /// let mut d = rut::decode(bytes);
    ///
    /// assert_eq!(d.next(), Some(Ok('A')));
    /// assert_eq!(d.rest(), &[0x42, 0x43]);
    /// ```
    pub fn rest(&self) -> &[u8] {
        self.bytes
    }
}

impl ::core::iter::Iterator for Decode<'_> {
    type Item = Result<char, self::Error>;

    fn next(&mut self) -> Option<Self::Item> {
        if self.bytes.is_empty() {
            None
        } else {
            let (result, rest) = decode_one(self.bytes);
            self.bytes = rest;
            Some(result)
        }
    }
}

impl ::core::iter::FusedIterator for Decode<'_> {}

#[cfg(test)]
mod tests {
    use super::{*, Error::*};

    #[test]
    fn first_valid_1() {
        let bytes = &[0x00];

        let (result, rest) = decode_one(bytes);

        assert_eq!(result, Ok('\u{0000}'));
        assert_eq!(rest, &[]);
    }

    #[test]
    fn first_valid_2() {
        let bytes = &[0xC2, 0x80];

        let (result, rest) = decode_one(bytes);

        assert_eq!(result, Ok('\u{0080}'));
        assert_eq!(rest, &[]);
    }

    #[test]
    fn first_valid_3() {
        let bytes = &[0xE0, 0xA0, 0x80];

        let (result, rest) = decode_one(bytes);

        assert_eq!(result, Ok('\u{0800}'));
        assert_eq!(rest, &[]);
    }

    #[test]
    fn first_valid_4() {
        let bytes = &[0xF0, 0x90, 0x80, 0x80];

        let (result, rest) = decode_one(bytes);

        assert_eq!(result, Ok('\u{10000}'));
        assert_eq!(rest, &[]);
    }

    #[test]
    fn last_valid_1() {
        let bytes = &[0x7F];

        let (result, rest) = decode_one(bytes);

        assert_eq!(result, Ok('\u{007F}'));
        assert_eq!(rest, &[]);
    }

    #[test]
    fn last_valid_2() {
        let bytes = &[0xDF, 0xBF];

        let (result, rest) = decode_one(bytes);

        assert_eq!(result, Ok('\u{07FF}'));
        assert_eq!(rest, &[]);
    }

    #[test]
    fn last_valid_3() {
        let bytes = &[0xEF, 0xBF, 0xBF];

        let (result, rest) = decode_one(bytes);

        assert_eq!(result, Ok('\u{FFFF}'));
        assert_eq!(rest, &[]);
    }

    #[test]
    fn last_valid_4() {
        let bytes = &[0xF4, 0x8F, 0xBF, 0xBF];

        let (result, rest) = decode_one(bytes);

        assert_eq!(result, Ok('\u{10FFFF}'));
        assert_eq!(rest, &[]);
    }

    #[test]
    fn first_before_surrogates() {
        let bytes = &[0xED, 0x9F, 0xBF];

        let (result, rest) = decode_one(bytes);

        assert_eq!(result, Ok('\u{D7FF}'));
        assert_eq!(rest, &[]);
    }

    #[test]
    fn first_after_surrogates() {
        let bytes = &[0xEE, 0x80, 0x80];

        let (result, rest) = decode_one(bytes);

        assert_eq!(result, Ok('\u{E000}'));
        assert_eq!(rest, &[]);
    }

    #[test]
    fn invalid_bytes() {
        let bytes = &[0xC0, 0xC1,
                      0xF5, 0xF6, 0xF7, 0xF8,
                      0xF9, 0xFA, 0xFB, 0xFC,
                      0xFD, 0xFE, 0xFF];

        for b in bytes {
            let slice = core::slice::from_ref(b);

            let (result, rest) = decode_one(slice);

            assert_eq!(result, Err(InvalidByte));
            assert_eq!(rest, &[]);
        }
    }

    #[test]
    fn continuation_bytes() {
        for b in 0x80..=0xBF {
            let slice = core::slice::from_ref(&b);

            let (result, rest) =  decode_one(slice);

            assert_eq!(result, Err(UnexpectedContinuation));
            assert_eq!(rest, &[]);
        }
    }
}