sipp 0.2.1

Simple parser package
Documentation
use std::io::{Error, ErrorKind, Read};

use crate::{buffer::ByteBuffer, decoder::ByteStreamCharDecoder};

/**
A decoder for a byte stream which is using UTF-16 little-endian character encoding.
*/
pub struct Utf16LittleEndianDecoder<R> {
    byte_buffer: ByteBuffer<R>,
}

impl<R: Read> ByteStreamCharDecoder<R> for Utf16LittleEndianDecoder<R> {
    /**
    Wraps the given `Read` type as a byte stream and uses UTF-16 encoding, with **little
    endian** byte order, to convert bytes into characters.

    # Examples

    If you know that your application will only receive UTF-16LE encoded files, then you can
    wrap the file in a decoder directly and simply check for a literal BOM character (Unicode
    codepoint U+FEFF "Zero Width No-break Space"). (Note that the **codepoint** of the BOM does
    not change regardless of whether big endian or little endian byte order is being used. So
    we use the Rust literal `'\u{FEFF}'` to represent the BOM character no matter the encoding.)

    ```
    // Important: `use` both Utf16LittleEndianDecoder and the trait ByteStreamCharDecoder!
    use sipp::{buffer::ByteBuffer, decoder::{Utf16LittleEndianDecoder, ByteStreamCharDecoder}};
    use std::fs::File;
    use std::io::Error;

    fn main() -> Result<(), Error> {
        let file = File::open("test_resources/xml_utf16LE_BOM.xml")?;
        let mut decoder = Utf16LittleEndianDecoder::wrap(file);
        let first_character = decoder.decode_char()?;
        if first_character == Some('\u{FEFF}') {
            println!("First read character is the Unicode BOM!");
        } else {
            println!("First read character is not the BOM!");
        }
        # assert_eq!(first_character, Some('\u{FEFF}'));
        Ok(())
    }
    ```
    */
    fn wrap(reader: R) -> Self {
        Utf16LittleEndianDecoder {
            byte_buffer: ByteBuffer::wrap(reader),
        }
    }

    /**
    Wraps the given `ByteBuffer` type as a byte stream and uses UTF-16 encoding, with
    **little endian** byte order, to convert bytes into characters.

    # Examples

    If you need to read from a UTF-16 LE file (or any other `Read` type) but you need to check for
    a BOM (byte-order mark) at the start of the byte stream first, then you can wrap the `File`
    in a `ByteBuffer`, check for the BOM and skip past it, then wrap the `ByteBuffer` with a
    `Utf16LittleEndianDecoder` and start reading the actual content from it one character at a time.

    ```
    // Important: `use` both Utf16LittleEndianDecoder and the trait ByteStreamCharDecoder!
    use sipp::{buffer::ByteBuffer, decoder::{Utf16LittleEndianDecoder, ByteStreamCharDecoder}};
    use std::fs::File;
    use std::io::Error;

    // A UTF-16 little endian BOM is two bytes: 0xFF 0xFE
    const BOM_UTF16LE: &[u8] = &[0xFF, 0xFE];

    fn main() -> Result<(), Error> {
        let file = File::open("test_resources/xml_utf16LE_BOM.xml")?;
        let mut byte_buffer = ByteBuffer::wrap(file);
        let first_bytes = byte_buffer.peek()?;
        # let mut found_byte_order_mark = false;
        if first_bytes.len() > 3 && first_bytes[0..2] == *BOM_UTF16LE {
            println!("Found input which starts with UTF-16LE BOM!");
            // To rule out the possibility that this is a UCS-4 BOM, we need to check that at
            // least one of the next two bytes are non-zero.
            if first_bytes[2] == 0_u8 && first_bytes[3] == 0_u8 {
                println!("This is actually a UCS-4 encoded file!");
                panic!("This application is not expecting UCS-4 encoding!");
            } else {
                # found_byte_order_mark = true;
                // Now read past the two bytes which make up the UTF-16LE BOM.
                assert_eq!(byte_buffer.read_next()?, Some(0xFF));
                assert_eq!(byte_buffer.read_next()?, Some(0xFE));
            }
        } else {
            println!("No BOM found!");
        }
        # assert!(found_byte_order_mark);
        // Now the BOM is out of the way, you can wrap the ByteBuffer with
        // Utf16LittleEndianDecoder so that it's ready to decode actual character content.
        let mut decoder = Utf16LittleEndianDecoder::wrap_buffer(byte_buffer);
        # assert_eq!(decoder.decode_char()?, Some('<'));
        Ok(())
    }
    ```

    Note that in a real application, you would also want to check for a UTF-16BE (big endian)
    byte-order mark before you assume that it's a valid UTF-16LE file.
    */
    fn wrap_buffer(byte_buffer: ByteBuffer<R>) -> Self {
        Utf16LittleEndianDecoder { byte_buffer }
    }

    /**
    Returns the next character represented by the byte stream. If there are no bytes remaining
    in the input stream then this method will return `None`.

    This method will not (must not) return Unicode surrogate codepoint characters.

    # Errors

    If the byte stream contains a sequence of bytes which do not represent a valid character
    under UTF-16LE encoding, or if something goes wrong while reading the byte stream, then this
    method will return an `std::io::Error` variant.

    # Examples

    Reading from a file we know to be UTF-16LE with a BOM, you can see how `decode_next` works.

    ```
    // Important: must `use` both Utf16LittleEndianDecoder and the trait ByteStreamCharDecoder!
    use sipp::decoder::{Utf16LittleEndianDecoder, ByteStreamCharDecoder};
    use std::io::Error;
    use std::fs::File;

    fn main() -> Result<(), Error> {
        let file = File::open("test_resources/xml_utf16LE_BOM.xml")?;
        let mut decoder = Utf16LittleEndianDecoder::wrap(file);
        // Confirm that the BOM exists.
        let first_character = decoder.decode_char()?;
        assert_eq!(first_character, Some('\u{FEFF}'));
        // While there is content, Some(c) will be returned.
        while let Some(c) = decoder.decode_char()? {
            println!("Found character '{}'", c);
        }
        // Once we've consumed all of the content, None will be returned.
        assert_eq!(decoder.decode_char()?, None);
        Ok(())
    }
    ```

    As an example of what might cause an error to be returned, see what happens if we ask
    `Utf16LittleEndianDecoder` to decode a byte sequence which is not valid UTF-16.

    ```
    // Important: must `use` both Utf16LittleEndianDecoder and the trait ByteStreamCharDecoder!
    use sipp::decoder::{Utf16LittleEndianDecoder, ByteStreamCharDecoder};
    use std::io::Error;
    use std::fs::File;

    fn main() -> Result<(), Error> {
        // UTF-16LE representation of "Hi" followed by two *high* surrogate codepoints
        let bytes: &[u8] = &[0x48, 0x00, 0x69, 0x00, 0x00, 0xD8, 0x01, 0xD8];
        let mut decoder = Utf16LittleEndianDecoder::wrap(bytes);
        // Reading works fine while we have valid UTF-16LE byte to decode:
        assert_eq!(decoder.decode_char()?, Some('H'));
        assert_eq!(decoder.decode_char()?, Some('i'));
        // But once the decoder reaches the byte sequence of two high surrogate codepoints,
        // which is not a valid surrogate pair (should be high paired with low), then an error
        // will be returned.
        let invalid_read = decoder.decode_char();
        assert!(invalid_read.is_err());
        # let invalid_read = decoder.decode_char()?;
        # assert!(invalid_read.is_none());
        Ok(())
    }
    ```

    While you may be able to keep reading after an error has been returned, it is recommended
    that an error is considered to indicate an invalid or corrupt UTF-16 stream, and no further
    reading should be attempted.
    */
    fn decode_char(&mut self) -> Result<Option<char>, Error> {
        // I'm writing this decoder from scratch, so hold on tight.
        match self.byte_buffer.read_next()? {
            None => Ok(None),
            Some(first_byte) => match self.byte_buffer.read_next()? {
                None => Err(Error::new(
                    ErrorKind::InvalidData,
                    "Input is not valid UTF-16.",
                )),
                Some(second_byte) => {
                    if second_byte == 0 {
                        return Self::u32_to_char(first_byte as u32);
                    }
                    let mut numeric_value: u32 = 0;
                    numeric_value += (second_byte as u32) << 8;
                    numeric_value += first_byte as u32;
                    if !(0xD8..0xE0).contains(&second_byte) {
                        return Self::u32_to_char(numeric_value);
                    }
                    if numeric_value < 0xD800 {
                        return Err(Error::new(
                            ErrorKind::InvalidData,
                            "Input contains invalid surrogate pair (start of pair).",
                        ));
                    }
                    let high_surrogate_part: u32 = (numeric_value - 0xD800) << 10;
                    match self.byte_buffer.read_next()? {
                        None => Err(Error::new(
                            ErrorKind::InvalidData,
                            "Input contains incomplete surrogate pair.",
                        )),
                        Some(third_byte) => match self.byte_buffer.read_next()? {
                            None => Err(Error::new(
                                ErrorKind::InvalidData,
                                "Input contains invalid lower surrogate.",
                            )),
                            Some(fourth_byte) => {
                                let mut numeric_value: u32 = 0;
                                numeric_value += (fourth_byte as u32) << 8;
                                numeric_value += third_byte as u32;
                                if numeric_value < 0xDC00 {
                                    return Err(Error::new(
                                        ErrorKind::InvalidData,
                                        "Input contains invalid surrogate pair (end of pair).",
                                    ));
                                }
                                let lower_surrogate_part: u32 = numeric_value - 0xDC00;
                                let codepoint =
                                    high_surrogate_part + lower_surrogate_part + 0x10000;

                                Self::u32_to_char(codepoint)
                            }
                        },
                    }
                }
            },
        }
    }
}

impl<R: Read> Utf16LittleEndianDecoder<R> {
    fn u32_to_char(codepoint: u32) -> Result<Option<char>, Error> {
        let conversion = char::from_u32(codepoint);
        match conversion {
            Some(c) => Ok(Some(c)),
            None => Err(Error::new(
                ErrorKind::InvalidData,
                "Invalid UTF-16: byte sequence maps to illegal codepoint.",
            )),
        }
    }
}

#[cfg(test)]
mod tests {
    use std::fs::File;

    // Note this useful idiom: importing names from outer (for mod tests) scope.
    use super::*;

    #[test]
    fn empty() -> Result<(), Error> {
        let original = "";
        let bytes = original.as_bytes();
        let mut decoder = Utf16LittleEndianDecoder::wrap(bytes);
        let mut builder = String::new();
        while let Some(c) = decoder.decode_char()? {
            builder.push(c);
        }
        assert_eq!(builder.to_owned(), original.to_owned());
        Ok(())
    }

    fn decode_file(filename: &str) -> Result<Utf16LittleEndianDecoder<File>, Error> {
        let file = File::open(filename)?;
        Ok(Utf16LittleEndianDecoder::wrap(file))
    }

    #[test]
    fn simple_characters() -> Result<(), Error> {
        let mut decoder = decode_file("test_resources/simple_utf16_LE_withBOM.txt")?;
        let mut builder = String::with_capacity(16);
        while let Some(c) = decoder.decode_char()? {
            builder.push(c);
        }
        let expected = "\u{FEFF}Simple text with no exotic characters.
Just some Latin letters, digits, and standard whitespace.
    <-Tab here.
1234567890
abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
";
        assert_eq!(builder.as_str(), expected);
        Ok(())
    }

    #[test]
    fn european_characters() -> Result<(), Error> {
        let mut decoder = decode_file("test_resources/European_utf16_LE_withBOM.txt")?;
        let mut builder = String::with_capacity(16);
        while let Some(c) = decoder.decode_char()? {
            builder.push(c);
        }
        let expected = "\u{FEFF}Swedish: Svenska är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige.
Ukrainian: Украї́нська мо́ва - національна мова українців. Належить до східнослов'янської групи слов'янських мов, що входять до індоєвропейської мовної сім'ї, поряд з романськими, германськими, кельтськими, грецькою, албанською, вірменською та найближче спорідненими зі слов'янськими балтійськими мовами.
Greek: Η ελληνική γλώσσα ανήκει στην ινδοευρωπαϊκή οικογένεια και αποτελεί το μοναδικό μέλος του ελληνικού κλάδου, ενώ είναι η επίσημη γλώσσα της Ελλάδας και της Κύπρου. Ανήκει επίσης στο βαλκανικό γλωσσικό δεσμό.
";
        assert_eq!(builder.as_str(), expected);
        Ok(())
    }

    #[test]
    fn japanese_and_math_characters() -> Result<(), Error> {
        let mut decoder = decode_file("test_resources/Japanese_and_math_utf16_LE_withBOM.txt")?;
        let mut builder = String::with_capacity(16);
        while let Some(c) = decoder.decode_char()? {
            builder.push(c);
        }
        let expected = "\u{FEFF}Japanese: 日本語 は、日本国内や、かつての日本領だった国、そして国外移民や移住者を含む日本人同士の間で使用されている言語。
Mathematical symbols: ∀ x ∃ ∅ ∌ x
";
        assert_eq!(builder.as_str(), expected);
        Ok(())
    }

    #[test]
    fn fuzz_test_crash_1() -> Result<(), Error> {
        // This byte sequence contains an invalid surrogate pair, so we expect the decoder
        // to return an Err before it reaches the end of the sequence.
        let bad_data: &[u8] = &[
            0xFF, 0xFE, 0xFF, 0xDB, 0xDB, 0xDB, 0xDB, 0xDB, 0x0A, 0x02, 0x0D, 0x3C, 0x3F, 0x3A,
        ];
        let mut decoder = Utf16LittleEndianDecoder::wrap(bad_data);
        let mut found_error = false;
        loop {
            let outcome = decoder.decode_char();
            match outcome {
                Ok(None) => break,
                Ok(Some(_)) => continue,
                Err(_) => {
                    found_error = true;
                }
            }
        }
        assert!(found_error);
        Ok(())
    }

    #[test]
    fn invalid_unicode_codepoint_high_surrogate_first() -> Result<(), Error> {
        // UTF-16LE representation of first high surrogate codepoint U+D800
        let bytes: &[u8] = &[0x00, 0xD8];
        let mut decoder = Utf16LittleEndianDecoder::wrap(bytes);
        let invalid_read = decoder.decode_char();
        assert!(invalid_read.is_err());
        Ok(())
    }

    #[test]
    fn invalid_unicode_codepoint_high_surrogate_last() -> Result<(), Error> {
        // UTF-16LE representation of last high surrogate codepoint U+DBFF
        let bytes: &[u8] = &[0xFF, 0xDB];
        let mut decoder = Utf16LittleEndianDecoder::wrap(bytes);
        let invalid_read = decoder.decode_char();
        assert!(invalid_read.is_err());
        Ok(())
    }

    #[test]
    fn invalid_unicode_codepoint_low_surrogate_first() -> Result<(), Error> {
        // UTF-16LE representation of first low surrogate codepoint U+DC00
        let bytes: &[u8] = &[0x00, 0xDC];
        let mut decoder = Utf16LittleEndianDecoder::wrap(bytes);
        let invalid_read = decoder.decode_char();
        assert!(invalid_read.is_err());
        Ok(())
    }

    #[test]
    fn invalid_unicode_codepoint_low_surrogate_last() -> Result<(), Error> {
        // UTF-16LE representation of last low surrogate codepoint U+DFFF
        let bytes: &[u8] = &[0xFF, 0xDF];
        let mut decoder = Utf16LittleEndianDecoder::wrap(bytes);
        let invalid_read = decoder.decode_char();
        assert!(invalid_read.is_err());
        Ok(())
    }
}