trivet 3.1.0 - Docs.rs

// Trivet
// Copyright (c) 2025 by Stacy Prowell.  All rights reserved.
// https://gitlab.com/binary-tools/trivet

//! Decode a byte stream into characters for subsequent processing.
//!
//! This module is important for the implementation of parsing on Windows.
//! Other platforms (Linux, OS X) have settled on UTF-8 as the native
//! preferred encoding, but Windows has settled on UTF-16 for historical
//! reasons.  As such, we have to be careful about reading and handling
//! files.  We use a process here which borrows heavily from the
//! [Encoding Standard][].
//!
//! ## Use
//!
//! To use this struct, make a new instance providing the source as a vector
//! of bytes.  For example, the following reads the content of the standard
//! input and passes it to the decoder, then writes out the decoded bytes
//! as UTF-8.
//!
//! ```rust
//! use trivet::decoder::Decode;
//! use std::io::Read;
//!
//! let mut bytes = vec![];
//! std::io::stdin().read_to_end(&mut bytes).unwrap();
//! let decode = Decode::new(bytes);
//! for ch in decode {
//!     print!("{}", ch);
//! }
//! ```
//!
//! ## UTF-8
//! The UTF-8 decoding method is entirely due to Bjoern Hoehrmann
//! <bjoern@hoehrmann.de>.  See the details [here][Hoehrmann].  This is used
//! persuant to the license given below (though the code has been re-implemented in
//! Rust; original code is available as a non-doc comment in the source).
//!
//! ### License
//! Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
//!
//! Permission is hereby granted, free of charge, to any person obtaining a copy of
//! this software and associated documentation files (the "Software"), to deal in the
//! Software without restriction, including without limitation the rights to use,
//! copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
//! Software, and to permit persons to whom the Software is furnished to do so,
//! subject to the following conditions:
//!
//! The above copyright notice and this permission notice shall be included in all
//! copies or substantial portions of the Software.
//!
//! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
//! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
//! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
//! AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
//! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
//! OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
//! SOFTWARE.
//!
//! [Hoehrmann]: https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
//! [Unicode Standard]: http://www.unicode.org/versions/Unicode11.0.0/ch03.pdf#G7404
//! [Encoding Standard]: https://encoding.spec.whatwg.org/
//!

/// Provide a common interface for stream decoders.
pub trait Decoder: Iterator<Item = char> {
    /// Fetch a sequence of characters.  The returned array will have the number of
    /// characters requested, or fewer if the end of the stream is encountered.
    fn next_n(&mut self, n: usize) -> Vec<char>;

    /// Given a mutable array start filling it from position zero and fill up to the
    /// specified number of characters.  The actual number of characters filled (which
    /// may be less than requested) is returned.
    fn fill_n(&mut self, n: usize, target: &mut [char]) -> usize;
}

/// Determine if a 32-bit value is in the range for a Unicode scalar
/// value.  If so, return the value as a character  if not, return
/// the Unicode "replacement character" instead.
#[inline]
fn check(cp: u32) -> char {
    std::char::from_u32(cp).unwrap_or(std::char::REPLACEMENT_CHARACTER)
}

/// Possible encodings.
#[derive(Eq, PartialEq, Debug, Copy, Clone)]
pub enum Encoding {
    /// UTF-8 encoding.
    UTF8,

    /// UTF-16 encoding, little endian.
    UTF16LE,

    /// UTF-16 encoding, big endian.
    UTF16BE,
}

/// The tricky state transition.
#[rustfmt::skip]
const UTF8D: [u8; 400] = [
    // These bytes accept (state 0) immediately.  They are the ASCII
    // characters that translate directly to Unicode; single-byte UTF-8.
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
    // The first 16 of these immediately reject (state 1).  These are not
    // legal bytes in the UTF-8 encoding system.
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
    0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
    0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
    0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
    1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
    1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
    1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
];
const UTF8_ACCEPT_STATE: u8 = 0;
const UTF8_REJECT_STATE: u8 = 1;

/// Implement decoding of a stream.
///
/// Attempt to decode a stream and provide character-by-character access to the
/// contents.  This method looks for the byte order mark (BOM) at the start of
/// the stream to guess the correct encoding and, if it does not find it, makes
/// an assumption based on the platform.  Specifically, on Windows it assumes
/// UTF-16 LE, whereas on every other platform it assumes UTF-8.
///
/// # Example
///
/// The following reads from the provided string.
///
/// ```rust
/// use trivet::decoder::Decode;
/// use std::io::Read;
///
/// let source = "This is a string to parse";
/// let decode = Decode::new(source.bytes().collect());
/// for ch in decode {
///     print!("{}", ch);
/// }
/// ```
///
#[derive(Debug)]
pub struct Decode {
    /// A buffer holding the bytes read.
    buffer: Vec<u8>,

    /// The size of the buffer.  We hold this for quick reference later.
    size: usize,

    /// The index of the next byte to examine in the buffer.
    next: usize,

    /// The encoding to use.
    encoding: Encoding,
}

impl Decode {
    /// Make a new instance from the given byte sequence.
    pub fn new(bytes: Vec<u8>) -> Self {
        // Read everything to get started.
        let length = bytes.len();
        let mut decode = Decode {
            buffer: bytes,
            size: length,
            next: 0,
            encoding: Encoding::UTF8,
        };
        // Fill the buffer and then guess the encoding.
        decode.guess_encoding();
        decode
    }

    /// Decode from the given string.
    pub fn from_string(string: &str) -> Self {
        Decode::new(string.bytes().collect())
    }

    /// Examine the first few bytes and try to determine the encoding.  After this
    /// the pointer into the buffer should be set to the first byte of the encoded
    /// text and there is a chance that the encoding is set to the correct value.
    /// We assume here that the buffer has been filled.
    ///
    /// We use the following strategy.  On Windows we *assume* the encoding is UTF16LE.
    /// Otherwise we *assume* the encoding is UTF8.  We then examine the first bytes
    /// (see section 6 of the Encoding Standard) for the byte order mark (BOM).  If
    /// we don't find *any* BOM, then we assume the platform-specific encoding.
    ///
    /// ```text
    ///     | 0xEF 0xBB 0xBF | UTF-8    |
    ///     | 0xFE 0xFF      | UTF-16BE |
    ///     | 0xFF 0xFE      | UTF-16LE |
    /// ```
    fn guess_encoding(&mut self) {
        if self.size >= 3 && self.buffer[0..3] == [0xEF, 0xBB, 0xBF] {
            self.encoding = Encoding::UTF8;
            self.next = 3;
        } else if self.size >= 2 && self.buffer[0..2] == [0xFE, 0xFF] {
            self.encoding = Encoding::UTF16BE;
            self.next = 2;
        } else if self.size >= 2 && self.buffer[0..2] == [0xFF, 0xFE] {
            self.encoding = Encoding::UTF16LE;
            self.next = 2;
        } else {
            #[cfg(windows)]
            {
                self.encoding = Encoding::UTF16LE;
            }
            #[cfg(not(windows))]
            {
                self.encoding = Encoding::UTF8;
            }
        }
    }

    /// Process and return the next UTF-8 encoded character.
    ///
    /// If an invalid code point is detected, then the Unicode "replacement character"
    /// is returned.
    ///
    /// If the end of the stream is reached, then `None` is returned.
    fn next_utf8(&mut self) -> Option<char> {
        let mut state: u8 = UTF8_ACCEPT_STATE;
        let mut codep: u32 = 0;
        loop {
            if self.next >= self.size {
                return None;
            }
            let byte = self.buffer[self.next];
            self.next += 1;

            // The original algorithm (in C) without the outer loop to fully collect a character.
            //
            // uint32_t inline
            // decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
            //   uint32_t type = utf8d[byte];
            //
            //   *codep = (*state != UTF8_ACCEPT) ?
            //     (byte & 0x3fu) | (*codep << 6) :
            //     (0xff >> type) & (byte);
            //
            //   *state = utf8d[256 + *state*16 + type];
            //   return *state;
            // }

            let kind = UTF8D[byte as usize];
            codep = if state != UTF8_ACCEPT_STATE {
                (byte as u32 & 0x3Fu32) | (codep << 6)
            } else {
                (0xFFu32 >> kind) & (byte as u32)
            };
            state = UTF8D[256 + (state as usize) * 16 + (kind as usize)];
            if state == UTF8_ACCEPT_STATE {
                return Some(std::char::from_u32(codep).unwrap());
            } else if state == UTF8_REJECT_STATE {
                return Some(std::char::REPLACEMENT_CHARACTER);
            }
        } // Loop until a character is accepted and returned.
    }

    /// Obtain the next two-byte UTF-16 surrogate from the source.  This
    /// method handles the endianness.
    fn next_utf16_surrogate(&mut self) -> Option<u16> {
        // Convert the two bytes into a 16-bit integer based on the
        // byte order.
        if self.next + 2 > self.size {
            return None;
        }
        let first = self.buffer[self.next];
        self.next += 1;
        let second = self.buffer[self.next];
        self.next += 1;
        if self.encoding == Encoding::UTF16BE {
            // High order byte is first.
            Some(((first as u16) << 8) | second as u16)
        } else {
            // Low order byte is first.
            Some(((second as u16) << 8) | first as u16)
        }
    }

    /// Process and return the next UTF-16 encoded character.  The
    /// endianness is obtained from the struct.
    fn next_utf16(&mut self) -> Option<char> {
        // We do not have a prior surrogate pair.  The high-order bits
        // will tell us what to do.
        let surrogate1 = self.next_utf16_surrogate()?;
        if !(0xD800..0xE000).contains(&surrogate1) {
            // Convert and check.
            Some(check(surrogate1 as u32))
        } else {
            // This is the legal first part of a surrogate pair.  Get the
            // next surrogate.  See The Unicode Standard 3.0 (oddly not later),
            // section 3.7 on surrogates.
            let surrogate2 = match self.next_utf16_surrogate() {
                None => return Some(std::char::REPLACEMENT_CHARACTER),
                Some(value) => value,
            };
            // The order of the pairs themselves is independent of the endianness.
            // That is, the d8xx value always comes before the dcxx value.
            let (high, low) = (surrogate1 as u32, surrogate2 as u32);
            // Convert now and check.  Really this might be "< 0xD800" or
            // "< 0xDC00", but just the one check is sufficient.  This is done
            // to make sure we don't underflow on subtract.
            if high < 0xD800 || low < 0xDC00 {
                // Invalid high surrogate.
                Some(std::char::REPLACEMENT_CHARACTER)
            } else {
                // Correctly encoded character.
                Some(check((high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000))
            }
        }
    }
}

impl Decoder for Decode {
    /// Fetch a sequence of characters.  The returned array will have the number of
    /// characters requested, or fewer if the end of the stream is encountered.
    fn next_n(&mut self, n: usize) -> Vec<char> {
        let mut result = vec![];
        match &self.encoding {
            Encoding::UTF8 => {
                for _ in 0..n {
                    if let Some(ch) = self.next_utf8() {
                        result.push(ch);
                    }
                }
            }
            Encoding::UTF16LE | &Encoding::UTF16BE => {
                for _ in 0..n {
                    if let Some(ch) = self.next_utf16() {
                        result.push(ch);
                    }
                }
            }
        }
        result
    }

    /// Given a mutable array start filling it from position zero and fill up to the
    /// specified number of characters.  The actual number of characters filled (which
    /// may be less than requested) is returned.
    fn fill_n(&mut self, n: usize, target: &mut [char]) -> usize {
        let mut count = n;
        match &self.encoding {
            Encoding::UTF8 => {
                // Clippy is wrong.  We are not iterating over the array.
                #[allow(clippy::needless_range_loop)]
                for index in 0..n {
                    match self.next_utf8() {
                        Some(ch) => target[index] = ch,
                        None => {
                            count = index;
                            break;
                        }
                    }
                }
                count
            }
            Encoding::UTF16LE | &Encoding::UTF16BE => {
                // Clippy is wrong.  We are not iterating over the array.
                #[allow(clippy::needless_range_loop)]
                for index in 0..n {
                    match self.next_utf16() {
                        Some(ch) => target[index] = ch,
                        None => {
                            count = index;
                            break;
                        }
                    }
                }
                count
            }
        }
    }
}

/// Provide a character iterator for the decoder.
impl Iterator for Decode {
    /// Iterate over characters.
    type Item = char;

    /// Decode and return the next character from the source.  If
    /// the source is exhausted, or an error occurs, then `None`
    /// is returned.
    fn next(&mut self) -> Option<char> {
        match &self.encoding {
            Encoding::UTF8 => self.next_utf8(),
            Encoding::UTF16LE => self.next_utf16(),
            Encoding::UTF16BE => self.next_utf16(),
        }
    }
}