bufjson 0.1.4 - Docs.rs

use std::fmt;
use std::pin::Pin;
use std::task::{Context, Poll};

pub mod buf;
pub mod state;

/// JSON lexical token type, such as begin object (`{`), literal true (`true`), or string.
///
/// This is a list of the JSON lexical token types as described in the [JSON spec][rfc]. The names
/// of enumeration members are aligned with the names as they appear in the spec.
///
/// Note that `Token` just models the token *type*, not the value. Some token types have static
/// values that never change (*e.g.*, [`ArrBegin`] is always `'['`) while others have variable
/// values that depend on the specific JSON text being analyzed (*e.g.* [`Str`]).
///
/// [rfc]: https://datatracker.ietf.org/doc/html/rfc8259
/// [`ArrBegin`]: Token::ArrBegin
/// [`Str`]: Token::Str
#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub enum Token {
    /// The begin array token, which has the literal value `[`.
    ArrBegin,
    /// The end array token, which has the literal value `]`.
    ArrEnd,
    /// Pseudo-token representing the end of the JSON text (end of file).
    Eof,
    /// Pseudo-token representing an unrecoverable lexical error detected in the JSON text.
    Err,
    /// The value literal `false`.
    LitFalse,
    /// The value literal `null`.
    LitNull,
    /// The value literal `true`.
    LitTrue,
    /// The name separator token, which has the literal value `:`.
    NameSep,
    /// A numer token such as `0`, `123.4`, or `-1.25e+6`.
    Num,
    /// The begin object token, which has the literal value `{`.
    ObjBegin,
    /// The end object token, which has the literal value `}`.
    ObjEnd,
    /// A string token, such as `""`, `"foo"`, or `"Hello,\u0020world! 🌎"`.
    Str,
    /// The value separator token, which has the literal value `,`.
    ValueSep,
    /// A maximal string of insignificant whitespace.
    White,
}

impl fmt::Display for Token {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let s = match self {
            Self::ArrBegin => "[",
            Self::ArrEnd => "]",
            Self::Eof => "EOF",
            Self::Err => "error",
            Self::LitFalse => "false",
            Self::LitNull => "null",
            Self::LitTrue => "true",
            Self::NameSep => ":",
            Self::Num => "number",
            Self::ObjBegin => "{",
            Self::ObjEnd => "}",
            Self::Str => "string",
            Self::ValueSep => ",",
            Self::White => "whitespace",
        };

        write!(f, "{s}")
    }
}

/// JSON lexical token value.
///
/// Contains the actual *value* of the JSON token read from the JSON text. This is in distinction to
/// [`Token`], which only indicates the *type* of the token.
///
/// For example, consider the following JSON text:
///
/// ```json
/// "foo"
/// ```
///
/// The above JSON text contains one token whose type is [`Token::Str`] and whose value is `"foo"`.
pub trait Value {
    /// Returns the literal value of the token exactly as it appears in the JSON text.
    ///
    /// # Fixed value tokens
    ///
    /// For token types with a fixed literal value, *e.g.* [`Token::NameSep`], the value returned
    /// is the fixed value.
    ///
    /// # Numbers
    ///
    /// For number tokens, the value returned is the literal text of the number token.
    ///
    /// # Strings
    ///
    /// For string tokens, the value returned is the literal text of the string token *including*
    /// the opening and closing double quote (`"`) characters. Therefore, every string token has
    /// length at least two and the unquoted value can be extracted by dropping the first and last
    /// characters.
    ///
    /// Because the return value contains the entire literal string token as it appears in the JSON
    /// text, any escape sequences the string may contain are not expanded. This has the benefit
    /// of supporting the following use cases: allowing lexical analyzer implementations to minimize
    /// or eliminate allocations when returning token values; and allowing applications to observe
    /// or edit a stream of JSON tokens without making any unintended changes to the raw JSON input.
    ///
    /// Some applications need to have escape sequences expanded in order to work with normalized
    /// strings. For example, it's pretty hard to reliably do a dictionary lookup for the name
    /// `"foo"` if the literal value might be `"fo\u006f"`, `"f\u006f\u006f"`, `"\u0066oo"`, *etc.*
    /// To check if the string contains an escape sequence, use [`is_escaped`]; and to obtain the
    /// normalized value with all escape sequences expanded, use [`unescaped`].
    ///
    /// [`is_escaped`]: method@Self::is_escaped
    /// [`unescaped`]: method@Self::unescaped
    ///
    /// # Whitespace
    ///
    /// For whitespace tokens, the value returned is the literal string of whitespace characters.
    ///
    /// # End of file
    ///
    /// For the pseudo-token [`Token::Eof`], the value is the empty string.
    fn literal(&self) -> &str;

    /// Indicates whether the token value contains escape sequences.
    ///
    /// This method must always return `false` for all token types except [`Token::Str`]. For
    /// [`Token::Str`], it must return `true` if the literal text of the string token contains at
    /// least one escape sequence, and `false` otherwise.
    fn is_escaped(&self) -> bool;

    /// Returns a normalized version of [`literal`] with all escape sequences encountered in the
    /// JSON text fully expanded.
    ///
    /// For non-string tokens, and string tokens for which [`is_escaped`] returns `false`, this
    /// method does nothing and simply returns the same value returned by [`literal`].
    ///
    /// For string tokens containing one or more escape sequences, this method returns a normalized
    /// version of the string value with the escape sequences expanded. At least one allocation is
    /// likely to be triggered the first time this method is called for a string token value
    /// containing escape sequences.
    ///
    /// As described in the [JSON spec][rfc], the following escape sequence expansions are done:
    ///
    /// | Sequence | Expands to |
    /// |-|-|
    /// | `\"` | Quotation mark, `"`, U+0022 |
    /// | `\\` | Reverse solidus, `\`, U+005c |
    /// | `\/` | Solidus, `/`, U+002f |
    /// | `\b` | Backspace, U+0008 |
    /// | `\f` | Form feed, U+000c |
    /// | `\n` | Line feed, U+000a |
    /// | `\r` | Carriage return, U+000d |
    /// | `\t` | Horizontal tab, U+0009 |
    /// | `\uXXXX` | Any Unicode character in basic multilingual plane, U+0000 through U+ffff |
    /// | `\uHHHH\uLLLL` | Unicode characters outside the basic multilingual plane represented as a high/low surrogate pair |
    ///
    /// [`literal`]: method@Self::literal
    /// [`is_escaped`]: method@Self::is_escaped
    /// [rfc]: https://datatracker.ietf.org/doc/html/rfc8259
    fn unescaped(&mut self) -> &str;
}

/// Position in an input buffer or stream.
#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
pub struct Pos {
    /// Zero-based byte offset from the start of the stream.
    ///
    /// The first byte in the stream has `offset` zero, the second `offset` one, and so on.
    pub offset: usize,

    /// One-based line offset from the start of the stream.
    ///
    /// The first byte in the stream is on `line` one, the first byte following the first line
    /// breaking sequence is on line two, and so on. One-based indexing is used for `line` because
    /// line numbers are primarily for consumption by humans, as opposed to byte offsets, which are
    /// primarily for consumption by computers.
    pub line: usize,

    /// One based column offset from the start of the line, where columns are measured in
    /// characters. One-based indexing is used for `col` because column numbers are primarily for
    /// consumption by humans, as opposed to byte offsets, which are primarily for consumption by
    /// computers.
    ///
    /// The first byte in the stream is at `col` one, and whenever the line number is incremented,
    /// the first byte on the next line is at `col` one. Each column number increment corresponds
    /// to a full valid UTF-8 character.
    ///
    /// Note that the [JSON spec][rfc] only allows multi-byte UTF-8 within string values. Outside of
    /// strings, every one byte always equals one column; but inside a string, a valid two-, three-,
    /// or four-byte UTF-8 sequence will only increment the column count by 1.
    ///
    /// [rfc]: https://datatracker.ietf.org/doc/html/rfc8259
    pub col: usize,
}

impl Pos {
    #[inline(always)]
    pub(crate) fn advance_line(&mut self) {
        self.offset += 1;
        self.line += 1;
        self.col = 1;
    }

    #[inline(always)]
    pub(crate) fn advance_line_no_offset(&mut self) {
        self.line += 1;
        self.col = 1;
    }

    #[inline(always)]
    pub(crate) fn advance_col(&mut self) {
        self.offset += 1;
        self.col += 1;
    }

    #[inline(always)]
    pub(crate) fn advance_offset(&mut self, by: usize) {
        self.offset += by;
    }
}

impl Default for Pos {
    fn default() -> Self {
        Self {
            offset: 0,
            line: 1,
            col: 1,
        }
    }
}

impl fmt::Display for Pos {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(
            f,
            "line {}, column {} (offset: {})",
            self.line, self.col, self.offset
        )
    }
}

/// Character or class of characters that a lexical analyzer expecs to see at the next input
/// position.
///
/// This enumeration used to provide detail information for [`ErrorKind::UnexpectedByte`].
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
pub enum Expect {
    /// Any token boundary character.
    ///
    /// One of:
    /// - `'{'` (opening brace, U+007B)
    /// - `'}'` (closing brace, U+007D)
    /// - `'['` (opening bracket, U+005B)
    /// - `']'` (closing bracket, U+005D)
    /// - `':'` (colon, U+003A)
    /// - `','` (comma, U+002C)
    /// - `' '` (space, U+0020)
    /// - `'\t'` (horizontal tab, U+0009)
    /// - `'\n'` (line feed, U+000A)
    /// - `'\r'` (carriage return, U+000D).
    Boundary,

    /// A specific character.
    Char(char),

    /// Any decimal digit character, `'0'`..`'9'` (U+0030..U+0039).
    Digit,

    /// Any decimal digit character ([`Digit`]) or one of the two exponent sign characters `'+'`
    /// (U+002B) or `'-'` (U+002D).
    ///
    /// [`Digit`]: Expect::Digit
    DigitOrExpSign,

    /// Any decimal digit character ([`Digit`]) or token boundary character ([`Boundary`]).
    ///
    /// [`Digit`]: Expect::Digit
    /// [`Boundary`]: Expect::Boundary
    DigitOrBoundary,

    /// The dot or period character `'.'` (U+002E) or any token boundary character ([`Boundary`]).
    ///
    /// [`Boundary`]: Expect::Boundary
    DotOrBoundary,

    /// Any character that completes a short-form escape sequence or starts a Unicode escape
    /// sequence.
    ///
    /// One of:
    /// - `'"'` (double quotation mark, U+0022)
    /// - `'\\'` (reverse solidus, U+005C)
    /// - `'/'` (solidus, U+002F)
    /// - `'b'` (lowercase 'b', U+0062)
    /// - `'f'` (lowercase 'f', U+0066)
    /// - `'n'` (lowercase 'n', U+006E)
    /// - `'r'` (lowercase 'r', U+0072)
    /// - `'t'` (lowercase 't', U+0074)
    /// - `'u'` (lowercase 'u', U+0075)
    EscChar,

    /// Any character that is valid in a JSON string token, the string token termination character
    /// `'"'` (double quotation mark, U+0022).
    ///
    /// This essentially means any valid Unicode character at or above the space `' '` (U+0020).
    StrChar,

    /// Any character that validly starts a JSON token.
    ///
    /// One of:
    ///
    /// - A boundary character ([`Boundary`])
    /// - A digit ([`Digit`])
    /// - `'"'` (double quotation mark, U+0022)
    /// - `'f'` (U+0066)
    /// - `'n'` (U+006E)
    /// - `'t'` (U+0074)
    ///
    /// [`Digit`]: Expect::Digit
    /// [`Boundary`]: Expect::Boundary
    TokenStartChar,

    /// Any hexadecimal digit character allowed in a Unicode escape sequence.
    ///
    /// One of:
    /// - A decimal digit character ([`Digit`])
    /// - An uppercase letter `'A'`..`'F'` (U+0041..U+0046)
    /// - A lowercase letter `'a'`..`'f'` (U+0061..0066)
    UnicodeEscHexDigit,
}

impl fmt::Display for Expect {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::Boundary => write!(f, "boundary character or EOF"),
            Self::Char(c) => write!(f, "character '{c}'"),
            Self::Digit => write!(f, "digit character '0'..'9'"),
            Self::DigitOrBoundary => {
                write!(f, "digit character '0'..'9', boundary character, or EOF")
            }
            Self::DigitOrExpSign => write!(
                f,
                "exponent sign character '+' or '-', or exponent digit character '0'..'9'"
            ),
            Self::DotOrBoundary => write!(f, "character '.', boundary character, or EOF"),
            Self::EscChar => write!(
                f,
                "escape sequence character '\\', '\"', '/', 'r', 'n', 't', or 'u'"
            ),
            Self::StrChar => write!(f, "string character"),
            Self::TokenStartChar => write!(f, "token start character"),
            Self::UnicodeEscHexDigit => write!(
                f,
                "Unicode escape sequence hex digit '0'..'9', 'A'..'F', or 'a'..'f'"
            ),
        }
    }
}

#[derive(Debug, Clone, Copy, Eq, PartialEq)]
pub enum ErrorKind {
    BadSurrogatePair(u16, Option<u16>),
    BadUtf8Seq,
    BadUtf8ContByte {
        seq_len: u8,
        offset: u8,
        value: u8,
    },
    Read,
    UnexpectedByte {
        token: Option<Token>,
        expect: Expect,
        actual: u8,
    },
    UnexpectedEof(Token),
}

impl ErrorKind {
    pub(crate) fn bad_utf8_cont_byte(seq_len: u8, offset: u8, value: u8) -> ErrorKind {
        ErrorKind::BadUtf8ContByte {
            seq_len,
            offset,
            value,
        }
    }

    pub(crate) fn expect_boundary(token: Token, actual: u8) -> ErrorKind {
        let expect = Expect::Boundary;

        ErrorKind::UnexpectedByte {
            token: Some(token),
            expect,
            actual,
        }
    }

    pub(crate) fn expect_char(token: Token, actual: u8, expect: char) -> ErrorKind {
        let expect = Expect::Char(expect);

        ErrorKind::UnexpectedByte {
            token: Some(token),
            expect,
            actual,
        }
    }

    pub(crate) fn expect_digit(actual: u8) -> ErrorKind {
        let expect = Expect::Digit;

        ErrorKind::UnexpectedByte {
            token: Some(Token::Num),
            expect,
            actual,
        }
    }

    pub(crate) fn expect_digit_or_boundary(actual: u8) -> ErrorKind {
        let expect = Expect::DigitOrBoundary;

        ErrorKind::UnexpectedByte {
            token: Some(Token::Num),
            expect,
            actual,
        }
    }

    pub(crate) fn expect_dot_or_boundary(actual: u8) -> ErrorKind {
        let expect = Expect::DotOrBoundary;

        ErrorKind::UnexpectedByte {
            token: Some(Token::Num),
            expect,
            actual,
        }
    }

    pub(crate) fn expect_esc_char(actual: u8) -> ErrorKind {
        let expect = Expect::EscChar;

        ErrorKind::UnexpectedByte {
            token: Some(Token::Str),
            expect,
            actual,
        }
    }

    pub(crate) fn expect_exp_sign_or_digit(actual: u8) -> ErrorKind {
        let expect = Expect::DigitOrExpSign;

        ErrorKind::UnexpectedByte {
            token: Some(Token::Num),
            expect,
            actual,
        }
    }

    pub(crate) fn expect_str_char(actual: u8) -> ErrorKind {
        let expect = Expect::StrChar;

        ErrorKind::UnexpectedByte {
            token: Some(Token::Str),
            expect,
            actual,
        }
    }

    pub(crate) fn expect_token_start_char(actual: u8) -> ErrorKind {
        let expect = Expect::TokenStartChar;

        ErrorKind::UnexpectedByte {
            token: None,
            expect,
            actual,
        }
    }

    pub(crate) fn expect_unicode_esc_hex_digit(actual: u8) -> ErrorKind {
        let expect = Expect::UnicodeEscHexDigit;

        ErrorKind::UnexpectedByte {
            token: Some(Token::Str),
            expect,
            actual,
        }
    }

    pub(crate) fn expect_unicode_esc_lo_surrogate(actual: u8, expect: char) -> ErrorKind {
        let expect = Expect::Char(expect);

        ErrorKind::UnexpectedByte {
            token: Some(Token::Str),
            expect,
            actual,
        }
    }

    pub(crate) fn fmt_at(&self, f: &mut fmt::Formatter, pos: Option<&Pos>) -> fmt::Result {
        match self {
            Self::BadSurrogatePair(hi, None) => {
                write!(
                    f,
                    "bad Unicode escape sequence: low surrogate '\\u{hi:04X}' without preceding high surrogate"
                )?;
            }

            Self::BadSurrogatePair(hi, Some(lo)) => {
                write!(
                    f,
                    "bad Unicode escape sequence surogate pair: high surrogate '\\u{hi:04X}' followed by invalid low surrogate '\\u{lo:04X}'"
                )?;
            }

            Self::BadUtf8Seq => {
                write!(f, "bad UTF-8 byte sequence")?;
            }

            Self::BadUtf8ContByte {
                seq_len,
                offset,
                value,
            } => {
                write!(
                    f,
                    "bad continuation byte 0x{value:02x} in {seq_len}-byte UTF-8 sequence (byte #{offset})"
                )?;
            }

            Self::Read => write!(f, "read error")?,

            Self::UnexpectedByte {
                token,
                expect,
                actual,
            } if (b' '..=0x7e).contains(actual) => {
                write!(
                    f,
                    "expected {expect} but got character '{}' (ASCII 0x{actual:02x}",
                    *actual as char
                )?;
                if let Some(t) = token {
                    write!(f, " in {t} token")?;
                }
            }

            Self::UnexpectedByte {
                token,
                expect,
                actual,
            } => {
                write!(f, "expected {expect} but got byte {actual:02x}")?;
                if let Some(t) = token {
                    write!(f, "in {t} token")?;
                }
            }

            Self::UnexpectedEof(token) => {
                write!(f, "unexpected EOF in {token} token")?;
            }
        };

        if let Some(p) = pos {
            write!(f, "at {}", *p)?;
        }

        Ok(())
    }
}

impl fmt::Display for ErrorKind {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        self.fmt_at(f, None)
    }
}

pub trait Error: std::error::Error {
    fn kind(&self) -> ErrorKind;

    fn pos(&self) -> &Pos;
}

pub trait Analyzer {
    type Value: Value;
    type Error: Error;

    fn next(&mut self) -> Token;

    fn value(&self) -> Result<Self::Value, Self::Error>;

    fn pos(&self) -> &Pos;
}

pub trait AsyncAnalyzer {
    type Value: Value;
    type Error: Error;

    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Token>>;

    fn value(&self) -> Option<Result<Self::Value, Self::Error>>;

    fn pos(&self) -> Pos;
}

pub(crate) fn hex2u16(b: u8) -> u16 {
    match b {
        b'0'..=b'9' => (b - b'0') as u16,
        b'a'..=b'f' => (10 + b - b'a') as u16,
        b'A'..=b'F' => (10 + b - b'A') as u16,
        _ => panic!("invalid hex character: 0x{b:02x}"),
    }
}

pub(crate) fn unescape<'c>(literal: &str, buf: &'c mut Vec<u8>) {
    debug_assert!(literal.len() >= 2);
    debug_assert!(matches!(literal.chars().nth(0), Some('"')));
    debug_assert!(matches!(literal.chars().nth_back(0), Some('"')));

    let bytes = literal.as_bytes();

    // Reserve at least len-1 characters in the buffer. This is a bit tricksy: we know there is at
    // least one escape sequence, so the real string length is going to shrink by at least one byte.
    buf.reserve(bytes.len() - 1);

    let (mut i, mut j) = (0usize, 0usize);
    let mut hi_surrogate: Option<u32> = None;
    while j < bytes.len() {
        if bytes[j] != b'\\' {
            j = j + 1;
        } else {
            buf.extend_from_slice(&bytes[i..j]);

            let x = bytes[j + 1];
            let mut len = 2;

            match x {
                b'"' | b'\\' | b'/' => buf.push(x),
                b'b' => buf.push(b'\x08'),
                b't' => buf.push(b'\t'),
                b'f' => buf.push(b'\x0c'),
                b'n' => buf.push(b'\n'),
                b'r' => buf.push(b'\r'),
                b'u' => {
                    len = 6;
                    let (b0, b1, b2, b3) = (bytes[j + 2], bytes[j + 3], bytes[j + 4], bytes[j + 5]);
                    let x: u32 =
                        (hex2u16(b0) << 12 | hex2u16(b1) << 8 | hex2u16(b2) << 4 | hex2u16(b3))
                            as u32;

                    let code_point = match (hi_surrogate, x as u32) {
                        (None, 0xd800..=0xdbff) => {
                            hi_surrogate = Some(x);

                            None
                        }
                        (None, _) => Some(x),
                        (Some(hi), 0xdc00..=0xdfff) => {
                            hi_surrogate = None;

                            Some(0x10000 + ((hi - 0xd800) << 10 | x - 0xdc00))
                        }
                        (Some(hi), _) => panic!(
                            "high surrogate followed by invalid low surrogate: [0x{hi:04x}], [0x{x:04x}]"
                        ),
                    };

                    if let Some(c) = code_point {
                        match char::from_u32(c) {
                            Some(y) => {
                                let mut seq = [0u8; 4];
                                let utf8_str = y.encode_utf8(&mut seq);
                                buf.extend_from_slice(utf8_str.as_bytes());
                            }

                            None => unreachable!(),
                        }
                    }
                }
                _ => panic!("invalid escape sequence byte after '\\': 0x{x:02x}"),
            }

            j = j + len;
            i = j;
        }
    }

    debug_assert!(matches!(hi_surrogate, None));

    buf.extend_from_slice(&bytes[i..j]);
}

#[cfg(test)]
mod tests {
    use super::*;
    use rstest::rstest;

    #[rstest]
    #[case(r#""""#, r#""""#)]
    #[case(r#""f""#, r#""f""#)]
    #[case(r#""fo""#, r#""fo""#)]
    #[case(r#""foo""#, r#""foo""#)]
    #[case(r#""\\""#, r#""\""#)]
    #[case(r#""\/""#, r#""/""#)]
    #[case(r#""\"""#, r#"""""#)]
    #[case(r#""\b""#, "\"\x08\"")]
    #[case(r#""\t""#, "\"\t\"")]
    #[case(r#""\f""#, "\"\x0c\"")]
    #[case(r#""\n""#, "\"\n\"")]
    #[case(r#""\r""#, "\"\r\"")]
    #[case(r#""\u0000""#, "\"\0\"")]
    #[case(r#""\u0008""#, "\"\x08\"")]
    #[case(r#""\u0009""#, "\"\t\"")]
    #[case(r#""\u000c""#, "\"\x0c\"")]
    #[case(r#""\u000C""#, "\"\x0C\"")]
    #[case(r#""\u000a""#, "\"\n\"")]
    #[case(r#""\u000A""#, "\"\n\"")]
    #[case(r#""\u000d""#, "\"\r\"")]
    #[case(r#""\u000D""#, "\"\r\"")]
    #[case(r#""\u000D""#, "\"\r\"")]
    #[case(r#""\u0021""#, r#""!""#)]
    #[case(r#""\u0030""#, r#""0""#)]
    #[case(r#""\u0041""#, r#""A""#)]
    #[case(r#""\u0062""#, r#""b""#)]
    #[case(r#""\u007F""#, "\"\x7f\"")] // DEL (U+007F, highest 1-byte UTF-8)
    #[case(r#""\u00A9""#, r#""©""#)] // Copyright sign (U+00A9, 2-byte UTF-8)
    #[case(r#""\u03A9""#, r#""Ω""#)] // Greek capital Omega (U+03A9, 2-byte UTF-8)
    #[case(r#""\u0080""#, "\"\u{80}\"")] // First 2-byte UTF-8 code point
    #[case(r#""\u07FF""#, "\"\u{7ff}\"")] // Last 2-byte UTF-8 code point
    #[case(r#""\u20AC""#, r#""€""#)] // Euro sign (U+20AC, 3-byte UTF-8)
    #[case(r#""\u2603""#, r#""☃""#)] // Snowman (U+2603, 3-byte UTF-8)
    #[case(r#""\u0800""#, "\"\u{800}\"")] // First 3-byte UTF-8 code point
    #[case(r#""\uFFFF""#, "\"\u{ffff}\"")] // Last valid BMP code point (3-byte UTF-8)
    #[case(r#""\ud83D\uDe00""#, r#""😀""#)] // Grinning face emoji (U+1F600, 4-byte UTF-8)
    #[case(r#""\ud800\uDC00""#, "\"\u{10000}\"")] // First 4-byte UTF-8 code point
    #[case(r#""\uDBFF\udfff""#, "\"\u{10FFFF}\"")] // Highest valid Unicode scalar value
    fn test_unescape_ok(#[case] literal: &str, #[case] expect: &str) {
        // Test with an empty buffer.
        {
            let mut buf = Vec::new();

            unescape(literal, &mut buf);
            let actual = String::from_utf8(buf).unwrap();

            assert_eq!(actual, expect);
        }

        // Test with a non-empty buffer.
        {
            let mut buf = Vec::new();

            buf.extend_from_slice(b"foo");
            unescape(literal, &mut buf);
            let actual = String::from_utf8(buf).unwrap();

            assert_eq!(actual, format!("foo{expect}"));
        }
    }

    #[rstest]
    #[case(r#""\ud800\u0000""#)]
    #[case(r#""\uDBFF\ud800""#)]
    #[should_panic(expected = "high surrogate followed by invalid low surrogate")]
    fn test_unescape_panic_invalid_surrogate_pair(#[case] literal: &str) {
        let mut buf = Vec::new();

        unescape(literal, &mut buf);
    }

    #[rstest]
    #[case(r#""\a""#)]
    #[case(r#""\U""#)]
    #[case(r#""\:""#)]
    #[should_panic(expected = "invalid escape sequence byte after '\\'")]
    fn test_unescape_panic_invalid_esc_seq_byte(#[case] literal: &str) {
        let mut buf = Vec::new();

        unescape(literal, &mut buf);
    }
}