trivet 3.1.0 - Docs.rs

// Trivet
// Copyright (c) 2025 by Stacy Prowell.  All rights reserved.
// https://gitlab.com/binary-tools/trivet

//! Parse strings.
//!
//! This supports many different approaches for parsing strings.  Choosing a particular
//! mode re-configures the options of the string parser to handle that specific syntax.

#[cfg(not(feature = "no_ucd"))]
use super::ucd::UCD;
use super::C_ESCAPES;
use super::JSON_ESCAPES;
use super::PYTHON_ESCAPES;
use super::RUST_ESCAPES;
use super::TOML_ESCAPES;
use super::TRIVET_ESCAPES;
use crate::decoder::Decode;
use crate::strings::EscapeType;
use crate::strings::IllegalUnicodeProtocol;
use crate::strings::StringStandard;
use crate::strings::UnknownEscapeProtocol;
use crate::{
    errors::{syntax_error, unexpected_character_error, ParseResult},
    Loc, ParserCore,
};
use std::collections::BTreeMap;
#[cfg(not(feature = "no_ucd"))]
use std::rc::Rc;

/// The initial capacity of strings.
const CAPACITY: usize = 64;

/// Construct the UCD and return it.  This is a relatively costly operation and you
/// should only do it *once*.  Once you have done this you can keep it around and use
/// it to initialize string parsers that handle named Unicode escapes.  It is not needed
/// otherwise.
///
/// Why is this boxed?  To prevent passing a huge data structure on the stack.
///
/// Why is this reference counted?  So a single copy can be used repeatedly.
#[cfg(not(feature = "no_ucd"))]
pub fn get_ucd() -> Box<Rc<BTreeMap<&'static str, char>>> {
    // This is where I would use lazy_static, but that would add an external
    // dependency.  Unfortunately `from` requires that we pass the massive array
    // on the stack, so let's not do that.
    let mut map = BTreeMap::new();
    for (key, value) in UCD {
        map.insert(*key, *value);
    }
    Box::new(Rc::new(map))
}

/// Implement parsing of strings.
///
/// This is intended to be a very flexible parsing system, and implements
/// some common string formats.  Specific features can be enable and disabled
/// by setting the flags and providing a map for escape handling rules.
///
/// # Escape Handling
///
/// Specify escape handling rules by creating a `BTreeMap` mapping characters
/// to escape handling rules.  The character is the character following the
/// escape character.  Escape handling rules are specified by [`EscapeType`].
///
/// **Note**: You cannot have both a `\0` escape and support octal escapes, or
/// octal escapes with a leading zero will not work.
///
/// As an example, here are the escape handling rules for Python.
///
/// ```rust
/// use std::collections::BTreeMap;
/// use trivet::strings::EscapeType;
///
/// let escapes = BTreeMap::from([
///     ('\n', EscapeType::Discard),
///     ('\\', EscapeType::Char('\\')),
///     ('\'', EscapeType::Char('\'')),
///     ('\"', EscapeType::Char('\"')),
///     ('a', EscapeType::Char('\x07')),
///     ('b', EscapeType::Char('\x08')),
///     ('f', EscapeType::Char('\x0c')),
///     ('n', EscapeType::Char('\n')),
///     ('r', EscapeType::Char('\r')),
///     ('t', EscapeType::Char('\t')),
///     ('v', EscapeType::Char('\x0b')),
///     ('x', EscapeType::NakedByte),
///     ('N', EscapeType::BracketUNamed),
///     ('u', EscapeType::NakedU4),
///     ('U', EscapeType::NakedU8),
/// ]);
/// ```
///
/// # Unicode Database
///
/// Note: The feature `no_ucd` will disable use of the Unicode database.
///
/// The parser is capable of looking up Unicode code points by their name
/// or alias.  This is provided by a map that encodes the entire space.  This
/// map must be provided to every new parser instance.
///
/// Creating a default instance (with [`Self::default`]) does this for you.
/// If you only use this string parser instance from then on, then you do not
/// need to worry about this.
///
/// If you plan to create many string parser instances, then you should instead
/// get the UCD database yourself via [`get_ucd`], which returns a boxed,
/// reference-counted copy.
///
/// # Example
///
/// ```rust
/// use trivet::strings::StringParser;
/// use trivet::parse_from_string;
/// use trivet::Parser;
///
/// // Make a new string parser.
/// let mut strpar = StringParser::new();
///
/// // Make a parser around a string.
/// let mut parser = parse_from_string(r#""This\nis\na\nstring.""#);
/// match parser.parse_string_match_delimiter() {
///     Ok(value) => println!("{}", value),
///     Err(err) => println!("ERROR: {}", err),
/// }
/// ```
#[derive(Clone)]
pub struct StringParser {
    /// If true, parse escape sequences.
    pub enable_escapes: bool,

    /// Character used to introduce an escape.  Usually `\`.
    pub escape_char: char,

    /// If true, permit "naked" control characters to be present in the stream.  Otherwise
    /// generate an error.  This applies to all character values below `\u0020` and to only
    /// those characters (so delete and a few other control characters are not included).
    pub permit_low_control_characters: bool,

    /// How to handle unrecognized escape sequences.
    pub unknown_escape_protocol: UnknownEscapeProtocol,

    /// If true, and if the current result looks like a UTF-16 surrogate pair (it is in
    /// the range U+D800 up to U+DBFF) then try to find and parse a second surrogate and
    /// generate the corresponding character.
    ///
    /// If false, treat this as an invalid escape.  For instance, Rust does not permit
    /// surrogate pairs in this way.
    pub allow_surrogate_pairs: bool,

    /// How to handle invalid Unicode values that arise from parsing hexadecimal escapes.
    /// This includes surrogate pairs when those are not allowed.
    pub illegal_unicode_protocol: IllegalUnicodeProtocol,

    /// Permit octal escapes.  These have the form `[escape]` followed by (usually) one to
    /// three octal digits (but see [`Self::octal_escapes_are_flexible`]).  Parsing of
    /// octal escapes is performed *before* handling other escapes to permit `[escape]0` to
    /// be handled correctly, if present.
    pub allow_octal_escapes: bool,

    /// Allow flexible octal escapes.  These consist of one to three octal digits.  Python
    /// uses this approach, so `"\x12k"` encodes the string `"\nk"`.  It this is disabled,
    /// then octal escapes must have *exactly* three octal digits.
    pub octal_escapes_are_flexible: bool,

    /// Provide interpretation for escapes.  Each entry maps a specific character to the
    /// character's meaning when that character follows the escape character.  For example,
    /// in C we would have `n` map to `EscapeType::Char('\n')`.
    ///
    /// See [ASCII](https://www.ascii-code.com/) for the meaning of characters in the ASCII
    /// range, and consult the Unicode standard for others.
    escapes: BTreeMap<char, EscapeType>,

    /// Provide a fast lookup table for escapes in the ASCII range.
    fast_escapes: [EscapeType; 128],

    /// The Unicode database of names and aliases to code points.
    #[cfg(not(feature = "no_ucd"))]
    pub ucd: Rc<BTreeMap<&'static str, char>>,
}

impl StringParser {
    /// Make and return a new string parser.  The initial parsing standard is set to
    /// [`StringStandard::Trivet`].
    #[cfg(not(feature = "no_ucd"))]
    pub fn new() -> Self {
        let mut parser = StringParser {
            enable_escapes: true,
            permit_low_control_characters: true,
            escape_char: '\\',
            allow_octal_escapes: true,
            octal_escapes_are_flexible: true,
            allow_surrogate_pairs: true,
            illegal_unicode_protocol: IllegalUnicodeProtocol::ReplacementCharacter,
            unknown_escape_protocol: UnknownEscapeProtocol::LiteralEscape,
            escapes: BTreeMap::from(TRIVET_ESCAPES),
            fast_escapes: [EscapeType::Undefined; 128],
            ucd: *get_ucd(),
        };
        parser.fix_escapes();
        parser
    }
    #[cfg(feature = "no_ucd")]
    pub fn new() -> Self {
        let mut parser = StringParser {
            enable_escapes: true,
            permit_low_control_characters: true,
            escape_char: '\\',
            allow_octal_escapes: true,
            octal_escapes_are_flexible: true,
            allow_surrogate_pairs: true,
            illegal_unicode_protocol: IllegalUnicodeProtocol::ReplacementCharacter,
            unknown_escape_protocol: UnknownEscapeProtocol::LiteralEscape,
            escapes: BTreeMap::from(TRIVET_ESCAPES),
            fast_escapes: [EscapeType::Undefined; 128],
        };
        parser.fix_escapes();
        parser
    }

    /// Make and return a new string parser.  The initial parsing mode is set to Trivet.
    #[cfg(not(feature = "no_ucd"))]
    pub fn new_from_db(ucd: &Rc<BTreeMap<&'static str, char>>) -> Self {
        let mut parser = StringParser {
            enable_escapes: true,
            permit_low_control_characters: true,
            escape_char: '\\',
            allow_octal_escapes: true,
            octal_escapes_are_flexible: true,
            allow_surrogate_pairs: true,
            illegal_unicode_protocol: IllegalUnicodeProtocol::ReplacementCharacter,
            unknown_escape_protocol: UnknownEscapeProtocol::LiteralEscape,
            escapes: BTreeMap::from(TRIVET_ESCAPES),
            fast_escapes: [EscapeType::Undefined; 128],
            ucd: ucd.clone(),
        };
        parser.fix_escapes();
        parser
    }

    /// Configure all settings to conform to a given standard.  See
    /// [`StringStandard`] for the available standards.
    pub fn set(&mut self, std: StringStandard) {
        match std {
            StringStandard::Trivet => {
                self.enable_escapes = true;
                self.permit_low_control_characters = true;
                self.escape_char = '\\';
                self.allow_octal_escapes = true;
                self.octal_escapes_are_flexible = true;
                self.allow_surrogate_pairs = true;
                self.illegal_unicode_protocol = IllegalUnicodeProtocol::ReplacementCharacter;
                self.unknown_escape_protocol = UnknownEscapeProtocol::LiteralEscape;
                self.escapes = BTreeMap::from(TRIVET_ESCAPES);
            }
            StringStandard::C => {
                self.enable_escapes = true;
                self.permit_low_control_characters = true;
                self.escape_char = '\\';
                self.allow_octal_escapes = true;
                self.octal_escapes_are_flexible = true;
                self.allow_surrogate_pairs = false;
                self.illegal_unicode_protocol = IllegalUnicodeProtocol::ReplacementCharacter;
                self.unknown_escape_protocol = UnknownEscapeProtocol::LiteralEscape;
                self.escapes = BTreeMap::from(C_ESCAPES);
            }
            StringStandard::Rust => {
                self.enable_escapes = true;
                self.permit_low_control_characters = true;
                self.escape_char = '\\';
                self.allow_octal_escapes = false;
                self.allow_surrogate_pairs = false;
                self.illegal_unicode_protocol = IllegalUnicodeProtocol::Error;
                self.unknown_escape_protocol = UnknownEscapeProtocol::Error;
                self.escapes = BTreeMap::from(RUST_ESCAPES);
            }
            StringStandard::JSON => {
                self.enable_escapes = true;
                self.permit_low_control_characters = false;
                self.escape_char = '\\';
                self.allow_octal_escapes = false;
                self.allow_surrogate_pairs = true;
                self.illegal_unicode_protocol = IllegalUnicodeProtocol::ReplacementCharacter;
                self.unknown_escape_protocol = UnknownEscapeProtocol::Error;
                self.escapes = BTreeMap::from(JSON_ESCAPES);
            }
            StringStandard::TOML => {
                self.enable_escapes = true;
                self.permit_low_control_characters = false;
                self.escape_char = '\\';
                self.allow_octal_escapes = false;
                self.allow_surrogate_pairs = false;
                self.illegal_unicode_protocol = IllegalUnicodeProtocol::Error;
                self.unknown_escape_protocol = UnknownEscapeProtocol::Error;
                self.escapes = BTreeMap::from(TOML_ESCAPES);
            }
            StringStandard::Python => {
                self.enable_escapes = true;
                self.permit_low_control_characters = true;
                self.escape_char = '\\';
                self.allow_octal_escapes = true;
                self.octal_escapes_are_flexible = true;
                self.allow_surrogate_pairs = false;
                self.illegal_unicode_protocol = IllegalUnicodeProtocol::ReplacementCharacter;
                self.unknown_escape_protocol = UnknownEscapeProtocol::LiteralEscape;
                self.escapes = BTreeMap::from(PYTHON_ESCAPES);
            }
        }
        self.fix_escapes();
    }

    /// Set the escapes for this parser instance.
    pub fn set_escapes(&mut self, escapes: BTreeMap<char, EscapeType>) {
        self.escapes = escapes;
        self.fix_escapes();
    }

    /// Create the fast escape table.
    fn fix_escapes(&mut self) {
        self.fast_escapes = [EscapeType::Undefined; 128];
        for (key, value) in self.escapes.iter() {
            if key <= &'\u{80}' {
                self.fast_escapes[*key as usize] = *value
            }
        }
    }

    /// Correctly handle an invalid escape.
    fn invalid_escape(&self, ch: char, loc: Loc, string: &mut String) -> ParseResult<()> {
        match self.unknown_escape_protocol {
            UnknownEscapeProtocol::Discard => Ok(()),
            UnknownEscapeProtocol::DropEscape => {
                string.push(ch);
                Ok(())
            }
            UnknownEscapeProtocol::Error => Err(syntax_error(
                loc,
                format!("Invalid escape '{}{}'", self.escape_char, ch).as_str(),
            )),
            UnknownEscapeProtocol::LiteralEscape => {
                string.push(self.escape_char);
                string.push(ch);
                Ok(())
            }
            UnknownEscapeProtocol::Replace(ch) => {
                string.push(ch);
                Ok(())
            }
            UnknownEscapeProtocol::ReplacementCharacter => {
                string.push(char::REPLACEMENT_CHARACTER);
                Ok(())
            }
        }
    }

    /// Correctly handle an invalid Unicode value.
    fn handle_illegal_unicode(&self, value: u32, loc: Loc, string: &mut String) -> ParseResult<()> {
        match self.illegal_unicode_protocol {
            IllegalUnicodeProtocol::Discard => Ok(()),
            IllegalUnicodeProtocol::Error => Err(syntax_error(
                loc,
                format!("Value is not a valid Unicode code point: {:04x}", value).as_str(),
            )),
            IllegalUnicodeProtocol::Replace(ch) => {
                string.push(ch);
                Ok(())
            }
            IllegalUnicodeProtocol::ReplacementCharacter => {
                string.push(char::REPLACEMENT_CHARACTER);
                Ok(())
            }
        }
    }

    /// Handle something that looks like a surrogate pair.  On entry the parser is assumed
    /// to be pointing to the escape character of the second element of the pair.  On exit
    /// the entire second element been consumed.  If no second element is found, then treat
    /// this as illegal Unicode and handle appropriately.
    fn parse_surrogate_pair(
        &self,
        parser: &mut ParserCore,
        first: u32,
        loc: Loc,
        string: &mut String,
    ) -> ParseResult<()> {
        // At this point we have parsed the first surrogate pair.  Now we need to see if
        // there is a second element.  We should *expect* the next thing in the stream to
        // be an escape character.  If it isn't, then we don't have a second surrogate
        // pair.
        if !parser.peek_and_consume(self.escape_char) {
            // This is not what we expect, and the whole thing is wrong.
            return self.handle_illegal_unicode(first, loc, string);
        }

        // We need to process the next escape, but it must be a hexadecimal escape of at least 16 bits
        // or we can't get a second surrogate pair.
        let ch = parser.peek();
        parser.consume();
        let second = match self.escapes.get(&ch) {
            Some(EscapeType::BraceU18) => {
                // Get the hex code.
                self.parse_braced_hex(parser, 1, 8, true)?
            }
            Some(EscapeType::BraceU16) => {
                // Get the hex code.
                self.parse_braced_hex(parser, 1, 6, false)?
            }
            Some(EscapeType::NakedU4) => {
                // Get the hex code.
                let digits = parser.peek_n(4);
                parser.consume_n(4);
                // Try to convert to a u32.
                (match u16::from_str_radix(&digits, 16) {
                    Ok(value) => value,
                    Err(err) => {
                        return Err(syntax_error(
                            loc,
                            format!("Invalid hex value (ref:1) '{}': {}", digits, err).as_str(),
                        ))
                    }
                }) as u32
            }
            Some(EscapeType::NakedU8) => {
                // Get the hex code.
                let digits = parser.peek_n(8);
                parser.consume_n(8);
                // Try to convert to a u32.
                match u32::from_str_radix(&digits, 16) {
                    Ok(value) => value,
                    Err(err) => {
                        return Err(syntax_error(
                            loc,
                            format!("Invalid hex value (ref:2) '{}': {}", digits, err).as_str(),
                        ))
                    }
                }
            }
            _ => {
                // Well this is clearly wrong.
                return Err(syntax_error(loc,
                    "Found what seems to be the first half of a surrogate pair, but no second half was found."
                ));
            }
        };

        // Do we even allow surrogate pairs?
        if !self.allow_surrogate_pairs {
            // No.
            return Err(syntax_error(loc, "Surrogate pairs are not permitted"));
        }

        // Okay, check the parts for this surrogate pair.
        if !(0xd800..0xdc00).contains(&first) || !(0xdc00..0xe000).contains(&second) {
            // This is not a valid surrogate pair.
            return Err(syntax_error(
                loc,
                format!("Invalid surrogate pair {:04x},{:04x}", first, second).as_str(),
            ));
        }

        // Compute the actual value.  Having checked everything above, this should never
        // fail.
        let value = (first - 0xD800) * 0x400 + (second - 0xDC00) + 0x10000;
        self.u32_to_char(value, loc, string)?;
        Ok(())
    }

    /// Process braced hexadecimal values.  This returns the u32 that is parsed, if any.
    /// It does not transform it into a Unicode character or check that.
    ///
    /// On entry the parser is assumed to be pointing to the opening brace, and this is checked.
    /// On exit the closing brace is consumed.
    fn parse_braced_hex(
        &self,
        parser: &mut ParserCore,
        low: usize,
        high: usize,
        underscores: bool,
    ) -> ParseResult<u32> {
        let loc = parser.loc();
        // Expect an opening brace.
        if !parser.peek_and_consume('{') {
            // Malformed escape.
            return Err(unexpected_character_error(loc, "{", parser.peek()));
        }
        // Read the hexadecimal characters.
        let digits = if underscores {
            parser.take_while_unless(|ch| ch.is_ascii_hexdigit(), |ch| ch == '_')
        } else {
            parser.take_while(|ch| ch.is_ascii_hexdigit())
        };
        // The next thing must be the closing brace.
        if !parser.peek_and_consume('}') {
            // Malformed escape.
            return Err(unexpected_character_error(parser.loc(), "}", parser.peek()));
        }
        // Check the number of digits.  Because they are in the ASCII range we can use length.
        if !(low..=high).contains(&digits.len()) {
            if digits.len() < low {
                return Err(syntax_error(loc, "Too few digits given in escape"));
            }
            return Err(syntax_error(loc, "Too many digits given in escape"));
        }
        Ok(u32::from_str_radix(&digits, 16).unwrap())
    }

    /// Handle a u32 conversion to a char.  This also handles the failure.
    fn u32_to_char(&self, value: u32, loc: Loc, string: &mut String) -> ParseResult<()> {
        match char::from_u32(value) {
            None => {
                // Failed.
                self.handle_illegal_unicode(value, loc, string)
            }
            Some(ch) => {
                string.push(ch);
                Ok(())
            }
        }
    }

    /// Parse the next escape sequence.  The initial escape character is assumed to have been
    /// consumed prior to entry, and thus the parser is pointing to the first character after
    /// the escape.  On exit the parser is pointing to the first character following the escape
    /// sequence.
    fn parse_escape(&self, parser: &mut ParserCore, string: &mut String) -> ParseResult<()> {
        let loc = parser.loc();
        let mut ch = parser.peek();
        parser.consume();

        let esc_type = if ch.is_ascii() {
            &self.fast_escapes[ch as usize]
        } else if let Some(esc_type) = self.escapes.get(&ch) {
            esc_type
        } else {
            &EscapeType::Undefined
        };

        // Check for a known escape code.
        match esc_type {
            EscapeType::Char(rp) => {
                string.push(*rp);
                Ok(())
            }
            EscapeType::Undefined => {
                // Look for an octal escape if we are allowing them.
                if self.allow_octal_escapes && ('0'..='7').contains(&ch) {
                    // Parse this as an octal escape.  We can grab up to two additional digits.
                    let mut value = (ch as u32) - ('0' as u32);
                    for _ in 0..2 {
                        ch = parser.peek();
                        if ('0'..='7').contains(&ch) {
                            value *= 8;
                            value += (ch as u32) - ('0' as u32);
                            parser.consume();
                        } else {
                            if !self.octal_escapes_are_flexible {
                                return Err(syntax_error(
                                    loc,
                                    "Octal escape must have three digits",
                                ));
                            }
                            break;
                        }
                    }
                    self.u32_to_char(value, loc, string)?;
                    return Ok(());
                }
                self.invalid_escape(ch, loc, string)?;
                Ok(())
            }
            EscapeType::BraceU18 => {
                let value = self.parse_braced_hex(parser, 1, 8, true)?;
                if (0xd800..0xe000).contains(&value) {
                    // This is the start of a surrogate pair.
                    self.parse_surrogate_pair(parser, value, loc, string)?
                } else {
                    self.u32_to_char(value, loc, string)?
                };
                Ok(())
            }
            EscapeType::BraceU16 => {
                let value = self.parse_braced_hex(parser, 1, 6, false)?;
                if (0xd800..0xe000).contains(&value) {
                    // This is the start of a surrogate pair.
                    self.parse_surrogate_pair(parser, value, loc, string)?
                } else {
                    self.u32_to_char(value, loc, string)?
                };
                Ok(())
            }
            EscapeType::BracketUNamed => {
                #[cfg(not(feature = "no_ucd"))]
                {
                    // Expect an opening brace.
                    if !parser.peek_and_consume('{') {
                        // Malformed escape.
                        return Err(unexpected_character_error(loc, "{", parser.peek()));
                    }
                    // Get the content of the braces.
                    let name = parser.take_while(|ch| ch != '}');
                    // The next thing must be the closing brace.
                    if !parser.peek_and_consume('}') {
                        // Malformed escape.
                        return Err(unexpected_character_error(loc, "}", parser.peek()));
                    }
                    // Try to find the character in the Unicode database.
                    let name = name.to_uppercase();
                    match self.ucd.get(name.as_str()) {
                        Some(ch) => {
                            string.push(*ch);
                            Ok(())
                        }
                        None => Err(syntax_error(
                            loc,
                            format!("Unknown Unicode character name '{}'", name).as_str(),
                        )),
                    }
                }
                #[cfg(feature = "no_ucd")]
                {
                    Err(syntax_error(loc, "Unicode name lookup is not enabled."))
                }
            }
            EscapeType::Discard => Ok(()),
            EscapeType::DiscardWS => {
                parser.consume_ws_only();
                Ok(())
            }
            EscapeType::NakedASCII => {
                let digits = parser.peek_n(2);
                parser.consume_n(2);
                // Try to convert to a byte.
                let value = match u8::from_str_radix(&digits, 16) {
                    Ok(value) => value,
                    Err(err) => {
                        return Err(syntax_error(
                            loc,
                            format!("Invalid ASCII hex value '{}': {}", digits, err).as_str(),
                        ))
                    }
                };
                if value > 0x7f {
                    return Err(syntax_error(
                        loc,
                        format!("Invalid ASCII value (too high): '{}'", digits).as_str(),
                    ));
                }
                string.push(unsafe { char::from_u32_unchecked(value as u32) });
                Ok(())
            }
            EscapeType::NakedByte => {
                let digits = parser.peek_n(2);
                parser.consume_n(2);
                // Try to convert to a byte.
                let value = match u8::from_str_radix(&digits, 16) {
                    Ok(value) => value,
                    Err(err) => {
                        return Err(syntax_error(
                            loc,
                            format!("Invalid hex value (ref:3) '{}': {}", digits, err).as_str(),
                        ))
                    }
                } as u32;
                // None of the code points this can match are invalid, so we don't need to
                // check.  Note that this will behave differently from C in that the value
                // will be treated as a Unicode code point.
                string.push(char::from_u32(value).unwrap());
                Ok(())
            }
            EscapeType::NakedU4 => {
                let digits = parser.peek_n(4);
                parser.consume_n(4);
                // Try to convert to a u32.
                let value = match u16::from_str_radix(&digits, 16) {
                    Ok(value) => value,
                    Err(err) => {
                        return Err(syntax_error(
                            loc,
                            format!("Invalid hex value (ref:4) '{}': {}", digits, err).as_str(),
                        ))
                    }
                } as u32;
                if (0xd800..0xe000).contains(&value) {
                    // This is the start of a surrogate pair.
                    return self.parse_surrogate_pair(parser, value, loc, string);
                }
                // Because surrogate pairs are extracted above, we have nothing here that could
                // be a problem.
                string.push(unsafe { char::from_u32_unchecked(value) });
                Ok(())
            }
            EscapeType::NakedU8 => {
                let digits = parser.peek_n(8);
                parser.consume_n(8);
                // Try to convert to a u32.
                let value = match u32::from_str_radix(&digits, 16) {
                    Ok(value) => value,
                    Err(err) => {
                        return Err(syntax_error(
                            loc,
                            format!("Invalid hex value (ref:5) '{}': {}", digits, err).as_str(),
                        ))
                    }
                };
                if (0xd800..0xe000).contains(&value) {
                    // This is the start of a surrogate pair.
                    return self.parse_surrogate_pair(parser, value, loc, string);
                }
                match char::from_u32(value) {
                    Some(ch) => {
                        string.push(ch);
                        Ok(())
                    }
                    None => self.handle_illegal_unicode(value, loc, string),
                }
            }
        }
    }

    // Methods that require a terminal delimiter.

    /// Parse a string.
    fn parse_esc_con_ter(&self, parser: &mut ParserCore, terminal: char) -> ParseResult<String> {
        // Parse every character until we encounter the specified terminal.
        let mut result = String::with_capacity(CAPACITY);
        let loc = parser.loc();
        while !parser.is_at_eof() {
            let ch = parser.peek();
            if ch == terminal {
                parser.consume();
                return Ok(result);
            } else if ch == self.escape_char {
                // Process an escape.
                parser.consume();
                self.parse_escape(parser, &mut result)?;
            } else {
                parser.consume();
                result.push(ch)
            }
        }
        Err(syntax_error(loc, "Found unterminated string."))
    }

    /// Parse a string.
    fn parse_esc_ter(&self, parser: &mut ParserCore, terminal: char) -> ParseResult<String> {
        // Parse every character until we encounter the specified terminal.
        let mut result = String::with_capacity(CAPACITY);
        let loc = parser.loc();
        while !parser.is_at_eof() {
            let ch = parser.peek();
            if ch == terminal {
                parser.consume();
                return Ok(result);
            } else if ch < '\x20' {
                // Low control code.
                return Err(syntax_error(
                    parser.loc(),
                    &format!(
                        "Control characters are not permitted in strings: '{:?}'",
                        ch
                    ),
                ));
            } else if ch == self.escape_char {
                // Process an escape.
                parser.consume();
                self.parse_escape(parser, &mut result)?;
            } else {
                parser.consume();
                result.push(ch)
            }
        }
        Err(syntax_error(loc, "Found unterminated string."))
    }

    /// Parse a string.
    fn parse_con_ter(&self, parser: &mut ParserCore, terminal: char) -> ParseResult<String> {
        // Parse every character until we encounter the specified terminal.
        let mut result = String::with_capacity(CAPACITY);
        let loc = parser.loc();
        while !parser.is_at_eof() {
            let ch = parser.peek();
            if ch == terminal {
                parser.consume();
                return Ok(result);
            } else {
                parser.consume();
                result.push(ch)
            }
        }
        Err(syntax_error(loc, "Found unterminated string."))
    }

    /// Parse a string.
    fn parse_ter(&self, parser: &mut ParserCore, terminal: char) -> ParseResult<String> {
        // Parse every character until we encounter the specified terminal.
        let mut result = String::with_capacity(CAPACITY);
        let loc = parser.loc();
        while !parser.is_at_eof() {
            let ch = parser.peek();
            if ch == terminal {
                parser.consume();
                return Ok(result);
            } else if ch < '\x20' {
                // Low control code.
                return Err(syntax_error(
                    parser.loc(),
                    &format!(
                        "Control characters are not permitted in strings: '{:?}'",
                        ch
                    ),
                ));
            } else {
                parser.consume();
                result.push(ch)
            }
        }
        Err(syntax_error(loc, "Found unterminated string."))
    }

    // Methods that do not require a terminal character.

    fn read_c(&self, parser: &mut ParserCore) -> ParseResult<String> {
        Ok(parser.take_while(|_| true))
    }

    fn read_ce(&self, parser: &mut ParserCore) -> ParseResult<String> {
        let mut result = String::with_capacity(CAPACITY);
        while !parser.is_at_eof() {
            let ch = parser.peek();
            parser.consume();
            if ch == self.escape_char {
                self.parse_escape(parser, &mut result)?
            } else {
                result.push(ch)
            }
        }
        Ok(result)
    }

    fn read(&self, parser: &mut ParserCore) -> ParseResult<String> {
        let result = parser.take_while(|ch| ch >= '\x20');
        if parser.is_at_eof() {
            Ok(result)
        } else {
            let ch = parser.peek();
            Err(syntax_error(
                parser.loc(),
                &format!(
                    "Control characters are not permitted in strings: '{:?}'",
                    ch
                ),
            ))
        }
    }

    fn read_e(&self, parser: &mut ParserCore) -> ParseResult<String> {
        let mut result = String::with_capacity(CAPACITY);
        while !parser.is_at_eof() {
            let ch = parser.peek();
            if ch == self.escape_char {
                parser.consume();
                self.parse_escape(parser, &mut result)?
            } else if ch < '\x20' {
                return Err(syntax_error(
                    parser.loc(),
                    &format!(
                        "Control characters are not permitted in strings: '{:?}'",
                        ch
                    ),
                ));
            } else {
                parser.consume();
                result.push(ch)
            }
        }
        Ok(result)
    }

    /// Parse a string from the given parser.  The `terminal` specifies a terminal character
    /// that ends the string.  If the terminal is `None`, then *everything* is parsed as part
    /// of the string until the end of stream is reached.
    ///
    /// If a terminal is specified (is not `None`) but is not found, an error is generated.
    ///
    pub fn process(&self, parser: &mut ParserCore, terminal: Option<char>) -> ParseResult<String> {
        match terminal {
            None => {
                if self.enable_escapes {
                    if self.permit_low_control_characters {
                        self.read_ce(parser)
                    } else {
                        self.read_e(parser)
                    }
                } else if self.permit_low_control_characters {
                    self.read_c(parser)
                } else {
                    self.read(parser)
                }
            }
            Some(terminal) => {
                if self.enable_escapes {
                    if self.permit_low_control_characters {
                        self.parse_esc_con_ter(parser, terminal)
                    } else {
                        self.parse_esc_ter(parser, terminal)
                    }
                } else if self.permit_low_control_characters {
                    self.parse_con_ter(parser, terminal)
                } else {
                    self.parse_ter(parser, terminal)
                }
            }
        }
    }

    /// Parse a string from the given value.  The entire string is parsed.
    pub fn parse_string(&self, value: &str) -> ParseResult<String> {
        let decoder = Decode::new(value.bytes().collect());
        let mut parser = ParserCore::new("<string>", decoder);
        self.process(&mut parser, None)
    }
}

impl Default for StringParser {
    /// Make and return a new string parser.  The initial parsing mode is set to Rust.
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod test {
    use std::collections::BTreeMap;

    use super::StringParser;
    use crate::parse_from_string;
    use crate::strings::{EscapeType, IllegalUnicodeProtocol, UnknownEscapeProtocol};

    // Basic sanity checks.  For more testing, see the strings_test module.

    #[test]
    fn simple_test() {
        let mut sp = StringParser::new();
        sp.enable_escapes = false;
        sp.permit_low_control_characters = false;
        let cases = &[
            (
                r#"This is a simple string."#,
                None,
                "This is a simple string.",
            ),
            (r#"This is an escape\n."#, None, "This is an escape\\n."),
            ("This is a control code\x02.", None, ""),
            (
                r#"This is a simple string.""#,
                Some('"'),
                "This is a simple string.",
            ),
            (r#"This is a simple string."#, Some('"'), ""),
            (
                r#"This is an escape\n.""#,
                Some('"'),
                "This is an escape\\n.",
            ),
            ("This is a control code\x02.\"", Some('"'), ""),
        ];
        for (in_str, term, out_str) in cases {
            let mut parser = parse_from_string(in_str);
            let result = sp.process(parser.borrow_core(), *term);
            if out_str.is_empty() {
                assert!(result.is_err())
            } else {
                assert_eq!(&result.unwrap(), out_str)
            }
        }
    }

    #[test]
    fn control_test() {
        let mut sp = StringParser::new();
        sp.enable_escapes = false;
        sp.permit_low_control_characters = true;
        let cases = &[
            (
                r#"This is a simple string."#,
                None,
                "This is a simple string.",
            ),
            (r#"This is an escape\n."#, None, "This is an escape\\n."),
            (
                "This is a control code\x02.",
                None,
                "This is a control code\x02.",
            ),
            (
                r#"This is a simple string.""#,
                Some('"'),
                "This is a simple string.",
            ),
            (r#"This is a simple string."#, Some('"'), ""),
            (
                r#"This is an escape\n.""#,
                Some('"'),
                "This is an escape\\n.",
            ),
            (
                "This is a control code\x02.\"",
                Some('"'),
                "This is a control code\x02.",
            ),
        ];
        for (in_str, term, out_str) in cases {
            let mut parser = parse_from_string(in_str);
            let result = sp.process(parser.borrow_core(), *term);
            if out_str.is_empty() {
                assert!(result.is_err())
            } else {
                assert_eq!(&result.unwrap(), out_str)
            }
        }
    }

    #[test]
    fn escape_test() {
        let mut sp = StringParser::new();
        sp.enable_escapes = true;
        sp.permit_low_control_characters = false;
        let cases = &[
            (
                r#"This is a simple string."#,
                None,
                "This is a simple string.",
            ),
            (r#"This is an escape\n."#, None, "This is an escape\n."),
            ("This is a control code\x02.", None, ""),
            (
                r#"This is a simple string.""#,
                Some('"'),
                "This is a simple string.",
            ),
            (r#"This is a simple string."#, Some('"'), ""),
            (
                r#"This is an escape\n.""#,
                Some('"'),
                "This is an escape\n.",
            ),
            ("This is a control code\x02.\"", Some('"'), ""),
        ];
        for (in_str, term, out_str) in cases {
            let mut parser = parse_from_string(in_str);
            let result = sp.process(parser.borrow_core(), *term);
            if out_str.is_empty() {
                assert!(result.is_err())
            } else {
                assert_eq!(&result.unwrap(), out_str)
            }
        }
    }

    #[test]
    fn odd_escapes_test() {
        let mut sp = StringParser::new();
        sp.enable_escapes = true;
        sp.permit_low_control_characters = true;
        sp.allow_surrogate_pairs = true;
        let escapes = BTreeMap::from([
            ('\n', EscapeType::Discard),
            ('\\', EscapeType::Char('\\')),
            ('\'', EscapeType::Char('\'')),
            ('\"', EscapeType::Char('\"')),
            ('a', EscapeType::Char('\x07')),
            ('b', EscapeType::Char('\x08')),
            ('f', EscapeType::Char('\x0c')),
            ('n', EscapeType::Char('\n')),
            ('r', EscapeType::Char('\r')),
            ('t', EscapeType::Char('\t')),
            ('v', EscapeType::Char('\x0b')),
            ('x', EscapeType::NakedByte),
            ('N', EscapeType::BracketUNamed),
            ('u', EscapeType::NakedU4),
            ('U', EscapeType::NakedU8),
            ('z', EscapeType::Char('0')),
            ('å', EscapeType::Discard),
        ]);
        sp.unknown_escape_protocol = UnknownEscapeProtocol::Error;
        sp.illegal_unicode_protocol = IllegalUnicodeProtocol::Error;
        sp.set_escapes(escapes);
        let mut parser =
            parse_from_string(r#"A very \\escaped\\ string. \'\"\a\b\f\n\r\t\v\z\å\z"#);
        let result = sp.process(parser.borrow_core(), None);
        assert_eq!(
            result.unwrap(),
            "A very \\escaped\\ string. '\"\u{7}\u{8}\u{c}\n\r\t\u{b}00"
        );
        let mut parser = parse_from_string(r#"\ud801\udce0"#);
        let result = sp.process(parser.borrow_core(), None);
        assert_eq!(result.unwrap(), "𐓠");
        let mut parser = parse_from_string(r#"\ud801\u002e"#);
        let result = sp.process(parser.borrow_core(), None);
        assert!(result.is_err());
        let mut parser = parse_from_string(r#"\ud801*"#);
        let result = sp.process(parser.borrow_core(), None);
        println!("{:?}", result);
        assert!(result.is_err());
        let mut parser = parse_from_string(r#"\ß"#);
        let result = sp.process(parser.borrow_core(), None);
        assert!(result.is_err());
    }

    #[test]
    fn control_escape_test() {
        let mut sp = StringParser::new();
        sp.enable_escapes = true;
        sp.permit_low_control_characters = true;
        let cases = &[
            (
                r#"This is a simple string."#,
                None,
                "This is a simple string.",
            ),
            (r#"This is an escape\n."#, None, "This is an escape\n."),
            (
                "This is a control code\x02.",
                None,
                "This is a control code\x02.",
            ),
            (
                r#"This is a simple string.""#,
                Some('"'),
                "This is a simple string.",
            ),
            (r#"This is a simple string."#, Some('"'), ""),
            (
                r#"This is an escape\n.""#,
                Some('"'),
                "This is an escape\n.",
            ),
            (
                "This is a control code\x02.\"",
                Some('"'),
                "This is a control code\x02.",
            ),
        ];
        for (in_str, term, out_str) in cases {
            let mut parser = parse_from_string(in_str);
            let result = sp.process(parser.borrow_core(), *term);
            if out_str.is_empty() {
                assert!(result.is_err())
            } else {
                assert_eq!(&result.unwrap(), out_str)
            }
        }
    }
}