trivet 3.1.0

The trivet Parser Library
Documentation
// Trivet
// Copyright (c) 2025 by Stacy Prowell.  All rights reserved.
// https://gitlab.com/binary-tools/trivet

//! The different escaping standards.

use core::fmt;
use std::ops::Range;

/// These are the string standards that are implemented for the string parser and encoder.  These select
/// a "bundle" of options at once to configure how strings are parsed or encoded.
///
/// These are *mostly* faithful to the specified implementation, *except* that the source and destination
/// strings are Rust UTF-8 strings.  This means some things won't work exactly the same when the underlying
/// platform does not enforce UTF-8.  For instance, C uses null-terminated byte arrays, but Rust strings can
/// contain nulls.
#[derive(Debug, Copy, Clone)]
pub enum StringStandard {
    /// Trivet has its own, very permissive string standard, providing basic character escapes, including
    /// both "naked" two-digit ASCII (`\x0a`) and longer brace Unicode (`\u{a}`) escapes.
    ///
    /// Trivet allows octal escapes such as `\0` (null) and `\12` (newline), permits
    /// surrogate pairs, replaces illegal Unicode characters with U+FFFD, and allows using the escape
    /// character to indicate a literal value not otherwise covered (so `\q` is treated as just `q`).
    Trivet,

    /// Parse according to the Rust standard found
    /// [here](https://doc.rust-lang.org/reference/tokens.html#string-literals).
    Rust,

    /// Parse according to the JSON standard defined by
    /// [ECMA-404 JSON Data Interchange Syntax](https://www.ecma-international.org/publications-and-standards/standards/ecma-404/).
    JSON,

    /// Parse according to the TOML specification defined by
    /// [TOML 1.0.0](https://toml.io/en/v1.0.0#string)
    TOML,

    /// Parse according to the latest C standard
    /// [ISO/IEC 9899:2018 (C18)](https://blog.ansi.org/2018/11/c-language-standard-iso-iec-9899-2018-c18/).
    C,

    /// Parse according to the current
    /// [Python standard](https://docs.python.org/3/reference/lexical_analysis.html).
    Python,
}

impl fmt::Display for StringStandard {
    fn fmt(&self, form: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::C => write!(form, "C"),
            Self::JSON => write!(form, "JSON"),
            Self::TOML => write!(form, "TOML"),
            Self::Python => write!(form, "Python"),
            Self::Rust => write!(form, "Rust"),
            Self::Trivet => write!(form, "Trivet"),
        }
    }
}

/// How to handle parsing unknown or invalid escapes.
#[derive(Clone)]
pub enum UnknownEscapeProtocol {
    /// Just discard.  This is rarely used.
    Discard,

    /// Drop the escape character.  This transforms, for instance, an undefined `\m` into `m` and is
    /// common in shell languages like Bash in order to escape arbitrary characters.
    DropEscape,

    /// Escape the escape.  This transforms an undefined `\m` into `\\m`.  This is the
    /// protocol used in Python.
    LiteralEscape,

    /// Replace with the Unicode replacement character, U+FFFD.
    ReplacementCharacter,

    /// Substitute the given character.  This generalizes the `ReplacementCharacter` option.
    /// Some implementations map unknown escapes to a specific character.
    Replace(char),

    /// Generate an error.
    Error,
}

/// How to handle parsing an invalid Unicode hexadecimal escape.  This also applies to the
/// escapes of a surrogate pair when surrogate pairs are not allowed.
#[derive(Clone)]
pub enum IllegalUnicodeProtocol {
    /// Just discard.
    Discard,

    /// Generate an error.
    Error,

    /// Replace with the replacement character, U+FFFD.
    ReplacementCharacter,

    /// Substitute the given character.
    Replace(char),
}

/// Define the different escape types.  These are used to interpret escape sequences found in
/// strings and to determine how escapes are used when encoding.
#[derive(Clone, Copy)]
pub enum EscapeType {
    /// A character escape.  For instance, this can be used to map `\n` to a newline.
    Char(char),

    /// A "naked" ASCII escape of exactly two hex digits of value at most 7F.  This is how
    /// Rust handles `\xNN`.
    NakedASCII,

    /// A "naked" byte escape of exactly two hex digits of any value.  This is how Python
    /// handles `\xNN`.
    NakedByte,

    /// A "naked" Unicode escape of exactly four hex digits.  This is used in both JSON,
    /// Python, and C.
    NakedU4,

    /// A "naked" Unicode escape of exactly eight digits.  This is used in Python and C.
    NakedU8,

    /// A brace Unicode escape of 1-6 hex digits.  This is used in Rust.
    BraceU16,

    /// A brace Unicode escape of 1-8 hex digits with underscores permitted.
    BraceU18,

    /// A brace named Unicode escape.  These are given by `{n}`, where `n`
    /// is a name in [the Unicode database](https://unicode.org/ucd/).  See also
    /// [Name Aliases](https://www.unicode.org/Public/11.0.0/ucd/NameAliases.txt).
    /// The names are not case sensitive.
    BracketUNamed,

    /// Discard.  This can be used to join lines; for instance, some standards allow
    /// `[escape][newline]` to join lines together.
    Discard,

    /// Discard, but also consume whitespace to the first non-whitespace character.
    DiscardWS,

    /// Special escape type representing an undefined escape.
    Undefined,
}

/// The standard used to determine which characters to encode as escapes.
pub enum EncodingStandard {
    /// Encode only control characters.
    OnlyControl,
    /// Encode anything that is outside the ASCII range or a control character.
    ASCII,
    /// Encode everything above the given limit.  Control characters are also encoded.
    /// Note that this strictly means *above* the limit.  Thus `ASCII` is equivalent to
    /// `EncodeAbove(0x7f)`.
    EncodeAbove(u32),
    /// Encode everything in the specified ranges.  Control characters are also encoded.
    EncodeRanges(Vec<Range<u32>>),
}

/// Determine how Unicode characters that require encoding should be handled.  If ASCII encoding
/// is enabled, this only applies to characters that need encoding and are outside the ASCII range.
pub enum EncodingMethod {
    /// Encode using four-character hex.  Anything out of this range must be encoded
    /// using a surrogate pair.  This is used by JSON, for example.  Note that values
    /// that require more than 20 bits (anything above 0x3ffff) cannot be encoded.
    Naked4,
    /// Encode using four-character or eight-character hex.  This is the standard used
    /// by Python, for example.
    Naked48,
    /// Encode as a brace hex of at least one and no more than six digits.  This is
    /// the standard used by Rust
    Braced6,
    /// Encode as a brace hex of one to eight characters.
    Braced8,
    /// Encode everything using two-digit naked hex escapes.  This is pathological, but might be
    /// useful for debugging or with some older software.  In general you do not want to use this.
    Naked2,
}

/// Trivet is very permissive with its escapes.
#[cfg(not(feature = "no_ucd"))]
pub const TRIVET_ESCAPES: [(char, EscapeType); 16] = [
    ('\n', EscapeType::Discard),
    ('\\', EscapeType::Char('\\')),
    ('\'', EscapeType::Char('\'')),
    ('\"', EscapeType::Char('\"')),
    ('a', EscapeType::Char('\x07')),
    ('b', EscapeType::Char('\x08')),
    ('e', EscapeType::Char('\x1b')),
    ('f', EscapeType::Char('\x0c')),
    ('n', EscapeType::Char('\n')),
    ('r', EscapeType::Char('\r')),
    ('t', EscapeType::Char('\t')),
    ('v', EscapeType::Char('\x0b')),
    ('x', EscapeType::NakedByte),
    ('u', EscapeType::BraceU18),
    ('N', EscapeType::BracketUNamed),
    ('?', EscapeType::Char('?')),
];
#[cfg(feature = "no_ucd")]
pub const TRIVET_ESCAPES: [(char, EscapeType); 15] = [
    ('\n', EscapeType::Discard),
    ('\\', EscapeType::Char('\\')),
    ('\'', EscapeType::Char('\'')),
    ('\"', EscapeType::Char('\"')),
    ('a', EscapeType::Char('\x07')),
    ('b', EscapeType::Char('\x08')),
    ('e', EscapeType::Char('\x1b')),
    ('f', EscapeType::Char('\x0c')),
    ('n', EscapeType::Char('\n')),
    ('r', EscapeType::Char('\r')),
    ('t', EscapeType::Char('\t')),
    ('v', EscapeType::Char('\x0b')),
    ('x', EscapeType::NakedByte),
    ('u', EscapeType::BraceU18),
    ('?', EscapeType::Char('?')),
];

/// See [C](https://en.wikipedia.org/wiki/Escape_sequences_in_C).
pub const C_ESCAPES: [(char, EscapeType); 16] = [
    ('\n', EscapeType::Discard),
    ('\\', EscapeType::Char('\\')),
    ('\'', EscapeType::Char('\'')),
    ('\"', EscapeType::Char('\"')),
    ('a', EscapeType::Char('\x07')),
    ('b', EscapeType::Char('\x08')),
    ('e', EscapeType::Char('\x1b')),
    ('f', EscapeType::Char('\x0c')),
    ('n', EscapeType::Char('\n')),
    ('r', EscapeType::Char('\r')),
    ('t', EscapeType::Char('\t')),
    ('v', EscapeType::Char('\x0b')),
    ('x', EscapeType::NakedByte),
    ('u', EscapeType::NakedU4),
    ('U', EscapeType::NakedU8),
    ('?', EscapeType::Char('?')),
];

/// See [Python](https://docs.python.org/3/reference/lexical_analysis.html).
#[cfg(not(feature = "no_ucd"))]
pub const PYTHON_ESCAPES: [(char, EscapeType); 15] = [
    ('\n', EscapeType::Discard),
    ('\\', EscapeType::Char('\\')),
    ('\'', EscapeType::Char('\'')),
    ('\"', EscapeType::Char('\"')),
    ('a', EscapeType::Char('\x07')),
    ('b', EscapeType::Char('\x08')),
    ('f', EscapeType::Char('\x0c')),
    ('n', EscapeType::Char('\n')),
    ('r', EscapeType::Char('\r')),
    ('t', EscapeType::Char('\t')),
    ('v', EscapeType::Char('\x0b')),
    ('x', EscapeType::NakedByte),
    ('N', EscapeType::BracketUNamed),
    ('u', EscapeType::NakedU4),
    ('U', EscapeType::NakedU8),
];
#[cfg(feature = "no_ucd")]
pub const PYTHON_ESCAPES: [(char, EscapeType); 14] = [
    ('\n', EscapeType::Discard),
    ('\\', EscapeType::Char('\\')),
    ('\'', EscapeType::Char('\'')),
    ('\"', EscapeType::Char('\"')),
    ('a', EscapeType::Char('\x07')),
    ('b', EscapeType::Char('\x08')),
    ('f', EscapeType::Char('\x0c')),
    ('n', EscapeType::Char('\n')),
    ('r', EscapeType::Char('\r')),
    ('t', EscapeType::Char('\t')),
    ('v', EscapeType::Char('\x0b')),
    ('x', EscapeType::NakedByte),
    ('u', EscapeType::NakedU4),
    ('U', EscapeType::NakedU8),
];

/// See [Rust](https://doc.rust-lang.org/reference/tokens.html#string-literals).
pub const RUST_ESCAPES: [(char, EscapeType); 10] = [
    ('\n', EscapeType::DiscardWS),
    ('\\', EscapeType::Char('\\')),
    ('\'', EscapeType::Char('\'')),
    ('\"', EscapeType::Char('\"')),
    ('0', EscapeType::Char('\0')),
    ('n', EscapeType::Char('\n')),
    ('r', EscapeType::Char('\r')),
    ('t', EscapeType::Char('\t')),
    ('x', EscapeType::NakedASCII),
    ('u', EscapeType::BraceU16),
];

/// See [JSON](https://www.ecma-international.org/publications-and-standards/standards/ecma-404/).
pub const JSON_ESCAPES: [(char, EscapeType); 10] = [
    ('\"', EscapeType::Char('\"')),
    ('\\', EscapeType::Char('\\')),
    ('\n', EscapeType::Discard),
    ('/', EscapeType::Char('/')),
    ('b', EscapeType::Char('\x08')),
    ('f', EscapeType::Char('\x0c')),
    ('n', EscapeType::Char('\n')),
    ('r', EscapeType::Char('\r')),
    ('t', EscapeType::Char('\t')),
    ('u', EscapeType::NakedU4),
];

/// See [TOML](https://toml.io/).
pub const TOML_ESCAPES: [(char, EscapeType); 10] = [
    ('\"', EscapeType::Char('\"')),
    ('\\', EscapeType::Char('\\')),
    ('\n', EscapeType::DiscardWS),
    ('b', EscapeType::Char('\x08')),
    ('f', EscapeType::Char('\x0c')),
    ('n', EscapeType::Char('\n')),
    ('r', EscapeType::Char('\r')),
    ('t', EscapeType::Char('\t')),
    ('u', EscapeType::NakedU4),
    ('U', EscapeType::NakedU8),
];