trivet 3.1.0

The trivet Parser Library
Documentation
// Trivet
// Copyright (c) 2025 by Stacy Prowell.  All rights reserved.
// https://gitlab.com/binary-tools/trivet

//! Support writing strings in various forms.

use super::hex::byte_to_two_digit_hex;
use super::hex::char_to_four_digit_hex;
use super::hex::char_to_six_digit_hex;
use super::hex::char_to_surrogate_pair;
use super::hex::dword_to_eight_digit_hex;
use super::EncodingMethod;
use super::EncodingStandard;
use super::EscapeType;
use super::C_ESCAPES;
use super::JSON_ESCAPES;
use super::PYTHON_ESCAPES;
use super::RUST_ESCAPES;
use super::TOML_ESCAPES;
use super::TRIVET_ESCAPES;
use crate::strings::StringStandard;
use std::collections::BTreeMap;
#[cfg(not(feature = "no_ucd"))]
use {crate::strings::UCD, std::rc::Rc};

/// Construct the UCD and return it.  This is a relatively costly operation and you
/// should only do it *once*.  Once you have done this you can keep it around and use
/// it to initialize string encoders that generate named Unicode escapes.  It is not needed
/// otherwise.
///
/// Why is this reference counted?  So a single copy can be used repeatedly.
#[cfg(not(feature = "no_ucd"))]
pub fn get_ucd_reverse() -> Rc<BTreeMap<char, &'static str>> {
    // This is where I would use lazy_static, but that would add an external
    // dependency.  Unfortunately `from` requires that we pass the massive array
    // on the stack, so let's not do that.
    let mut map = BTreeMap::new();
    for (key, value) in UCD {
        map.insert(*value, *key);
    }
    Rc::new(map)
}

/// Provide a configurable string encoder.  This takes strings and produces an encoded version of
/// the string.  This complements the [`crate::strings::StringParser`].
pub struct StringEncoder {
    /// Character used to introduce an escape.  Usually `\`.
    pub escape_char: char,

    /// If a character needs to be encoded and is in the ASCII range, use a simple two-digit naked
    /// encoding.  This overrides other encoding standards, if present.  For instance, `\x09`.
    pub use_ascii_escapes: bool,

    /// The character to introduce a naked two-digit hex ASCII escape.  Typically this is `x`.
    /// For instance, `\x09`.
    pub ascii_escape: char,

    /// The character to introduce a "low" naked Unicode escape (four digits).  Typically this is `u`.
    /// For instance, `\u2135` = `ℵ`.
    pub low_unicode_escape: char,

    /// The character to introduce a "high" naked Unicode escape (eight digits).  Typically this is `U`.
    /// For instance, `\U00002135` = `ℵ`.
    pub high_unicode_escape: char,

    /// The character to introduce a brace Unicode escape.  Typically this is `u`.  For instance,
    /// `\u{2135}` = `ℵ`.
    pub brace_unicode_escape: char,

    /// The character to introduce a named Unicode escape.  Typically this is `N`.  For instance,
    /// `\N{alef symbol}` = `ℵ`.
    pub named_unicode_escape: char,

    /// If a character needs to be encoded and it has a name, use a named escape.  This will apply
    /// to ASCII values if [`Self::use_ascii_escapes`] is not enabled.  For instance, `\N{alef symbol}` = `ℵ`.
    pub use_names: bool,

    /// The encoding standard to use.  This determines *what* gets encoded.
    pub encoding_standard: EncodingStandard,

    /// The encoding method to use.  This determines *how* encoding is done.
    pub encoding_method: EncodingMethod,

    /// Declare special handling of certain characters.  This maps a character to its encoding,
    /// minus the escape character.
    ///
    /// See [ASCII](https://www.ascii-code.com/) for the meaning of characters in the ASCII
    /// range, and consult the Unicode standard for others.
    ///
    /// The following is the table used for Python.
    ///
    /// ```rust
    /// use std::collections::BTreeMap;
    /// use trivet::strings::EscapeType;
    ///
    /// let escapes = BTreeMap::from([
    ///     ('\\', "\\"),
    ///     ('\'', "'"),
    ///     ('\"', "\""),
    ///     ('\x07', "a"),
    ///     ('\x08', "b"),
    ///     ('\x0c', "f"),
    ///     ('\n', "n"),
    ///     ('\r', "r"),
    ///     ('\t', "t"),
    ///     ('\x0b', "v"),
    /// ]);
    /// ```
    ///
    pub escapes: BTreeMap<char, String>,

    /// The Unicode database mapping code points to names.
    #[cfg(not(feature = "no_ucd"))]
    pub ucd: Rc<BTreeMap<char, &'static str>>,
}

impl StringEncoder {
    /// Create a new string encoder instance.  By default this uses the
    /// [`StringStandard::Trivet`] standard.
    pub fn new() -> Self {
        let mut encoder = StringEncoder {
            escape_char: '\\',
            use_ascii_escapes: true,
            ascii_escape: 'x',
            low_unicode_escape: 'u',
            high_unicode_escape: 'U',
            brace_unicode_escape: 'u',
            named_unicode_escape: 'N',
            encoding_method: EncodingMethod::Braced8,
            encoding_standard: EncodingStandard::OnlyControl,
            use_names: true,
            escapes: BTreeMap::new(),
            #[cfg(not(feature = "no_ucd"))]
            ucd: get_ucd_reverse(),
        };
        encoder.set(StringStandard::Trivet);
        encoder
    }

    /// Install the settings for the given string standard.  See
    /// [`StringStandard`] for the available standards.
    fn install(&mut self, table: &[(char, EscapeType)]) {
        let mut tree = BTreeMap::new();
        for (ch, escape_type) in table {
            match escape_type {
                EscapeType::BraceU18 => {
                    // Apparently this standard allows the U18 encoding.  Turn it on.
                    self.brace_unicode_escape = *ch;
                    self.encoding_method = EncodingMethod::Braced8;
                }
                EscapeType::BraceU16 => {
                    // Apparently this standard allows the U16 encoding.  Turn it on.
                    self.brace_unicode_escape = *ch;
                    self.encoding_method = EncodingMethod::Braced6;
                }
                EscapeType::BracketUNamed => {
                    // This allows named escapes.  Turn them on.
                    self.named_unicode_escape = *ch;
                }
                EscapeType::Char(code) => {
                    tree.insert(*code, ch.to_string());
                }
                EscapeType::Discard | EscapeType::DiscardWS | EscapeType::Undefined => {
                    // Don't care.
                }
                EscapeType::NakedASCII => {
                    self.ascii_escape = *ch;
                    self.use_ascii_escapes = true;
                }
                EscapeType::NakedByte => {
                    self.ascii_escape = *ch;
                    self.use_ascii_escapes = true;
                }
                EscapeType::NakedU4 => {
                    // This uses the naked 4-digit encoding.
                    self.low_unicode_escape = *ch;
                    self.encoding_method = EncodingMethod::Naked4;
                    self.encoding_standard = EncodingStandard::EncodeAbove(0xffff);
                }
                EscapeType::NakedU8 => {
                    // This uses the naked 8-digit encoding.
                    self.high_unicode_escape = *ch;
                    self.encoding_method = EncodingMethod::Naked48;
                }
            }
        }
        self.escapes = tree;
    }

    /// Configure all settings to conform to a given standard.
    pub fn set(&mut self, std: StringStandard) {
        self.use_ascii_escapes = false;
        self.use_names = false;
        self.encoding_standard = EncodingStandard::OnlyControl;
        match std {
            StringStandard::C => {
                self.install(&C_ESCAPES);
            }
            StringStandard::JSON => {
                self.install(&JSON_ESCAPES);
            }
            StringStandard::TOML => {
                self.install(&TOML_ESCAPES);
            }
            StringStandard::Python => {
                self.install(&PYTHON_ESCAPES);
            }
            StringStandard::Rust => {
                self.install(&RUST_ESCAPES);
            }
            StringStandard::Trivet => {
                self.install(&TRIVET_ESCAPES);
            }
        }
    }

    /// Encode a character and push the encoding onto the end of the result.
    fn encode_character(&self, ch: char, result: &mut String) {
        // See if we are forcing ASCII 2-digit escapes for characters in that range.
        if self.use_ascii_escapes && ch <= '\x7f' {
            result.push(self.escape_char);
            result.push(self.ascii_escape);
            let digits = byte_to_two_digit_hex(ch as u8);
            result.push(digits[0]);
            result.push(digits[1]);
            return;
        }

        // If names are enabled, then check for a name.  If we get a name, use it.
        #[cfg(not(feature = "no_ucd"))]
        if self.use_names {
            // See if there is a name.
            if let Some(name) = self.ucd.get(&ch) {
                result.push(self.escape_char);
                result.push(self.named_unicode_escape);
                result.push('{');
                result.push_str(name);
                result.push('}');
                return;
            }
        }

        // How a character is encoded depends on the encoding method,
        match self.encoding_method {
            EncodingMethod::Naked4 => {
                // Naked 4 digit hex.
                if ch > '\u{ffff}' {
                    // We need a surrogate pair for this.  Compute it.  The returned
                    // characters are returned in big endian order.
                    let digits = char_to_surrogate_pair(ch);
                    result.push(self.escape_char);
                    result.push(self.low_unicode_escape);
                    result.push(digits[0]);
                    result.push(digits[1]);
                    result.push(digits[2]);
                    result.push(digits[3]);
                    result.push(self.escape_char);
                    result.push(self.low_unicode_escape);
                    result.push(digits[4]);
                    result.push(digits[5]);
                    result.push(digits[6]);
                    result.push(digits[7]);
                } else {
                    let digits = char_to_four_digit_hex(ch);
                    result.push(self.escape_char);
                    result.push(self.low_unicode_escape);
                    result.push(digits[0]);
                    result.push(digits[1]);
                    result.push(digits[2]);
                    result.push(digits[3]);
                }
            }
            EncodingMethod::Braced6 => {
                // Bracketed 1 to 6 digit hex.
                result.push(self.escape_char);
                result.push(self.brace_unicode_escape);
                result.push('{');
                let digits = char_to_six_digit_hex(ch);
                result.push(digits[0]);
                result.push(digits[1]);
                result.push(digits[2]);
                result.push(digits[3]);
                result.push(digits[4]);
                result.push(digits[5]);
                result.push('}');
            }
            EncodingMethod::Braced8 => {
                // Bracketed 1 to 8 digit hex.
                // Right now we will not get more than six non-zero digits from a Unicode
                // character, so we just output six always.
                result.push(self.escape_char);
                result.push(self.brace_unicode_escape);
                result.push('{');
                let digits = char_to_six_digit_hex(ch);
                result.push(digits[0]);
                result.push(digits[1]);
                result.push(digits[2]);
                result.push(digits[3]);
                result.push(digits[4]);
                result.push(digits[5]);
                result.push('}');
            }
            EncodingMethod::Naked2 => {
                // Naked 2 digit hex.
                let mut high = ch as u32;
                loop {
                    result.push(self.escape_char);
                    result.push(self.ascii_escape);
                    let digits = byte_to_two_digit_hex((high & 0xff) as u8);
                    result.push(digits[0]);
                    result.push(digits[1]);
                    high >>= 8;
                    if high == 0 {
                        break;
                    }
                }
            }
            EncodingMethod::Naked48 => {
                // Naked 4 or 8 digit hex.
                result.push(self.escape_char);
                if ch > '\u{ffff}' {
                    let digits = dword_to_eight_digit_hex(ch as u32);
                    result.push(self.high_unicode_escape);
                    result.push(digits[0]);
                    result.push(digits[1]);
                    result.push(digits[2]);
                    result.push(digits[3]);
                    result.push(digits[4]);
                    result.push(digits[5]);
                    result.push(digits[6]);
                    result.push(digits[7]);
                } else {
                    let digits = char_to_four_digit_hex(ch);
                    result.push(self.low_unicode_escape);
                    result.push(digits[0]);
                    result.push(digits[1]);
                    result.push(digits[2]);
                    result.push(digits[3]);
                }
            }
        }
    }

    /// Encode the provided string and return the result.
    pub fn encode(&self, value: &str) -> String {
        let mut result = String::new();
        'outer: for ch in value.chars() {
            // The specified escape sequences override everything else.
            if let Some(value) = self.escapes.get(&ch) {
                result.push(self.escape_char);
                result.push_str(value);
                continue 'outer;
            }

            // Control characters are always encoded, no matter what.
            if ch.is_control() {
                self.encode_character(ch, &mut result);
                continue 'outer;
            }

            // Now check the encoding standard.
            let value = ch as u32;
            match self.encoding_standard {
                EncodingStandard::ASCII => {
                    if ch.is_ascii() {
                        result.push(ch);
                    } else {
                        self.encode_character(ch, &mut result);
                    }
                }
                EncodingStandard::EncodeAbove(limit) => {
                    if value > limit {
                        self.encode_character(ch, &mut result);
                    } else {
                        result.push(ch);
                    }
                }
                EncodingStandard::EncodeRanges(ref ranges) => {
                    for range in ranges {
                        if range.contains(&value) {
                            self.encode_character(ch, &mut result);
                            continue 'outer;
                        }
                    }
                    result.push(ch);
                }
                EncodingStandard::OnlyControl => {
                    result.push(ch);
                }
            }
        }
        result
    }
}

impl Default for StringEncoder {
    fn default() -> Self {
        Self::new()
    }
}