1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298
// Trivet
// Copyright (c) 2023 by Stacy Prowell. All rights reserved.
// https://gitlab.com/binary-tools/trivet
//! The different escaping standards.
use core::fmt;
use std::ops::Range;
/// String standards that are implemented for the string parser and encoder. These select a "bundle"
/// of options at once to configure how strings are parsed or encoded.
///
/// These are *mostly* faithful to the specified implementation, *except* that the source and destination
/// strings are Rust UTF-8 strings. This means some things won't work exactly the same when the underlying
/// platform does not enforce UTF-8. For instance, C uses null-terminated byte arrays, but Rust strings can
/// contains nulls.
#[derive(Debug, Copy, Clone)]
pub enum StringStandard {
/// Trivet has its own, very permissive string standard, providing basic character escapes, including
/// both "naked" two-digit ASCII (`\x0a`) and longer bracketed Unicode (`\u{a}`) escapes.
///
/// Trivet allows octal escapes such as `\0` (null) and `\12` (newline), permits
/// surrogate pairs, replaces illegal Unicode characters with U+FFFD, and allows using the escape
/// character to indicate a literal value not otherwise covered (so `\q` is treated as just `q`).
Trivet,
/// Parse according to the Rust standard found
/// [here](https://doc.rust-lang.org/reference/tokens.html#string-literals).
Rust,
/// Parse according to the JSON standard defined by
/// [ECMA-404 JSON Data Interchange Syntax](https://www.ecma-international.org/publications-and-standards/standards/ecma-404/).
JSON,
/// Parse according to the latest C standard
/// [ISO/IEC 9899:2018 (C18)](https://blog.ansi.org/2018/11/c-language-standard-iso-iec-9899-2018-c18/).
C,
/// Parse according to the current
/// [Python standard](https://docs.python.org/3/reference/lexical_analysis.html).
Python,
}
impl fmt::Display for StringStandard {
fn fmt(&self, form: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::C => write!(form, "C"),
Self::JSON => write!(form, "JSON"),
Self::Python => write!(form, "Python"),
Self::Rust => write!(form, "Rust"),
Self::Trivet => write!(form, "Trivet"),
}
}
}
/// How to handle parsing unknown and invalid escapes.
pub enum UnknownEscapeProtocol {
/// Just discard. This is rarely used.
Discard,
/// Drop the escape character. This transforms, for instance, an undefined `\m` into `m` and is
/// common in shell languages like Bash in order to escape arbitrary characters.
DropEscape,
/// Escape the escape. This transforms an undefined `\m` into `\\m`. This is the
/// protocol used in Python.
LiteralEscape,
/// Replace with the Unicode replacement character, U+FFFD.
ReplacementCharacter,
/// Substitute the given character. This generalizes the `ReplacementCharacter` option.
/// Some implementations map unknown escapes to a specific character.
Replace(char),
/// Generate an error.
Error,
}
/// How to handle parsing an invalid Unicode hexadecimal escape. This also applies to the
/// escapes of a surrogate pair when surrogate pairs are not allowed.
pub enum IllegalUnicodeProtocol {
/// Just discard.
Discard,
/// Generate an error.
Error,
/// Replace with the replacement character, U+FFFD.
ReplacementCharacter,
/// Substitute the given character.
Replace(char),
/// Preserve the value; the result *may* be an invalid Unicode string. Note that this will
/// not work the way you might think, and is overall a Bad Idea. Prefer the replacement
/// character.
Preserve,
}
/// Define the different escape types. These are used to interpret escape sequences found in
/// strings and to determine how escapes are used when encoding.
pub enum EscapeType {
/// A character escape. For instance, this can be used to map `\n` to a newline.
Char(char),
/// A "naked" ASCII escape of exactly two hex digits of value at most 7F. This is how
/// Rust handles `\xNN`.
NakedASCII,
/// A "naked" byte escape of exactly two hex digits of any value. This is how Python
/// handles `\xNN`.
NakedByte,
/// A "naked" Unicode escape of exactly four hex digits. This is used in both JSON,
/// Python, and C.
NakedU4,
/// A "naked" Unicode escape of exactly eight digits. This is used in Python and C.
NakedU8,
/// A bracketed Unicode escape of 1-6 hex digits. This is used in Rust.
BracketU16,
/// A bracketed Unicode escape of 1-8 hex digits with underscores permitted.
BracketU18,
/// A bracketed named Unicode escape. These are given by `{n}`, where `n`
/// is a name in [the Unicode database](https://unicode.org/ucd/). See also
/// [Name Aliases](https://www.unicode.org/Public/11.0.0/ucd/NameAliases.txt).
/// The names are not case sensitive.
BracketUNamed,
/// Discard. This can be used to join lines; for instance, some standards allow
/// `[escape][newline]` to join lines together.
Discard,
/// Discard, but also consume whitespace to the first non-whitespace character.
DiscardWS,
}
/// The standard used to determine which characters to encode as escapes.
pub enum EncodingStandard {
/// Encode only control characters.
OnlyControl,
/// Encode anything that is outside the ASCII range or a control character.
ASCII,
/// Encode everything above the given limit. Control characters are also encoded.
/// Note that this strictly means *above* the limit. Thus `ASCII` is equivalent to
/// `EncodeAbove(0x7f)`.
EncodeAbove(u32),
/// Encode everything in the specified ranges only. Control characters are also encoded.
EncodeRanges(Vec<Range<u32>>),
}
/// Determine how Unicode characters that require encoding should be handled. If ASCII encoding
/// is enabled, this only applies to characters that need encoding and are outside the ASCII range.
pub enum EncodingMethod {
/// Encode using four-character hex. Anything out of this range must be encoded
/// using a surrogate pair. This is used by JSON, for example. Note that values
/// that require more than 20 bits (anything above 0x3ffff) cannot be encoded.
Naked4,
/// Encode using four-character or eight-character hex. This is the standard used
/// by Python, for example.
Naked48,
/// Encode as a bracketed hex of at least four and no more than six digits. This is
/// the standard used by Rust
Bracketed46,
/// Encode as a bracketed hex of four to eight characters.
Bracketed48,
/// Encode everything using two-digit naked hex escapes. This is pathological, but might be
/// useful for debugging or with some older software. In general you do not want to use this.
Naked2,
}
/// Trivet is very permissive with its escapes.
#[cfg(not(feature = "no_ucd"))]
pub const TRIVET_ESCAPES: [(char, EscapeType); 16] = [
('\n', EscapeType::Discard),
('\\', EscapeType::Char('\\')),
('\'', EscapeType::Char('\'')),
('\"', EscapeType::Char('\"')),
('a', EscapeType::Char('\x07')),
('b', EscapeType::Char('\x08')),
('e', EscapeType::Char('\x1b')),
('f', EscapeType::Char('\x0c')),
('n', EscapeType::Char('\n')),
('r', EscapeType::Char('\r')),
('t', EscapeType::Char('\t')),
('v', EscapeType::Char('\x0b')),
('x', EscapeType::NakedByte),
('u', EscapeType::BracketU18),
('N', EscapeType::BracketUNamed),
('?', EscapeType::Char('?')),
];
#[cfg(feature = "no_ucd")]
pub const TRIVET_ESCAPES: [(char, EscapeType); 15] = [
('\n', EscapeType::Discard),
('\\', EscapeType::Char('\\')),
('\'', EscapeType::Char('\'')),
('\"', EscapeType::Char('\"')),
('a', EscapeType::Char('\x07')),
('b', EscapeType::Char('\x08')),
('e', EscapeType::Char('\x1b')),
('f', EscapeType::Char('\x0c')),
('n', EscapeType::Char('\n')),
('r', EscapeType::Char('\r')),
('t', EscapeType::Char('\t')),
('v', EscapeType::Char('\x0b')),
('x', EscapeType::NakedByte),
('u', EscapeType::BracketU18),
('?', EscapeType::Char('?')),
];
/// See [C](https://en.wikipedia.org/wiki/Escape_sequences_in_C).
pub const C_ESCAPES: [(char, EscapeType); 16] = [
('\n', EscapeType::Discard),
('\\', EscapeType::Char('\\')),
('\'', EscapeType::Char('\'')),
('\"', EscapeType::Char('\"')),
('a', EscapeType::Char('\x07')),
('b', EscapeType::Char('\x08')),
('e', EscapeType::Char('\x1b')),
('f', EscapeType::Char('\x0c')),
('n', EscapeType::Char('\n')),
('r', EscapeType::Char('\r')),
('t', EscapeType::Char('\t')),
('v', EscapeType::Char('\x0b')),
('x', EscapeType::NakedByte),
('u', EscapeType::NakedU4),
('U', EscapeType::NakedU8),
('?', EscapeType::Char('?')),
];
/// See [Python](https://docs.python.org/3/reference/lexical_analysis.html).
#[cfg(not(feature = "no_ucd"))]
pub const PYTHON_ESCAPES: [(char, EscapeType); 15] = [
('\n', EscapeType::Discard),
('\\', EscapeType::Char('\\')),
('\'', EscapeType::Char('\'')),
('\"', EscapeType::Char('\"')),
('a', EscapeType::Char('\x07')),
('b', EscapeType::Char('\x08')),
('f', EscapeType::Char('\x0c')),
('n', EscapeType::Char('\n')),
('r', EscapeType::Char('\r')),
('t', EscapeType::Char('\t')),
('v', EscapeType::Char('\x0b')),
('x', EscapeType::NakedByte),
('N', EscapeType::BracketUNamed),
('u', EscapeType::NakedU4),
('U', EscapeType::NakedU8),
];
#[cfg(feature = "no_ucd")]
pub const PYTHON_ESCAPES: [(char, EscapeType); 14] = [
('\n', EscapeType::Discard),
('\\', EscapeType::Char('\\')),
('\'', EscapeType::Char('\'')),
('\"', EscapeType::Char('\"')),
('a', EscapeType::Char('\x07')),
('b', EscapeType::Char('\x08')),
('f', EscapeType::Char('\x0c')),
('n', EscapeType::Char('\n')),
('r', EscapeType::Char('\r')),
('t', EscapeType::Char('\t')),
('v', EscapeType::Char('\x0b')),
('x', EscapeType::NakedByte),
('u', EscapeType::NakedU4),
('U', EscapeType::NakedU8),
];
/// See [Rust](https://doc.rust-lang.org/reference/tokens.html#string-literals).
pub const RUST_ESCAPES: [(char, EscapeType); 10] = [
('\n', EscapeType::DiscardWS),
('\\', EscapeType::Char('\\')),
('\'', EscapeType::Char('\'')),
('\"', EscapeType::Char('\"')),
('0', EscapeType::Char('\0')),
('n', EscapeType::Char('\n')),
('r', EscapeType::Char('\r')),
('t', EscapeType::Char('\t')),
('x', EscapeType::NakedASCII),
('u', EscapeType::BracketU16),
];
/// See [JSON](https://www.ecma-international.org/publications-and-standards/standards/ecma-404/).
pub const JSON_ESCAPES: [(char, EscapeType); 10] = [
('\"', EscapeType::Char('\"')),
('\\', EscapeType::Char('\\')),
('\n', EscapeType::Discard),
('/', EscapeType::Char('/')),
('b', EscapeType::Char('\x08')),
('f', EscapeType::Char('\x0c')),
('n', EscapeType::Char('\n')),
('r', EscapeType::Char('\r')),
('t', EscapeType::Char('\t')),
('u', EscapeType::NakedU4),
];