1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
// Trivet
// Copyright (c) 2023 by Stacy Prowell.  All rights reserved.
// https://gitlab.com/binary-tools/trivet

//! The different escaping standards.

use core::fmt;
use std::ops::Range;

/// String standards that are implemented for the string parser and encoder.  These select a "bundle"
/// of options at once to configure how strings are parsed or encoded.
///
/// These are *mostly* faithful to the specified implementation, *except* that the source and destination
/// strings are Rust UTF-8 strings.  This means some things won't work exactly the same when the underlying
/// platform does not enforce UTF-8.  For instance, C uses null-terminated byte arrays, but Rust strings can
/// contains nulls.
#[derive(Debug, Copy, Clone)]
pub enum StringStandard {
    /// Trivet has its own, very permissive string standard, providing basic character escapes, including
    /// both "naked" two-digit ASCII (`\x0a`) and longer bracketed Unicode (`\u{a}`) escapes.
    ///
    /// Trivet allows octal escapes such as `\0` (null) and `\12` (newline), permits
    /// surrogate pairs, replaces illegal Unicode characters with U+FFFD, and allows using the escape
    /// character to indicate a literal value not otherwise covered (so `\q` is treated as just `q`).
    Trivet,

    /// Parse according to the Rust standard found
    /// [here](https://doc.rust-lang.org/reference/tokens.html#string-literals).
    Rust,

    /// Parse according to the JSON standard defined by
    /// [ECMA-404 JSON Data Interchange Syntax](https://www.ecma-international.org/publications-and-standards/standards/ecma-404/).
    JSON,

    /// Parse according to the latest C standard
    /// [ISO/IEC 9899:2018 (C18)](https://blog.ansi.org/2018/11/c-language-standard-iso-iec-9899-2018-c18/).
    C,

    /// Parse according to the current
    /// [Python standard](https://docs.python.org/3/reference/lexical_analysis.html).
    Python,
}

impl fmt::Display for StringStandard {
    fn fmt(&self, form: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::C => write!(form, "C"),
            Self::JSON => write!(form, "JSON"),
            Self::Python => write!(form, "Python"),
            Self::Rust => write!(form, "Rust"),
            Self::Trivet => write!(form, "Trivet"),
        }
    }
}

/// How to handle parsing unknown and invalid escapes.
pub enum UnknownEscapeProtocol {
    /// Just discard.  This is rarely used.
    Discard,

    /// Drop the escape character.  This transforms, for instance, an undefined `\m` into `m` and is
    /// common in shell languages like Bash in order to escape arbitrary characters.
    DropEscape,

    /// Escape the escape.  This transforms an undefined `\m` into `\\m`.  This is the
    /// protocol used in Python.
    LiteralEscape,

    /// Replace with the Unicode replacement character, U+FFFD.
    ReplacementCharacter,

    /// Substitute the given character.  This generalizes the `ReplacementCharacter` option.
    /// Some implementations map unknown escapes to a specific character.
    Replace(char),

    /// Generate an error.
    Error,
}

/// How to handle parsing an invalid Unicode hexadecimal escape.  This also applies to the
/// escapes of a surrogate pair when surrogate pairs are not allowed.
pub enum IllegalUnicodeProtocol {
    /// Just discard.
    Discard,

    /// Generate an error.
    Error,

    /// Replace with the replacement character, U+FFFD.
    ReplacementCharacter,

    /// Substitute the given character.
    Replace(char),

    /// Preserve the value; the result *may* be an invalid Unicode string.  Note that this will
    /// not work the way you might think, and is overall a Bad Idea.  Prefer the replacement
    /// character.
    Preserve,
}

/// Define the different escape types.  These are used to interpret escape sequences found in
/// strings and to determine how escapes are used when encoding.
pub enum EscapeType {
    /// A character escape.  For instance, this can be used to map `\n` to a newline.
    Char(char),

    /// A "naked" ASCII escape of exactly two hex digits of value at most 7F.  This is how
    /// Rust handles `\xNN`.
    NakedASCII,

    /// A "naked" byte escape of exactly two hex digits of any value.  This is how Python
    /// handles `\xNN`.
    NakedByte,

    /// A "naked" Unicode escape of exactly four hex digits.  This is used in both JSON,
    /// Python, and C.
    NakedU4,

    /// A "naked" Unicode escape of exactly eight digits.  This is used in Python and C.
    NakedU8,

    /// A bracketed Unicode escape of 1-6 hex digits.  This is used in Rust.
    BracketU16,

    /// A bracketed Unicode escape of 1-8 hex digits with underscores permitted.
    BracketU18,

    /// A bracketed named Unicode escape.  These are given by `{n}`, where `n`
    /// is a name in [the Unicode database](https://unicode.org/ucd/).  See also
    /// [Name Aliases](https://www.unicode.org/Public/11.0.0/ucd/NameAliases.txt).
    /// The names are not case sensitive.
    BracketUNamed,

    /// Discard.  This can be used to join lines; for instance, some standards allow
    /// `[escape][newline]` to join lines together.
    Discard,

    /// Discard, but also consume whitespace to the first non-whitespace character.
    DiscardWS,
}

/// The standard used to determine which characters to encode as escapes.
pub enum EncodingStandard {
    /// Encode only control characters.
    OnlyControl,
    /// Encode anything that is outside the ASCII range or a control character.
    ASCII,
    /// Encode everything above the given limit.  Control characters are also encoded.
    /// Note that this strictly means *above* the limit.  Thus `ASCII` is equivalent to
    /// `EncodeAbove(0x7f)`.
    EncodeAbove(u32),
    /// Encode everything in the specified ranges only.  Control characters are also encoded.
    EncodeRanges(Vec<Range<u32>>),
}

/// Determine how Unicode characters that require encoding should be handled.  If ASCII encoding
/// is enabled, this only applies to characters that need encoding and are outside the ASCII range.
pub enum EncodingMethod {
    /// Encode using four-character hex.  Anything out of this range must be encoded
    /// using a surrogate pair.  This is used by JSON, for example.  Note that values
    /// that require more than 20 bits (anything above 0x3ffff) cannot be encoded.
    Naked4,
    /// Encode using four-character or eight-character hex.  This is the standard used
    /// by Python, for example.
    Naked48,
    /// Encode as a bracketed hex of at least four and no more than six digits.  This is
    /// the standard used by Rust
    Bracketed46,
    /// Encode as a bracketed hex of four to eight characters.
    Bracketed48,
    /// Encode everything using two-digit naked hex escapes.  This is pathological, but might be
    /// useful for debugging or with some older software.  In general you do not want to use this.
    Naked2,
}

/// Trivet is very permissive with its escapes.
#[cfg(not(feature = "no_ucd"))]
pub const TRIVET_ESCAPES: [(char, EscapeType); 16] = [
    ('\n', EscapeType::Discard),
    ('\\', EscapeType::Char('\\')),
    ('\'', EscapeType::Char('\'')),
    ('\"', EscapeType::Char('\"')),
    ('a', EscapeType::Char('\x07')),
    ('b', EscapeType::Char('\x08')),
    ('e', EscapeType::Char('\x1b')),
    ('f', EscapeType::Char('\x0c')),
    ('n', EscapeType::Char('\n')),
    ('r', EscapeType::Char('\r')),
    ('t', EscapeType::Char('\t')),
    ('v', EscapeType::Char('\x0b')),
    ('x', EscapeType::NakedByte),
    ('u', EscapeType::BracketU18),
    ('N', EscapeType::BracketUNamed),
    ('?', EscapeType::Char('?')),
];
#[cfg(feature = "no_ucd")]
pub const TRIVET_ESCAPES: [(char, EscapeType); 15] = [
    ('\n', EscapeType::Discard),
    ('\\', EscapeType::Char('\\')),
    ('\'', EscapeType::Char('\'')),
    ('\"', EscapeType::Char('\"')),
    ('a', EscapeType::Char('\x07')),
    ('b', EscapeType::Char('\x08')),
    ('e', EscapeType::Char('\x1b')),
    ('f', EscapeType::Char('\x0c')),
    ('n', EscapeType::Char('\n')),
    ('r', EscapeType::Char('\r')),
    ('t', EscapeType::Char('\t')),
    ('v', EscapeType::Char('\x0b')),
    ('x', EscapeType::NakedByte),
    ('u', EscapeType::BracketU18),
    ('?', EscapeType::Char('?')),
];

/// See [C](https://en.wikipedia.org/wiki/Escape_sequences_in_C).
pub const C_ESCAPES: [(char, EscapeType); 16] = [
    ('\n', EscapeType::Discard),
    ('\\', EscapeType::Char('\\')),
    ('\'', EscapeType::Char('\'')),
    ('\"', EscapeType::Char('\"')),
    ('a', EscapeType::Char('\x07')),
    ('b', EscapeType::Char('\x08')),
    ('e', EscapeType::Char('\x1b')),
    ('f', EscapeType::Char('\x0c')),
    ('n', EscapeType::Char('\n')),
    ('r', EscapeType::Char('\r')),
    ('t', EscapeType::Char('\t')),
    ('v', EscapeType::Char('\x0b')),
    ('x', EscapeType::NakedByte),
    ('u', EscapeType::NakedU4),
    ('U', EscapeType::NakedU8),
    ('?', EscapeType::Char('?')),
];

/// See [Python](https://docs.python.org/3/reference/lexical_analysis.html).
#[cfg(not(feature = "no_ucd"))]
pub const PYTHON_ESCAPES: [(char, EscapeType); 15] = [
    ('\n', EscapeType::Discard),
    ('\\', EscapeType::Char('\\')),
    ('\'', EscapeType::Char('\'')),
    ('\"', EscapeType::Char('\"')),
    ('a', EscapeType::Char('\x07')),
    ('b', EscapeType::Char('\x08')),
    ('f', EscapeType::Char('\x0c')),
    ('n', EscapeType::Char('\n')),
    ('r', EscapeType::Char('\r')),
    ('t', EscapeType::Char('\t')),
    ('v', EscapeType::Char('\x0b')),
    ('x', EscapeType::NakedByte),
    ('N', EscapeType::BracketUNamed),
    ('u', EscapeType::NakedU4),
    ('U', EscapeType::NakedU8),
];
#[cfg(feature = "no_ucd")]
pub const PYTHON_ESCAPES: [(char, EscapeType); 14] = [
    ('\n', EscapeType::Discard),
    ('\\', EscapeType::Char('\\')),
    ('\'', EscapeType::Char('\'')),
    ('\"', EscapeType::Char('\"')),
    ('a', EscapeType::Char('\x07')),
    ('b', EscapeType::Char('\x08')),
    ('f', EscapeType::Char('\x0c')),
    ('n', EscapeType::Char('\n')),
    ('r', EscapeType::Char('\r')),
    ('t', EscapeType::Char('\t')),
    ('v', EscapeType::Char('\x0b')),
    ('x', EscapeType::NakedByte),
    ('u', EscapeType::NakedU4),
    ('U', EscapeType::NakedU8),
];

/// See [Rust](https://doc.rust-lang.org/reference/tokens.html#string-literals).
pub const RUST_ESCAPES: [(char, EscapeType); 10] = [
    ('\n', EscapeType::DiscardWS),
    ('\\', EscapeType::Char('\\')),
    ('\'', EscapeType::Char('\'')),
    ('\"', EscapeType::Char('\"')),
    ('0', EscapeType::Char('\0')),
    ('n', EscapeType::Char('\n')),
    ('r', EscapeType::Char('\r')),
    ('t', EscapeType::Char('\t')),
    ('x', EscapeType::NakedASCII),
    ('u', EscapeType::BracketU16),
];

/// See [JSON](https://www.ecma-international.org/publications-and-standards/standards/ecma-404/).
pub const JSON_ESCAPES: [(char, EscapeType); 10] = [
    ('\"', EscapeType::Char('\"')),
    ('\\', EscapeType::Char('\\')),
    ('\n', EscapeType::Discard),
    ('/', EscapeType::Char('/')),
    ('b', EscapeType::Char('\x08')),
    ('f', EscapeType::Char('\x0c')),
    ('n', EscapeType::Char('\n')),
    ('r', EscapeType::Char('\r')),
    ('t', EscapeType::Char('\t')),
    ('u', EscapeType::NakedU4),
];