trivet/strings/
decoder.rs

1// Trivet
2// Copyright (c) 2025 by Stacy Prowell.  All rights reserved.
3// https://gitlab.com/binary-tools/trivet
4
5//! Parse strings.
6//!
7//! This supports many different approaches for parsing strings.  Choosing a particular
8//! mode re-configures the options of the string parser to handle that specific syntax.
9
10#[cfg(not(feature = "no_ucd"))]
11use super::ucd::UCD;
12use super::C_ESCAPES;
13use super::JSON_ESCAPES;
14use super::PYTHON_ESCAPES;
15use super::RUST_ESCAPES;
16use super::TOML_ESCAPES;
17use super::TRIVET_ESCAPES;
18use crate::decoder::Decode;
19use crate::strings::EscapeType;
20use crate::strings::IllegalUnicodeProtocol;
21use crate::strings::StringStandard;
22use crate::strings::UnknownEscapeProtocol;
23use crate::{
24    errors::{syntax_error, unexpected_character_error, ParseResult},
25    Loc, ParserCore,
26};
27use std::collections::BTreeMap;
28#[cfg(not(feature = "no_ucd"))]
29use std::rc::Rc;
30
31/// The initial capacity of strings.
32const CAPACITY: usize = 64;
33
34/// Construct the UCD and return it.  This is a relatively costly operation and you
35/// should only do it *once*.  Once you have done this you can keep it around and use
36/// it to initialize string parsers that handle named Unicode escapes.  It is not needed
37/// otherwise.
38///
39/// Why is this boxed?  To prevent passing a huge data structure on the stack.
40///
41/// Why is this reference counted?  So a single copy can be used repeatedly.
42#[cfg(not(feature = "no_ucd"))]
43pub fn get_ucd() -> Box<Rc<BTreeMap<&'static str, char>>> {
44    // This is where I would use lazy_static, but that would add an external
45    // dependency.  Unfortunately `from` requires that we pass the massive array
46    // on the stack, so let's not do that.
47    let mut map = BTreeMap::new();
48    for (key, value) in UCD {
49        map.insert(*key, *value);
50    }
51    Box::new(Rc::new(map))
52}
53
54/// Implement parsing of strings.
55///
56/// This is intended to be a very flexible parsing system, and implements
57/// some common string formats.  Specific features can be enable and disabled
58/// by setting the flags and providing a map for escape handling rules.
59///
60/// # Escape Handling
61///
62/// Specify escape handling rules by creating a `BTreeMap` mapping characters
63/// to escape handling rules.  The character is the character following the
64/// escape character.  Escape handling rules are specified by [`EscapeType`].
65///
66/// **Note**: You cannot have both a `\0` escape and support octal escapes, or
67/// octal escapes with a leading zero will not work.
68///
69/// As an example, here are the escape handling rules for Python.
70///
71/// ```rust
72/// use std::collections::BTreeMap;
73/// use trivet::strings::EscapeType;
74///
75/// let escapes = BTreeMap::from([
76///     ('\n', EscapeType::Discard),
77///     ('\\', EscapeType::Char('\\')),
78///     ('\'', EscapeType::Char('\'')),
79///     ('\"', EscapeType::Char('\"')),
80///     ('a', EscapeType::Char('\x07')),
81///     ('b', EscapeType::Char('\x08')),
82///     ('f', EscapeType::Char('\x0c')),
83///     ('n', EscapeType::Char('\n')),
84///     ('r', EscapeType::Char('\r')),
85///     ('t', EscapeType::Char('\t')),
86///     ('v', EscapeType::Char('\x0b')),
87///     ('x', EscapeType::NakedByte),
88///     ('N', EscapeType::BracketUNamed),
89///     ('u', EscapeType::NakedU4),
90///     ('U', EscapeType::NakedU8),
91/// ]);
92/// ```
93///
94/// # Unicode Database
95///
96/// Note: The feature `no_ucd` will disable use of the Unicode database.
97///
98/// The parser is capable of looking up Unicode code points by their name
99/// or alias.  This is provided by a map that encodes the entire space.  This
100/// map must be provided to every new parser instance.
101///
102/// Creating a default instance (with [`Self::default`]) does this for you.
103/// If you only use this string parser instance from then on, then you do not
104/// need to worry about this.
105///
106/// If you plan to create many string parser instances, then you should instead
107/// get the UCD database yourself via [`get_ucd`], which returns a boxed,
108/// reference-counted copy.
109///
110/// # Example
111///
112/// ```rust
113/// use trivet::strings::StringParser;
114/// use trivet::parse_from_string;
115/// use trivet::Parser;
116///
117/// // Make a new string parser.
118/// let mut strpar = StringParser::new();
119///
120/// // Make a parser around a string.
121/// let mut parser = parse_from_string(r#""This\nis\na\nstring.""#);
122/// match parser.parse_string_match_delimiter() {
123///     Ok(value) => println!("{}", value),
124///     Err(err) => println!("ERROR: {}", err),
125/// }
126/// ```
127#[derive(Clone)]
128pub struct StringParser {
129    /// If true, parse escape sequences.
130    pub enable_escapes: bool,
131
132    /// Character used to introduce an escape.  Usually `\`.
133    pub escape_char: char,
134
135    /// If true, permit "naked" control characters to be present in the stream.  Otherwise
136    /// generate an error.  This applies to all character values below `\u0020` and to only
137    /// those characters (so delete and a few other control characters are not included).
138    pub permit_low_control_characters: bool,
139
140    /// How to handle unrecognized escape sequences.
141    pub unknown_escape_protocol: UnknownEscapeProtocol,
142
143    /// If true, and if the current result looks like a UTF-16 surrogate pair (it is in
144    /// the range U+D800 up to U+DBFF) then try to find and parse a second surrogate and
145    /// generate the corresponding character.
146    ///
147    /// If false, treat this as an invalid escape.  For instance, Rust does not permit
148    /// surrogate pairs in this way.
149    pub allow_surrogate_pairs: bool,
150
151    /// How to handle invalid Unicode values that arise from parsing hexadecimal escapes.
152    /// This includes surrogate pairs when those are not allowed.
153    pub illegal_unicode_protocol: IllegalUnicodeProtocol,
154
155    /// Permit octal escapes.  These have the form `[escape]` followed by (usually) one to
156    /// three octal digits (but see [`Self::octal_escapes_are_flexible`]).  Parsing of
157    /// octal escapes is performed *before* handling other escapes to permit `[escape]0` to
158    /// be handled correctly, if present.
159    pub allow_octal_escapes: bool,
160
161    /// Allow flexible octal escapes.  These consist of one to three octal digits.  Python
162    /// uses this approach, so `"\x12k"` encodes the string `"\nk"`.  It this is disabled,
163    /// then octal escapes must have *exactly* three octal digits.
164    pub octal_escapes_are_flexible: bool,
165
166    /// Provide interpretation for escapes.  Each entry maps a specific character to the
167    /// character's meaning when that character follows the escape character.  For example,
168    /// in C we would have `n` map to `EscapeType::Char('\n')`.
169    ///
170    /// See [ASCII](https://www.ascii-code.com/) for the meaning of characters in the ASCII
171    /// range, and consult the Unicode standard for others.
172    escapes: BTreeMap<char, EscapeType>,
173
174    /// Provide a fast lookup table for escapes in the ASCII range.
175    fast_escapes: [EscapeType; 128],
176
177    /// The Unicode database of names and aliases to code points.
178    #[cfg(not(feature = "no_ucd"))]
179    pub ucd: Rc<BTreeMap<&'static str, char>>,
180}
181
182impl StringParser {
183    /// Make and return a new string parser.  The initial parsing standard is set to
184    /// [`StringStandard::Trivet`].
185    #[cfg(not(feature = "no_ucd"))]
186    pub fn new() -> Self {
187        let mut parser = StringParser {
188            enable_escapes: true,
189            permit_low_control_characters: true,
190            escape_char: '\\',
191            allow_octal_escapes: true,
192            octal_escapes_are_flexible: true,
193            allow_surrogate_pairs: true,
194            illegal_unicode_protocol: IllegalUnicodeProtocol::ReplacementCharacter,
195            unknown_escape_protocol: UnknownEscapeProtocol::LiteralEscape,
196            escapes: BTreeMap::from(TRIVET_ESCAPES),
197            fast_escapes: [EscapeType::Undefined; 128],
198            ucd: *get_ucd(),
199        };
200        parser.fix_escapes();
201        parser
202    }
203    #[cfg(feature = "no_ucd")]
204    pub fn new() -> Self {
205        let mut parser = StringParser {
206            enable_escapes: true,
207            permit_low_control_characters: true,
208            escape_char: '\\',
209            allow_octal_escapes: true,
210            octal_escapes_are_flexible: true,
211            allow_surrogate_pairs: true,
212            illegal_unicode_protocol: IllegalUnicodeProtocol::ReplacementCharacter,
213            unknown_escape_protocol: UnknownEscapeProtocol::LiteralEscape,
214            escapes: BTreeMap::from(TRIVET_ESCAPES),
215            fast_escapes: [EscapeType::Undefined; 128],
216        };
217        parser.fix_escapes();
218        parser
219    }
220
221    /// Make and return a new string parser.  The initial parsing mode is set to Trivet.
222    #[cfg(not(feature = "no_ucd"))]
223    pub fn new_from_db(ucd: &Rc<BTreeMap<&'static str, char>>) -> Self {
224        let mut parser = StringParser {
225            enable_escapes: true,
226            permit_low_control_characters: true,
227            escape_char: '\\',
228            allow_octal_escapes: true,
229            octal_escapes_are_flexible: true,
230            allow_surrogate_pairs: true,
231            illegal_unicode_protocol: IllegalUnicodeProtocol::ReplacementCharacter,
232            unknown_escape_protocol: UnknownEscapeProtocol::LiteralEscape,
233            escapes: BTreeMap::from(TRIVET_ESCAPES),
234            fast_escapes: [EscapeType::Undefined; 128],
235            ucd: ucd.clone(),
236        };
237        parser.fix_escapes();
238        parser
239    }
240
241    /// Configure all settings to conform to a given standard.  See
242    /// [`StringStandard`] for the available standards.
243    pub fn set(&mut self, std: StringStandard) {
244        match std {
245            StringStandard::Trivet => {
246                self.enable_escapes = true;
247                self.permit_low_control_characters = true;
248                self.escape_char = '\\';
249                self.allow_octal_escapes = true;
250                self.octal_escapes_are_flexible = true;
251                self.allow_surrogate_pairs = true;
252                self.illegal_unicode_protocol = IllegalUnicodeProtocol::ReplacementCharacter;
253                self.unknown_escape_protocol = UnknownEscapeProtocol::LiteralEscape;
254                self.escapes = BTreeMap::from(TRIVET_ESCAPES);
255            }
256            StringStandard::C => {
257                self.enable_escapes = true;
258                self.permit_low_control_characters = true;
259                self.escape_char = '\\';
260                self.allow_octal_escapes = true;
261                self.octal_escapes_are_flexible = true;
262                self.allow_surrogate_pairs = false;
263                self.illegal_unicode_protocol = IllegalUnicodeProtocol::ReplacementCharacter;
264                self.unknown_escape_protocol = UnknownEscapeProtocol::LiteralEscape;
265                self.escapes = BTreeMap::from(C_ESCAPES);
266            }
267            StringStandard::Rust => {
268                self.enable_escapes = true;
269                self.permit_low_control_characters = true;
270                self.escape_char = '\\';
271                self.allow_octal_escapes = false;
272                self.allow_surrogate_pairs = false;
273                self.illegal_unicode_protocol = IllegalUnicodeProtocol::Error;
274                self.unknown_escape_protocol = UnknownEscapeProtocol::Error;
275                self.escapes = BTreeMap::from(RUST_ESCAPES);
276            }
277            StringStandard::JSON => {
278                self.enable_escapes = true;
279                self.permit_low_control_characters = false;
280                self.escape_char = '\\';
281                self.allow_octal_escapes = false;
282                self.allow_surrogate_pairs = true;
283                self.illegal_unicode_protocol = IllegalUnicodeProtocol::ReplacementCharacter;
284                self.unknown_escape_protocol = UnknownEscapeProtocol::Error;
285                self.escapes = BTreeMap::from(JSON_ESCAPES);
286            }
287            StringStandard::TOML => {
288                self.enable_escapes = true;
289                self.permit_low_control_characters = false;
290                self.escape_char = '\\';
291                self.allow_octal_escapes = false;
292                self.allow_surrogate_pairs = false;
293                self.illegal_unicode_protocol = IllegalUnicodeProtocol::Error;
294                self.unknown_escape_protocol = UnknownEscapeProtocol::Error;
295                self.escapes = BTreeMap::from(TOML_ESCAPES);
296            }
297            StringStandard::Python => {
298                self.enable_escapes = true;
299                self.permit_low_control_characters = true;
300                self.escape_char = '\\';
301                self.allow_octal_escapes = true;
302                self.octal_escapes_are_flexible = true;
303                self.allow_surrogate_pairs = false;
304                self.illegal_unicode_protocol = IllegalUnicodeProtocol::ReplacementCharacter;
305                self.unknown_escape_protocol = UnknownEscapeProtocol::LiteralEscape;
306                self.escapes = BTreeMap::from(PYTHON_ESCAPES);
307            }
308        }
309        self.fix_escapes();
310    }
311
312    /// Set the escapes for this parser instance.
313    pub fn set_escapes(&mut self, escapes: BTreeMap<char, EscapeType>) {
314        self.escapes = escapes;
315        self.fix_escapes();
316    }
317
318    /// Create the fast escape table.
319    fn fix_escapes(&mut self) {
320        self.fast_escapes = [EscapeType::Undefined; 128];
321        for (key, value) in self.escapes.iter() {
322            if key <= &'\u{80}' {
323                self.fast_escapes[*key as usize] = *value
324            }
325        }
326    }
327
328    /// Correctly handle an invalid escape.
329    fn invalid_escape(&self, ch: char, loc: Loc, string: &mut String) -> ParseResult<()> {
330        match self.unknown_escape_protocol {
331            UnknownEscapeProtocol::Discard => Ok(()),
332            UnknownEscapeProtocol::DropEscape => {
333                string.push(ch);
334                Ok(())
335            }
336            UnknownEscapeProtocol::Error => Err(syntax_error(
337                loc,
338                format!("Invalid escape '{}{}'", self.escape_char, ch).as_str(),
339            )),
340            UnknownEscapeProtocol::LiteralEscape => {
341                string.push(self.escape_char);
342                string.push(ch);
343                Ok(())
344            }
345            UnknownEscapeProtocol::Replace(ch) => {
346                string.push(ch);
347                Ok(())
348            }
349            UnknownEscapeProtocol::ReplacementCharacter => {
350                string.push(char::REPLACEMENT_CHARACTER);
351                Ok(())
352            }
353        }
354    }
355
356    /// Correctly handle an invalid Unicode value.
357    fn handle_illegal_unicode(&self, value: u32, loc: Loc, string: &mut String) -> ParseResult<()> {
358        match self.illegal_unicode_protocol {
359            IllegalUnicodeProtocol::Discard => Ok(()),
360            IllegalUnicodeProtocol::Error => Err(syntax_error(
361                loc,
362                format!("Value is not a valid Unicode code point: {:04x}", value).as_str(),
363            )),
364            IllegalUnicodeProtocol::Replace(ch) => {
365                string.push(ch);
366                Ok(())
367            }
368            IllegalUnicodeProtocol::ReplacementCharacter => {
369                string.push(char::REPLACEMENT_CHARACTER);
370                Ok(())
371            }
372        }
373    }
374
375    /// Handle something that looks like a surrogate pair.  On entry the parser is assumed
376    /// to be pointing to the escape character of the second element of the pair.  On exit
377    /// the entire second element been consumed.  If no second element is found, then treat
378    /// this as illegal Unicode and handle appropriately.
379    fn parse_surrogate_pair(
380        &self,
381        parser: &mut ParserCore,
382        first: u32,
383        loc: Loc,
384        string: &mut String,
385    ) -> ParseResult<()> {
386        // At this point we have parsed the first surrogate pair.  Now we need to see if
387        // there is a second element.  We should *expect* the next thing in the stream to
388        // be an escape character.  If it isn't, then we don't have a second surrogate
389        // pair.
390        if !parser.peek_and_consume(self.escape_char) {
391            // This is not what we expect, and the whole thing is wrong.
392            return self.handle_illegal_unicode(first, loc, string);
393        }
394
395        // We need to process the next escape, but it must be a hexadecimal escape of at least 16 bits
396        // or we can't get a second surrogate pair.
397        let ch = parser.peek();
398        parser.consume();
399        let second = match self.escapes.get(&ch) {
400            Some(EscapeType::BraceU18) => {
401                // Get the hex code.
402                self.parse_braced_hex(parser, 1, 8, true)?
403            }
404            Some(EscapeType::BraceU16) => {
405                // Get the hex code.
406                self.parse_braced_hex(parser, 1, 6, false)?
407            }
408            Some(EscapeType::NakedU4) => {
409                // Get the hex code.
410                let digits = parser.peek_n(4);
411                parser.consume_n(4);
412                // Try to convert to a u32.
413                (match u16::from_str_radix(&digits, 16) {
414                    Ok(value) => value,
415                    Err(err) => {
416                        return Err(syntax_error(
417                            loc,
418                            format!("Invalid hex value (ref:1) '{}': {}", digits, err).as_str(),
419                        ))
420                    }
421                }) as u32
422            }
423            Some(EscapeType::NakedU8) => {
424                // Get the hex code.
425                let digits = parser.peek_n(8);
426                parser.consume_n(8);
427                // Try to convert to a u32.
428                match u32::from_str_radix(&digits, 16) {
429                    Ok(value) => value,
430                    Err(err) => {
431                        return Err(syntax_error(
432                            loc,
433                            format!("Invalid hex value (ref:2) '{}': {}", digits, err).as_str(),
434                        ))
435                    }
436                }
437            }
438            _ => {
439                // Well this is clearly wrong.
440                return Err(syntax_error(loc,
441                    "Found what seems to be the first half of a surrogate pair, but no second half was found."
442                ));
443            }
444        };
445
446        // Do we even allow surrogate pairs?
447        if !self.allow_surrogate_pairs {
448            // No.
449            return Err(syntax_error(loc, "Surrogate pairs are not permitted"));
450        }
451
452        // Okay, check the parts for this surrogate pair.
453        if !(0xd800..0xdc00).contains(&first) || !(0xdc00..0xe000).contains(&second) {
454            // This is not a valid surrogate pair.
455            return Err(syntax_error(
456                loc,
457                format!("Invalid surrogate pair {:04x},{:04x}", first, second).as_str(),
458            ));
459        }
460
461        // Compute the actual value.  Having checked everything above, this should never
462        // fail.
463        let value = (first - 0xD800) * 0x400 + (second - 0xDC00) + 0x10000;
464        self.u32_to_char(value, loc, string)?;
465        Ok(())
466    }
467
468    /// Process braced hexadecimal values.  This returns the u32 that is parsed, if any.
469    /// It does not transform it into a Unicode character or check that.
470    ///
471    /// On entry the parser is assumed to be pointing to the opening brace, and this is checked.
472    /// On exit the closing brace is consumed.
473    fn parse_braced_hex(
474        &self,
475        parser: &mut ParserCore,
476        low: usize,
477        high: usize,
478        underscores: bool,
479    ) -> ParseResult<u32> {
480        let loc = parser.loc();
481        // Expect an opening brace.
482        if !parser.peek_and_consume('{') {
483            // Malformed escape.
484            return Err(unexpected_character_error(loc, "{", parser.peek()));
485        }
486        // Read the hexadecimal characters.
487        let digits = if underscores {
488            parser.take_while_unless(|ch| ch.is_ascii_hexdigit(), |ch| ch == '_')
489        } else {
490            parser.take_while(|ch| ch.is_ascii_hexdigit())
491        };
492        // The next thing must be the closing brace.
493        if !parser.peek_and_consume('}') {
494            // Malformed escape.
495            return Err(unexpected_character_error(parser.loc(), "}", parser.peek()));
496        }
497        // Check the number of digits.  Because they are in the ASCII range we can use length.
498        if !(low..=high).contains(&digits.len()) {
499            if digits.len() < low {
500                return Err(syntax_error(loc, "Too few digits given in escape"));
501            }
502            return Err(syntax_error(loc, "Too many digits given in escape"));
503        }
504        Ok(u32::from_str_radix(&digits, 16).unwrap())
505    }
506
507    /// Handle a u32 conversion to a char.  This also handles the failure.
508    fn u32_to_char(&self, value: u32, loc: Loc, string: &mut String) -> ParseResult<()> {
509        match char::from_u32(value) {
510            None => {
511                // Failed.
512                self.handle_illegal_unicode(value, loc, string)
513            }
514            Some(ch) => {
515                string.push(ch);
516                Ok(())
517            }
518        }
519    }
520
521    /// Parse the next escape sequence.  The initial escape character is assumed to have been
522    /// consumed prior to entry, and thus the parser is pointing to the first character after
523    /// the escape.  On exit the parser is pointing to the first character following the escape
524    /// sequence.
525    fn parse_escape(&self, parser: &mut ParserCore, string: &mut String) -> ParseResult<()> {
526        let loc = parser.loc();
527        let mut ch = parser.peek();
528        parser.consume();
529
530        let esc_type = if ch.is_ascii() {
531            &self.fast_escapes[ch as usize]
532        } else if let Some(esc_type) = self.escapes.get(&ch) {
533            esc_type
534        } else {
535            &EscapeType::Undefined
536        };
537
538        // Check for a known escape code.
539        match esc_type {
540            EscapeType::Char(rp) => {
541                string.push(*rp);
542                Ok(())
543            }
544            EscapeType::Undefined => {
545                // Look for an octal escape if we are allowing them.
546                if self.allow_octal_escapes && ('0'..='7').contains(&ch) {
547                    // Parse this as an octal escape.  We can grab up to two additional digits.
548                    let mut value = (ch as u32) - ('0' as u32);
549                    for _ in 0..2 {
550                        ch = parser.peek();
551                        if ('0'..='7').contains(&ch) {
552                            value *= 8;
553                            value += (ch as u32) - ('0' as u32);
554                            parser.consume();
555                        } else {
556                            if !self.octal_escapes_are_flexible {
557                                return Err(syntax_error(
558                                    loc,
559                                    "Octal escape must have three digits",
560                                ));
561                            }
562                            break;
563                        }
564                    }
565                    self.u32_to_char(value, loc, string)?;
566                    return Ok(());
567                }
568                self.invalid_escape(ch, loc, string)?;
569                Ok(())
570            }
571            EscapeType::BraceU18 => {
572                let value = self.parse_braced_hex(parser, 1, 8, true)?;
573                if (0xd800..0xe000).contains(&value) {
574                    // This is the start of a surrogate pair.
575                    self.parse_surrogate_pair(parser, value, loc, string)?
576                } else {
577                    self.u32_to_char(value, loc, string)?
578                };
579                Ok(())
580            }
581            EscapeType::BraceU16 => {
582                let value = self.parse_braced_hex(parser, 1, 6, false)?;
583                if (0xd800..0xe000).contains(&value) {
584                    // This is the start of a surrogate pair.
585                    self.parse_surrogate_pair(parser, value, loc, string)?
586                } else {
587                    self.u32_to_char(value, loc, string)?
588                };
589                Ok(())
590            }
591            EscapeType::BracketUNamed => {
592                #[cfg(not(feature = "no_ucd"))]
593                {
594                    // Expect an opening brace.
595                    if !parser.peek_and_consume('{') {
596                        // Malformed escape.
597                        return Err(unexpected_character_error(loc, "{", parser.peek()));
598                    }
599                    // Get the content of the braces.
600                    let name = parser.take_while(|ch| ch != '}');
601                    // The next thing must be the closing brace.
602                    if !parser.peek_and_consume('}') {
603                        // Malformed escape.
604                        return Err(unexpected_character_error(loc, "}", parser.peek()));
605                    }
606                    // Try to find the character in the Unicode database.
607                    let name = name.to_uppercase();
608                    match self.ucd.get(name.as_str()) {
609                        Some(ch) => {
610                            string.push(*ch);
611                            Ok(())
612                        }
613                        None => Err(syntax_error(
614                            loc,
615                            format!("Unknown Unicode character name '{}'", name).as_str(),
616                        )),
617                    }
618                }
619                #[cfg(feature = "no_ucd")]
620                {
621                    Err(syntax_error(loc, "Unicode name lookup is not enabled."))
622                }
623            }
624            EscapeType::Discard => Ok(()),
625            EscapeType::DiscardWS => {
626                parser.consume_ws_only();
627                Ok(())
628            }
629            EscapeType::NakedASCII => {
630                let digits = parser.peek_n(2);
631                parser.consume_n(2);
632                // Try to convert to a byte.
633                let value = match u8::from_str_radix(&digits, 16) {
634                    Ok(value) => value,
635                    Err(err) => {
636                        return Err(syntax_error(
637                            loc,
638                            format!("Invalid ASCII hex value '{}': {}", digits, err).as_str(),
639                        ))
640                    }
641                };
642                if value > 0x7f {
643                    return Err(syntax_error(
644                        loc,
645                        format!("Invalid ASCII value (too high): '{}'", digits).as_str(),
646                    ));
647                }
648                string.push(unsafe { char::from_u32_unchecked(value as u32) });
649                Ok(())
650            }
651            EscapeType::NakedByte => {
652                let digits = parser.peek_n(2);
653                parser.consume_n(2);
654                // Try to convert to a byte.
655                let value = match u8::from_str_radix(&digits, 16) {
656                    Ok(value) => value,
657                    Err(err) => {
658                        return Err(syntax_error(
659                            loc,
660                            format!("Invalid hex value (ref:3) '{}': {}", digits, err).as_str(),
661                        ))
662                    }
663                } as u32;
664                // None of the code points this can match are invalid, so we don't need to
665                // check.  Note that this will behave differently from C in that the value
666                // will be treated as a Unicode code point.
667                string.push(char::from_u32(value).unwrap());
668                Ok(())
669            }
670            EscapeType::NakedU4 => {
671                let digits = parser.peek_n(4);
672                parser.consume_n(4);
673                // Try to convert to a u32.
674                let value = match u16::from_str_radix(&digits, 16) {
675                    Ok(value) => value,
676                    Err(err) => {
677                        return Err(syntax_error(
678                            loc,
679                            format!("Invalid hex value (ref:4) '{}': {}", digits, err).as_str(),
680                        ))
681                    }
682                } as u32;
683                if (0xd800..0xe000).contains(&value) {
684                    // This is the start of a surrogate pair.
685                    return self.parse_surrogate_pair(parser, value, loc, string);
686                }
687                // Because surrogate pairs are extracted above, we have nothing here that could
688                // be a problem.
689                string.push(unsafe { char::from_u32_unchecked(value) });
690                Ok(())
691            }
692            EscapeType::NakedU8 => {
693                let digits = parser.peek_n(8);
694                parser.consume_n(8);
695                // Try to convert to a u32.
696                let value = match u32::from_str_radix(&digits, 16) {
697                    Ok(value) => value,
698                    Err(err) => {
699                        return Err(syntax_error(
700                            loc,
701                            format!("Invalid hex value (ref:5) '{}': {}", digits, err).as_str(),
702                        ))
703                    }
704                };
705                if (0xd800..0xe000).contains(&value) {
706                    // This is the start of a surrogate pair.
707                    return self.parse_surrogate_pair(parser, value, loc, string);
708                }
709                match char::from_u32(value) {
710                    Some(ch) => {
711                        string.push(ch);
712                        Ok(())
713                    }
714                    None => self.handle_illegal_unicode(value, loc, string),
715                }
716            }
717        }
718    }
719
720    // Methods that require a terminal delimiter.
721
722    /// Parse a string.
723    fn parse_esc_con_ter(&self, parser: &mut ParserCore, terminal: char) -> ParseResult<String> {
724        // Parse every character until we encounter the specified terminal.
725        let mut result = String::with_capacity(CAPACITY);
726        let loc = parser.loc();
727        while !parser.is_at_eof() {
728            let ch = parser.peek();
729            if ch == terminal {
730                parser.consume();
731                return Ok(result);
732            } else if ch == self.escape_char {
733                // Process an escape.
734                parser.consume();
735                self.parse_escape(parser, &mut result)?;
736            } else {
737                parser.consume();
738                result.push(ch)
739            }
740        }
741        Err(syntax_error(loc, "Found unterminated string."))
742    }
743
744    /// Parse a string.
745    fn parse_esc_ter(&self, parser: &mut ParserCore, terminal: char) -> ParseResult<String> {
746        // Parse every character until we encounter the specified terminal.
747        let mut result = String::with_capacity(CAPACITY);
748        let loc = parser.loc();
749        while !parser.is_at_eof() {
750            let ch = parser.peek();
751            if ch == terminal {
752                parser.consume();
753                return Ok(result);
754            } else if ch < '\x20' {
755                // Low control code.
756                return Err(syntax_error(
757                    parser.loc(),
758                    &format!(
759                        "Control characters are not permitted in strings: '{:?}'",
760                        ch
761                    ),
762                ));
763            } else if ch == self.escape_char {
764                // Process an escape.
765                parser.consume();
766                self.parse_escape(parser, &mut result)?;
767            } else {
768                parser.consume();
769                result.push(ch)
770            }
771        }
772        Err(syntax_error(loc, "Found unterminated string."))
773    }
774
775    /// Parse a string.
776    fn parse_con_ter(&self, parser: &mut ParserCore, terminal: char) -> ParseResult<String> {
777        // Parse every character until we encounter the specified terminal.
778        let mut result = String::with_capacity(CAPACITY);
779        let loc = parser.loc();
780        while !parser.is_at_eof() {
781            let ch = parser.peek();
782            if ch == terminal {
783                parser.consume();
784                return Ok(result);
785            } else {
786                parser.consume();
787                result.push(ch)
788            }
789        }
790        Err(syntax_error(loc, "Found unterminated string."))
791    }
792
793    /// Parse a string.
794    fn parse_ter(&self, parser: &mut ParserCore, terminal: char) -> ParseResult<String> {
795        // Parse every character until we encounter the specified terminal.
796        let mut result = String::with_capacity(CAPACITY);
797        let loc = parser.loc();
798        while !parser.is_at_eof() {
799            let ch = parser.peek();
800            if ch == terminal {
801                parser.consume();
802                return Ok(result);
803            } else if ch < '\x20' {
804                // Low control code.
805                return Err(syntax_error(
806                    parser.loc(),
807                    &format!(
808                        "Control characters are not permitted in strings: '{:?}'",
809                        ch
810                    ),
811                ));
812            } else {
813                parser.consume();
814                result.push(ch)
815            }
816        }
817        Err(syntax_error(loc, "Found unterminated string."))
818    }
819
820    // Methods that do not require a terminal character.
821
822    fn read_c(&self, parser: &mut ParserCore) -> ParseResult<String> {
823        Ok(parser.take_while(|_| true))
824    }
825
826    fn read_ce(&self, parser: &mut ParserCore) -> ParseResult<String> {
827        let mut result = String::with_capacity(CAPACITY);
828        while !parser.is_at_eof() {
829            let ch = parser.peek();
830            parser.consume();
831            if ch == self.escape_char {
832                self.parse_escape(parser, &mut result)?
833            } else {
834                result.push(ch)
835            }
836        }
837        Ok(result)
838    }
839
840    fn read(&self, parser: &mut ParserCore) -> ParseResult<String> {
841        let result = parser.take_while(|ch| ch >= '\x20');
842        if parser.is_at_eof() {
843            Ok(result)
844        } else {
845            let ch = parser.peek();
846            Err(syntax_error(
847                parser.loc(),
848                &format!(
849                    "Control characters are not permitted in strings: '{:?}'",
850                    ch
851                ),
852            ))
853        }
854    }
855
856    fn read_e(&self, parser: &mut ParserCore) -> ParseResult<String> {
857        let mut result = String::with_capacity(CAPACITY);
858        while !parser.is_at_eof() {
859            let ch = parser.peek();
860            if ch == self.escape_char {
861                parser.consume();
862                self.parse_escape(parser, &mut result)?
863            } else if ch < '\x20' {
864                return Err(syntax_error(
865                    parser.loc(),
866                    &format!(
867                        "Control characters are not permitted in strings: '{:?}'",
868                        ch
869                    ),
870                ));
871            } else {
872                parser.consume();
873                result.push(ch)
874            }
875        }
876        Ok(result)
877    }
878
879    /// Parse a string from the given parser.  The `terminal` specifies a terminal character
880    /// that ends the string.  If the terminal is `None`, then *everything* is parsed as part
881    /// of the string until the end of stream is reached.
882    ///
883    /// If a terminal is specified (is not `None`) but is not found, an error is generated.
884    ///
885    pub fn process(&self, parser: &mut ParserCore, terminal: Option<char>) -> ParseResult<String> {
886        match terminal {
887            None => {
888                if self.enable_escapes {
889                    if self.permit_low_control_characters {
890                        self.read_ce(parser)
891                    } else {
892                        self.read_e(parser)
893                    }
894                } else if self.permit_low_control_characters {
895                    self.read_c(parser)
896                } else {
897                    self.read(parser)
898                }
899            }
900            Some(terminal) => {
901                if self.enable_escapes {
902                    if self.permit_low_control_characters {
903                        self.parse_esc_con_ter(parser, terminal)
904                    } else {
905                        self.parse_esc_ter(parser, terminal)
906                    }
907                } else if self.permit_low_control_characters {
908                    self.parse_con_ter(parser, terminal)
909                } else {
910                    self.parse_ter(parser, terminal)
911                }
912            }
913        }
914    }
915
916    /// Parse a string from the given value.  The entire string is parsed.
917    pub fn parse_string(&self, value: &str) -> ParseResult<String> {
918        let decoder = Decode::new(value.bytes().collect());
919        let mut parser = ParserCore::new("<string>", decoder);
920        self.process(&mut parser, None)
921    }
922}
923
924impl Default for StringParser {
925    /// Make and return a new string parser.  The initial parsing mode is set to Rust.
926    fn default() -> Self {
927        Self::new()
928    }
929}
930
931#[cfg(test)]
932mod test {
933    use std::collections::BTreeMap;
934
935    use super::StringParser;
936    use crate::parse_from_string;
937    use crate::strings::{EscapeType, IllegalUnicodeProtocol, UnknownEscapeProtocol};
938
939    // Basic sanity checks.  For more testing, see the strings_test module.
940
941    #[test]
942    fn simple_test() {
943        let mut sp = StringParser::new();
944        sp.enable_escapes = false;
945        sp.permit_low_control_characters = false;
946        let cases = &[
947            (
948                r#"This is a simple string."#,
949                None,
950                "This is a simple string.",
951            ),
952            (r#"This is an escape\n."#, None, "This is an escape\\n."),
953            ("This is a control code\x02.", None, ""),
954            (
955                r#"This is a simple string.""#,
956                Some('"'),
957                "This is a simple string.",
958            ),
959            (r#"This is a simple string."#, Some('"'), ""),
960            (
961                r#"This is an escape\n.""#,
962                Some('"'),
963                "This is an escape\\n.",
964            ),
965            ("This is a control code\x02.\"", Some('"'), ""),
966        ];
967        for (in_str, term, out_str) in cases {
968            let mut parser = parse_from_string(in_str);
969            let result = sp.process(parser.borrow_core(), *term);
970            if out_str.is_empty() {
971                assert!(result.is_err())
972            } else {
973                assert_eq!(&result.unwrap(), out_str)
974            }
975        }
976    }
977
978    #[test]
979    fn control_test() {
980        let mut sp = StringParser::new();
981        sp.enable_escapes = false;
982        sp.permit_low_control_characters = true;
983        let cases = &[
984            (
985                r#"This is a simple string."#,
986                None,
987                "This is a simple string.",
988            ),
989            (r#"This is an escape\n."#, None, "This is an escape\\n."),
990            (
991                "This is a control code\x02.",
992                None,
993                "This is a control code\x02.",
994            ),
995            (
996                r#"This is a simple string.""#,
997                Some('"'),
998                "This is a simple string.",
999            ),
1000            (r#"This is a simple string."#, Some('"'), ""),
1001            (
1002                r#"This is an escape\n.""#,
1003                Some('"'),
1004                "This is an escape\\n.",
1005            ),
1006            (
1007                "This is a control code\x02.\"",
1008                Some('"'),
1009                "This is a control code\x02.",
1010            ),
1011        ];
1012        for (in_str, term, out_str) in cases {
1013            let mut parser = parse_from_string(in_str);
1014            let result = sp.process(parser.borrow_core(), *term);
1015            if out_str.is_empty() {
1016                assert!(result.is_err())
1017            } else {
1018                assert_eq!(&result.unwrap(), out_str)
1019            }
1020        }
1021    }
1022
1023    #[test]
1024    fn escape_test() {
1025        let mut sp = StringParser::new();
1026        sp.enable_escapes = true;
1027        sp.permit_low_control_characters = false;
1028        let cases = &[
1029            (
1030                r#"This is a simple string."#,
1031                None,
1032                "This is a simple string.",
1033            ),
1034            (r#"This is an escape\n."#, None, "This is an escape\n."),
1035            ("This is a control code\x02.", None, ""),
1036            (
1037                r#"This is a simple string.""#,
1038                Some('"'),
1039                "This is a simple string.",
1040            ),
1041            (r#"This is a simple string."#, Some('"'), ""),
1042            (
1043                r#"This is an escape\n.""#,
1044                Some('"'),
1045                "This is an escape\n.",
1046            ),
1047            ("This is a control code\x02.\"", Some('"'), ""),
1048        ];
1049        for (in_str, term, out_str) in cases {
1050            let mut parser = parse_from_string(in_str);
1051            let result = sp.process(parser.borrow_core(), *term);
1052            if out_str.is_empty() {
1053                assert!(result.is_err())
1054            } else {
1055                assert_eq!(&result.unwrap(), out_str)
1056            }
1057        }
1058    }
1059
1060    #[test]
1061    fn odd_escapes_test() {
1062        let mut sp = StringParser::new();
1063        sp.enable_escapes = true;
1064        sp.permit_low_control_characters = true;
1065        sp.allow_surrogate_pairs = true;
1066        let escapes = BTreeMap::from([
1067            ('\n', EscapeType::Discard),
1068            ('\\', EscapeType::Char('\\')),
1069            ('\'', EscapeType::Char('\'')),
1070            ('\"', EscapeType::Char('\"')),
1071            ('a', EscapeType::Char('\x07')),
1072            ('b', EscapeType::Char('\x08')),
1073            ('f', EscapeType::Char('\x0c')),
1074            ('n', EscapeType::Char('\n')),
1075            ('r', EscapeType::Char('\r')),
1076            ('t', EscapeType::Char('\t')),
1077            ('v', EscapeType::Char('\x0b')),
1078            ('x', EscapeType::NakedByte),
1079            ('N', EscapeType::BracketUNamed),
1080            ('u', EscapeType::NakedU4),
1081            ('U', EscapeType::NakedU8),
1082            ('z', EscapeType::Char('0')),
1083            ('å', EscapeType::Discard),
1084        ]);
1085        sp.unknown_escape_protocol = UnknownEscapeProtocol::Error;
1086        sp.illegal_unicode_protocol = IllegalUnicodeProtocol::Error;
1087        sp.set_escapes(escapes);
1088        let mut parser =
1089            parse_from_string(r#"A very \\escaped\\ string. \'\"\a\b\f\n\r\t\v\z\å\z"#);
1090        let result = sp.process(parser.borrow_core(), None);
1091        assert_eq!(
1092            result.unwrap(),
1093            "A very \\escaped\\ string. '\"\u{7}\u{8}\u{c}\n\r\t\u{b}00"
1094        );
1095        let mut parser = parse_from_string(r#"\ud801\udce0"#);
1096        let result = sp.process(parser.borrow_core(), None);
1097        assert_eq!(result.unwrap(), "𐓠");
1098        let mut parser = parse_from_string(r#"\ud801\u002e"#);
1099        let result = sp.process(parser.borrow_core(), None);
1100        assert!(result.is_err());
1101        let mut parser = parse_from_string(r#"\ud801*"#);
1102        let result = sp.process(parser.borrow_core(), None);
1103        println!("{:?}", result);
1104        assert!(result.is_err());
1105        let mut parser = parse_from_string(r#"\ß"#);
1106        let result = sp.process(parser.borrow_core(), None);
1107        assert!(result.is_err());
1108    }
1109
1110    #[test]
1111    fn control_escape_test() {
1112        let mut sp = StringParser::new();
1113        sp.enable_escapes = true;
1114        sp.permit_low_control_characters = true;
1115        let cases = &[
1116            (
1117                r#"This is a simple string."#,
1118                None,
1119                "This is a simple string.",
1120            ),
1121            (r#"This is an escape\n."#, None, "This is an escape\n."),
1122            (
1123                "This is a control code\x02.",
1124                None,
1125                "This is a control code\x02.",
1126            ),
1127            (
1128                r#"This is a simple string.""#,
1129                Some('"'),
1130                "This is a simple string.",
1131            ),
1132            (r#"This is a simple string."#, Some('"'), ""),
1133            (
1134                r#"This is an escape\n.""#,
1135                Some('"'),
1136                "This is an escape\n.",
1137            ),
1138            (
1139                "This is a control code\x02.\"",
1140                Some('"'),
1141                "This is a control code\x02.",
1142            ),
1143        ];
1144        for (in_str, term, out_str) in cases {
1145            let mut parser = parse_from_string(in_str);
1146            let result = sp.process(parser.borrow_core(), *term);
1147            if out_str.is_empty() {
1148                assert!(result.is_err())
1149            } else {
1150                assert_eq!(&result.unwrap(), out_str)
1151            }
1152        }
1153    }
1154}
trivet/strings/decoder.rs

trivet/strings/
decoder.rs