cel_parser/
parse.rs

1use std::iter::Enumerate;
2use std::num::ParseIntError;
3use std::str::Chars;
4
5/// Error type of [unescape](unescape).
6#[derive(Debug, PartialEq)]
7pub enum ParseSequenceError {
8    InvalidSymbol {
9        symbol: String,
10        index: usize,
11        string: String,
12    },
13    // #[error("invalid escape {escape} at {index} in {string}")]
14    InvalidEscape {
15        escape: String,
16        index: usize,
17        string: String,
18    },
19    // #[error("\\u could not be parsed at {index} in {string}: {source}")]
20    InvalidUnicode {
21        // #[source]
22        source: ParseUnicodeError,
23        index: usize,
24        string: String,
25    },
26    MissingOpeningQuote,
27    MissingClosingQuote,
28}
29
30/// Source error type of [ParseError::InvalidUnicode](ParseError::InvalidUnicode).
31#[derive(Debug, PartialEq, Clone)]
32pub enum ParseUnicodeError {
33    // #[error("could not parse {string} as u32 hex: {source}")]
34    ParseHexFailed {
35        // #[source]
36        source: ParseIntError,
37        string: String,
38    },
39    ParseOctFailed {
40        // #[source]
41        source: ParseIntError,
42        string: String,
43    },
44    // #[error("could not parse {value} as a unicode char")]
45    ParseUnicodeFailed {
46        value: u32,
47    },
48}
49
50pub fn parse_bytes(s: &str) -> Result<Vec<u8>, ParseSequenceError> {
51    let mut chars = s.chars().enumerate();
52    let mut res: Vec<u8> = Vec::with_capacity(s.len());
53
54    while let Some((idx, c)) = chars.next() {
55        if c == '\\' {
56            match chars.next() {
57                None => {
58                    return Err(ParseSequenceError::InvalidEscape {
59                        escape: format!("{}", c),
60                        index: idx,
61                        string: String::from(s),
62                    });
63                }
64                Some((idx, c2)) => {
65                    let byte: u8 = match c2 {
66                        'x' => {
67                            let hex: String = [
68                                chars
69                                    .next()
70                                    .ok_or(ParseSequenceError::InvalidEscape {
71                                        escape: "\\x".to_string(),
72                                        index: idx,
73                                        string: s.to_string(),
74                                    })?
75                                    .1,
76                                chars
77                                    .next()
78                                    .ok_or(ParseSequenceError::InvalidEscape {
79                                        escape: "\\x".to_string(),
80                                        index: idx,
81                                        string: s.to_string(),
82                                    })?
83                                    .1,
84                            ]
85                            .iter()
86                            .collect();
87                            u8::from_str_radix(&hex, 16).map_err(|_| {
88                                ParseSequenceError::InvalidEscape {
89                                    escape: hex,
90                                    index: idx,
91                                    string: s.to_string(),
92                                }
93                            })?
94                        }
95                        n if ('0'..='3').contains(&n) => {
96                            let octal: String = [
97                                n,
98                                chars
99                                    .next()
100                                    .ok_or(ParseSequenceError::InvalidEscape {
101                                        escape: format!("\\{n}"),
102                                        index: idx,
103                                        string: s.to_string(),
104                                    })?
105                                    .1,
106                                chars
107                                    .next()
108                                    .ok_or(ParseSequenceError::InvalidEscape {
109                                        escape: format!("\\{n}"),
110                                        index: idx,
111                                        string: s.to_string(),
112                                    })?
113                                    .1,
114                            ]
115                            .iter()
116                            .collect();
117                            u8::from_str_radix(&octal, 8).map_err(|_| {
118                                ParseSequenceError::InvalidEscape {
119                                    escape: octal,
120                                    index: idx,
121                                    string: s.to_string(),
122                                }
123                            })?
124                        }
125                        _ => {
126                            return Err(ParseSequenceError::InvalidEscape {
127                                escape: format!("{}{}", c, c2),
128                                index: idx,
129                                string: String::from(s),
130                            });
131                        }
132                    };
133
134                    res.push(byte);
135                    continue;
136                }
137            };
138        }
139        let size = c.len_utf8();
140        let mut buffer = [0; 4];
141        c.encode_utf8(&mut buffer);
142        res.extend_from_slice(&buffer[..size]);
143    }
144    Ok(res)
145}
146
147/// Parse the provided quoted string.
148/// This function was adopted from [snailquote](https://docs.rs/snailquote/latest/snailquote/).
149///
150/// # Details
151///
152/// Parses a single or double quoted string and interprets escape sequences such as
153/// '\n', '\r', '\'', etc.
154///
155/// Supports raw strings prefixed with `r` or `R` in which case all escape sequences are ignored.///
156///
157/// The full set of supported escapes between quotes may be found below:
158///
159/// | Escape     | Code       | Description                              |
160/// |------------|------------|------------------------------------------|
161/// | \a         | 0x07       | Bell                                     |
162/// | \b         | 0x08       | Backspace                                |
163/// | \v         | 0x0B       | Vertical tab                             |
164/// | \f         | 0x0C       | Form feed                                |
165/// | \n         | 0x0A       | Newline                                  |
166/// | \r         | 0x0D       | Carriage return                          |
167/// | \t         | 0x09       | Tab                                      |
168/// | \\         | 0x5C       | Backslash                                |
169/// | \?         | 0x??       | Question mark                            |
170/// | \"         | 0x22       | Double quote                             |
171/// | \'         | 0x27       | Single quote                             |
172/// | \`         | 0x60       | Backtick                                 |
173/// | \xDD       | 0xDD       | Unicode character with hex code DD       |
174/// | \uDDDD     | 0xDDDD     | Unicode character with hex code DDDD     |
175/// | \UDDDDDDDD | 0xDDDDDDDD | Unicode character with hex code DDDDDDDD |
176/// | \DDD       | 0DDD       | Unicode character with octal code DDD    |
177///
178/// # Errors
179///
180/// The returned result can display a human readable error if the string cannot be parsed as a
181/// valid quoted string.
182pub fn parse_string(s: &str) -> Result<String, ParseSequenceError> {
183    let mut chars = s.chars().enumerate();
184    let res = String::with_capacity(s.len());
185
186    match chars.next() {
187        Some((_, c)) if c == 'r' || c == 'R' => parse_raw_string(&mut chars, res),
188        Some((_, c)) if c == '\'' || c == '"' => parse_quoted_string(s, &mut chars, res, c),
189        _ => Err(ParseSequenceError::MissingOpeningQuote),
190    }
191}
192
193fn parse_raw_string(
194    chars: &mut Enumerate<Chars>,
195    mut res: String,
196) -> Result<String, ParseSequenceError> {
197    let mut in_single_quotes = false;
198    let mut in_double_quotes = false;
199
200    while let Some((_, c)) = chars.next() {
201        let in_quotes = in_single_quotes || in_double_quotes;
202
203        if c == '\\' && in_quotes {
204            match chars.next() {
205                Some((_, c2)) => {
206                    match c2 {
207                        '"' => {
208                            if in_single_quotes {
209                                res.push(c);
210                            }
211                        }
212                        '\'' => {
213                            if in_double_quotes {
214                                res.push(c);
215                            }
216                        }
217                        _ => {
218                            res.push(c);
219                        }
220                    };
221                    res.push(c2);
222                    continue;
223                }
224                _ => {
225                    res.push(c);
226                    continue;
227                }
228            };
229        } else if c == '\'' {
230            if in_double_quotes {
231                res.push(c);
232                continue;
233            }
234
235            in_single_quotes = !in_single_quotes;
236            continue;
237        } else if c == '"' {
238            if in_single_quotes {
239                res.push(c);
240                continue;
241            }
242
243            in_double_quotes = !in_double_quotes;
244            continue;
245        } else if !in_quotes {
246            return Err(ParseSequenceError::MissingOpeningQuote);
247        }
248
249        res.push(c);
250    }
251
252    Ok(res)
253}
254
255fn parse_quoted_string(
256    s: &str,
257    mut chars: &mut Enumerate<Chars>,
258    mut res: String,
259    quote: char,
260) -> Result<String, ParseSequenceError> {
261    let mut in_single_quotes = quote == '\'';
262    let mut in_double_quotes = quote == '"';
263
264    while let Some((idx, c)) = chars.next() {
265        let in_quotes = in_single_quotes || in_double_quotes;
266
267        if c == '\\' && in_quotes {
268            match chars.next() {
269                None => {
270                    return Err(ParseSequenceError::InvalidEscape {
271                        escape: format!("{}", c),
272                        index: idx,
273                        string: String::from(s),
274                    });
275                }
276                Some((idx, c2)) => {
277                    let mut push_escape_character = false;
278
279                    let value = match c2 {
280                        'a' => '\u{07}',
281                        'b' => '\u{08}',
282                        'v' => '\u{0B}',
283                        'f' => '\u{0C}',
284                        'n' => '\n',
285                        'r' => '\r',
286                        't' => '\t',
287                        '\\' => c2,
288                        '?' => c2,
289                        '\'' => {
290                            push_escape_character = in_double_quotes;
291                            c2
292                        }
293                        '"' => {
294                            push_escape_character = in_single_quotes;
295                            c2
296                        }
297                        '`' => c2,
298                        'x' | 'u' | 'U' => {
299                            let length = match c2 {
300                                'x' => 2,
301                                'u' => 4,
302                                'U' => 8,
303                                _ => unreachable!(),
304                            };
305
306                            parse_unicode_hex(length, &mut chars).map_err(|x| {
307                                ParseSequenceError::InvalidUnicode {
308                                    source: x.clone(),
309                                    index: idx,
310                                    string: String::from(s),
311                                }
312                            })?
313                        }
314                        n if ('0'..='3').contains(&n) => parse_unicode_oct(&n, &mut chars)
315                            .map_err(|x| ParseSequenceError::InvalidUnicode {
316                                source: x.clone(),
317                                index: idx,
318                                string: String::from(s),
319                            })?,
320                        _ => {
321                            return Err(ParseSequenceError::InvalidEscape {
322                                escape: format!("{}{}", c, c2),
323                                index: idx,
324                                string: String::from(s),
325                            });
326                        }
327                    };
328
329                    if push_escape_character {
330                        res.push(c);
331                    }
332
333                    res.push(value);
334
335                    continue;
336                }
337            };
338        } else if c == '\'' {
339            if in_double_quotes {
340                res.push(c);
341                continue;
342            }
343
344            in_single_quotes = !in_single_quotes;
345            continue;
346        } else if c == '"' {
347            if in_single_quotes {
348                res.push(c);
349                continue;
350            }
351
352            in_double_quotes = !in_double_quotes;
353            continue;
354        } else if !in_quotes {
355            return Err(ParseSequenceError::MissingOpeningQuote);
356        }
357
358        res.push(c);
359    }
360
361    // Ensure string has a closing quote
362    if in_single_quotes || in_double_quotes {
363        return Err(ParseSequenceError::MissingClosingQuote);
364    }
365
366    Ok(res)
367}
368
369fn parse_unicode_hex<I>(length: usize, chars: &mut I) -> Result<char, ParseUnicodeError>
370where
371    I: Iterator<Item = (usize, char)>,
372{
373    let unicode_seq: String = chars.take(length).map(|(_, c)| c).collect();
374
375    u32::from_str_radix(&unicode_seq, 16)
376        .map_err(|e| ParseUnicodeError::ParseHexFailed {
377            source: e,
378            string: unicode_seq,
379        })
380        .and_then(|u| char::from_u32(u).ok_or(ParseUnicodeError::ParseUnicodeFailed { value: u }))
381}
382
383fn parse_unicode_oct<I>(first_char: &char, chars: &mut I) -> Result<char, ParseUnicodeError>
384where
385    I: Iterator<Item = (usize, char)>,
386{
387    let mut unicode_seq: String = String::with_capacity(3);
388    unicode_seq.push(*first_char);
389    chars.take(2).for_each(|(_, c)| unicode_seq.push(c));
390
391    u32::from_str_radix(&unicode_seq, 8)
392        .map_err(|e| ParseUnicodeError::ParseOctFailed {
393            source: e,
394            string: unicode_seq,
395        })
396        .and_then(|u| {
397            if u <= 255 {
398                char::from_u32(u).ok_or(ParseUnicodeError::ParseUnicodeFailed { value: u })
399            } else {
400                Err(ParseUnicodeError::ParseUnicodeFailed { value: u })
401            }
402        })
403}
404
405#[cfg(test)]
406mod tests {
407    use crate::parse::ParseSequenceError;
408    use crate::{parse_bytes, parse_string};
409
410    #[test]
411    fn single_quotes_interprets_escapes() {
412        let tests: Vec<(&str, Result<String, ParseSequenceError>)> = vec![
413            ("'Hello \\a'", Ok(String::from("Hello \u{07}"))),
414            ("'Hello \\b'", Ok(String::from("Hello \u{08}"))),
415            ("'Hello \\v'", Ok(String::from("Hello \u{0b}"))),
416            ("'Hello \\f'", Ok(String::from("Hello \u{0c}"))),
417            ("'Hello \\n'", Ok(String::from("Hello \u{0a}"))),
418            ("'Hello \\r'", Ok(String::from("Hello \u{0d}"))),
419            ("'Hello \\t'", Ok(String::from("Hello \u{09}"))),
420            ("'Hello \\\\'", Ok(String::from("Hello \\"))),
421            ("'Hello \\?'", Ok(String::from("Hello ?"))),
422            ("'Hello \"'", Ok(String::from("Hello \""))),
423            ("'Hello \\''", Ok(String::from("Hello '"))),
424            ("'Hello \\`'", Ok(String::from("Hello `"))),
425            ("'Hello \\x20'", Ok(String::from("Hello  "))),
426            ("'Hello \\u270c'", Ok(String::from("Hello ✌"))),
427            ("'Hello \\U0001f431'", Ok(String::from("Hello 🐱"))),
428            ("'Hello \\040'", Ok(String::from("Hello  "))),
429            (
430                "Missing closing quote'",
431                Err(ParseSequenceError::MissingOpeningQuote),
432            ),
433            (
434                "'Missing closing quote",
435                Err(ParseSequenceError::MissingClosingQuote),
436            ),
437            // Testing octal value is out of range
438            (
439                "'\\440'",
440                Err(ParseSequenceError::InvalidEscape {
441                    escape: String::from("\\4"),
442                    index: 2,
443                    string: String::from("'\\440'"),
444                }),
445            ),
446        ];
447
448        for (s, expected) in tests {
449            let result = parse_string(s);
450            assert_eq!(result, expected);
451        }
452    }
453
454    #[test]
455    fn double_quotes_interprets_escapes() {
456        let tests: Vec<(&str, Result<String, ParseSequenceError>)> = vec![
457            ("\"Hello \\a\"", Ok(String::from("Hello \u{07}"))),
458            ("\"Hello \\b\"", Ok(String::from("Hello \u{08}"))),
459            ("\"Hello \\v\"", Ok(String::from("Hello \u{0b}"))),
460            ("\"Hello \\f\"", Ok(String::from("Hello \u{0c}"))),
461            ("\"Hello \\n\"", Ok(String::from("Hello \u{0a}"))),
462            ("\"Hello \\r\"", Ok(String::from("Hello \u{0d}"))),
463            ("\"Hello \\t\"", Ok(String::from("Hello \u{09}"))),
464            ("\"Hello \\\\\"", Ok(String::from("Hello \\"))),
465            ("\"Hello \\?\"", Ok(String::from("Hello ?"))),
466            ("\"Hello \\\"\"", Ok(String::from("Hello \""))),
467            ("\"Hello \\'\"", Ok(String::from("Hello \\'"))),
468            ("\"Hello \\`\"", Ok(String::from("Hello `"))),
469            ("\"Hello \\x20 \"", Ok(String::from("Hello   "))),
470            ("\"Hello \\x60\"", Ok(String::from("Hello `"))),
471            ("\"Hello \\u270c\"", Ok(String::from("Hello ✌"))),
472            ("\"Hello \\U0001f431\"", Ok(String::from("Hello 🐱"))),
473            ("\"Hello \\040\"", Ok(String::from("Hello  "))),
474            (
475                "Missing closing quote\"",
476                Err(ParseSequenceError::MissingOpeningQuote),
477            ),
478            (
479                "\"Missing closing quote",
480                Err(ParseSequenceError::MissingClosingQuote),
481            ),
482            // Testing octal value is out of range
483            (
484                "\"\\440\"",
485                Err(ParseSequenceError::InvalidEscape {
486                    escape: String::from("\\4"),
487                    index: 2,
488                    string: String::from("\"\\440\""),
489                }),
490            ),
491        ];
492
493        for (s, expected) in tests {
494            let result = parse_string(s);
495            assert_eq!(result, expected, "Testing {}", s);
496        }
497    }
498
499    #[test]
500    fn raw_string_does_not_interpret_escapes() {
501        let tests: Vec<(&str, Result<String, ParseSequenceError>)> = vec![
502            // Raw string in double quotes
503            // r"Hello \a \" ' \' \U0001f431 " => Hello \a " ' \' \U0001f431
504            // R"Hello \a \" ' \' \U0001f431 " => Hello \a " ' \' \U0001f431
505            (
506                "r\"Hello \\a \\\" ' \\' \\U0001f431 \"",
507                Ok(String::from("Hello \\a \" ' \\' \\U0001f431 ")),
508            ),
509            (
510                "R\"Hello \\a \\\" ' \\' \\U0001f431 \"",
511                Ok(String::from("Hello \\a \" ' \\' \\U0001f431 ")),
512            ),
513            // Raw string in single quotes
514            // r'Hello \a \" " \' \U0001f431 ' => Hello \a \" " ' \U0001f431
515            // R'Hello \a \" " \' \U0001f431 ' => Hello \a \" " ' \U0001f431
516            (
517                "r'Hello \\a \\\" \" \\' \\U0001f431 '",
518                Ok(String::from("Hello \\a \\\" \" ' \\U0001f431 ")),
519            ),
520            (
521                "R'Hello \\a \\\" \" \\' \\U0001f431 '",
522                Ok(String::from("Hello \\a \\\" \" ' \\U0001f431 ")),
523            ),
524        ];
525
526        for (s, expected) in tests {
527            let result = parse_string(s);
528            assert_eq!(result, expected, "Testing {}", s);
529        }
530    }
531
532    #[test]
533    fn parses_bytes() {
534        let bytes = parse_bytes("abc💖\\xFF\\376").expect("Must parse!");
535        assert_eq!([97, 98, 99, 240, 159, 146, 150, 255, 254], *bytes)
536    }
537}