cel_parser/
parse.rs

1use std::iter::Enumerate;
2use std::num::ParseIntError;
3use std::str::Chars;
4
5/// Error type of [unescape](unescape).
6#[derive(Debug, PartialEq)]
7pub enum ParseSequenceError {
8    // #[error("invalid escape {escape} at {index} in {string}")]
9    InvalidEscape {
10        escape: String,
11        index: usize,
12        string: String,
13    },
14    // #[error("\\u could not be parsed at {index} in {string}: {source}")]
15    InvalidUnicode {
16        // #[source]
17        source: ParseUnicodeError,
18        index: usize,
19        string: String,
20    },
21    MissingOpeningQuote,
22    MissingClosingQuote,
23}
24
25/// Source error type of [ParseError::InvalidUnicode](ParseError::InvalidUnicode).
26#[derive(Debug, PartialEq, Clone)]
27pub enum ParseUnicodeError {
28    // #[error("could not parse {string} as u32 hex: {source}")]
29    Hex {
30        // #[source]
31        source: ParseIntError,
32        string: String,
33    },
34    Oct {
35        // #[source]
36        source: ParseIntError,
37        string: String,
38    },
39    // #[error("could not parse {value} as a unicode char")]
40    Unicode {
41        value: u32,
42    },
43}
44
45pub fn parse_bytes(s: &str) -> Result<Vec<u8>, ParseSequenceError> {
46    let mut chars = s.chars().enumerate();
47    let mut res: Vec<u8> = Vec::with_capacity(s.len());
48
49    while let Some((idx, c)) = chars.next() {
50        if c == '\\' {
51            match chars.next() {
52                None => {
53                    return Err(ParseSequenceError::InvalidEscape {
54                        escape: format!("{c}"),
55                        index: idx,
56                        string: String::from(s),
57                    });
58                }
59                Some((idx, c2)) => {
60                    let byte: u8 = match c2 {
61                        'x' => {
62                            let hex: String = [
63                                chars
64                                    .next()
65                                    .ok_or(ParseSequenceError::InvalidEscape {
66                                        escape: "\\x".to_string(),
67                                        index: idx,
68                                        string: s.to_string(),
69                                    })?
70                                    .1,
71                                chars
72                                    .next()
73                                    .ok_or(ParseSequenceError::InvalidEscape {
74                                        escape: "\\x".to_string(),
75                                        index: idx,
76                                        string: s.to_string(),
77                                    })?
78                                    .1,
79                            ]
80                            .iter()
81                            .collect();
82                            u8::from_str_radix(&hex, 16).map_err(|_| {
83                                ParseSequenceError::InvalidEscape {
84                                    escape: hex,
85                                    index: idx,
86                                    string: s.to_string(),
87                                }
88                            })?
89                        }
90                        n if ('0'..='3').contains(&n) => {
91                            let octal: String = [
92                                n,
93                                chars
94                                    .next()
95                                    .ok_or(ParseSequenceError::InvalidEscape {
96                                        escape: format!("\\{n}"),
97                                        index: idx,
98                                        string: s.to_string(),
99                                    })?
100                                    .1,
101                                chars
102                                    .next()
103                                    .ok_or(ParseSequenceError::InvalidEscape {
104                                        escape: format!("\\{n}"),
105                                        index: idx,
106                                        string: s.to_string(),
107                                    })?
108                                    .1,
109                            ]
110                            .iter()
111                            .collect();
112                            u8::from_str_radix(&octal, 8).map_err(|_| {
113                                ParseSequenceError::InvalidEscape {
114                                    escape: octal,
115                                    index: idx,
116                                    string: s.to_string(),
117                                }
118                            })?
119                        }
120                        _ => {
121                            return Err(ParseSequenceError::InvalidEscape {
122                                escape: format!("{c}{c2}"),
123                                index: idx,
124                                string: String::from(s),
125                            });
126                        }
127                    };
128
129                    res.push(byte);
130                    continue;
131                }
132            };
133        }
134        let size = c.len_utf8();
135        let mut buffer = [0; 4];
136        c.encode_utf8(&mut buffer);
137        res.extend_from_slice(&buffer[..size]);
138    }
139    Ok(res)
140}
141
142/// Parse the provided quoted string.
143/// This function was adopted from [snailquote](https://docs.rs/snailquote/latest/snailquote/).
144///
145/// # Details
146///
147/// Parses a single or double quoted string and interprets escape sequences such as
148/// '\n', '\r', '\'', etc.
149///
150/// Supports raw strings prefixed with `r` or `R` in which case all escape sequences are ignored.///
151///
152/// The full set of supported escapes between quotes may be found below:
153///
154/// | Escape     | Code       | Description                              |
155/// |------------|------------|------------------------------------------|
156/// | \a         | 0x07       | Bell                                     |
157/// | \b         | 0x08       | Backspace                                |
158/// | \v         | 0x0B       | Vertical tab                             |
159/// | \f         | 0x0C       | Form feed                                |
160/// | \n         | 0x0A       | Newline                                  |
161/// | \r         | 0x0D       | Carriage return                          |
162/// | \t         | 0x09       | Tab                                      |
163/// | \\         | 0x5C       | Backslash                                |
164/// | \?         | 0x??       | Question mark                            |
165/// | \"         | 0x22       | Double quote                             |
166/// | \'         | 0x27       | Single quote                             |
167/// | \`         | 0x60       | Backtick                                 |
168/// | \xDD       | 0xDD       | Unicode character with hex code DD       |
169/// | \uDDDD     | 0xDDDD     | Unicode character with hex code DDDD     |
170/// | \UDDDDDDDD | 0xDDDDDDDD | Unicode character with hex code DDDDDDDD |
171/// | \DDD       | 0DDD       | Unicode character with octal code DDD    |
172///
173/// # Errors
174///
175/// The returned result can display a human readable error if the string cannot be parsed as a
176/// valid quoted string.
177pub fn parse_string(s: &str) -> Result<String, ParseSequenceError> {
178    let mut chars = s.chars().enumerate();
179    let res = String::with_capacity(s.len());
180
181    match chars.next() {
182        Some((_, c)) if c == 'r' || c == 'R' => parse_raw_string(&mut chars, res),
183        Some((_, c)) if c == '\'' || c == '"' => parse_quoted_string(s, &mut chars, res, c),
184        _ => Err(ParseSequenceError::MissingOpeningQuote),
185    }
186}
187
188fn parse_raw_string(
189    chars: &mut Enumerate<Chars>,
190    mut res: String,
191) -> Result<String, ParseSequenceError> {
192    let mut in_single_quotes = false;
193    let mut in_double_quotes = false;
194
195    while let Some((_, c)) = chars.next() {
196        let in_quotes = in_single_quotes || in_double_quotes;
197
198        if c == '\\' && in_quotes {
199            match chars.next() {
200                Some((_, c2)) => {
201                    match c2 {
202                        '"' => {
203                            if in_single_quotes {
204                                res.push(c);
205                            }
206                        }
207                        '\'' => {
208                            if in_double_quotes {
209                                res.push(c);
210                            }
211                        }
212                        _ => {
213                            res.push(c);
214                        }
215                    };
216                    res.push(c2);
217                    continue;
218                }
219                _ => {
220                    res.push(c);
221                    continue;
222                }
223            };
224        } else if c == '\'' {
225            if in_double_quotes {
226                res.push(c);
227                continue;
228            }
229
230            in_single_quotes = !in_single_quotes;
231            continue;
232        } else if c == '"' {
233            if in_single_quotes {
234                res.push(c);
235                continue;
236            }
237
238            in_double_quotes = !in_double_quotes;
239            continue;
240        } else if !in_quotes {
241            return Err(ParseSequenceError::MissingOpeningQuote);
242        }
243
244        res.push(c);
245    }
246
247    Ok(res)
248}
249
250fn parse_quoted_string(
251    s: &str,
252    mut chars: &mut Enumerate<Chars>,
253    mut res: String,
254    quote: char,
255) -> Result<String, ParseSequenceError> {
256    let mut in_single_quotes = quote == '\'';
257    let mut in_double_quotes = quote == '"';
258
259    while let Some((idx, c)) = chars.next() {
260        let in_quotes = in_single_quotes || in_double_quotes;
261
262        if c == '\\' && in_quotes {
263            match chars.next() {
264                None => {
265                    return Err(ParseSequenceError::InvalidEscape {
266                        escape: format!("{c}"),
267                        index: idx,
268                        string: String::from(s),
269                    });
270                }
271                Some((idx, c2)) => {
272                    let mut push_escape_character = false;
273
274                    let value = match c2 {
275                        'a' => '\u{07}',
276                        'b' => '\u{08}',
277                        'v' => '\u{0B}',
278                        'f' => '\u{0C}',
279                        'n' => '\n',
280                        'r' => '\r',
281                        't' => '\t',
282                        '\\' => c2,
283                        '?' => c2,
284                        '\'' => {
285                            push_escape_character = in_double_quotes;
286                            c2
287                        }
288                        '"' => {
289                            push_escape_character = in_single_quotes;
290                            c2
291                        }
292                        '`' => c2,
293                        'x' | 'u' | 'U' => {
294                            let length = match c2 {
295                                'x' => 2,
296                                'u' => 4,
297                                'U' => 8,
298                                _ => unreachable!(),
299                            };
300
301                            parse_unicode_hex(length, &mut chars).map_err(|x| {
302                                ParseSequenceError::InvalidUnicode {
303                                    source: x.clone(),
304                                    index: idx,
305                                    string: String::from(s),
306                                }
307                            })?
308                        }
309                        n if ('0'..='3').contains(&n) => parse_unicode_oct(&n, &mut chars)
310                            .map_err(|x| ParseSequenceError::InvalidUnicode {
311                                source: x.clone(),
312                                index: idx,
313                                string: String::from(s),
314                            })?,
315                        _ => {
316                            return Err(ParseSequenceError::InvalidEscape {
317                                escape: format!("{c}{c2}"),
318                                index: idx,
319                                string: String::from(s),
320                            });
321                        }
322                    };
323
324                    if push_escape_character {
325                        res.push(c);
326                    }
327
328                    res.push(value);
329
330                    continue;
331                }
332            };
333        } else if c == '\'' {
334            if in_double_quotes {
335                res.push(c);
336                continue;
337            }
338
339            in_single_quotes = !in_single_quotes;
340            continue;
341        } else if c == '"' {
342            if in_single_quotes {
343                res.push(c);
344                continue;
345            }
346
347            in_double_quotes = !in_double_quotes;
348            continue;
349        } else if !in_quotes {
350            return Err(ParseSequenceError::MissingOpeningQuote);
351        }
352
353        res.push(c);
354    }
355
356    // Ensure string has a closing quote
357    if in_single_quotes || in_double_quotes {
358        return Err(ParseSequenceError::MissingClosingQuote);
359    }
360
361    Ok(res)
362}
363
364fn parse_unicode_hex<I>(length: usize, chars: &mut I) -> Result<char, ParseUnicodeError>
365where
366    I: Iterator<Item = (usize, char)>,
367{
368    let unicode_seq: String = chars.take(length).map(|(_, c)| c).collect();
369
370    u32::from_str_radix(&unicode_seq, 16)
371        .map_err(|e| ParseUnicodeError::Hex {
372            source: e,
373            string: unicode_seq,
374        })
375        .and_then(|u| char::from_u32(u).ok_or(ParseUnicodeError::Unicode { value: u }))
376}
377
378fn parse_unicode_oct<I>(first_char: &char, chars: &mut I) -> Result<char, ParseUnicodeError>
379where
380    I: Iterator<Item = (usize, char)>,
381{
382    let mut unicode_seq: String = String::with_capacity(3);
383    unicode_seq.push(*first_char);
384    chars.take(2).for_each(|(_, c)| unicode_seq.push(c));
385
386    u32::from_str_radix(&unicode_seq, 8)
387        .map_err(|e| ParseUnicodeError::Oct {
388            source: e,
389            string: unicode_seq,
390        })
391        .and_then(|u| {
392            if u <= 255 {
393                char::from_u32(u).ok_or(ParseUnicodeError::Unicode { value: u })
394            } else {
395                Err(ParseUnicodeError::Unicode { value: u })
396            }
397        })
398}
399
400#[cfg(test)]
401mod tests {
402    use super::{parse_bytes, parse_string, ParseSequenceError};
403
404    #[test]
405    fn single_quotes_interprets_escapes() {
406        let tests: Vec<(&str, Result<String, ParseSequenceError>)> = vec![
407            ("'Hello \\a'", Ok(String::from("Hello \u{07}"))),
408            ("'Hello \\b'", Ok(String::from("Hello \u{08}"))),
409            ("'Hello \\v'", Ok(String::from("Hello \u{0b}"))),
410            ("'Hello \\f'", Ok(String::from("Hello \u{0c}"))),
411            ("'Hello \\n'", Ok(String::from("Hello \u{0a}"))),
412            ("'Hello \\r'", Ok(String::from("Hello \u{0d}"))),
413            ("'Hello \\t'", Ok(String::from("Hello \u{09}"))),
414            ("'Hello \\\\'", Ok(String::from("Hello \\"))),
415            ("'Hello \\?'", Ok(String::from("Hello ?"))),
416            ("'Hello \"'", Ok(String::from("Hello \""))),
417            ("'Hello \\''", Ok(String::from("Hello '"))),
418            ("'Hello \\`'", Ok(String::from("Hello `"))),
419            ("'Hello \\x20'", Ok(String::from("Hello  "))),
420            ("'Hello \\u270c'", Ok(String::from("Hello ✌"))),
421            ("'Hello \\U0001f431'", Ok(String::from("Hello 🐱"))),
422            ("'Hello \\040'", Ok(String::from("Hello  "))),
423            (
424                "Missing closing quote'",
425                Err(ParseSequenceError::MissingOpeningQuote),
426            ),
427            (
428                "'Missing closing quote",
429                Err(ParseSequenceError::MissingClosingQuote),
430            ),
431            // Testing octal value is out of range
432            (
433                "'\\440'",
434                Err(ParseSequenceError::InvalidEscape {
435                    escape: String::from("\\4"),
436                    index: 2,
437                    string: String::from("'\\440'"),
438                }),
439            ),
440        ];
441
442        for (s, expected) in tests {
443            let result = parse_string(s);
444            assert_eq!(result, expected);
445        }
446    }
447
448    #[test]
449    fn double_quotes_interprets_escapes() {
450        let tests: Vec<(&str, Result<String, ParseSequenceError>)> = vec![
451            ("\"Hello \\a\"", Ok(String::from("Hello \u{07}"))),
452            ("\"Hello \\b\"", Ok(String::from("Hello \u{08}"))),
453            ("\"Hello \\v\"", Ok(String::from("Hello \u{0b}"))),
454            ("\"Hello \\f\"", Ok(String::from("Hello \u{0c}"))),
455            ("\"Hello \\n\"", Ok(String::from("Hello \u{0a}"))),
456            ("\"Hello \\r\"", Ok(String::from("Hello \u{0d}"))),
457            ("\"Hello \\t\"", Ok(String::from("Hello \u{09}"))),
458            ("\"Hello \\\\\"", Ok(String::from("Hello \\"))),
459            ("\"Hello \\?\"", Ok(String::from("Hello ?"))),
460            ("\"Hello \\\"\"", Ok(String::from("Hello \""))),
461            ("\"Hello \\'\"", Ok(String::from("Hello \\'"))),
462            ("\"Hello \\`\"", Ok(String::from("Hello `"))),
463            ("\"Hello \\x20 \"", Ok(String::from("Hello   "))),
464            ("\"Hello \\x60\"", Ok(String::from("Hello `"))),
465            ("\"Hello \\u270c\"", Ok(String::from("Hello ✌"))),
466            ("\"Hello \\U0001f431\"", Ok(String::from("Hello 🐱"))),
467            ("\"Hello \\040\"", Ok(String::from("Hello  "))),
468            (
469                "Missing closing quote\"",
470                Err(ParseSequenceError::MissingOpeningQuote),
471            ),
472            (
473                "\"Missing closing quote",
474                Err(ParseSequenceError::MissingClosingQuote),
475            ),
476            // Testing octal value is out of range
477            (
478                "\"\\440\"",
479                Err(ParseSequenceError::InvalidEscape {
480                    escape: String::from("\\4"),
481                    index: 2,
482                    string: String::from("\"\\440\""),
483                }),
484            ),
485        ];
486
487        for (s, expected) in tests {
488            let result = parse_string(s);
489            assert_eq!(result, expected, "Testing {s}");
490        }
491    }
492
493    #[test]
494    fn raw_string_does_not_interpret_escapes() {
495        let tests: Vec<(&str, Result<String, ParseSequenceError>)> = vec![
496            // Raw string in double quotes
497            // r"Hello \a \" ' \' \U0001f431 " => Hello \a " ' \' \U0001f431
498            // R"Hello \a \" ' \' \U0001f431 " => Hello \a " ' \' \U0001f431
499            (
500                "r\"Hello \\a \\\" ' \\' \\U0001f431 \"",
501                Ok(String::from("Hello \\a \" ' \\' \\U0001f431 ")),
502            ),
503            (
504                "R\"Hello \\a \\\" ' \\' \\U0001f431 \"",
505                Ok(String::from("Hello \\a \" ' \\' \\U0001f431 ")),
506            ),
507            // Raw string in single quotes
508            // r'Hello \a \" " \' \U0001f431 ' => Hello \a \" " ' \U0001f431
509            // R'Hello \a \" " \' \U0001f431 ' => Hello \a \" " ' \U0001f431
510            (
511                "r'Hello \\a \\\" \" \\' \\U0001f431 '",
512                Ok(String::from("Hello \\a \\\" \" ' \\U0001f431 ")),
513            ),
514            (
515                "R'Hello \\a \\\" \" \\' \\U0001f431 '",
516                Ok(String::from("Hello \\a \\\" \" ' \\U0001f431 ")),
517            ),
518        ];
519
520        for (s, expected) in tests {
521            let result = parse_string(s);
522            assert_eq!(result, expected, "Testing {s}");
523        }
524    }
525
526    #[test]
527    fn parses_bytes() {
528        let bytes = parse_bytes("abc💖\\xFF\\376").expect("Must parse!");
529        assert_eq!([97, 98, 99, 240, 159, 146, 150, 255, 254], *bytes)
530    }
531}