url_cleaner_engine/glue/parse/js/
string_literal.rs

1//! Parsing javascript string literals.
2
3use serde::{Serialize, Deserialize};
4use thiserror::Error;
5
6use crate::util::*;
7
8/// The last state of the state machine used to unescape javascript string literal prefixes.
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
10#[serde(deny_unknown_fields)]
11pub enum StringLiteralPrefixLastState {
12    /// Before the start of the string literal.
13    Outside,
14    /// Inside the string literal but nothing else special.
15    ///
16    /// When an escape sequence finishes, returns to this state.
17    Inside,
18    /// The `\` of an escape sequence.
19    Start,
20    /// The first digit of an octal escape sequence.
21    Octal1,
22    /// The second digit of an octal escape sequence.
23    Octal2,
24    /// The `u` in `\xHH`.
25    AsciiHexx,
26    /// The first digit in an ascii escape sequence.
27    AsciiHex1,
28    /// The `u` in `\uHHHH`/`\u{HHHHH}`.
29    UnicodeU,
30    /// The first digit in `\uHHHH`.
31    Unicode41,
32    /// The second digit in `\uHHHH`.
33    Unicode42,
34    /// The third digit in `\uHHHH`.
35    Unicode43,
36    /// The `{` in `\u{HHHHH}`.
37    UnicodeLeftBrace,
38    /// The first digit in `\u{HHHHH}`.
39    Unicode51,
40    /// The second digit in `\u{HHHHH}`.
41    Unicode52,
42    /// The third digit in `\u{HHHHH}`.
43    Unicode53,
44    /// The fourth digit in `\u{HHHHH}`.
45    Unicode54,
46    /// The fifth digit in `\u{HHHHH}`.
47    Unicode55
48}
49
50/// The enum of errors [`string_literal_prefix`] can return.
51#[derive(Debug, Error)]
52pub enum StringLiteralPrefixError {
53    /// Returned when a syntax error is encountered.
54    #[error("A syntax error was encountered.")]
55    SyntaxError {
56        /// The last state of the state machine before the error was encountered.
57        last_state: StringLiteralPrefixLastState,
58        /// The index of the character that triggered the error.
59        i: usize,
60        /// The character that triggered the error.
61        c: char,
62        /// The scratchspace that was calculating the unescaped character.
63        scratchspace: u32,
64        /// The quote being used.
65        quote: char,
66        /// The calculated return value prior to the error.
67        partial: String
68    },
69    /// Returned when an invalid codepoint is encountered.
70    #[error("An invalid codepoint was encountered: {0}.")]
71    InvalidCodepoint(u32)
72}
73
74/// Given a [`str`] that starts with a javascript string literal, return the value of that string.
75///
76/// TODO: Handle template strings.
77/// # Errors
78/// If a syntax error happens, returns the error [`StringLiteralPrefixError::SyntaxError`].
79///
80/// If an invalid codepoint is encountered, returns the error [`StringLiteralPrefixError::InvalidCodepoint`].
81/// # Examples
82/// ```
83/// use url_cleaner_engine::glue::*;
84///
85/// assert_eq!(parse::js::string_literal_prefix("\"abc\\n\\u000Adef\"other stuff"                                ).unwrap(), "abc\n\ndef"          );
86/// assert_eq!(parse::js::string_literal_prefix("\"1\\u{a}2\\u{0a}3\\u{00a}4\\u{000a}5\\u{0000a}6\\u000a7\\\n8\"").unwrap(), "1\n2\n3\n4\n5\n6\n78");
87/// assert_eq!(parse::js::string_literal_prefix("\"'\\\"\"outside"                                               ).unwrap(), "'\""                 );
88/// assert_eq!(parse::js::string_literal_prefix("'\"\\''outside"                                                 ).unwrap(), "\"'"                 );
89/// assert_eq!(parse::js::string_literal_prefix("'a\\na'"                                                        ).unwrap(), "a\na"                );
90/// assert_eq!(parse::js::string_literal_prefix("'a\\\na'"                                                       ).unwrap(), "aa"                  );
91///
92/// parse::js::string_literal_prefix("\"\\u{00000a}\"").unwrap_err();
93/// ```
94#[allow(clippy::missing_panics_doc, reason = "Shouldn't ever happen.")]
95#[allow(clippy::unwrap_used, reason = "Who cares?")]
96pub fn string_literal_prefix(s: &str) -> Result<String, StringLiteralPrefixError> {
97    debug!(prefix::js::string_literal_prefix, &(), s);
98    let mut ret = String::new();
99    let mut last_state = StringLiteralPrefixLastState::Outside;
100
101    let mut scratchspace: u32 = 0;
102    let mut quote = '"';
103
104    for (i, c) in s.chars().enumerate() {
105        debug!(prefix::js::string_literal_prefix, &(), i, c, last_state, scratchspace, quote, ret);
106        #[allow(clippy::arithmetic_side_effects, reason = "Shouldn't ever happen.")]
107        match (last_state, c) {
108            (StringLiteralPrefixLastState::Outside         , '"' | '\''                       ) => {last_state = StringLiteralPrefixLastState::Inside          ; quote = c;},
109            (StringLiteralPrefixLastState::Inside          , '\\'                             ) => {last_state = StringLiteralPrefixLastState::Start           ;},
110            (StringLiteralPrefixLastState::Start           , '0'                              ) => {last_state = StringLiteralPrefixLastState::Inside          ; ret.push('\0');},
111            (StringLiteralPrefixLastState::Start           , 'b'                              ) => {last_state = StringLiteralPrefixLastState::Inside          ; ret.push('\u{0008}');},
112            (StringLiteralPrefixLastState::Start           , 'g'                              ) => {last_state = StringLiteralPrefixLastState::Inside          ; ret.push('\u{000c}');},
113            (StringLiteralPrefixLastState::Start           , 'n'                              ) => {last_state = StringLiteralPrefixLastState::Inside          ; ret.push('\n');},
114            (StringLiteralPrefixLastState::Start           , '\n'                             ) => {last_state = StringLiteralPrefixLastState::Inside          ;},
115            (StringLiteralPrefixLastState::Start           , 'r'                              ) => {last_state = StringLiteralPrefixLastState::Inside          ; ret.push('\r');},
116            (StringLiteralPrefixLastState::Start           , 't'                              ) => {last_state = StringLiteralPrefixLastState::Inside          ; ret.push('\t');},
117            (StringLiteralPrefixLastState::Start           , 'v'                              ) => {last_state = StringLiteralPrefixLastState::Inside          ; ret.push('\u{000b}');},
118            (StringLiteralPrefixLastState::Start           , '\''                             ) => {last_state = StringLiteralPrefixLastState::Inside          ; ret.push('\'');},
119            (StringLiteralPrefixLastState::Start           , '"'                              ) => {last_state = StringLiteralPrefixLastState::Inside          ; ret.push('"') ;},
120            (StringLiteralPrefixLastState::Start           , '\\'                             ) => {last_state = StringLiteralPrefixLastState::Inside          ; ret.push('\\');},
121            (StringLiteralPrefixLastState::Start           , '0'..='7'                        ) => {last_state = StringLiteralPrefixLastState::Octal1          ; scratchspace =                     c.to_digit( 8).unwrap();},
122            (StringLiteralPrefixLastState::Octal1          , '0'..='7'                        ) => {last_state = StringLiteralPrefixLastState::Octal2          ; scratchspace = scratchspace *  8 + c.to_digit( 8).unwrap();},
123            (StringLiteralPrefixLastState::Octal2          , '0'..='7'                        ) => {last_state = StringLiteralPrefixLastState::Inside          ; scratchspace = scratchspace *  8 + c.to_digit( 8).unwrap(); ret.push(char::from_u32(scratchspace).ok_or(StringLiteralPrefixError::InvalidCodepoint(scratchspace))?);},
124            (StringLiteralPrefixLastState::Start           , 'x'                              ) => {last_state = StringLiteralPrefixLastState::AsciiHexx       ;},
125            (StringLiteralPrefixLastState::AsciiHexx       , '0'..='7' | 'A'..='F' | 'a'..='f') => {last_state = StringLiteralPrefixLastState::AsciiHex1       ; scratchspace =                     c.to_digit(16).unwrap();},
126            (StringLiteralPrefixLastState::AsciiHex1       , '0'..='9' | 'A'..='F' | 'a'..='f') => {last_state = StringLiteralPrefixLastState::Inside          ; scratchspace = scratchspace * 16 + c.to_digit(16).unwrap(); ret.push(char::from_u32(scratchspace).ok_or(StringLiteralPrefixError::InvalidCodepoint(scratchspace))?);},
127            (StringLiteralPrefixLastState::Start           , 'u'                              ) => {last_state = StringLiteralPrefixLastState::UnicodeU        ;},
128            (StringLiteralPrefixLastState::UnicodeU        , '{'                              ) => {last_state = StringLiteralPrefixLastState::UnicodeLeftBrace;},
129            (StringLiteralPrefixLastState::UnicodeLeftBrace, '0'..='9' | 'A'..='F' | 'a'..='f') => {last_state = StringLiteralPrefixLastState::Unicode51       ; scratchspace =                     c.to_digit(16).unwrap();},
130            (StringLiteralPrefixLastState::Unicode51       , '0'..='9' | 'A'..='F' | 'a'..='f') => {last_state = StringLiteralPrefixLastState::Unicode52       ; scratchspace = scratchspace * 16 + c.to_digit(16).unwrap();},
131            (StringLiteralPrefixLastState::Unicode52       , '0'..='9' | 'A'..='F' | 'a'..='f') => {last_state = StringLiteralPrefixLastState::Unicode53       ; scratchspace = scratchspace * 16 + c.to_digit(16).unwrap();},
132            (StringLiteralPrefixLastState::Unicode53       , '0'..='9' | 'A'..='F' | 'a'..='f') => {last_state = StringLiteralPrefixLastState::Unicode54       ; scratchspace = scratchspace * 16 + c.to_digit(16).unwrap();},
133            (StringLiteralPrefixLastState::Unicode54       , '0'..='9' | 'A'..='F' | 'a'..='f') => {last_state = StringLiteralPrefixLastState::Unicode55       ; scratchspace = scratchspace * 16 + c.to_digit(16).unwrap();},
134            (StringLiteralPrefixLastState::Unicode51
135                | StringLiteralPrefixLastState::Unicode52
136                | StringLiteralPrefixLastState::Unicode53
137                | StringLiteralPrefixLastState::Unicode54
138                | StringLiteralPrefixLastState::Unicode55  , '}'                              ) => {last_state = StringLiteralPrefixLastState::Inside          ; ret.push(char::from_u32(scratchspace).ok_or(StringLiteralPrefixError::InvalidCodepoint(scratchspace))?);},
139            (StringLiteralPrefixLastState::UnicodeU        , '0'..='9' | 'A'..='F' | 'a'..='f') => {last_state = StringLiteralPrefixLastState::Unicode41       ; scratchspace =                     c.to_digit(16).unwrap();},
140            (StringLiteralPrefixLastState::Unicode41       , '0'..='9' | 'A'..='F' | 'a'..='f') => {last_state = StringLiteralPrefixLastState::Unicode42       ; scratchspace = scratchspace * 16 + c.to_digit(16).unwrap();},
141            (StringLiteralPrefixLastState::Unicode42       , '0'..='9' | 'A'..='F' | 'a'..='f') => {last_state = StringLiteralPrefixLastState::Unicode43       ; scratchspace = scratchspace * 16 + c.to_digit(16).unwrap();},
142            (StringLiteralPrefixLastState::Unicode43       , '0'..='9' | 'A'..='F' | 'a'..='f') => {last_state = StringLiteralPrefixLastState::Inside          ; scratchspace = scratchspace * 16 + c.to_digit(16).unwrap(); ret.push(char::from_u32(scratchspace).ok_or(StringLiteralPrefixError::InvalidCodepoint(scratchspace))?);},
143            (StringLiteralPrefixLastState::Inside          , '"' | '\''                       ) if c == quote => break,
144            (StringLiteralPrefixLastState::Start           , _                                ) => {last_state = StringLiteralPrefixLastState::Inside          ; ret.push(c);},
145            (StringLiteralPrefixLastState::Inside          , _                                ) => {ret.push(c);}
146            _ => Err(StringLiteralPrefixError::SyntaxError {last_state, i, c, scratchspace, quote, partial: ret.clone()})?
147        };
148    }
149
150    debug!(prefix::js::string_literal_prefix, &(), ret);
151
152    Ok(ret)
153}
154