parse_js/parse/
literal.rs

1use super::ParseCtx;
2use super::Parser;
3use crate::error::SyntaxErrorType;
4use crate::error::SyntaxResult;
5use crate::num::JsNumber;
6use crate::token::TokenType;
7use core::str::FromStr;
8use memchr::memchr;
9use std::str::from_utf8_unchecked;
10
11fn parse_radix(raw: &str, radix: u32) -> Result<f64, ()> {
12  u64::from_str_radix(raw, radix)
13    .map_err(|_| ())
14    // TODO This is lossy, but there is no TryFrom for converting from u64 to f64, and u32 cannot represent all possible JS values.
15    .map(|v| v as f64)
16}
17
18pub fn normalise_literal_number(raw: &str) -> Option<JsNumber> {
19  // TODO We assume that the Rust parser follows ECMAScript spec and that different representations
20  // of the same value get parsed into the same f64 value/bit pattern (e.g. `5.1e10` and `0.51e11`).
21  match raw {
22    s if s.starts_with("0b") || s.starts_with("0B") => parse_radix(&s[2..], 2),
23    s if s.starts_with("0o") || s.starts_with("0o") => parse_radix(&s[2..], 8),
24    s if s.starts_with("0x") || s.starts_with("0X") => parse_radix(&s[2..], 16),
25    s => f64::from_str(s).map_err(|_| ()),
26  }
27  .map(JsNumber)
28  .ok()
29}
30
31pub fn normalise_literal_bigint(raw: &str) -> Option<String> {
32  // TODO Use custom type like JsNumber.
33  // TODO
34  Some(raw.to_string())
35}
36
37pub fn normalise_literal_string_or_template_inner(mut raw: &[u8]) -> Option<String> {
38  let mut norm = Vec::new();
39  while !raw.is_empty() {
40    let Some(escape_pos) = memchr(b'\\', raw) else {
41      norm.extend_from_slice(raw);
42      break;
43    };
44    norm.extend_from_slice(&raw[..escape_pos]);
45    raw = &raw[escape_pos + 1..];
46    // https://mathiasbynens.be/notes/javascript-escapes
47    // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#escape_sequences
48    // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Template_literals#tagged_templates_and_escape_sequences
49    let mut tmp = [0u8; 4];
50    let (skip, add): (usize, &[u8]) = match raw[0] {
51      b'\n' => (1, b""),
52      b'b' => (1, b"\x08"),
53      b'f' => (1, b"\x0c"),
54      b'n' => (1, b"\n"),
55      b'r' => (1, b"\r"),
56      b't' => (1, b"\t"),
57      b'v' => (1, b"\x0b"),
58      b'0'..=b'7' => {
59        // Octal escape.
60        let mut len = 1;
61        if raw
62          .get(len)
63          .filter(|&c| (b'0'..=b'7').contains(c))
64          .is_some()
65        {
66          len += 1;
67          if raw
68            .get(len)
69            .filter(|&c| (b'0'..=b'7').contains(c))
70            .is_some()
71          {
72            len += 1;
73          };
74        };
75        char::from_u32(
76          u32::from_str_radix(unsafe { from_utf8_unchecked(&raw[..len]) }, 8).unwrap(),
77        )
78        .unwrap()
79        .encode_utf8(&mut tmp);
80        (len, tmp.as_slice())
81      }
82      b'x' => {
83        // Hexadecimal escape.
84        if raw.len() < 3 || !raw[1].is_ascii_hexdigit() || !raw[2].is_ascii_hexdigit() {
85          return None;
86        };
87        char::from_u32(
88          u32::from_str_radix(unsafe { from_utf8_unchecked(&raw[1..3]) }, 16).unwrap(),
89        )
90        .unwrap()
91        .encode_utf8(&mut tmp);
92        (3, tmp.as_slice())
93      }
94      b'u' => match raw.get(1) {
95        Some(b'{') => {
96          // Unicode code point escape.
97          let Some(end_pos) = memchr(b'}', raw) else {
98            return None;
99          };
100          if !(3..=8).contains(&end_pos) {
101            return None;
102          };
103          let cp =
104            u32::from_str_radix(unsafe { from_utf8_unchecked(&raw[2..end_pos]) }, 16).ok()?;
105          let c = char::from_u32(cp)?;
106          c.encode_utf8(&mut tmp);
107          (end_pos + 1, tmp.as_slice())
108        }
109        Some(_) => {
110          // Unicode escape.
111          if raw.len() < 5 {
112            return None;
113          };
114          let cp = u32::from_str_radix(unsafe { from_utf8_unchecked(&raw[1..5]) }, 16).ok()?;
115          let c = char::from_u32(cp)?;
116          c.encode_utf8(&mut tmp);
117          (5, tmp.as_slice())
118        }
119        None => {
120          return None;
121        }
122      },
123      c => (1, {
124        tmp[0] = c;
125        &tmp[..1]
126      }),
127    };
128    norm.extend_from_slice(add);
129    raw = &raw[skip..];
130  }
131  // We return str instead of [u8] so that serialisation is easy and str methods are available.
132  Some(String::from_utf8(norm).unwrap())
133}
134
135pub fn normalise_literal_string(raw: &str) -> Option<String> {
136  normalise_literal_string_or_template_inner(&raw.as_bytes()[1..raw.len() - 1])
137}
138
139impl<'a> Parser<'a> {
140  pub fn parse_and_normalise_literal_string(&mut self, ctx: ParseCtx) -> SyntaxResult<String> {
141    let t = self.require(TokenType::LiteralString)?;
142    normalise_literal_string(self.str(t.loc))
143      .ok_or_else(|| t.loc.error(SyntaxErrorType::InvalidCharacterEscape, None))
144  }
145}