rustc_lexer/
unescape.rs

1//! Utilities for validating string and char literals and turning them into
2//! values they represent.
3
4use std::str::Chars;
5use std::ops::Range;
6
7#[cfg(test)]
8mod tests;
9
10#[derive(Debug, PartialEq, Eq)]
11pub enum EscapeError {
12    ZeroChars,
13    MoreThanOneChar,
14
15    LoneSlash,
16    InvalidEscape,
17    BareCarriageReturn,
18    BareCarriageReturnInRawString,
19    EscapeOnlyChar,
20
21    TooShortHexEscape,
22    InvalidCharInHexEscape,
23    OutOfRangeHexEscape,
24
25    NoBraceInUnicodeEscape,
26    InvalidCharInUnicodeEscape,
27    EmptyUnicodeEscape,
28    UnclosedUnicodeEscape,
29    LeadingUnderscoreUnicodeEscape,
30    OverlongUnicodeEscape,
31    LoneSurrogateUnicodeEscape,
32    OutOfRangeUnicodeEscape,
33
34    UnicodeEscapeInByte,
35    NonAsciiCharInByte,
36    NonAsciiCharInByteString,
37}
38
39/// Takes a contents of a char literal (without quotes), and returns an
40/// unescaped char or an error
41pub fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
42    let mut chars = literal_text.chars();
43    unescape_char_or_byte(&mut chars, Mode::Char)
44        .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
45}
46
47/// Takes a contents of a string literal (without quotes) and produces a
48/// sequence of escaped characters or errors.
49pub fn unescape_str<F>(literal_text: &str, callback: &mut F)
50where
51    F: FnMut(Range<usize>, Result<char, EscapeError>),
52{
53    unescape_str_or_byte_str(literal_text, Mode::Str, callback)
54}
55
56pub fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
57    let mut chars = literal_text.chars();
58    unescape_char_or_byte(&mut chars, Mode::Byte)
59        .map(byte_from_char)
60        .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
61}
62
63/// Takes a contents of a string literal (without quotes) and produces a
64/// sequence of escaped characters or errors.
65pub fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
66where
67    F: FnMut(Range<usize>, Result<u8, EscapeError>),
68{
69    unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
70        callback(range, char.map(byte_from_char))
71    })
72}
73
74/// Takes a contents of a string literal (without quotes) and produces a
75/// sequence of characters or errors.
76/// NOTE: Raw strings do not perform any explicit character escaping, here we
77/// only translate CRLF to LF and produce errors on bare CR.
78pub fn unescape_raw_str<F>(literal_text: &str, callback: &mut F)
79where
80    F: FnMut(Range<usize>, Result<char, EscapeError>),
81{
82    unescape_raw_str_or_byte_str(literal_text, Mode::Str, callback)
83}
84
85/// Takes a contents of a string literal (without quotes) and produces a
86/// sequence of characters or errors.
87/// NOTE: Raw strings do not perform any explicit character escaping, here we
88/// only translate CRLF to LF and produce errors on bare CR.
89pub fn unescape_raw_byte_str<F>(literal_text: &str, callback: &mut F)
90where
91    F: FnMut(Range<usize>, Result<u8, EscapeError>),
92{
93    unescape_raw_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
94        callback(range, char.map(byte_from_char))
95    })
96}
97
98#[derive(Debug, Clone, Copy)]
99pub enum Mode {
100    Char,
101    Str,
102    Byte,
103    ByteStr,
104}
105
106impl Mode {
107    pub fn in_single_quotes(self) -> bool {
108        match self {
109            Mode::Char | Mode::Byte => true,
110            Mode::Str | Mode::ByteStr => false,
111        }
112    }
113
114    pub fn in_double_quotes(self) -> bool {
115        !self.in_single_quotes()
116    }
117
118    pub fn is_bytes(self) -> bool {
119        match self {
120            Mode::Byte | Mode::ByteStr => true,
121            Mode::Char | Mode::Str => false,
122        }
123    }
124}
125
126
127fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
128    if first_char != '\\' {
129        return match first_char {
130            '\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
131            '\r' => Err(EscapeError::BareCarriageReturn),
132            '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
133            '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
134            _ => {
135                if mode.is_bytes() && !first_char.is_ascii() {
136                    return Err(EscapeError::NonAsciiCharInByte);
137                }
138                Ok(first_char)
139            }
140        };
141    }
142
143    let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
144
145    let res = match second_char {
146        '"' => '"',
147        'n' => '\n',
148        'r' => '\r',
149        't' => '\t',
150        '\\' => '\\',
151        '\'' => '\'',
152        '0' => '\0',
153
154        'x' => {
155            let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
156            let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
157
158            let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
159            let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
160
161            let value = hi * 16 + lo;
162
163            if !mode.is_bytes() && !is_ascii(value) {
164                return Err(EscapeError::OutOfRangeHexEscape);
165            }
166            let value = value as u8;
167
168            value as char
169        }
170
171        'u' => {
172            if chars.next() != Some('{') {
173                return Err(EscapeError::NoBraceInUnicodeEscape);
174            }
175
176            let mut n_digits = 1;
177            let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
178                '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
179                '}' => return Err(EscapeError::EmptyUnicodeEscape),
180                c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
181            };
182
183            loop {
184                match chars.next() {
185                    None => return Err(EscapeError::UnclosedUnicodeEscape),
186                    Some('_') => continue,
187                    Some('}') => {
188                        if n_digits > 6 {
189                            return Err(EscapeError::OverlongUnicodeEscape);
190                        }
191                        if mode.is_bytes() {
192                            return Err(EscapeError::UnicodeEscapeInByte);
193                        }
194
195                        break std::char::from_u32(value).ok_or_else(|| {
196                            if value > 0x10FFFF {
197                                EscapeError::OutOfRangeUnicodeEscape
198                            } else {
199                                EscapeError::LoneSurrogateUnicodeEscape
200                            }
201                        })?;
202                    }
203                    Some(c) => {
204                        let digit = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
205                        n_digits += 1;
206                        if n_digits > 6 {
207                            continue;
208                        }
209                        let digit = digit as u32;
210                        value = value * 16 + digit;
211                    }
212                };
213            }
214        }
215        _ => return Err(EscapeError::InvalidEscape),
216    };
217    Ok(res)
218}
219
220fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
221    let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
222    let res = scan_escape(first_char, chars, mode)?;
223    if chars.next().is_some() {
224        return Err(EscapeError::MoreThanOneChar);
225    }
226    Ok(res)
227}
228
229/// Takes a contents of a string literal (without quotes) and produces a
230/// sequence of escaped characters or errors.
231fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
232where
233    F: FnMut(Range<usize>, Result<char, EscapeError>),
234{
235    assert!(mode.in_double_quotes());
236    let initial_len = src.len();
237    let mut chars = src.chars();
238    while let Some(first_char) = chars.next() {
239        let start = initial_len - chars.as_str().len() - first_char.len_utf8();
240
241        let unescaped_char = match first_char {
242            '\\' => {
243                let second_char = chars.clone().next();
244                match second_char {
245                    Some('\n') => {
246                        skip_ascii_whitespace(&mut chars);
247                        continue;
248                    }
249                    _ => scan_escape(first_char, &mut chars, mode),
250                }
251            }
252            '\n' => Ok('\n'),
253            '\t' => Ok('\t'),
254            _ => scan_escape(first_char, &mut chars, mode),
255        };
256        let end = initial_len - chars.as_str().len();
257        callback(start..end, unescaped_char);
258    }
259
260    fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
261        let str = chars.as_str();
262        let first_non_space = str
263            .bytes()
264            .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
265            .unwrap_or(str.len());
266        *chars = str[first_non_space..].chars()
267    }
268}
269
270/// Takes a contents of a string literal (without quotes) and produces a
271/// sequence of characters or errors.
272/// NOTE: Raw strings do not perform any explicit character escaping, here we
273/// only translate CRLF to LF and produce errors on bare CR.
274fn unescape_raw_str_or_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F)
275where
276    F: FnMut(Range<usize>, Result<char, EscapeError>),
277{
278    assert!(mode.in_double_quotes());
279    let initial_len = literal_text.len();
280
281    let mut chars = literal_text.chars();
282    while let Some(curr) = chars.next() {
283        let start = initial_len - chars.as_str().len() - curr.len_utf8();
284
285        let result = match curr {
286            '\r' => Err(EscapeError::BareCarriageReturnInRawString),
287            c if mode.is_bytes() && !c.is_ascii() =>
288                Err(EscapeError::NonAsciiCharInByteString),
289            c => Ok(c),
290        };
291        let end = initial_len - chars.as_str().len();
292
293        callback(start..end, result);
294    }
295}
296
297fn byte_from_char(c: char) -> u8 {
298    let res = c as u32;
299    assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte(Str)");
300    res as u8
301}
302
303fn is_ascii(x: u32) -> bool {
304    x <= 0x7F
305}