ra_ap_parser/
lexed_str.rs

1//! Lexing `&str` into a sequence of Rust tokens.
2//!
3//! Note that strictly speaking the parser in this crate is not required to work
4//! on tokens which originated from text. Macros, eg, can synthesize tokens out
5//! of thin air. So, ideally, lexer should be an orthogonal crate. It is however
6//! convenient to include a text-based lexer here!
7//!
8//! Note that these tokens, unlike the tokens we feed into the parser, do
9//! include info about comments and whitespace.
10
11use std::ops;
12
13use rustc_literal_escaper::{
14    EscapeError, Mode, unescape_byte, unescape_byte_str, unescape_c_str, unescape_char,
15    unescape_str,
16};
17
18use crate::{
19    Edition,
20    SyntaxKind::{self, *},
21    T,
22};
23
24pub struct LexedStr<'a> {
25    text: &'a str,
26    kind: Vec<SyntaxKind>,
27    start: Vec<u32>,
28    error: Vec<LexError>,
29}
30
31struct LexError {
32    msg: String,
33    token: u32,
34}
35
36impl<'a> LexedStr<'a> {
37    pub fn new(edition: Edition, text: &'a str) -> LexedStr<'a> {
38        let _p = tracing::info_span!("LexedStr::new").entered();
39        let mut conv = Converter::new(edition, text);
40        if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
41            conv.res.push(SHEBANG, conv.offset);
42            conv.offset = shebang_len;
43        };
44
45        // Re-create the tokenizer from scratch every token because `GuardedStrPrefix` is one token in the lexer
46        // but we want to split it to two in edition <2024.
47        while let Some(token) =
48            rustc_lexer::tokenize(&text[conv.offset..], rustc_lexer::FrontmatterAllowed::No).next()
49        {
50            let token_text = &text[conv.offset..][..token.len as usize];
51
52            conv.extend_token(&token.kind, token_text);
53        }
54
55        conv.finalize_with_eof()
56    }
57
58    pub fn single_token(edition: Edition, text: &'a str) -> Option<(SyntaxKind, Option<String>)> {
59        if text.is_empty() {
60            return None;
61        }
62
63        let token = rustc_lexer::tokenize(text, rustc_lexer::FrontmatterAllowed::No).next()?;
64        if token.len as usize != text.len() {
65            return None;
66        }
67
68        let mut conv = Converter::new(edition, text);
69        conv.extend_token(&token.kind, text);
70        match &*conv.res.kind {
71            [kind] => Some((*kind, conv.res.error.pop().map(|it| it.msg))),
72            _ => None,
73        }
74    }
75
76    pub fn as_str(&self) -> &str {
77        self.text
78    }
79
80    pub fn len(&self) -> usize {
81        self.kind.len() - 1
82    }
83
84    pub fn is_empty(&self) -> bool {
85        self.len() == 0
86    }
87
88    pub fn kind(&self, i: usize) -> SyntaxKind {
89        assert!(i < self.len());
90        self.kind[i]
91    }
92
93    pub fn text(&self, i: usize) -> &str {
94        self.range_text(i..i + 1)
95    }
96
97    pub fn range_text(&self, r: ops::Range<usize>) -> &str {
98        assert!(r.start < r.end && r.end <= self.len());
99        let lo = self.start[r.start] as usize;
100        let hi = self.start[r.end] as usize;
101        &self.text[lo..hi]
102    }
103
104    // Naming is hard.
105    pub fn text_range(&self, i: usize) -> ops::Range<usize> {
106        assert!(i < self.len());
107        let lo = self.start[i] as usize;
108        let hi = self.start[i + 1] as usize;
109        lo..hi
110    }
111    pub fn text_start(&self, i: usize) -> usize {
112        assert!(i <= self.len());
113        self.start[i] as usize
114    }
115    pub fn text_len(&self, i: usize) -> usize {
116        assert!(i < self.len());
117        let r = self.text_range(i);
118        r.end - r.start
119    }
120
121    pub fn error(&self, i: usize) -> Option<&str> {
122        assert!(i < self.len());
123        let err = self.error.binary_search_by_key(&(i as u32), |i| i.token).ok()?;
124        Some(self.error[err].msg.as_str())
125    }
126
127    pub fn errors(&self) -> impl Iterator<Item = (usize, &str)> + '_ {
128        self.error.iter().map(|it| (it.token as usize, it.msg.as_str()))
129    }
130
131    fn push(&mut self, kind: SyntaxKind, offset: usize) {
132        self.kind.push(kind);
133        self.start.push(offset as u32);
134    }
135}
136
137struct Converter<'a> {
138    res: LexedStr<'a>,
139    offset: usize,
140    edition: Edition,
141}
142
143impl<'a> Converter<'a> {
144    fn new(edition: Edition, text: &'a str) -> Self {
145        Self {
146            res: LexedStr { text, kind: Vec::new(), start: Vec::new(), error: Vec::new() },
147            offset: 0,
148            edition,
149        }
150    }
151
152    fn finalize_with_eof(mut self) -> LexedStr<'a> {
153        self.res.push(EOF, self.offset);
154        self.res
155    }
156
157    fn push(&mut self, kind: SyntaxKind, len: usize, errors: Vec<String>) {
158        self.res.push(kind, self.offset);
159        self.offset += len;
160
161        for msg in errors {
162            if !msg.is_empty() {
163                self.res.error.push(LexError { msg, token: self.res.len() as u32 });
164            }
165        }
166    }
167
168    fn extend_token(&mut self, kind: &rustc_lexer::TokenKind, mut token_text: &str) {
169        // A note on an intended tradeoff:
170        // We drop some useful information here (see patterns with double dots `..`)
171        // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
172        // being `u16` that come from `rowan::SyntaxKind`.
173        let mut errors: Vec<String> = vec![];
174
175        let syntax_kind = {
176            match kind {
177                rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT,
178                rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated } => {
179                    if !terminated {
180                        errors.push(
181                            "Missing trailing `*/` symbols to terminate the block comment".into(),
182                        );
183                    }
184                    COMMENT
185                }
186
187                rustc_lexer::TokenKind::Frontmatter {
188                    has_invalid_preceding_whitespace,
189                    invalid_infostring,
190                } => {
191                    if *has_invalid_preceding_whitespace {
192                        errors.push("invalid preceding whitespace for frontmatter opening".into());
193                    } else if *invalid_infostring {
194                        errors.push("invalid infostring for frontmatter".into());
195                    }
196                    FRONTMATTER
197                }
198
199                rustc_lexer::TokenKind::Whitespace => WHITESPACE,
200
201                rustc_lexer::TokenKind::Ident if token_text == "_" => UNDERSCORE,
202                rustc_lexer::TokenKind::Ident => {
203                    SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT)
204                }
205                rustc_lexer::TokenKind::InvalidIdent => {
206                    errors.push("Ident contains invalid characters".into());
207                    IDENT
208                }
209
210                rustc_lexer::TokenKind::RawIdent => IDENT,
211
212                rustc_lexer::TokenKind::GuardedStrPrefix if self.edition.at_least_2024() => {
213                    // FIXME: rustc does something better for recovery.
214                    errors.push("Invalid string literal (reserved syntax)".into());
215                    ERROR
216                }
217                rustc_lexer::TokenKind::GuardedStrPrefix => {
218                    // The token is `#"` or `##`, split it into two.
219                    token_text = &token_text[1..];
220                    POUND
221                }
222
223                rustc_lexer::TokenKind::Literal { kind, .. } => {
224                    self.extend_literal(token_text.len(), kind);
225                    return;
226                }
227
228                rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
229                    if *starts_with_number {
230                        errors.push("Lifetime name cannot start with a number".into());
231                    }
232                    LIFETIME_IDENT
233                }
234                rustc_lexer::TokenKind::UnknownPrefixLifetime => {
235                    errors.push("Unknown lifetime prefix".into());
236                    LIFETIME_IDENT
237                }
238                rustc_lexer::TokenKind::RawLifetime => LIFETIME_IDENT,
239
240                rustc_lexer::TokenKind::Semi => T![;],
241                rustc_lexer::TokenKind::Comma => T![,],
242                rustc_lexer::TokenKind::Dot => T![.],
243                rustc_lexer::TokenKind::OpenParen => T!['('],
244                rustc_lexer::TokenKind::CloseParen => T![')'],
245                rustc_lexer::TokenKind::OpenBrace => T!['{'],
246                rustc_lexer::TokenKind::CloseBrace => T!['}'],
247                rustc_lexer::TokenKind::OpenBracket => T!['['],
248                rustc_lexer::TokenKind::CloseBracket => T![']'],
249                rustc_lexer::TokenKind::At => T![@],
250                rustc_lexer::TokenKind::Pound => T![#],
251                rustc_lexer::TokenKind::Tilde => T![~],
252                rustc_lexer::TokenKind::Question => T![?],
253                rustc_lexer::TokenKind::Colon => T![:],
254                rustc_lexer::TokenKind::Dollar => T![$],
255                rustc_lexer::TokenKind::Eq => T![=],
256                rustc_lexer::TokenKind::Bang => T![!],
257                rustc_lexer::TokenKind::Lt => T![<],
258                rustc_lexer::TokenKind::Gt => T![>],
259                rustc_lexer::TokenKind::Minus => T![-],
260                rustc_lexer::TokenKind::And => T![&],
261                rustc_lexer::TokenKind::Or => T![|],
262                rustc_lexer::TokenKind::Plus => T![+],
263                rustc_lexer::TokenKind::Star => T![*],
264                rustc_lexer::TokenKind::Slash => T![/],
265                rustc_lexer::TokenKind::Caret => T![^],
266                rustc_lexer::TokenKind::Percent => T![%],
267                rustc_lexer::TokenKind::Unknown => ERROR,
268                rustc_lexer::TokenKind::UnknownPrefix if token_text == "builtin" => IDENT,
269                rustc_lexer::TokenKind::UnknownPrefix => {
270                    errors.push("unknown literal prefix".into());
271                    IDENT
272                }
273                rustc_lexer::TokenKind::Eof => EOF,
274            }
275        };
276
277        self.push(syntax_kind, token_text.len(), errors);
278    }
279
280    fn extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind) {
281        let invalid_raw_msg = String::from("Invalid raw string literal");
282
283        let mut errors = vec![];
284        let mut no_end_quote = |c: char, kind: &str| {
285            errors.push(format!("Missing trailing `{c}` symbol to terminate the {kind} literal"));
286        };
287
288        let syntax_kind = match *kind {
289            rustc_lexer::LiteralKind::Int { empty_int, base: _ } => {
290                if empty_int {
291                    errors.push("Missing digits after the integer base prefix".into());
292                }
293                INT_NUMBER
294            }
295            rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => {
296                if empty_exponent {
297                    errors.push("Missing digits after the exponent symbol".into());
298                }
299                FLOAT_NUMBER
300            }
301            rustc_lexer::LiteralKind::Char { terminated } => {
302                if !terminated {
303                    no_end_quote('\'', "character");
304                } else {
305                    let text = &self.res.text[self.offset + 1..][..len - 1];
306                    let text = &text[..text.rfind('\'').unwrap()];
307                    if let Err(e) = unescape_char(text) {
308                        errors.push(err_to_msg(e, Mode::Char));
309                    }
310                }
311                CHAR
312            }
313            rustc_lexer::LiteralKind::Byte { terminated } => {
314                if !terminated {
315                    no_end_quote('\'', "byte");
316                } else {
317                    let text = &self.res.text[self.offset + 2..][..len - 2];
318                    let text = &text[..text.rfind('\'').unwrap()];
319                    if let Err(e) = unescape_byte(text) {
320                        errors.push(err_to_msg(e, Mode::Byte));
321                    }
322                }
323                BYTE
324            }
325            rustc_lexer::LiteralKind::Str { terminated } => {
326                if !terminated {
327                    no_end_quote('"', "string");
328                } else {
329                    let text = &self.res.text[self.offset + 1..][..len - 1];
330                    let text = &text[..text.rfind('"').unwrap()];
331                    unescape_str(text, |_, res| {
332                        if let Err(e) = res {
333                            errors.push(err_to_msg(e, Mode::Str));
334                        }
335                    });
336                }
337                STRING
338            }
339            rustc_lexer::LiteralKind::ByteStr { terminated } => {
340                if !terminated {
341                    no_end_quote('"', "byte string");
342                } else {
343                    let text = &self.res.text[self.offset + 2..][..len - 2];
344                    let text = &text[..text.rfind('"').unwrap()];
345                    unescape_byte_str(text, |_, res| {
346                        if let Err(e) = res {
347                            errors.push(err_to_msg(e, Mode::ByteStr));
348                        }
349                    });
350                }
351                BYTE_STRING
352            }
353            rustc_lexer::LiteralKind::CStr { terminated } => {
354                if !terminated {
355                    no_end_quote('"', "C string")
356                } else {
357                    let text = &self.res.text[self.offset + 2..][..len - 2];
358                    let text = &text[..text.rfind('"').unwrap()];
359                    unescape_c_str(text, |_, res| {
360                        if let Err(e) = res {
361                            errors.push(err_to_msg(e, Mode::CStr));
362                        }
363                    });
364                }
365                C_STRING
366            }
367            rustc_lexer::LiteralKind::RawStr { n_hashes } => {
368                if n_hashes.is_none() {
369                    errors.push(invalid_raw_msg);
370                }
371                STRING
372            }
373            rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
374                if n_hashes.is_none() {
375                    errors.push(invalid_raw_msg);
376                }
377                BYTE_STRING
378            }
379            rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
380                if n_hashes.is_none() {
381                    errors.push(invalid_raw_msg);
382                }
383                C_STRING
384            }
385        };
386
387        self.push(syntax_kind, len, errors);
388    }
389}
390
391fn err_to_msg(error: EscapeError, mode: Mode) -> String {
392    match error {
393        EscapeError::ZeroChars => "empty character literal",
394        EscapeError::MoreThanOneChar => "character literal may only contain one codepoint",
395        EscapeError::LoneSlash => "",
396        EscapeError::InvalidEscape if mode == Mode::Byte || mode == Mode::ByteStr => {
397            "unknown byte escape"
398        }
399        EscapeError::InvalidEscape => "unknown character escape",
400        EscapeError::BareCarriageReturn => "",
401        EscapeError::BareCarriageReturnInRawString => "",
402        EscapeError::EscapeOnlyChar if mode == Mode::Byte => "byte constant must be escaped",
403        EscapeError::EscapeOnlyChar => "character constant must be escaped",
404        EscapeError::TooShortHexEscape => "numeric character escape is too short",
405        EscapeError::InvalidCharInHexEscape => "invalid character in numeric character escape",
406        EscapeError::OutOfRangeHexEscape => "out of range hex escape",
407        EscapeError::NoBraceInUnicodeEscape => "incorrect unicode escape sequence",
408        EscapeError::InvalidCharInUnicodeEscape => "invalid character in unicode escape",
409        EscapeError::EmptyUnicodeEscape => "empty unicode escape",
410        EscapeError::UnclosedUnicodeEscape => "unterminated unicode escape",
411        EscapeError::LeadingUnderscoreUnicodeEscape => "invalid start of unicode escape",
412        EscapeError::OverlongUnicodeEscape => "overlong unicode escape",
413        EscapeError::LoneSurrogateUnicodeEscape => "invalid unicode character escape",
414        EscapeError::OutOfRangeUnicodeEscape => "invalid unicode character escape",
415        EscapeError::UnicodeEscapeInByte => "unicode escape in byte string",
416        EscapeError::NonAsciiCharInByte if mode == Mode::Byte => {
417            "non-ASCII character in byte literal"
418        }
419        EscapeError::NonAsciiCharInByte if mode == Mode::ByteStr => {
420            "non-ASCII character in byte string literal"
421        }
422        EscapeError::NonAsciiCharInByte => "non-ASCII character in raw byte string literal",
423        EscapeError::NulInCStr => "null character in C string literal",
424        EscapeError::UnskippedWhitespaceWarning => "",
425        EscapeError::MultipleSkippedLinesWarning => "",
426    }
427    .into()
428}