Skip to main content

ra_ap_parser/
lexed_str.rs

1//! Lexing `&str` into a sequence of Rust tokens.
2//!
3//! Note that strictly speaking the parser in this crate is not required to work
4//! on tokens which originated from text. Macros, eg, can synthesize tokens out
5//! of thin air. So, ideally, lexer should be an orthogonal crate. It is however
6//! convenient to include a text-based lexer here!
7//!
8//! Note that these tokens, unlike the tokens we feed into the parser, do
9//! include info about comments and whitespace.
10
11use std::ops;
12
13use rustc_literal_escaper::{
14    EscapeError, Mode, unescape_byte, unescape_byte_str, unescape_c_str, unescape_char,
15    unescape_str,
16};
17
18use crate::{
19    Edition,
20    SyntaxKind::{self, *},
21    T,
22};
23
24pub struct LexedStr<'a> {
25    text: &'a str,
26    kind: Vec<SyntaxKind>,
27    start: Vec<u32>,
28    error: Vec<LexError>,
29}
30
31struct LexError {
32    msg: String,
33    token: u32,
34}
35
36impl<'a> LexedStr<'a> {
37    pub fn new(edition: Edition, text: &'a str) -> LexedStr<'a> {
38        let _p = tracing::info_span!("LexedStr::new").entered();
39        let mut conv = Converter::new(edition, text);
40        if let Ok(script) = crate::frontmatter::ScriptSource::parse(text) {
41            if let Some(shebang) = script.shebang_span() {
42                conv.push(SHEBANG, shebang.end - shebang.start, Vec::new());
43            }
44            if script.frontmatter().is_some() {
45                conv.push(FRONTMATTER, script.content_span().start - conv.offset, Vec::new());
46            }
47        } else if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
48            // Leave error reporting to `rustc_lexer`
49            conv.push(SHEBANG, shebang_len, Vec::new());
50        }
51
52        // Re-create the tokenizer from scratch every token because `GuardedStrPrefix` is one token in the lexer
53        // but we want to split it to two in edition <2024.
54        while let Some(token) =
55            rustc_lexer::tokenize(&text[conv.offset..], rustc_lexer::FrontmatterAllowed::No).next()
56        {
57            let token_text = &text[conv.offset..][..token.len as usize];
58
59            conv.extend_token(&token.kind, token_text);
60        }
61
62        conv.finalize_with_eof()
63    }
64
65    pub fn single_token(edition: Edition, text: &'a str) -> Option<(SyntaxKind, Option<String>)> {
66        if text.is_empty() {
67            return None;
68        }
69
70        let token = rustc_lexer::tokenize(text, rustc_lexer::FrontmatterAllowed::No).next()?;
71        if token.len as usize != text.len() {
72            return None;
73        }
74
75        let mut conv = Converter::new(edition, text);
76        conv.extend_token(&token.kind, text);
77        match &*conv.res.kind {
78            [kind] => Some((*kind, conv.res.error.pop().map(|it| it.msg))),
79            _ => None,
80        }
81    }
82
83    pub fn as_str(&self) -> &str {
84        self.text
85    }
86
87    pub fn len(&self) -> usize {
88        self.kind.len() - 1
89    }
90
91    pub fn is_empty(&self) -> bool {
92        self.len() == 0
93    }
94
95    pub fn kind(&self, i: usize) -> SyntaxKind {
96        assert!(i < self.len());
97        self.kind[i]
98    }
99
100    pub fn text(&self, i: usize) -> &str {
101        self.range_text(i..i + 1)
102    }
103
104    pub fn range_text(&self, r: ops::Range<usize>) -> &str {
105        assert!(r.start < r.end && r.end <= self.len());
106        let lo = self.start[r.start] as usize;
107        let hi = self.start[r.end] as usize;
108        &self.text[lo..hi]
109    }
110
111    // Naming is hard.
112    pub fn text_range(&self, i: usize) -> ops::Range<usize> {
113        assert!(i < self.len());
114        let lo = self.start[i] as usize;
115        let hi = self.start[i + 1] as usize;
116        lo..hi
117    }
118    pub fn text_start(&self, i: usize) -> usize {
119        assert!(i <= self.len());
120        self.start[i] as usize
121    }
122    pub fn text_len(&self, i: usize) -> usize {
123        assert!(i < self.len());
124        let r = self.text_range(i);
125        r.end - r.start
126    }
127
128    pub fn error(&self, i: usize) -> Option<&str> {
129        assert!(i < self.len());
130        let err = self.error.binary_search_by_key(&(i as u32), |i| i.token).ok()?;
131        Some(self.error[err].msg.as_str())
132    }
133
134    pub fn errors(&self) -> impl Iterator<Item = (usize, &str)> + '_ {
135        self.error.iter().map(|it| (it.token as usize, it.msg.as_str()))
136    }
137
138    fn push(&mut self, kind: SyntaxKind, offset: usize) {
139        self.kind.push(kind);
140        self.start.push(offset as u32);
141    }
142}
143
144struct Converter<'a> {
145    res: LexedStr<'a>,
146    offset: usize,
147    edition: Edition,
148}
149
150impl<'a> Converter<'a> {
151    fn new(edition: Edition, text: &'a str) -> Self {
152        Self {
153            res: LexedStr {
154                text,
155                kind: Vec::with_capacity(text.len() / 3),
156                start: Vec::with_capacity(text.len() / 3),
157                error: Vec::new(),
158            },
159            offset: 0,
160            edition,
161        }
162    }
163
164    /// Check for likely unterminated string by analyzing STRING token content
165    fn has_likely_unterminated_string(&self) -> bool {
166        let Some(last_idx) = self.res.kind.len().checked_sub(1) else { return false };
167
168        for i in (0..=last_idx).rev().take(5) {
169            if self.res.kind[i] == STRING {
170                let start = self.res.start[i] as usize;
171                let end = self.res.start.get(i + 1).map(|&s| s as usize).unwrap_or(self.offset);
172                let content = &self.res.text[start..end];
173
174                if content.contains('(') && (content.contains("//") || content.contains(";\n")) {
175                    return true;
176                }
177            }
178        }
179        false
180    }
181
182    fn finalize_with_eof(mut self) -> LexedStr<'a> {
183        self.res.push(EOF, self.offset);
184        self.res
185    }
186
187    fn push(&mut self, kind: SyntaxKind, len: usize, errors: Vec<String>) {
188        self.res.push(kind, self.offset);
189        self.offset += len;
190
191        for msg in errors {
192            if !msg.is_empty() {
193                self.res.error.push(LexError { msg, token: self.res.len() as u32 });
194            }
195        }
196    }
197
198    fn extend_token(&mut self, kind: &rustc_lexer::TokenKind, mut token_text: &str) {
199        // A note on an intended tradeoff:
200        // We drop some useful information here (see patterns with double dots `..`)
201        // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
202        // being `u16` that come from `rowan::SyntaxKind`.
203        let mut errors: Vec<String> = vec![];
204
205        let syntax_kind = {
206            match kind {
207                rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT,
208                rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated } => {
209                    if !terminated {
210                        errors.push(
211                            "Missing trailing `*/` symbols to terminate the block comment".into(),
212                        );
213                    }
214                    COMMENT
215                }
216
217                rustc_lexer::TokenKind::Frontmatter {
218                    has_invalid_preceding_whitespace,
219                    invalid_infostring,
220                } => {
221                    if *has_invalid_preceding_whitespace {
222                        errors.push("invalid preceding whitespace for frontmatter opening".into());
223                    } else if *invalid_infostring {
224                        errors.push("invalid infostring for frontmatter".into());
225                    }
226                    FRONTMATTER
227                }
228
229                rustc_lexer::TokenKind::Whitespace => WHITESPACE,
230
231                rustc_lexer::TokenKind::Ident if token_text == "_" => UNDERSCORE,
232                rustc_lexer::TokenKind::Ident => {
233                    SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT)
234                }
235                rustc_lexer::TokenKind::InvalidIdent => {
236                    errors.push("Ident contains invalid characters".into());
237                    IDENT
238                }
239
240                rustc_lexer::TokenKind::RawIdent => IDENT,
241
242                rustc_lexer::TokenKind::GuardedStrPrefix if self.edition.at_least_2024() => {
243                    // FIXME: rustc does something better for recovery.
244                    errors.push("Invalid string literal (reserved syntax)".into());
245                    ERROR
246                }
247                rustc_lexer::TokenKind::GuardedStrPrefix => {
248                    // The token is `#"` or `##`, split it into two.
249                    token_text = &token_text[1..];
250                    POUND
251                }
252
253                rustc_lexer::TokenKind::Literal { kind, .. } => {
254                    self.extend_literal(token_text.len(), kind);
255                    return;
256                }
257
258                rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
259                    if *starts_with_number {
260                        errors.push("Lifetime name cannot start with a number".into());
261                    }
262                    LIFETIME_IDENT
263                }
264                rustc_lexer::TokenKind::UnknownPrefixLifetime => {
265                    errors.push("Unknown lifetime prefix".into());
266                    LIFETIME_IDENT
267                }
268                rustc_lexer::TokenKind::RawLifetime => LIFETIME_IDENT,
269
270                rustc_lexer::TokenKind::Semi => T![;],
271                rustc_lexer::TokenKind::Comma => T![,],
272                rustc_lexer::TokenKind::Dot => T![.],
273                rustc_lexer::TokenKind::OpenParen => T!['('],
274                rustc_lexer::TokenKind::CloseParen => T![')'],
275                rustc_lexer::TokenKind::OpenBrace => T!['{'],
276                rustc_lexer::TokenKind::CloseBrace => T!['}'],
277                rustc_lexer::TokenKind::OpenBracket => T!['['],
278                rustc_lexer::TokenKind::CloseBracket => T![']'],
279                rustc_lexer::TokenKind::At => T![@],
280                rustc_lexer::TokenKind::Pound => T![#],
281                rustc_lexer::TokenKind::Tilde => T![~],
282                rustc_lexer::TokenKind::Question => T![?],
283                rustc_lexer::TokenKind::Colon => T![:],
284                rustc_lexer::TokenKind::Dollar => T![$],
285                rustc_lexer::TokenKind::Eq => T![=],
286                rustc_lexer::TokenKind::Bang => T![!],
287                rustc_lexer::TokenKind::Lt => T![<],
288                rustc_lexer::TokenKind::Gt => T![>],
289                rustc_lexer::TokenKind::Minus => T![-],
290                rustc_lexer::TokenKind::And => T![&],
291                rustc_lexer::TokenKind::Or => T![|],
292                rustc_lexer::TokenKind::Plus => T![+],
293                rustc_lexer::TokenKind::Star => T![*],
294                rustc_lexer::TokenKind::Slash => T![/],
295                rustc_lexer::TokenKind::Caret => T![^],
296                rustc_lexer::TokenKind::Percent => T![%],
297                rustc_lexer::TokenKind::Unknown => ERROR,
298                rustc_lexer::TokenKind::UnknownPrefix if token_text == "builtin" => IDENT,
299                rustc_lexer::TokenKind::UnknownPrefix => {
300                    let has_unterminated = self.has_likely_unterminated_string();
301
302                    let error_msg = if has_unterminated {
303                        format!(
304                            "unknown literal prefix `{token_text}` (note: check for unterminated string literal)"
305                        )
306                    } else {
307                        "unknown literal prefix".to_owned()
308                    };
309                    errors.push(error_msg);
310                    IDENT
311                }
312                rustc_lexer::TokenKind::Eof => EOF,
313            }
314        };
315
316        self.push(syntax_kind, token_text.len(), errors);
317    }
318
319    fn extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind) {
320        let invalid_raw_msg = String::from("Invalid raw string literal");
321
322        let mut errors = vec![];
323        let mut no_end_quote = |c: char, kind: &str| {
324            errors.push(format!("Missing trailing `{c}` symbol to terminate the {kind} literal"));
325        };
326
327        let syntax_kind = match *kind {
328            rustc_lexer::LiteralKind::Int { empty_int, base: _ } => {
329                if empty_int {
330                    errors.push("Missing digits after the integer base prefix".into());
331                }
332                INT_NUMBER
333            }
334            rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => {
335                if empty_exponent {
336                    errors.push("Missing digits after the exponent symbol".into());
337                }
338                FLOAT_NUMBER
339            }
340            rustc_lexer::LiteralKind::Char { terminated } => {
341                if !terminated {
342                    no_end_quote('\'', "character");
343                } else {
344                    let text = &self.res.text[self.offset + 1..][..len - 1];
345                    let text = &text[..text.rfind('\'').unwrap()];
346                    if let Err(e) = unescape_char(text) {
347                        errors.push(err_to_msg(e, Mode::Char));
348                    }
349                }
350                CHAR
351            }
352            rustc_lexer::LiteralKind::Byte { terminated } => {
353                if !terminated {
354                    no_end_quote('\'', "byte");
355                } else {
356                    let text = &self.res.text[self.offset + 2..][..len - 2];
357                    let text = &text[..text.rfind('\'').unwrap()];
358                    if let Err(e) = unescape_byte(text) {
359                        errors.push(err_to_msg(e, Mode::Byte));
360                    }
361                }
362                BYTE
363            }
364            rustc_lexer::LiteralKind::Str { terminated } => {
365                if !terminated {
366                    no_end_quote('"', "string");
367                } else {
368                    let text = &self.res.text[self.offset + 1..][..len - 1];
369                    let text = &text[..text.rfind('"').unwrap()];
370                    unescape_str(text, |_, res| {
371                        if let Err(e) = res {
372                            errors.push(err_to_msg(e, Mode::Str));
373                        }
374                    });
375                }
376                STRING
377            }
378            rustc_lexer::LiteralKind::ByteStr { terminated } => {
379                if !terminated {
380                    no_end_quote('"', "byte string");
381                } else {
382                    let text = &self.res.text[self.offset + 2..][..len - 2];
383                    let text = &text[..text.rfind('"').unwrap()];
384                    unescape_byte_str(text, |_, res| {
385                        if let Err(e) = res {
386                            errors.push(err_to_msg(e, Mode::ByteStr));
387                        }
388                    });
389                }
390                BYTE_STRING
391            }
392            rustc_lexer::LiteralKind::CStr { terminated } => {
393                if !terminated {
394                    no_end_quote('"', "C string")
395                } else {
396                    let text = &self.res.text[self.offset + 2..][..len - 2];
397                    let text = &text[..text.rfind('"').unwrap()];
398                    unescape_c_str(text, |_, res| {
399                        if let Err(e) = res {
400                            errors.push(err_to_msg(e, Mode::CStr));
401                        }
402                    });
403                }
404                C_STRING
405            }
406            rustc_lexer::LiteralKind::RawStr { n_hashes } => {
407                if n_hashes.is_none() {
408                    errors.push(invalid_raw_msg);
409                }
410                STRING
411            }
412            rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
413                if n_hashes.is_none() {
414                    errors.push(invalid_raw_msg);
415                }
416                BYTE_STRING
417            }
418            rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
419                if n_hashes.is_none() {
420                    errors.push(invalid_raw_msg);
421                }
422                C_STRING
423            }
424        };
425
426        self.push(syntax_kind, len, errors);
427    }
428}
429
430fn err_to_msg(error: EscapeError, mode: Mode) -> String {
431    match error {
432        EscapeError::ZeroChars => "empty character literal",
433        EscapeError::MoreThanOneChar => "character literal may only contain one codepoint",
434        EscapeError::LoneSlash => "",
435        EscapeError::InvalidEscape if mode == Mode::Byte || mode == Mode::ByteStr => {
436            "unknown byte escape"
437        }
438        EscapeError::InvalidEscape => "unknown character escape",
439        EscapeError::BareCarriageReturn => "",
440        EscapeError::BareCarriageReturnInRawString => "",
441        EscapeError::EscapeOnlyChar if mode == Mode::Byte => "byte constant must be escaped",
442        EscapeError::EscapeOnlyChar => "character constant must be escaped",
443        EscapeError::TooShortHexEscape => "numeric character escape is too short",
444        EscapeError::InvalidCharInHexEscape => "invalid character in numeric character escape",
445        EscapeError::OutOfRangeHexEscape => "out of range hex escape",
446        EscapeError::NoBraceInUnicodeEscape => "incorrect unicode escape sequence",
447        EscapeError::InvalidCharInUnicodeEscape => "invalid character in unicode escape",
448        EscapeError::EmptyUnicodeEscape => "empty unicode escape",
449        EscapeError::UnclosedUnicodeEscape => "unterminated unicode escape",
450        EscapeError::LeadingUnderscoreUnicodeEscape => "invalid start of unicode escape",
451        EscapeError::OverlongUnicodeEscape => "overlong unicode escape",
452        EscapeError::LoneSurrogateUnicodeEscape => "invalid unicode character escape",
453        EscapeError::OutOfRangeUnicodeEscape => "invalid unicode character escape",
454        EscapeError::UnicodeEscapeInByte => "unicode escape in byte string",
455        EscapeError::NonAsciiCharInByte if mode == Mode::Byte => {
456            "non-ASCII character in byte literal"
457        }
458        EscapeError::NonAsciiCharInByte if mode == Mode::ByteStr => {
459            "non-ASCII character in byte string literal"
460        }
461        EscapeError::NonAsciiCharInByte => "non-ASCII character in raw byte string literal",
462        EscapeError::NulInCStr => "null character in C string literal",
463        EscapeError::UnskippedWhitespaceWarning => "",
464        EscapeError::MultipleSkippedLinesWarning => "",
465    }
466    .into()
467}