1use std::ops;
12
13use rustc_literal_escaper::{
14 EscapeError, Mode, unescape_byte, unescape_byte_str, unescape_c_str, unescape_char,
15 unescape_str,
16};
17
18use crate::{
19 Edition,
20 SyntaxKind::{self, *},
21 T,
22};
23
24pub struct LexedStr<'a> {
25 text: &'a str,
26 kind: Vec<SyntaxKind>,
27 start: Vec<u32>,
28 error: Vec<LexError>,
29}
30
31struct LexError {
32 msg: String,
33 token: u32,
34}
35
36impl<'a> LexedStr<'a> {
37 pub fn new(edition: Edition, text: &'a str) -> LexedStr<'a> {
38 let _p = tracing::info_span!("LexedStr::new").entered();
39 let mut conv = Converter::new(edition, text);
40 if let Ok(script) = crate::frontmatter::ScriptSource::parse(text) {
41 if let Some(shebang) = script.shebang_span() {
42 conv.push(SHEBANG, shebang.end - shebang.start, Vec::new());
43 }
44 if script.frontmatter().is_some() {
45 conv.push(FRONTMATTER, script.content_span().start - conv.offset, Vec::new());
46 }
47 } else if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
48 conv.push(SHEBANG, shebang_len, Vec::new());
50 }
51
52 while let Some(token) =
55 rustc_lexer::tokenize(&text[conv.offset..], rustc_lexer::FrontmatterAllowed::No).next()
56 {
57 let token_text = &text[conv.offset..][..token.len as usize];
58
59 conv.extend_token(&token.kind, token_text);
60 }
61
62 conv.finalize_with_eof()
63 }
64
65 pub fn single_token(edition: Edition, text: &'a str) -> Option<(SyntaxKind, Option<String>)> {
66 if text.is_empty() {
67 return None;
68 }
69
70 let token = rustc_lexer::tokenize(text, rustc_lexer::FrontmatterAllowed::No).next()?;
71 if token.len as usize != text.len() {
72 return None;
73 }
74
75 let mut conv = Converter::new(edition, text);
76 conv.extend_token(&token.kind, text);
77 match &*conv.res.kind {
78 [kind] => Some((*kind, conv.res.error.pop().map(|it| it.msg))),
79 _ => None,
80 }
81 }
82
83 pub fn as_str(&self) -> &str {
84 self.text
85 }
86
87 pub fn len(&self) -> usize {
88 self.kind.len() - 1
89 }
90
91 pub fn is_empty(&self) -> bool {
92 self.len() == 0
93 }
94
95 pub fn kind(&self, i: usize) -> SyntaxKind {
96 assert!(i < self.len());
97 self.kind[i]
98 }
99
100 pub fn text(&self, i: usize) -> &str {
101 self.range_text(i..i + 1)
102 }
103
104 pub fn range_text(&self, r: ops::Range<usize>) -> &str {
105 assert!(r.start < r.end && r.end <= self.len());
106 let lo = self.start[r.start] as usize;
107 let hi = self.start[r.end] as usize;
108 &self.text[lo..hi]
109 }
110
111 pub fn text_range(&self, i: usize) -> ops::Range<usize> {
113 assert!(i < self.len());
114 let lo = self.start[i] as usize;
115 let hi = self.start[i + 1] as usize;
116 lo..hi
117 }
118 pub fn text_start(&self, i: usize) -> usize {
119 assert!(i <= self.len());
120 self.start[i] as usize
121 }
122 pub fn text_len(&self, i: usize) -> usize {
123 assert!(i < self.len());
124 let r = self.text_range(i);
125 r.end - r.start
126 }
127
128 pub fn error(&self, i: usize) -> Option<&str> {
129 assert!(i < self.len());
130 let err = self.error.binary_search_by_key(&(i as u32), |i| i.token).ok()?;
131 Some(self.error[err].msg.as_str())
132 }
133
134 pub fn errors(&self) -> impl Iterator<Item = (usize, &str)> + '_ {
135 self.error.iter().map(|it| (it.token as usize, it.msg.as_str()))
136 }
137
138 fn push(&mut self, kind: SyntaxKind, offset: usize) {
139 self.kind.push(kind);
140 self.start.push(offset as u32);
141 }
142}
143
144struct Converter<'a> {
145 res: LexedStr<'a>,
146 offset: usize,
147 edition: Edition,
148}
149
150impl<'a> Converter<'a> {
151 fn new(edition: Edition, text: &'a str) -> Self {
152 Self {
153 res: LexedStr {
154 text,
155 kind: Vec::with_capacity(text.len() / 3),
156 start: Vec::with_capacity(text.len() / 3),
157 error: Vec::new(),
158 },
159 offset: 0,
160 edition,
161 }
162 }
163
164 fn has_likely_unterminated_string(&self) -> bool {
166 let Some(last_idx) = self.res.kind.len().checked_sub(1) else { return false };
167
168 for i in (0..=last_idx).rev().take(5) {
169 if self.res.kind[i] == STRING {
170 let start = self.res.start[i] as usize;
171 let end = self.res.start.get(i + 1).map(|&s| s as usize).unwrap_or(self.offset);
172 let content = &self.res.text[start..end];
173
174 if content.contains('(') && (content.contains("//") || content.contains(";\n")) {
175 return true;
176 }
177 }
178 }
179 false
180 }
181
182 fn finalize_with_eof(mut self) -> LexedStr<'a> {
183 self.res.push(EOF, self.offset);
184 self.res
185 }
186
187 fn push(&mut self, kind: SyntaxKind, len: usize, errors: Vec<String>) {
188 self.res.push(kind, self.offset);
189 self.offset += len;
190
191 for msg in errors {
192 if !msg.is_empty() {
193 self.res.error.push(LexError { msg, token: self.res.len() as u32 });
194 }
195 }
196 }
197
198 fn extend_token(&mut self, kind: &rustc_lexer::TokenKind, mut token_text: &str) {
199 let mut errors: Vec<String> = vec![];
204
205 let syntax_kind = {
206 match kind {
207 rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT,
208 rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated } => {
209 if !terminated {
210 errors.push(
211 "Missing trailing `*/` symbols to terminate the block comment".into(),
212 );
213 }
214 COMMENT
215 }
216
217 rustc_lexer::TokenKind::Frontmatter {
218 has_invalid_preceding_whitespace,
219 invalid_infostring,
220 } => {
221 if *has_invalid_preceding_whitespace {
222 errors.push("invalid preceding whitespace for frontmatter opening".into());
223 } else if *invalid_infostring {
224 errors.push("invalid infostring for frontmatter".into());
225 }
226 FRONTMATTER
227 }
228
229 rustc_lexer::TokenKind::Whitespace => WHITESPACE,
230
231 rustc_lexer::TokenKind::Ident if token_text == "_" => UNDERSCORE,
232 rustc_lexer::TokenKind::Ident => {
233 SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT)
234 }
235 rustc_lexer::TokenKind::InvalidIdent => {
236 errors.push("Ident contains invalid characters".into());
237 IDENT
238 }
239
240 rustc_lexer::TokenKind::RawIdent => IDENT,
241
242 rustc_lexer::TokenKind::GuardedStrPrefix if self.edition.at_least_2024() => {
243 errors.push("Invalid string literal (reserved syntax)".into());
245 ERROR
246 }
247 rustc_lexer::TokenKind::GuardedStrPrefix => {
248 token_text = &token_text[1..];
250 POUND
251 }
252
253 rustc_lexer::TokenKind::Literal { kind, .. } => {
254 self.extend_literal(token_text.len(), kind);
255 return;
256 }
257
258 rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
259 if *starts_with_number {
260 errors.push("Lifetime name cannot start with a number".into());
261 }
262 LIFETIME_IDENT
263 }
264 rustc_lexer::TokenKind::UnknownPrefixLifetime => {
265 errors.push("Unknown lifetime prefix".into());
266 LIFETIME_IDENT
267 }
268 rustc_lexer::TokenKind::RawLifetime => LIFETIME_IDENT,
269
270 rustc_lexer::TokenKind::Semi => T![;],
271 rustc_lexer::TokenKind::Comma => T![,],
272 rustc_lexer::TokenKind::Dot => T![.],
273 rustc_lexer::TokenKind::OpenParen => T!['('],
274 rustc_lexer::TokenKind::CloseParen => T![')'],
275 rustc_lexer::TokenKind::OpenBrace => T!['{'],
276 rustc_lexer::TokenKind::CloseBrace => T!['}'],
277 rustc_lexer::TokenKind::OpenBracket => T!['['],
278 rustc_lexer::TokenKind::CloseBracket => T![']'],
279 rustc_lexer::TokenKind::At => T![@],
280 rustc_lexer::TokenKind::Pound => T![#],
281 rustc_lexer::TokenKind::Tilde => T![~],
282 rustc_lexer::TokenKind::Question => T![?],
283 rustc_lexer::TokenKind::Colon => T![:],
284 rustc_lexer::TokenKind::Dollar => T![$],
285 rustc_lexer::TokenKind::Eq => T![=],
286 rustc_lexer::TokenKind::Bang => T![!],
287 rustc_lexer::TokenKind::Lt => T![<],
288 rustc_lexer::TokenKind::Gt => T![>],
289 rustc_lexer::TokenKind::Minus => T![-],
290 rustc_lexer::TokenKind::And => T![&],
291 rustc_lexer::TokenKind::Or => T![|],
292 rustc_lexer::TokenKind::Plus => T![+],
293 rustc_lexer::TokenKind::Star => T![*],
294 rustc_lexer::TokenKind::Slash => T![/],
295 rustc_lexer::TokenKind::Caret => T![^],
296 rustc_lexer::TokenKind::Percent => T![%],
297 rustc_lexer::TokenKind::Unknown => ERROR,
298 rustc_lexer::TokenKind::UnknownPrefix if token_text == "builtin" => IDENT,
299 rustc_lexer::TokenKind::UnknownPrefix => {
300 let has_unterminated = self.has_likely_unterminated_string();
301
302 let error_msg = if has_unterminated {
303 format!(
304 "unknown literal prefix `{token_text}` (note: check for unterminated string literal)"
305 )
306 } else {
307 "unknown literal prefix".to_owned()
308 };
309 errors.push(error_msg);
310 IDENT
311 }
312 rustc_lexer::TokenKind::Eof => EOF,
313 }
314 };
315
316 self.push(syntax_kind, token_text.len(), errors);
317 }
318
319 fn extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind) {
320 let invalid_raw_msg = String::from("Invalid raw string literal");
321
322 let mut errors = vec![];
323 let mut no_end_quote = |c: char, kind: &str| {
324 errors.push(format!("Missing trailing `{c}` symbol to terminate the {kind} literal"));
325 };
326
327 let syntax_kind = match *kind {
328 rustc_lexer::LiteralKind::Int { empty_int, base: _ } => {
329 if empty_int {
330 errors.push("Missing digits after the integer base prefix".into());
331 }
332 INT_NUMBER
333 }
334 rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => {
335 if empty_exponent {
336 errors.push("Missing digits after the exponent symbol".into());
337 }
338 FLOAT_NUMBER
339 }
340 rustc_lexer::LiteralKind::Char { terminated } => {
341 if !terminated {
342 no_end_quote('\'', "character");
343 } else {
344 let text = &self.res.text[self.offset + 1..][..len - 1];
345 let text = &text[..text.rfind('\'').unwrap()];
346 if let Err(e) = unescape_char(text) {
347 errors.push(err_to_msg(e, Mode::Char));
348 }
349 }
350 CHAR
351 }
352 rustc_lexer::LiteralKind::Byte { terminated } => {
353 if !terminated {
354 no_end_quote('\'', "byte");
355 } else {
356 let text = &self.res.text[self.offset + 2..][..len - 2];
357 let text = &text[..text.rfind('\'').unwrap()];
358 if let Err(e) = unescape_byte(text) {
359 errors.push(err_to_msg(e, Mode::Byte));
360 }
361 }
362 BYTE
363 }
364 rustc_lexer::LiteralKind::Str { terminated } => {
365 if !terminated {
366 no_end_quote('"', "string");
367 } else {
368 let text = &self.res.text[self.offset + 1..][..len - 1];
369 let text = &text[..text.rfind('"').unwrap()];
370 unescape_str(text, |_, res| {
371 if let Err(e) = res {
372 errors.push(err_to_msg(e, Mode::Str));
373 }
374 });
375 }
376 STRING
377 }
378 rustc_lexer::LiteralKind::ByteStr { terminated } => {
379 if !terminated {
380 no_end_quote('"', "byte string");
381 } else {
382 let text = &self.res.text[self.offset + 2..][..len - 2];
383 let text = &text[..text.rfind('"').unwrap()];
384 unescape_byte_str(text, |_, res| {
385 if let Err(e) = res {
386 errors.push(err_to_msg(e, Mode::ByteStr));
387 }
388 });
389 }
390 BYTE_STRING
391 }
392 rustc_lexer::LiteralKind::CStr { terminated } => {
393 if !terminated {
394 no_end_quote('"', "C string")
395 } else {
396 let text = &self.res.text[self.offset + 2..][..len - 2];
397 let text = &text[..text.rfind('"').unwrap()];
398 unescape_c_str(text, |_, res| {
399 if let Err(e) = res {
400 errors.push(err_to_msg(e, Mode::CStr));
401 }
402 });
403 }
404 C_STRING
405 }
406 rustc_lexer::LiteralKind::RawStr { n_hashes } => {
407 if n_hashes.is_none() {
408 errors.push(invalid_raw_msg);
409 }
410 STRING
411 }
412 rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
413 if n_hashes.is_none() {
414 errors.push(invalid_raw_msg);
415 }
416 BYTE_STRING
417 }
418 rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
419 if n_hashes.is_none() {
420 errors.push(invalid_raw_msg);
421 }
422 C_STRING
423 }
424 };
425
426 self.push(syntax_kind, len, errors);
427 }
428}
429
430fn err_to_msg(error: EscapeError, mode: Mode) -> String {
431 match error {
432 EscapeError::ZeroChars => "empty character literal",
433 EscapeError::MoreThanOneChar => "character literal may only contain one codepoint",
434 EscapeError::LoneSlash => "",
435 EscapeError::InvalidEscape if mode == Mode::Byte || mode == Mode::ByteStr => {
436 "unknown byte escape"
437 }
438 EscapeError::InvalidEscape => "unknown character escape",
439 EscapeError::BareCarriageReturn => "",
440 EscapeError::BareCarriageReturnInRawString => "",
441 EscapeError::EscapeOnlyChar if mode == Mode::Byte => "byte constant must be escaped",
442 EscapeError::EscapeOnlyChar => "character constant must be escaped",
443 EscapeError::TooShortHexEscape => "numeric character escape is too short",
444 EscapeError::InvalidCharInHexEscape => "invalid character in numeric character escape",
445 EscapeError::OutOfRangeHexEscape => "out of range hex escape",
446 EscapeError::NoBraceInUnicodeEscape => "incorrect unicode escape sequence",
447 EscapeError::InvalidCharInUnicodeEscape => "invalid character in unicode escape",
448 EscapeError::EmptyUnicodeEscape => "empty unicode escape",
449 EscapeError::UnclosedUnicodeEscape => "unterminated unicode escape",
450 EscapeError::LeadingUnderscoreUnicodeEscape => "invalid start of unicode escape",
451 EscapeError::OverlongUnicodeEscape => "overlong unicode escape",
452 EscapeError::LoneSurrogateUnicodeEscape => "invalid unicode character escape",
453 EscapeError::OutOfRangeUnicodeEscape => "invalid unicode character escape",
454 EscapeError::UnicodeEscapeInByte => "unicode escape in byte string",
455 EscapeError::NonAsciiCharInByte if mode == Mode::Byte => {
456 "non-ASCII character in byte literal"
457 }
458 EscapeError::NonAsciiCharInByte if mode == Mode::ByteStr => {
459 "non-ASCII character in byte string literal"
460 }
461 EscapeError::NonAsciiCharInByte => "non-ASCII character in raw byte string literal",
462 EscapeError::NulInCStr => "null character in C string literal",
463 EscapeError::UnskippedWhitespaceWarning => "",
464 EscapeError::MultipleSkippedLinesWarning => "",
465 }
466 .into()
467}