1use std::ops;
12
13use rustc_literal_escaper::{
14 EscapeError, Mode, unescape_byte, unescape_byte_str, unescape_c_str, unescape_char,
15 unescape_str,
16};
17
18use crate::{
19 Edition,
20 SyntaxKind::{self, *},
21 T,
22};
23
24pub struct LexedStr<'a> {
25 text: &'a str,
26 kind: Vec<SyntaxKind>,
27 start: Vec<u32>,
28 error: Vec<LexError>,
29}
30
31struct LexError {
32 msg: String,
33 token: u32,
34}
35
36impl<'a> LexedStr<'a> {
37 pub fn new(edition: Edition, text: &'a str) -> LexedStr<'a> {
38 let _p = tracing::info_span!("LexedStr::new").entered();
39 let mut conv = Converter::new(edition, text);
40 if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
41 conv.res.push(SHEBANG, conv.offset);
42 conv.offset = shebang_len;
43 };
44
45 while let Some(token) =
48 rustc_lexer::tokenize(&text[conv.offset..], rustc_lexer::FrontmatterAllowed::No).next()
49 {
50 let token_text = &text[conv.offset..][..token.len as usize];
51
52 conv.extend_token(&token.kind, token_text);
53 }
54
55 conv.finalize_with_eof()
56 }
57
58 pub fn single_token(edition: Edition, text: &'a str) -> Option<(SyntaxKind, Option<String>)> {
59 if text.is_empty() {
60 return None;
61 }
62
63 let token = rustc_lexer::tokenize(text, rustc_lexer::FrontmatterAllowed::No).next()?;
64 if token.len as usize != text.len() {
65 return None;
66 }
67
68 let mut conv = Converter::new(edition, text);
69 conv.extend_token(&token.kind, text);
70 match &*conv.res.kind {
71 [kind] => Some((*kind, conv.res.error.pop().map(|it| it.msg))),
72 _ => None,
73 }
74 }
75
76 pub fn as_str(&self) -> &str {
77 self.text
78 }
79
80 pub fn len(&self) -> usize {
81 self.kind.len() - 1
82 }
83
84 pub fn is_empty(&self) -> bool {
85 self.len() == 0
86 }
87
88 pub fn kind(&self, i: usize) -> SyntaxKind {
89 assert!(i < self.len());
90 self.kind[i]
91 }
92
93 pub fn text(&self, i: usize) -> &str {
94 self.range_text(i..i + 1)
95 }
96
97 pub fn range_text(&self, r: ops::Range<usize>) -> &str {
98 assert!(r.start < r.end && r.end <= self.len());
99 let lo = self.start[r.start] as usize;
100 let hi = self.start[r.end] as usize;
101 &self.text[lo..hi]
102 }
103
104 pub fn text_range(&self, i: usize) -> ops::Range<usize> {
106 assert!(i < self.len());
107 let lo = self.start[i] as usize;
108 let hi = self.start[i + 1] as usize;
109 lo..hi
110 }
111 pub fn text_start(&self, i: usize) -> usize {
112 assert!(i <= self.len());
113 self.start[i] as usize
114 }
115 pub fn text_len(&self, i: usize) -> usize {
116 assert!(i < self.len());
117 let r = self.text_range(i);
118 r.end - r.start
119 }
120
121 pub fn error(&self, i: usize) -> Option<&str> {
122 assert!(i < self.len());
123 let err = self.error.binary_search_by_key(&(i as u32), |i| i.token).ok()?;
124 Some(self.error[err].msg.as_str())
125 }
126
127 pub fn errors(&self) -> impl Iterator<Item = (usize, &str)> + '_ {
128 self.error.iter().map(|it| (it.token as usize, it.msg.as_str()))
129 }
130
131 fn push(&mut self, kind: SyntaxKind, offset: usize) {
132 self.kind.push(kind);
133 self.start.push(offset as u32);
134 }
135}
136
137struct Converter<'a> {
138 res: LexedStr<'a>,
139 offset: usize,
140 edition: Edition,
141}
142
143impl<'a> Converter<'a> {
144 fn new(edition: Edition, text: &'a str) -> Self {
145 Self {
146 res: LexedStr { text, kind: Vec::new(), start: Vec::new(), error: Vec::new() },
147 offset: 0,
148 edition,
149 }
150 }
151
152 fn finalize_with_eof(mut self) -> LexedStr<'a> {
153 self.res.push(EOF, self.offset);
154 self.res
155 }
156
157 fn push(&mut self, kind: SyntaxKind, len: usize, errors: Vec<String>) {
158 self.res.push(kind, self.offset);
159 self.offset += len;
160
161 for msg in errors {
162 if !msg.is_empty() {
163 self.res.error.push(LexError { msg, token: self.res.len() as u32 });
164 }
165 }
166 }
167
168 fn extend_token(&mut self, kind: &rustc_lexer::TokenKind, mut token_text: &str) {
169 let mut errors: Vec<String> = vec![];
174
175 let syntax_kind = {
176 match kind {
177 rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT,
178 rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated } => {
179 if !terminated {
180 errors.push(
181 "Missing trailing `*/` symbols to terminate the block comment".into(),
182 );
183 }
184 COMMENT
185 }
186
187 rustc_lexer::TokenKind::Frontmatter {
188 has_invalid_preceding_whitespace,
189 invalid_infostring,
190 } => {
191 if *has_invalid_preceding_whitespace {
192 errors.push("invalid preceding whitespace for frontmatter opening".into());
193 } else if *invalid_infostring {
194 errors.push("invalid infostring for frontmatter".into());
195 }
196 FRONTMATTER
197 }
198
199 rustc_lexer::TokenKind::Whitespace => WHITESPACE,
200
201 rustc_lexer::TokenKind::Ident if token_text == "_" => UNDERSCORE,
202 rustc_lexer::TokenKind::Ident => {
203 SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT)
204 }
205 rustc_lexer::TokenKind::InvalidIdent => {
206 errors.push("Ident contains invalid characters".into());
207 IDENT
208 }
209
210 rustc_lexer::TokenKind::RawIdent => IDENT,
211
212 rustc_lexer::TokenKind::GuardedStrPrefix if self.edition.at_least_2024() => {
213 errors.push("Invalid string literal (reserved syntax)".into());
215 ERROR
216 }
217 rustc_lexer::TokenKind::GuardedStrPrefix => {
218 token_text = &token_text[1..];
220 POUND
221 }
222
223 rustc_lexer::TokenKind::Literal { kind, .. } => {
224 self.extend_literal(token_text.len(), kind);
225 return;
226 }
227
228 rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
229 if *starts_with_number {
230 errors.push("Lifetime name cannot start with a number".into());
231 }
232 LIFETIME_IDENT
233 }
234 rustc_lexer::TokenKind::UnknownPrefixLifetime => {
235 errors.push("Unknown lifetime prefix".into());
236 LIFETIME_IDENT
237 }
238 rustc_lexer::TokenKind::RawLifetime => LIFETIME_IDENT,
239
240 rustc_lexer::TokenKind::Semi => T![;],
241 rustc_lexer::TokenKind::Comma => T![,],
242 rustc_lexer::TokenKind::Dot => T![.],
243 rustc_lexer::TokenKind::OpenParen => T!['('],
244 rustc_lexer::TokenKind::CloseParen => T![')'],
245 rustc_lexer::TokenKind::OpenBrace => T!['{'],
246 rustc_lexer::TokenKind::CloseBrace => T!['}'],
247 rustc_lexer::TokenKind::OpenBracket => T!['['],
248 rustc_lexer::TokenKind::CloseBracket => T![']'],
249 rustc_lexer::TokenKind::At => T![@],
250 rustc_lexer::TokenKind::Pound => T![#],
251 rustc_lexer::TokenKind::Tilde => T![~],
252 rustc_lexer::TokenKind::Question => T![?],
253 rustc_lexer::TokenKind::Colon => T![:],
254 rustc_lexer::TokenKind::Dollar => T![$],
255 rustc_lexer::TokenKind::Eq => T![=],
256 rustc_lexer::TokenKind::Bang => T![!],
257 rustc_lexer::TokenKind::Lt => T![<],
258 rustc_lexer::TokenKind::Gt => T![>],
259 rustc_lexer::TokenKind::Minus => T![-],
260 rustc_lexer::TokenKind::And => T![&],
261 rustc_lexer::TokenKind::Or => T![|],
262 rustc_lexer::TokenKind::Plus => T![+],
263 rustc_lexer::TokenKind::Star => T![*],
264 rustc_lexer::TokenKind::Slash => T![/],
265 rustc_lexer::TokenKind::Caret => T![^],
266 rustc_lexer::TokenKind::Percent => T![%],
267 rustc_lexer::TokenKind::Unknown => ERROR,
268 rustc_lexer::TokenKind::UnknownPrefix if token_text == "builtin" => IDENT,
269 rustc_lexer::TokenKind::UnknownPrefix => {
270 errors.push("unknown literal prefix".into());
271 IDENT
272 }
273 rustc_lexer::TokenKind::Eof => EOF,
274 }
275 };
276
277 self.push(syntax_kind, token_text.len(), errors);
278 }
279
280 fn extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind) {
281 let invalid_raw_msg = String::from("Invalid raw string literal");
282
283 let mut errors = vec![];
284 let mut no_end_quote = |c: char, kind: &str| {
285 errors.push(format!("Missing trailing `{c}` symbol to terminate the {kind} literal"));
286 };
287
288 let syntax_kind = match *kind {
289 rustc_lexer::LiteralKind::Int { empty_int, base: _ } => {
290 if empty_int {
291 errors.push("Missing digits after the integer base prefix".into());
292 }
293 INT_NUMBER
294 }
295 rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => {
296 if empty_exponent {
297 errors.push("Missing digits after the exponent symbol".into());
298 }
299 FLOAT_NUMBER
300 }
301 rustc_lexer::LiteralKind::Char { terminated } => {
302 if !terminated {
303 no_end_quote('\'', "character");
304 } else {
305 let text = &self.res.text[self.offset + 1..][..len - 1];
306 let text = &text[..text.rfind('\'').unwrap()];
307 if let Err(e) = unescape_char(text) {
308 errors.push(err_to_msg(e, Mode::Char));
309 }
310 }
311 CHAR
312 }
313 rustc_lexer::LiteralKind::Byte { terminated } => {
314 if !terminated {
315 no_end_quote('\'', "byte");
316 } else {
317 let text = &self.res.text[self.offset + 2..][..len - 2];
318 let text = &text[..text.rfind('\'').unwrap()];
319 if let Err(e) = unescape_byte(text) {
320 errors.push(err_to_msg(e, Mode::Byte));
321 }
322 }
323 BYTE
324 }
325 rustc_lexer::LiteralKind::Str { terminated } => {
326 if !terminated {
327 no_end_quote('"', "string");
328 } else {
329 let text = &self.res.text[self.offset + 1..][..len - 1];
330 let text = &text[..text.rfind('"').unwrap()];
331 unescape_str(text, |_, res| {
332 if let Err(e) = res {
333 errors.push(err_to_msg(e, Mode::Str));
334 }
335 });
336 }
337 STRING
338 }
339 rustc_lexer::LiteralKind::ByteStr { terminated } => {
340 if !terminated {
341 no_end_quote('"', "byte string");
342 } else {
343 let text = &self.res.text[self.offset + 2..][..len - 2];
344 let text = &text[..text.rfind('"').unwrap()];
345 unescape_byte_str(text, |_, res| {
346 if let Err(e) = res {
347 errors.push(err_to_msg(e, Mode::ByteStr));
348 }
349 });
350 }
351 BYTE_STRING
352 }
353 rustc_lexer::LiteralKind::CStr { terminated } => {
354 if !terminated {
355 no_end_quote('"', "C string")
356 } else {
357 let text = &self.res.text[self.offset + 2..][..len - 2];
358 let text = &text[..text.rfind('"').unwrap()];
359 unescape_c_str(text, |_, res| {
360 if let Err(e) = res {
361 errors.push(err_to_msg(e, Mode::CStr));
362 }
363 });
364 }
365 C_STRING
366 }
367 rustc_lexer::LiteralKind::RawStr { n_hashes } => {
368 if n_hashes.is_none() {
369 errors.push(invalid_raw_msg);
370 }
371 STRING
372 }
373 rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
374 if n_hashes.is_none() {
375 errors.push(invalid_raw_msg);
376 }
377 BYTE_STRING
378 }
379 rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
380 if n_hashes.is_none() {
381 errors.push(invalid_raw_msg);
382 }
383 C_STRING
384 }
385 };
386
387 self.push(syntax_kind, len, errors);
388 }
389}
390
391fn err_to_msg(error: EscapeError, mode: Mode) -> String {
392 match error {
393 EscapeError::ZeroChars => "empty character literal",
394 EscapeError::MoreThanOneChar => "character literal may only contain one codepoint",
395 EscapeError::LoneSlash => "",
396 EscapeError::InvalidEscape if mode == Mode::Byte || mode == Mode::ByteStr => {
397 "unknown byte escape"
398 }
399 EscapeError::InvalidEscape => "unknown character escape",
400 EscapeError::BareCarriageReturn => "",
401 EscapeError::BareCarriageReturnInRawString => "",
402 EscapeError::EscapeOnlyChar if mode == Mode::Byte => "byte constant must be escaped",
403 EscapeError::EscapeOnlyChar => "character constant must be escaped",
404 EscapeError::TooShortHexEscape => "numeric character escape is too short",
405 EscapeError::InvalidCharInHexEscape => "invalid character in numeric character escape",
406 EscapeError::OutOfRangeHexEscape => "out of range hex escape",
407 EscapeError::NoBraceInUnicodeEscape => "incorrect unicode escape sequence",
408 EscapeError::InvalidCharInUnicodeEscape => "invalid character in unicode escape",
409 EscapeError::EmptyUnicodeEscape => "empty unicode escape",
410 EscapeError::UnclosedUnicodeEscape => "unterminated unicode escape",
411 EscapeError::LeadingUnderscoreUnicodeEscape => "invalid start of unicode escape",
412 EscapeError::OverlongUnicodeEscape => "overlong unicode escape",
413 EscapeError::LoneSurrogateUnicodeEscape => "invalid unicode character escape",
414 EscapeError::OutOfRangeUnicodeEscape => "invalid unicode character escape",
415 EscapeError::UnicodeEscapeInByte => "unicode escape in byte string",
416 EscapeError::NonAsciiCharInByte if mode == Mode::Byte => {
417 "non-ASCII character in byte literal"
418 }
419 EscapeError::NonAsciiCharInByte if mode == Mode::ByteStr => {
420 "non-ASCII character in byte string literal"
421 }
422 EscapeError::NonAsciiCharInByte => "non-ASCII character in raw byte string literal",
423 EscapeError::NulInCStr => "null character in C string literal",
424 EscapeError::UnskippedWhitespaceWarning => "",
425 EscapeError::MultipleSkippedLinesWarning => "",
426 }
427 .into()
428}