1use std::cmp::Ordering;
10use std::str::FromStr;
11
12use unicode_ident::{is_xid_continue, is_xid_start};
13use unicode_normalization::UnicodeNormalization;
14
15use ruff_python_ast::name::Name;
16use ruff_python_ast::str_prefix::{AnyStringPrefix, StringLiteralPrefix};
17use ruff_python_ast::token::{TokenFlags, TokenKind};
18use ruff_python_ast::{Int, IpyEscapeKind, StringFlags};
19use ruff_python_trivia::is_python_whitespace;
20use ruff_text_size::{TextLen, TextRange, TextSize};
21
22use crate::Mode;
23use crate::error::{InterpolatedStringErrorType, LexicalError, LexicalErrorType};
24use crate::lexer::cursor::{Cursor, EOF_CHAR};
25use crate::lexer::indentation::{Indentation, Indentations, IndentationsCheckpoint};
26use crate::lexer::interpolated_string::{
27 InterpolatedStringContext, InterpolatedStrings, InterpolatedStringsCheckpoint,
28};
29use crate::string::InterpolatedStringKind;
30use crate::token::TokenValue;
31
32mod cursor;
33mod indentation;
34mod interpolated_string;
35
36const BOM: char = '\u{feff}';
37
38#[derive(Debug)]
40pub struct Lexer<'src> {
41 source: &'src str,
43
44 cursor: Cursor<'src>,
46
47 current_kind: TokenKind,
49
50 current_range: TextRange,
52
53 current_value: TokenValue,
55
56 current_flags: TokenFlags,
58
59 state: State,
61
62 nesting: u32,
65
66 indentations: Indentations,
68 pending_indentation: Option<Indentation>,
69
70 mode: Mode,
72
73 interpolated_strings: InterpolatedStrings,
75
76 errors: Vec<LexicalError>,
78}
79
80impl<'src> Lexer<'src> {
81 pub(crate) fn new(source: &'src str, mode: Mode, start_offset: TextSize) -> Self {
87 assert!(
88 u32::try_from(source.len()).is_ok(),
89 "Lexer only supports files with a size up to 4GB"
90 );
91
92 let (state, nesting) = if mode == Mode::ParenthesizedExpression {
93 (State::Other, 1)
94 } else {
95 (State::AfterNewline, 0)
96 };
97
98 let mut lexer = Lexer {
99 source,
100 cursor: Cursor::new(source),
101 state,
102 current_kind: TokenKind::EndOfFile,
103 current_range: TextRange::empty(start_offset),
104 current_value: TokenValue::None,
105 current_flags: TokenFlags::empty(),
106 nesting,
107 indentations: Indentations::default(),
108 pending_indentation: None,
109 mode,
110 interpolated_strings: InterpolatedStrings::default(),
111 errors: Vec::new(),
112 };
113
114 if start_offset == TextSize::new(0) {
115 lexer.cursor.eat_char(BOM);
117 } else {
118 lexer.cursor.skip_bytes(start_offset.to_usize());
119 }
120
121 lexer
122 }
123
124 pub(crate) fn current_kind(&self) -> TokenKind {
126 self.current_kind
127 }
128
129 pub(crate) fn current_range(&self) -> TextRange {
131 self.current_range
132 }
133
134 pub(crate) fn current_flags(&self) -> TokenFlags {
136 self.current_flags
137 }
138
139 pub(crate) fn take_value(&mut self) -> TokenValue {
145 std::mem::take(&mut self.current_value)
146 }
147
148 fn push_error(&mut self, error: LexicalError) -> TokenKind {
151 self.current_range = error.location();
152 self.errors.push(error);
153 TokenKind::Unknown
154 }
155
156 pub fn next_token(&mut self) -> TokenKind {
158 self.cursor.start_token();
159 self.current_value = TokenValue::None;
160 self.current_flags = TokenFlags::empty();
161 self.current_kind = self.lex_token();
162 if !matches!(self.current_kind, TokenKind::Unknown) {
164 self.current_range = self.token_range();
165 }
166 self.current_kind
167 }
168
169 fn lex_token(&mut self) -> TokenKind {
170 if let Some(interpolated_string) = self.interpolated_strings.current()
171 && !interpolated_string.is_in_interpolation(self.nesting)
172 {
173 if let Some(token) = self.lex_interpolated_string_middle_or_end() {
174 if token.is_interpolated_string_end() {
175 self.interpolated_strings.pop();
176 }
177 return token;
178 }
179 }
180 else if let Some(indentation) = self.pending_indentation.take() {
182 match self.indentations.current().try_compare(indentation) {
183 Ok(Ordering::Greater) => {
184 self.pending_indentation = Some(indentation);
185 if self.indentations.dedent_one(indentation).is_err() {
186 return self.push_error(LexicalError::new(
187 LexicalErrorType::IndentationError,
188 self.token_range(),
189 ));
190 }
191 return TokenKind::Dedent;
192 }
193 Ok(_) => {}
194 Err(_) => {
195 return self.push_error(LexicalError::new(
196 LexicalErrorType::IndentationError,
197 self.token_range(),
198 ));
199 }
200 }
201 }
202
203 if self.state.is_after_newline() {
204 if let Some(indentation) = self.eat_indentation() {
205 return indentation;
206 }
207 } else if let Err(error) = self.skip_whitespace() {
208 return self.push_error(error);
209 }
210
211 self.cursor.start_token();
213
214 if let Some(c) = self.cursor.bump() {
215 if c.is_ascii() {
216 self.consume_ascii_character(c)
217 } else if is_unicode_identifier_start(c) {
218 let identifier = self.lex_identifier(c);
219 self.state = State::Other;
220
221 identifier
222 } else {
223 self.push_error(LexicalError::new(
224 LexicalErrorType::UnrecognizedToken { tok: c },
225 self.token_range(),
226 ))
227 }
228 } else {
229 self.consume_end()
232 }
233 }
234
235 fn eat_indentation(&mut self) -> Option<TokenKind> {
236 let mut indentation = Indentation::root();
237
238 loop {
239 match self.cursor.first() {
240 ' ' => {
241 self.cursor.bump();
242 indentation = indentation.add_space();
243 }
244 '\t' => {
245 self.cursor.bump();
246 indentation = indentation.add_tab();
247 }
248 '\\' => {
249 self.cursor.bump();
250 if self.cursor.eat_char('\r') {
251 self.cursor.eat_char('\n');
252 } else if !self.cursor.eat_char('\n') {
253 return Some(self.push_error(LexicalError::new(
254 LexicalErrorType::LineContinuationError,
255 TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
256 )));
257 }
258 if self.cursor.is_eof() {
259 return Some(self.push_error(LexicalError::new(
260 LexicalErrorType::Eof,
261 self.token_range(),
262 )));
263 }
264 indentation = Indentation::root();
265 }
266 '\x0C' => {
268 self.cursor.bump();
269 indentation = Indentation::root();
270 }
271 _ => break,
272 }
273 }
274
275 if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) {
277 self.state = State::NonEmptyLogicalLine;
278
279 return self.handle_indentation(indentation);
281 }
282
283 None
284 }
285
286 fn handle_indentation(&mut self, indentation: Indentation) -> Option<TokenKind> {
287 match self.indentations.current().try_compare(indentation) {
288 Ok(Ordering::Greater) => {
290 self.pending_indentation = Some(indentation);
291
292 if self.indentations.dedent_one(indentation).is_err() {
293 return Some(self.push_error(LexicalError::new(
294 LexicalErrorType::IndentationError,
295 self.token_range(),
296 )));
297 }
298
299 self.cursor.start_token();
313
314 Some(TokenKind::Dedent)
315 }
316
317 Ok(Ordering::Equal) => None,
318
319 Ok(Ordering::Less) => {
321 self.indentations.indent(indentation);
322 Some(TokenKind::Indent)
323 }
324 Err(_) => Some(self.push_error(LexicalError::new(
325 LexicalErrorType::IndentationError,
326 self.token_range(),
327 ))),
328 }
329 }
330
331 fn skip_whitespace(&mut self) -> Result<(), LexicalError> {
332 loop {
333 match self.cursor.first() {
334 ' ' => {
335 self.cursor.bump();
336 }
337 '\t' => {
338 self.cursor.bump();
339 }
340 '\\' => {
341 self.cursor.bump();
342 if self.cursor.eat_char('\r') {
343 self.cursor.eat_char('\n');
344 } else if !self.cursor.eat_char('\n') {
345 return Err(LexicalError::new(
346 LexicalErrorType::LineContinuationError,
347 TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
348 ));
349 }
350 if self.cursor.is_eof() {
351 return Err(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
352 }
353 }
354 '\x0C' => {
356 self.cursor.bump();
357 }
358 _ => break,
359 }
360 }
361
362 Ok(())
363 }
364
365 fn consume_ascii_character(&mut self, c: char) -> TokenKind {
367 let token = match c {
368 c if is_ascii_identifier_start(c) => self.lex_identifier(c),
369 '0'..='9' => self.lex_number(c),
370 '#' => return self.lex_comment(),
371 '\'' | '"' => self.lex_string(c),
372 '=' => {
373 if self.cursor.eat_char('=') {
374 TokenKind::EqEqual
375 } else {
376 self.state = State::AfterEqual;
377 return TokenKind::Equal;
378 }
379 }
380 '+' => {
381 if self.cursor.eat_char('=') {
382 TokenKind::PlusEqual
383 } else {
384 TokenKind::Plus
385 }
386 }
387 '*' => {
388 if self.cursor.eat_char('=') {
389 TokenKind::StarEqual
390 } else if self.cursor.eat_char('*') {
391 if self.cursor.eat_char('=') {
392 TokenKind::DoubleStarEqual
393 } else {
394 TokenKind::DoubleStar
395 }
396 } else {
397 TokenKind::Star
398 }
399 }
400
401 c @ ('%' | '!')
402 if self.mode == Mode::Ipython
403 && self.state.is_after_equal()
404 && self.nesting == 0 =>
405 {
406 self.lex_ipython_escape_command(IpyEscapeKind::try_from(c).unwrap())
408 }
409
410 c @ ('%' | '!' | '?' | '/' | ';' | ',')
411 if self.mode == Mode::Ipython && self.state.is_new_logical_line() =>
412 {
413 let kind = if let Ok(kind) = IpyEscapeKind::try_from([c, self.cursor.first()]) {
414 self.cursor.bump();
415 kind
416 } else {
417 IpyEscapeKind::try_from(c).unwrap()
419 };
420
421 self.lex_ipython_escape_command(kind)
422 }
423
424 '?' if self.mode == Mode::Ipython => TokenKind::Question,
425
426 '/' => {
427 if self.cursor.eat_char('=') {
428 TokenKind::SlashEqual
429 } else if self.cursor.eat_char('/') {
430 if self.cursor.eat_char('=') {
431 TokenKind::DoubleSlashEqual
432 } else {
433 TokenKind::DoubleSlash
434 }
435 } else {
436 TokenKind::Slash
437 }
438 }
439 '%' => {
440 if self.cursor.eat_char('=') {
441 TokenKind::PercentEqual
442 } else {
443 TokenKind::Percent
444 }
445 }
446 '|' => {
447 if self.cursor.eat_char('=') {
448 TokenKind::VbarEqual
449 } else {
450 TokenKind::Vbar
451 }
452 }
453 '^' => {
454 if self.cursor.eat_char('=') {
455 TokenKind::CircumflexEqual
456 } else {
457 TokenKind::CircumFlex
458 }
459 }
460 '&' => {
461 if self.cursor.eat_char('=') {
462 TokenKind::AmperEqual
463 } else {
464 TokenKind::Amper
465 }
466 }
467 '-' => {
468 if self.cursor.eat_char('=') {
469 TokenKind::MinusEqual
470 } else if self.cursor.eat_char('>') {
471 TokenKind::Rarrow
472 } else {
473 TokenKind::Minus
474 }
475 }
476 '@' => {
477 if self.cursor.eat_char('=') {
478 TokenKind::AtEqual
479 } else {
480 TokenKind::At
481 }
482 }
483 '!' => {
484 if self.cursor.eat_char('=') {
485 TokenKind::NotEqual
486 } else {
487 TokenKind::Exclamation
488 }
489 }
490 '~' => TokenKind::Tilde,
491 '(' => {
492 self.nesting += 1;
493 TokenKind::Lpar
494 }
495 ')' => {
496 self.nesting = self.nesting.saturating_sub(1);
497 TokenKind::Rpar
498 }
499 '[' => {
500 self.nesting += 1;
501 TokenKind::Lsqb
502 }
503 ']' => {
504 self.nesting = self.nesting.saturating_sub(1);
505 TokenKind::Rsqb
506 }
507 '{' => {
508 self.nesting += 1;
509 TokenKind::Lbrace
510 }
511 '}' => {
512 if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
513 if interpolated_string.nesting() == self.nesting {
514 let error_type = LexicalErrorType::from_interpolated_string_error(
515 InterpolatedStringErrorType::SingleRbrace,
516 interpolated_string.kind(),
517 );
518 return self.push_error(LexicalError::new(error_type, self.token_range()));
519 }
520 interpolated_string.try_end_format_spec(self.nesting);
521 }
522 self.nesting = self.nesting.saturating_sub(1);
523 TokenKind::Rbrace
524 }
525 ':' => {
526 if self
527 .interpolated_strings
528 .current_mut()
529 .is_some_and(|interpolated_string| {
530 interpolated_string.try_start_format_spec(self.nesting)
531 })
532 {
533 TokenKind::Colon
534 } else if self.cursor.eat_char('=') {
535 TokenKind::ColonEqual
536 } else {
537 TokenKind::Colon
538 }
539 }
540 ';' => TokenKind::Semi,
541 '<' => {
542 if self.cursor.eat_char('<') {
543 if self.cursor.eat_char('=') {
544 TokenKind::LeftShiftEqual
545 } else {
546 TokenKind::LeftShift
547 }
548 } else if self.cursor.eat_char('=') {
549 TokenKind::LessEqual
550 } else {
551 TokenKind::Less
552 }
553 }
554 '>' => {
555 if self.cursor.eat_char('>') {
556 if self.cursor.eat_char('=') {
557 TokenKind::RightShiftEqual
558 } else {
559 TokenKind::RightShift
560 }
561 } else if self.cursor.eat_char('=') {
562 TokenKind::GreaterEqual
563 } else {
564 TokenKind::Greater
565 }
566 }
567 ',' => TokenKind::Comma,
568 '.' => {
569 if self.cursor.first().is_ascii_digit() {
570 self.lex_decimal_number('.')
571 } else if self.cursor.eat_char2('.', '.') {
572 TokenKind::Ellipsis
573 } else {
574 TokenKind::Dot
575 }
576 }
577 '\n' => {
578 return if self.nesting == 0 && !self.state.is_new_logical_line() {
579 self.state = State::AfterNewline;
580 TokenKind::Newline
581 } else {
582 if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
583 interpolated_string.try_end_format_spec(self.nesting);
584 }
585 TokenKind::NonLogicalNewline
586 };
587 }
588 '\r' => {
589 self.cursor.eat_char('\n');
590
591 return if self.nesting == 0 && !self.state.is_new_logical_line() {
592 self.state = State::AfterNewline;
593 TokenKind::Newline
594 } else {
595 if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
596 interpolated_string.try_end_format_spec(self.nesting);
597 }
598 TokenKind::NonLogicalNewline
599 };
600 }
601
602 _ => {
603 self.state = State::Other;
604
605 return self.push_error(LexicalError::new(
606 LexicalErrorType::UnrecognizedToken { tok: c },
607 self.token_range(),
608 ));
609 }
610 };
611
612 self.state = State::Other;
613
614 token
615 }
616
617 fn lex_identifier(&mut self, first: char) -> TokenKind {
619 let quote = match (first, self.cursor.first()) {
621 (_, quote @ ('\'' | '"')) => self.try_single_char_prefix(first).then(|| {
622 self.cursor.bump();
623 quote
624 }),
625 (_, second) if is_quote(self.cursor.second()) => {
626 self.try_double_char_prefix([first, second]).then(|| {
627 self.cursor.bump();
628 self.cursor.bump().unwrap()
630 })
631 }
632 _ => None,
633 };
634
635 if let Some(quote) = quote {
636 if self.current_flags.is_interpolated_string()
637 && let Some(kind) = self.lex_interpolated_string_start(quote)
638 {
639 return kind;
640 }
641
642 return self.lex_string(quote);
643 }
644
645 let mut is_ascii = first.is_ascii();
653 self.cursor
654 .eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
655
656 let text = self.token_text();
657
658 if !is_ascii {
659 self.current_value = TokenValue::Name(text.nfkc().collect::<Name>());
660 return TokenKind::Name;
661 }
662
663 if text.len() > 8 {
667 self.current_value = TokenValue::Name(Name::new(text));
668 return TokenKind::Name;
669 }
670
671 match text {
672 "False" => TokenKind::False,
673 "None" => TokenKind::None,
674 "True" => TokenKind::True,
675 "and" => TokenKind::And,
676 "as" => TokenKind::As,
677 "assert" => TokenKind::Assert,
678 "async" => TokenKind::Async,
679 "await" => TokenKind::Await,
680 "break" => TokenKind::Break,
681 "case" => TokenKind::Case,
682 "class" => TokenKind::Class,
683 "continue" => TokenKind::Continue,
684 "def" => TokenKind::Def,
685 "del" => TokenKind::Del,
686 "elif" => TokenKind::Elif,
687 "else" => TokenKind::Else,
688 "except" => TokenKind::Except,
689 "finally" => TokenKind::Finally,
690 "for" => TokenKind::For,
691 "from" => TokenKind::From,
692 "global" => TokenKind::Global,
693 "if" => TokenKind::If,
694 "import" => TokenKind::Import,
695 "in" => TokenKind::In,
696 "is" => TokenKind::Is,
697 "lambda" => TokenKind::Lambda,
698 "match" => TokenKind::Match,
699 "nonlocal" => TokenKind::Nonlocal,
700 "not" => TokenKind::Not,
701 "or" => TokenKind::Or,
702 "pass" => TokenKind::Pass,
703 "raise" => TokenKind::Raise,
704 "return" => TokenKind::Return,
705 "try" => TokenKind::Try,
706 "type" => TokenKind::Type,
707 "while" => TokenKind::While,
708 "with" => TokenKind::With,
709 "yield" => TokenKind::Yield,
710 _ => {
711 self.current_value = TokenValue::Name(Name::new(text));
712 TokenKind::Name
713 }
714 }
715 }
716
717 fn try_single_char_prefix(&mut self, first: char) -> bool {
720 match first {
721 'f' | 'F' => self.current_flags |= TokenFlags::F_STRING,
722 't' | 'T' => self.current_flags |= TokenFlags::T_STRING,
723 'u' | 'U' => self.current_flags |= TokenFlags::UNICODE_STRING,
724 'b' | 'B' => self.current_flags |= TokenFlags::BYTE_STRING,
725 'r' => self.current_flags |= TokenFlags::RAW_STRING_LOWERCASE,
726 'R' => self.current_flags |= TokenFlags::RAW_STRING_UPPERCASE,
727 _ => return false,
728 }
729 true
730 }
731
732 fn try_double_char_prefix(&mut self, value: [char; 2]) -> bool {
735 match value {
736 ['r', 'f' | 'F'] | ['f' | 'F', 'r'] => {
737 self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_LOWERCASE;
738 }
739 ['R', 'f' | 'F'] | ['f' | 'F', 'R'] => {
740 self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_UPPERCASE;
741 }
742 ['r', 't' | 'T'] | ['t' | 'T', 'r'] => {
743 self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_LOWERCASE;
744 }
745 ['R', 't' | 'T'] | ['t' | 'T', 'R'] => {
746 self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_UPPERCASE;
747 }
748 ['r', 'b' | 'B'] | ['b' | 'B', 'r'] => {
749 self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_LOWERCASE;
750 }
751 ['R', 'b' | 'B'] | ['b' | 'B', 'R'] => {
752 self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_UPPERCASE;
753 }
754 _ => return false,
755 }
756 true
757 }
758
759 fn lex_interpolated_string_start(&mut self, quote: char) -> Option<TokenKind> {
761 #[cfg(debug_assertions)]
762 debug_assert_eq!(self.cursor.previous(), quote);
763
764 if quote == '"' {
765 self.current_flags |= TokenFlags::DOUBLE_QUOTES;
766 }
767
768 if self.cursor.eat_char2(quote, quote) {
769 self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
770 }
771
772 let ftcontext = InterpolatedStringContext::new(self.current_flags, self.nesting)?;
773
774 let kind = ftcontext.kind();
775
776 self.interpolated_strings.push(ftcontext);
777
778 Some(kind.start_token())
779 }
780
781 fn lex_interpolated_string_middle_or_end(&mut self) -> Option<TokenKind> {
783 let interpolated_string = self.interpolated_strings.current().unwrap();
785 let string_kind = interpolated_string.kind();
786 let interpolated_flags = interpolated_string.flags();
787
788 if interpolated_string.is_triple_quoted() {
790 let quote_char = interpolated_string.quote_char();
791 if self.cursor.eat_char3(quote_char, quote_char, quote_char) {
792 self.current_flags = interpolated_string.flags();
793 return Some(string_kind.end_token());
794 }
795 } else if self.cursor.eat_char(interpolated_string.quote_char()) {
796 self.current_flags = interpolated_string.flags();
797 return Some(string_kind.end_token());
798 }
799
800 let mut normalized = String::new();
804
805 let mut last_offset = self.offset();
807
808 let in_format_spec = interpolated_string.is_in_format_spec(self.nesting);
810
811 let mut in_named_unicode = false;
812
813 loop {
814 match self.cursor.first() {
815 EOF_CHAR if self.cursor.is_eof() => {
819 let error = if interpolated_string.is_triple_quoted() {
820 InterpolatedStringErrorType::UnterminatedTripleQuotedString
821 } else {
822 InterpolatedStringErrorType::UnterminatedString
823 };
824
825 self.nesting = interpolated_string.nesting();
826 self.interpolated_strings.pop();
827 self.current_flags |= TokenFlags::UNCLOSED_STRING;
828 self.push_error(LexicalError::new(
829 LexicalErrorType::from_interpolated_string_error(error, string_kind),
830 self.token_range(),
831 ));
832
833 break;
834 }
835 '\n' | '\r' if !interpolated_string.is_triple_quoted() => {
836 let error_type = if in_format_spec {
839 InterpolatedStringErrorType::NewlineInFormatSpec
840 } else {
841 InterpolatedStringErrorType::UnterminatedString
842 };
843
844 self.nesting = interpolated_string.nesting();
845 self.interpolated_strings.pop();
846 self.current_flags |= TokenFlags::UNCLOSED_STRING;
847
848 self.push_error(LexicalError::new(
849 LexicalErrorType::from_interpolated_string_error(error_type, string_kind),
850 self.token_range(),
851 ));
852
853 break;
854 }
855 '\\' => {
856 self.cursor.bump(); if matches!(self.cursor.first(), '{' | '}') {
858 continue;
861 } else if !interpolated_string.is_raw_string()
862 && self.cursor.eat_char2('N', '{')
863 {
864 in_named_unicode = true;
865 continue;
866 }
867 if self.cursor.eat_char('\r') {
869 self.cursor.eat_char('\n');
870 } else {
871 self.cursor.bump();
872 }
873 }
874 quote @ ('\'' | '"') if quote == interpolated_string.quote_char() => {
875 if let Some(triple_quotes) = interpolated_string.triple_quotes() {
876 if self.cursor.rest().starts_with(triple_quotes) {
877 break;
878 }
879 self.cursor.bump();
880 } else {
881 break;
882 }
883 }
884 '{' => {
885 if self.cursor.second() == '{' && !in_format_spec {
886 self.cursor.bump();
887 normalized
888 .push_str(&self.source[TextRange::new(last_offset, self.offset())]);
889 self.cursor.bump(); last_offset = self.offset();
891 } else {
892 break;
893 }
894 }
895 '}' => {
896 if in_named_unicode {
897 in_named_unicode = false;
898 self.cursor.bump();
899 } else if self.cursor.second() == '}' && !in_format_spec {
900 self.cursor.bump();
901 normalized
902 .push_str(&self.source[TextRange::new(last_offset, self.offset())]);
903 self.cursor.bump(); last_offset = self.offset();
905 } else {
906 break;
907 }
908 }
909 _ => {
910 self.cursor.bump();
911 }
912 }
913 }
914 let range = self.token_range();
915 if range.is_empty() {
916 return None;
917 }
918
919 let value = if normalized.is_empty() {
920 self.source[range].to_string()
921 } else {
922 normalized.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
923 normalized
924 };
925
926 self.current_value = TokenValue::InterpolatedStringMiddle(value.into_boxed_str());
927
928 self.current_flags = interpolated_flags;
929 Some(string_kind.middle_token())
930 }
931
932 fn lex_string(&mut self, quote: char) -> TokenKind {
934 #[cfg(debug_assertions)]
935 debug_assert_eq!(self.cursor.previous(), quote);
936
937 if quote == '"' {
938 self.current_flags |= TokenFlags::DOUBLE_QUOTES;
939 }
940
941 if self.cursor.eat_char2(quote, quote) {
944 self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
945 }
946
947 let value_start = self.offset();
948
949 let quote_byte = u8::try_from(quote).expect("char that fits in u8");
950 let value_end = if self.current_flags.is_triple_quoted() {
951 loop {
954 let Some(index) = memchr::memchr(quote_byte, self.cursor.rest().as_bytes()) else {
955 self.cursor.skip_to_end();
956
957 self.current_flags |= TokenFlags::UNCLOSED_STRING;
958 self.push_error(LexicalError::new(
959 LexicalErrorType::UnclosedStringError,
960 self.token_range(),
961 ));
962 break self.offset();
963 };
964
965 let num_backslashes = self.cursor.rest().as_bytes()[..index]
968 .iter()
969 .rev()
970 .take_while(|&&c| c == b'\\')
971 .count();
972
973 self.cursor.skip_bytes(index + 1);
975
976 if num_backslashes % 2 == 1 {
978 continue;
979 }
980
981 if self.cursor.eat_char2(quote, quote) {
983 break self.offset() - TextSize::new(3);
984 }
985 }
986 } else {
987 loop {
990 let Some(index) =
991 memchr::memchr3(quote_byte, b'\r', b'\n', self.cursor.rest().as_bytes())
992 else {
993 self.cursor.skip_to_end();
994 self.current_flags |= TokenFlags::UNCLOSED_STRING;
995
996 self.push_error(LexicalError::new(
997 LexicalErrorType::UnclosedStringError,
998 self.token_range(),
999 ));
1000
1001 break self.offset();
1002 };
1003
1004 let num_backslashes = self.cursor.rest().as_bytes()[..index]
1007 .iter()
1008 .rev()
1009 .take_while(|&&c| c == b'\\')
1010 .count();
1011
1012 self.cursor.skip_bytes(index);
1014
1015 let quote_or_newline = self.cursor.first();
1017
1018 if num_backslashes % 2 == 1 {
1020 self.cursor.bump();
1021 if quote_or_newline == '\r' {
1022 self.cursor.eat_char('\n');
1023 }
1024 continue;
1025 }
1026
1027 match quote_or_newline {
1028 '\r' | '\n' => {
1029 self.current_flags |= TokenFlags::UNCLOSED_STRING;
1030 self.push_error(LexicalError::new(
1031 LexicalErrorType::UnclosedStringError,
1032 self.token_range(),
1033 ));
1034 break self.offset();
1035 }
1036 ch if ch == quote => {
1037 let value_end = self.offset();
1038 self.cursor.bump();
1039 break value_end;
1040 }
1041 _ => unreachable!("memchr2 returned an index that is not a quote or a newline"),
1042 }
1043 }
1044 };
1045
1046 self.current_value = TokenValue::String(
1047 self.source[TextRange::new(value_start, value_end)]
1048 .to_string()
1049 .into_boxed_str(),
1050 );
1051
1052 TokenKind::String
1053 }
1054
1055 fn lex_number(&mut self, first: char) -> TokenKind {
1057 if first == '0' {
1058 if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() {
1059 self.lex_number_radix(Radix::Hex)
1060 } else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() {
1061 self.lex_number_radix(Radix::Octal)
1062 } else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() {
1063 self.lex_number_radix(Radix::Binary)
1064 } else {
1065 self.lex_decimal_number(first)
1066 }
1067 } else {
1068 self.lex_decimal_number(first)
1069 }
1070 }
1071
1072 fn lex_number_radix(&mut self, radix: Radix) -> TokenKind {
1074 #[cfg(debug_assertions)]
1075 debug_assert!(matches!(
1076 self.cursor.previous().to_ascii_lowercase(),
1077 'x' | 'o' | 'b'
1078 ));
1079
1080 let mut number = LexedText::new(self.offset(), self.source);
1082 self.radix_run(&mut number, radix);
1083
1084 let token = &self.source[self.token_range()];
1086
1087 let value = match Int::from_str_radix(number.as_str(), radix.as_u32(), token) {
1088 Ok(int) => int,
1089 Err(err) => {
1090 return self.push_error(LexicalError::new(
1091 LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
1092 self.token_range(),
1093 ));
1094 }
1095 };
1096 self.current_value = TokenValue::Int(value);
1097 TokenKind::Int
1098 }
1099
1100 fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> TokenKind {
1102 #[cfg(debug_assertions)]
1103 debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.');
1104 let start_is_zero = first_digit_or_dot == '0';
1105
1106 let mut number = LexedText::new(self.token_start(), self.source);
1107 if first_digit_or_dot != '.' {
1108 number.push(first_digit_or_dot);
1109 self.radix_run(&mut number, Radix::Decimal);
1110 }
1111
1112 let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') {
1113 number.push('.');
1114
1115 if self.cursor.eat_char('_') {
1116 return self.push_error(LexicalError::new(
1117 LexicalErrorType::OtherError("Invalid Syntax".to_string().into_boxed_str()),
1118 TextRange::new(self.offset() - TextSize::new(1), self.offset()),
1119 ));
1120 }
1121
1122 self.radix_run(&mut number, Radix::Decimal);
1123 true
1124 } else {
1125 false
1127 };
1128
1129 let is_float = match self.cursor.rest().as_bytes() {
1130 [b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => {
1131 number.push(self.cursor.bump().unwrap());
1133
1134 if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) {
1135 number.push(sign);
1136 }
1137
1138 self.radix_run(&mut number, Radix::Decimal);
1139
1140 true
1141 }
1142 _ => is_float,
1143 };
1144
1145 if is_float {
1146 let Ok(value) = f64::from_str(number.as_str()) else {
1148 return self.push_error(LexicalError::new(
1149 LexicalErrorType::OtherError(
1150 "Invalid decimal literal".to_string().into_boxed_str(),
1151 ),
1152 self.token_range(),
1153 ));
1154 };
1155
1156 if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
1158 self.current_value = TokenValue::Complex {
1159 real: 0.0,
1160 imag: value,
1161 };
1162 TokenKind::Complex
1163 } else {
1164 self.current_value = TokenValue::Float(value);
1165 TokenKind::Float
1166 }
1167 } else {
1168 if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
1170 let imag = f64::from_str(number.as_str()).unwrap();
1171 self.current_value = TokenValue::Complex { real: 0.0, imag };
1172 TokenKind::Complex
1173 } else {
1174 let value = match Int::from_str(number.as_str()) {
1175 Ok(value) => {
1176 if start_is_zero && value.as_u8() != Some(0) {
1177 return self.push_error(LexicalError::new(
1179 LexicalErrorType::OtherError(
1180 "Invalid decimal integer literal"
1181 .to_string()
1182 .into_boxed_str(),
1183 ),
1184 self.token_range(),
1185 ));
1186 }
1187 value
1188 }
1189 Err(err) => {
1190 return self.push_error(LexicalError::new(
1191 LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
1192 self.token_range(),
1193 ));
1194 }
1195 };
1196 self.current_value = TokenValue::Int(value);
1197 TokenKind::Int
1198 }
1199 }
1200 }
1201
1202 fn radix_run(&mut self, number: &mut LexedText, radix: Radix) {
1206 loop {
1207 if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) {
1208 number.push(c);
1209 }
1210 else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) {
1212 self.cursor.bump();
1214 number.skip_char();
1215 } else {
1216 break;
1217 }
1218 }
1219 }
1220
1221 fn lex_comment(&mut self) -> TokenKind {
1223 #[cfg(debug_assertions)]
1224 debug_assert_eq!(self.cursor.previous(), '#');
1225
1226 let bytes = self.cursor.rest().as_bytes();
1227 let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len());
1228 self.cursor.skip_bytes(offset);
1229
1230 TokenKind::Comment
1231 }
1232
1233 fn lex_ipython_escape_command(&mut self, escape_kind: IpyEscapeKind) -> TokenKind {
1235 let mut value = String::new();
1236
1237 loop {
1238 match self.cursor.first() {
1239 '\\' => {
1240 if self.cursor.second() == '\r' {
1250 self.cursor.bump();
1251 self.cursor.bump();
1252 self.cursor.eat_char('\n');
1253 continue;
1254 } else if self.cursor.second() == '\n' {
1255 self.cursor.bump();
1256 self.cursor.bump();
1257 continue;
1258 }
1259
1260 self.cursor.bump();
1261 value.push('\\');
1262 }
1263 '?' => {
1276 self.cursor.bump();
1277 let mut question_count = 1u32;
1278 while self.cursor.eat_char('?') {
1279 question_count += 1;
1280 }
1281
1282 if question_count > 2
1295 || value.chars().last().is_none_or(is_python_whitespace)
1296 || !matches!(self.cursor.first(), '\n' | '\r' | EOF_CHAR)
1297 {
1298 value.reserve(question_count as usize);
1300 for _ in 0..question_count {
1301 value.push('?');
1302 }
1303 continue;
1304 }
1305
1306 if escape_kind.is_help() {
1307 value = value.trim_start_matches([' ', '?']).to_string();
1314 } else if escape_kind.is_magic() {
1315 value.insert_str(0, escape_kind.as_str());
1320 }
1321
1322 let kind = match question_count {
1323 1 => IpyEscapeKind::Help,
1324 2 => IpyEscapeKind::Help2,
1325 _ => unreachable!("`question_count` is always 1 or 2"),
1326 };
1327
1328 self.current_value = TokenValue::IpyEscapeCommand {
1329 kind,
1330 value: value.into_boxed_str(),
1331 };
1332
1333 return TokenKind::IpyEscapeCommand;
1334 }
1335 '\n' | '\r' | EOF_CHAR => {
1336 self.current_value = TokenValue::IpyEscapeCommand {
1337 kind: escape_kind,
1338 value: value.into_boxed_str(),
1339 };
1340
1341 return TokenKind::IpyEscapeCommand;
1342 }
1343 c => {
1344 self.cursor.bump();
1345 value.push(c);
1346 }
1347 }
1348 }
1349 }
1350
1351 fn consume_end(&mut self) -> TokenKind {
1352 while let Some(interpolated_string) = self.interpolated_strings.pop() {
1356 self.nesting = interpolated_string.nesting();
1357 self.push_error(LexicalError::new(
1358 LexicalErrorType::from_interpolated_string_error(
1359 InterpolatedStringErrorType::UnterminatedString,
1360 interpolated_string.kind(),
1361 ),
1362 self.token_range(),
1363 ));
1364 }
1365
1366 let init_nesting = u32::from(self.mode == Mode::ParenthesizedExpression);
1370
1371 if self.nesting > init_nesting {
1372 self.nesting = 0;
1374 return self.push_error(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
1375 }
1376
1377 if !self.state.is_new_logical_line() {
1379 self.state = State::AfterNewline;
1380 TokenKind::Newline
1381 }
1382 else if self.indentations.dedent().is_some() {
1384 TokenKind::Dedent
1385 } else {
1386 TokenKind::EndOfFile
1387 }
1388 }
1389
1390 pub(crate) fn re_lex_logical_token(
1446 &mut self,
1447 non_logical_newline_start: Option<TextSize>,
1448 ) -> bool {
1449 if self.nesting == 0 {
1450 return false;
1451 }
1452
1453 self.nesting -= 1;
1456
1457 if self.current_flags.is_triple_quoted_interpolated_string() {
1460 return false;
1461 }
1462
1463 let Some(new_position) = non_logical_newline_start else {
1464 return false;
1465 };
1466
1467 if matches!(
1481 self.current_kind,
1482 TokenKind::Rpar | TokenKind::Rsqb | TokenKind::Rbrace
1483 ) {
1484 self.nesting += 1;
1485 }
1486
1487 self.cursor = Cursor::new(self.source);
1488 self.cursor.skip_bytes(new_position.to_usize());
1489 self.state = State::Other;
1490 self.next_token();
1491 true
1492 }
1493
1494 pub(crate) fn re_lex_string_token_in_interpolation_element(
1503 &mut self,
1504 kind: InterpolatedStringKind,
1505 ) {
1506 let Some(interpolated_string) = self.interpolated_strings.current() else {
1507 return;
1508 };
1509
1510 let current_string_flags = self.current_flags().as_any_string_flags();
1511
1512 if !matches!(self.current_kind, TokenKind::String)
1514 || !self.current_flags.is_unclosed()
1515 || current_string_flags.prefix() != AnyStringPrefix::Regular(StringLiteralPrefix::Empty)
1516 || current_string_flags.quote_style().as_char() != interpolated_string.quote_char()
1517 || current_string_flags.is_triple_quoted() != interpolated_string.is_triple_quoted()
1518 {
1519 return;
1520 }
1521
1522 let first_line = &self.source
1525 [(self.current_range.start() + current_string_flags.quote_len()).to_usize()..];
1526
1527 for c in first_line.chars() {
1528 if matches!(c, '\n' | '\r' | '#') {
1529 break;
1530 }
1531
1532 if !is_python_whitespace(c) {
1534 return;
1535 }
1536 }
1537
1538 if self.errors.last().is_some_and(|error| {
1539 error.location() == self.current_range
1540 && matches!(error.error(), LexicalErrorType::UnclosedStringError)
1541 }) {
1542 self.errors.pop();
1543 }
1544
1545 self.current_range =
1546 TextRange::at(self.current_range.start(), self.current_flags.quote_len());
1547 self.current_kind = kind.end_token();
1548 self.current_value = TokenValue::None;
1549 self.current_flags = TokenFlags::empty();
1550
1551 self.nesting = interpolated_string.nesting();
1552 self.interpolated_strings.pop();
1553
1554 self.cursor = Cursor::new(self.source);
1555 self.cursor.skip_bytes(self.current_range.end().to_usize());
1556 }
1557
1558 pub(crate) fn re_lex_raw_string_in_format_spec(&mut self) {
1572 if matches!(self.current_kind, TokenKind::String)
1575 && self.current_flags.is_unclosed()
1576 && self.current_flags.prefix()
1577 == AnyStringPrefix::Regular(StringLiteralPrefix::Raw { uppercase: false })
1578 {
1579 if self.errors.last().is_some_and(|error| {
1580 error.location() == self.current_range
1581 && matches!(error.error(), LexicalErrorType::UnclosedStringError)
1582 }) {
1583 self.errors.pop();
1584 }
1585
1586 self.current_range = TextRange::at(self.current_range.start(), 'r'.text_len());
1587 self.current_kind = TokenKind::Name;
1588 self.current_value = TokenValue::Name(Name::new_static("r"));
1589 self.current_flags = TokenFlags::empty();
1590 self.cursor = Cursor::new(self.source);
1591 self.cursor.skip_bytes(self.current_range.end().to_usize());
1592 }
1593 }
1594
1595 #[inline]
1596 fn token_range(&self) -> TextRange {
1597 let end = self.offset();
1598 let len = self.cursor.token_len();
1599
1600 TextRange::at(end - len, len)
1601 }
1602
1603 #[inline]
1604 fn token_text(&self) -> &'src str {
1605 &self.source[self.token_range()]
1606 }
1607
1608 #[expect(clippy::cast_possible_truncation)]
1611 #[inline]
1612 fn offset(&self) -> TextSize {
1613 TextSize::new(self.source.len() as u32) - self.cursor.text_len()
1614 }
1615
1616 #[inline]
1617 fn token_start(&self) -> TextSize {
1618 self.token_range().start()
1619 }
1620
1621 pub(crate) fn checkpoint(&self) -> LexerCheckpoint {
1623 LexerCheckpoint {
1624 value: self.current_value.clone(),
1625 current_kind: self.current_kind,
1626 current_range: self.current_range,
1627 current_flags: self.current_flags,
1628 cursor_offset: self.offset(),
1629 state: self.state,
1630 nesting: self.nesting,
1631 indentations_checkpoint: self.indentations.checkpoint(),
1632 pending_indentation: self.pending_indentation,
1633 interpolated_strings_checkpoint: self.interpolated_strings.checkpoint(),
1634 errors_position: self.errors.len(),
1635 }
1636 }
1637
1638 pub(crate) fn rewind(&mut self, checkpoint: LexerCheckpoint) {
1640 let LexerCheckpoint {
1641 value,
1642 current_kind,
1643 current_range,
1644 current_flags,
1645 cursor_offset,
1646 state,
1647 nesting,
1648 indentations_checkpoint,
1649 pending_indentation,
1650 interpolated_strings_checkpoint,
1651 errors_position,
1652 } = checkpoint;
1653
1654 let mut cursor = Cursor::new(self.source);
1655 cursor.skip_bytes(cursor_offset.to_usize());
1657
1658 self.current_value = value;
1659 self.current_kind = current_kind;
1660 self.current_range = current_range;
1661 self.current_flags = current_flags;
1662 self.cursor = cursor;
1663 self.state = state;
1664 self.nesting = nesting;
1665 self.indentations.rewind(indentations_checkpoint);
1666 self.pending_indentation = pending_indentation;
1667 self.interpolated_strings
1668 .rewind(interpolated_strings_checkpoint);
1669 self.errors.truncate(errors_position);
1670 }
1671
1672 pub fn finish(self) -> Vec<LexicalError> {
1673 self.errors
1674 }
1675}
1676
1677pub(crate) struct LexerCheckpoint {
1678 value: TokenValue,
1679 current_kind: TokenKind,
1680 current_range: TextRange,
1681 current_flags: TokenFlags,
1682 cursor_offset: TextSize,
1683 state: State,
1684 nesting: u32,
1685 indentations_checkpoint: IndentationsCheckpoint,
1686 pending_indentation: Option<Indentation>,
1687 interpolated_strings_checkpoint: InterpolatedStringsCheckpoint,
1688 errors_position: usize,
1689}
1690
1691#[derive(Copy, Clone, Debug)]
1692enum State {
1693 AfterNewline,
1695
1696 NonEmptyLogicalLine,
1698
1699 AfterEqual,
1701
1702 Other,
1704}
1705
1706impl State {
1707 const fn is_after_newline(self) -> bool {
1708 matches!(self, State::AfterNewline)
1709 }
1710
1711 const fn is_new_logical_line(self) -> bool {
1712 matches!(self, State::AfterNewline | State::NonEmptyLogicalLine)
1713 }
1714
1715 const fn is_after_equal(self) -> bool {
1716 matches!(self, State::AfterEqual)
1717 }
1718}
1719
1720#[derive(Copy, Clone, Debug)]
1721enum Radix {
1722 Binary,
1723 Octal,
1724 Decimal,
1725 Hex,
1726}
1727
1728impl Radix {
1729 const fn as_u32(self) -> u32 {
1730 match self {
1731 Radix::Binary => 2,
1732 Radix::Octal => 8,
1733 Radix::Decimal => 10,
1734 Radix::Hex => 16,
1735 }
1736 }
1737
1738 const fn is_digit(self, c: char) -> bool {
1739 match self {
1740 Radix::Binary => matches!(c, '0'..='1'),
1741 Radix::Octal => matches!(c, '0'..='7'),
1742 Radix::Decimal => c.is_ascii_digit(),
1743 Radix::Hex => c.is_ascii_hexdigit(),
1744 }
1745 }
1746}
1747
1748const fn is_quote(c: char) -> bool {
1749 matches!(c, '\'' | '"')
1750}
1751
1752const fn is_ascii_identifier_start(c: char) -> bool {
1753 matches!(c, 'a'..='z' | 'A'..='Z' | '_')
1754}
1755
1756fn is_unicode_identifier_start(c: char) -> bool {
1759 is_xid_start(c)
1760}
1761
1762fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
1769 if c.is_ascii() {
1772 matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
1773 } else {
1774 *identifier_is_ascii_only = false;
1775 is_xid_continue(c)
1776 }
1777}
1778
1779enum LexedText<'a> {
1780 Source { source: &'a str, range: TextRange },
1781 Owned(String),
1782}
1783
1784impl<'a> LexedText<'a> {
1785 fn new(start: TextSize, source: &'a str) -> Self {
1786 Self::Source {
1787 range: TextRange::empty(start),
1788 source,
1789 }
1790 }
1791
1792 fn push(&mut self, c: char) {
1793 match self {
1794 LexedText::Source { range, source } => {
1795 *range = range.add_end(c.text_len());
1796 debug_assert!(source[*range].ends_with(c));
1797 }
1798 LexedText::Owned(owned) => owned.push(c),
1799 }
1800 }
1801
1802 fn as_str<'b>(&'b self) -> &'b str
1803 where
1804 'b: 'a,
1805 {
1806 match self {
1807 LexedText::Source { range, source } => &source[*range],
1808 LexedText::Owned(owned) => owned,
1809 }
1810 }
1811
1812 fn skip_char(&mut self) {
1813 match self {
1814 LexedText::Source { range, source } => {
1815 *self = LexedText::Owned(source[*range].to_string());
1816 }
1817 LexedText::Owned(_) => {}
1818 }
1819 }
1820}
1821
1822pub fn lex(source: &str, mode: Mode) -> Lexer<'_> {
1824 Lexer::new(source, mode, TextSize::default())
1825}