1use std::cmp::Ordering;
10use std::str::FromStr;
11
12use unicode_ident::{is_xid_continue, is_xid_start};
13use unicode_normalization::UnicodeNormalization;
14
15use ruff_python_ast::name::Name;
16use ruff_python_ast::str_prefix::{AnyStringPrefix, StringLiteralPrefix};
17use ruff_python_ast::token::{TokenFlags, TokenKind};
18use ruff_python_ast::{Int, IpyEscapeKind, StringFlags};
19use ruff_python_trivia::is_python_whitespace;
20use ruff_text_size::{TextLen, TextRange, TextSize};
21
22use crate::Mode;
23use crate::error::{InterpolatedStringErrorType, LexicalError, LexicalErrorType};
24use crate::lexer::cursor::{Cursor, EOF_CHAR};
25use crate::lexer::indentation::{Indentation, Indentations, IndentationsCheckpoint};
26use crate::lexer::interpolated_string::{
27 InterpolatedStringContext, InterpolatedStrings, InterpolatedStringsCheckpoint,
28};
29use crate::string::InterpolatedStringKind;
30use crate::token::TokenValue;
31
32mod cursor;
33mod indentation;
34mod interpolated_string;
35
36const BOM: char = '\u{feff}';
37
38#[derive(Debug)]
40pub struct Lexer<'src> {
41 source: &'src str,
43
44 cursor: Cursor<'src>,
46
47 current_kind: TokenKind,
49
50 current_range: TextRange,
52
53 current_value: TokenValue,
55
56 current_flags: TokenFlags,
58
59 state: State,
61
62 nesting: u32,
65
66 indentations: Indentations,
68 pending_indentation: Option<Indentation>,
69
70 mode: Mode,
72
73 interpolated_strings: InterpolatedStrings,
75
76 errors: Vec<LexicalError>,
78}
79
80impl<'src> Lexer<'src> {
81 pub(crate) fn new(source: &'src str, mode: Mode, start_offset: TextSize) -> Self {
87 assert!(
88 u32::try_from(source.len()).is_ok(),
89 "Lexer only supports files with a size up to 4GB"
90 );
91
92 let (state, nesting) = if mode == Mode::ParenthesizedExpression {
93 (State::Other, 1)
94 } else {
95 (State::AfterNewline, 0)
96 };
97
98 let mut lexer = Lexer {
99 source,
100 cursor: Cursor::new(source),
101 state,
102 current_kind: TokenKind::EndOfFile,
103 current_range: TextRange::empty(start_offset),
104 current_value: TokenValue::None,
105 current_flags: TokenFlags::empty(),
106 nesting,
107 indentations: Indentations::default(),
108 pending_indentation: None,
109 mode,
110 interpolated_strings: InterpolatedStrings::default(),
111 errors: Vec::new(),
112 };
113
114 if start_offset == TextSize::new(0) {
115 lexer.cursor.eat_char(BOM);
117 } else {
118 lexer.cursor.skip_bytes(start_offset.to_usize());
119 }
120
121 lexer
122 }
123
124 pub(crate) fn current_kind(&self) -> TokenKind {
126 self.current_kind
127 }
128
129 pub(crate) fn current_range(&self) -> TextRange {
131 self.current_range
132 }
133
134 pub(crate) fn current_flags(&self) -> TokenFlags {
136 self.current_flags
137 }
138
139 pub(crate) fn take_value(&mut self) -> TokenValue {
145 std::mem::take(&mut self.current_value)
146 }
147
148 fn push_error(&mut self, error: LexicalError) -> TokenKind {
151 self.current_range = error.location();
152 self.errors.push(error);
153 TokenKind::Unknown
154 }
155
156 pub fn next_token(&mut self) -> TokenKind {
158 self.cursor.start_token();
159 self.current_value = TokenValue::None;
160 self.current_flags = TokenFlags::empty();
161 self.current_kind = self.lex_token();
162 if !matches!(self.current_kind, TokenKind::Unknown) {
164 self.current_range = self.token_range();
165 }
166 self.current_kind
167 }
168
169 fn lex_token(&mut self) -> TokenKind {
170 if let Some(interpolated_string) = self.interpolated_strings.current() {
171 if !interpolated_string.is_in_interpolation(self.nesting) {
172 if let Some(token) = self.lex_interpolated_string_middle_or_end() {
173 if token.is_interpolated_string_end() {
174 self.interpolated_strings.pop();
175 }
176 return token;
177 }
178 }
179 }
180 else if let Some(indentation) = self.pending_indentation.take() {
182 match self.indentations.current().try_compare(indentation) {
183 Ok(Ordering::Greater) => {
184 self.pending_indentation = Some(indentation);
185 if self.indentations.dedent_one(indentation).is_err() {
186 return self.push_error(LexicalError::new(
187 LexicalErrorType::IndentationError,
188 self.token_range(),
189 ));
190 }
191 return TokenKind::Dedent;
192 }
193 Ok(_) => {}
194 Err(_) => {
195 return self.push_error(LexicalError::new(
196 LexicalErrorType::IndentationError,
197 self.token_range(),
198 ));
199 }
200 }
201 }
202
203 if self.state.is_after_newline() {
204 if let Some(indentation) = self.eat_indentation() {
205 return indentation;
206 }
207 } else {
208 if let Err(error) = self.skip_whitespace() {
209 return self.push_error(error);
210 }
211 }
212
213 self.cursor.start_token();
215
216 if let Some(c) = self.cursor.bump() {
217 if c.is_ascii() {
218 self.consume_ascii_character(c)
219 } else if is_unicode_identifier_start(c) {
220 let identifier = self.lex_identifier(c);
221 self.state = State::Other;
222
223 identifier
224 } else {
225 self.push_error(LexicalError::new(
226 LexicalErrorType::UnrecognizedToken { tok: c },
227 self.token_range(),
228 ))
229 }
230 } else {
231 self.consume_end()
234 }
235 }
236
237 fn eat_indentation(&mut self) -> Option<TokenKind> {
238 let mut indentation = Indentation::root();
239
240 loop {
241 match self.cursor.first() {
242 ' ' => {
243 self.cursor.bump();
244 indentation = indentation.add_space();
245 }
246 '\t' => {
247 self.cursor.bump();
248 indentation = indentation.add_tab();
249 }
250 '\\' => {
251 self.cursor.bump();
252 if self.cursor.eat_char('\r') {
253 self.cursor.eat_char('\n');
254 } else if !self.cursor.eat_char('\n') {
255 return Some(self.push_error(LexicalError::new(
256 LexicalErrorType::LineContinuationError,
257 TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
258 )));
259 }
260 if self.cursor.is_eof() {
261 return Some(self.push_error(LexicalError::new(
262 LexicalErrorType::Eof,
263 self.token_range(),
264 )));
265 }
266 if indentation != Indentation::root() {
293 self.cursor.eat_while(is_python_whitespace);
294 }
295 }
296 '\x0C' => {
298 self.cursor.bump();
299 indentation = Indentation::root();
300 }
301 _ => break,
302 }
303 }
304
305 if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) {
307 self.state = State::NonEmptyLogicalLine;
308
309 return self.handle_indentation(indentation);
311 }
312
313 None
314 }
315
316 fn handle_indentation(&mut self, indentation: Indentation) -> Option<TokenKind> {
317 match self.indentations.current().try_compare(indentation) {
318 Ok(Ordering::Greater) => {
320 self.pending_indentation = Some(indentation);
321
322 if self.indentations.dedent_one(indentation).is_err() {
323 return Some(self.push_error(LexicalError::new(
324 LexicalErrorType::IndentationError,
325 self.token_range(),
326 )));
327 }
328
329 self.cursor.start_token();
343
344 Some(TokenKind::Dedent)
345 }
346
347 Ok(Ordering::Equal) => None,
348
349 Ok(Ordering::Less) => {
351 self.indentations.indent(indentation);
352 Some(TokenKind::Indent)
353 }
354 Err(_) => Some(self.push_error(LexicalError::new(
355 LexicalErrorType::IndentationError,
356 self.token_range(),
357 ))),
358 }
359 }
360
361 fn skip_whitespace(&mut self) -> Result<(), LexicalError> {
362 loop {
363 match self.cursor.first() {
364 ' ' => {
365 self.cursor.bump();
366 }
367 '\t' => {
368 self.cursor.bump();
369 }
370 '\\' => {
371 self.cursor.bump();
372 if self.cursor.eat_char('\r') {
373 self.cursor.eat_char('\n');
374 } else if !self.cursor.eat_char('\n') {
375 return Err(LexicalError::new(
376 LexicalErrorType::LineContinuationError,
377 TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
378 ));
379 }
380 if self.cursor.is_eof() {
381 return Err(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
382 }
383 }
384 '\x0C' => {
386 self.cursor.bump();
387 }
388 _ => break,
389 }
390 }
391
392 Ok(())
393 }
394
395 fn consume_ascii_character(&mut self, c: char) -> TokenKind {
397 let token = match c {
398 c if is_ascii_identifier_start(c) => self.lex_identifier(c),
399 '0'..='9' => self.lex_number(c),
400 '#' => return self.lex_comment(),
401 '\'' | '"' => self.lex_string(c),
402 '=' => {
403 if self.cursor.eat_char('=') {
404 TokenKind::EqEqual
405 } else {
406 self.state = State::AfterEqual;
407 return TokenKind::Equal;
408 }
409 }
410 '+' => {
411 if self.cursor.eat_char('=') {
412 TokenKind::PlusEqual
413 } else {
414 TokenKind::Plus
415 }
416 }
417 '*' => {
418 if self.cursor.eat_char('=') {
419 TokenKind::StarEqual
420 } else if self.cursor.eat_char('*') {
421 if self.cursor.eat_char('=') {
422 TokenKind::DoubleStarEqual
423 } else {
424 TokenKind::DoubleStar
425 }
426 } else {
427 TokenKind::Star
428 }
429 }
430
431 c @ ('%' | '!')
432 if self.mode == Mode::Ipython
433 && self.state.is_after_equal()
434 && self.nesting == 0 =>
435 {
436 self.lex_ipython_escape_command(
438 IpyEscapeKind::try_from(c).unwrap(),
439 IpyEscapeLexContext::Assignment,
440 )
441 }
442
443 c @ ('%' | '!' | '?' | '/' | ';' | ',')
444 if self.mode == Mode::Ipython && self.state.is_new_logical_line() =>
445 {
446 let kind = if let Ok(kind) = IpyEscapeKind::try_from([c, self.cursor.first()]) {
447 self.cursor.bump();
448 kind
449 } else {
450 IpyEscapeKind::try_from(c).unwrap()
452 };
453
454 self.lex_ipython_escape_command(kind, IpyEscapeLexContext::LogicalLineStart)
455 }
456
457 '?' if self.mode == Mode::Ipython => TokenKind::Question,
458
459 '/' => {
460 if self.cursor.eat_char('=') {
461 TokenKind::SlashEqual
462 } else if self.cursor.eat_char('/') {
463 if self.cursor.eat_char('=') {
464 TokenKind::DoubleSlashEqual
465 } else {
466 TokenKind::DoubleSlash
467 }
468 } else {
469 TokenKind::Slash
470 }
471 }
472 '%' => {
473 if self.cursor.eat_char('=') {
474 TokenKind::PercentEqual
475 } else {
476 TokenKind::Percent
477 }
478 }
479 '|' => {
480 if self.cursor.eat_char('=') {
481 TokenKind::VbarEqual
482 } else {
483 TokenKind::Vbar
484 }
485 }
486 '^' => {
487 if self.cursor.eat_char('=') {
488 TokenKind::CircumflexEqual
489 } else {
490 TokenKind::CircumFlex
491 }
492 }
493 '&' => {
494 if self.cursor.eat_char('=') {
495 TokenKind::AmperEqual
496 } else {
497 TokenKind::Amper
498 }
499 }
500 '-' => {
501 if self.cursor.eat_char('=') {
502 TokenKind::MinusEqual
503 } else if self.cursor.eat_char('>') {
504 TokenKind::Rarrow
505 } else {
506 TokenKind::Minus
507 }
508 }
509 '@' => {
510 if self.cursor.eat_char('=') {
511 TokenKind::AtEqual
512 } else {
513 TokenKind::At
514 }
515 }
516 '!' => {
517 if self.cursor.eat_char('=') {
518 TokenKind::NotEqual
519 } else {
520 TokenKind::Exclamation
521 }
522 }
523 '~' => TokenKind::Tilde,
524 '(' => {
525 self.nesting += 1;
526 TokenKind::Lpar
527 }
528 ')' => {
529 self.nesting = self.nesting.saturating_sub(1);
530 TokenKind::Rpar
531 }
532 '[' => {
533 self.nesting += 1;
534 TokenKind::Lsqb
535 }
536 ']' => {
537 self.nesting = self.nesting.saturating_sub(1);
538 TokenKind::Rsqb
539 }
540 '{' => {
541 self.nesting += 1;
542 TokenKind::Lbrace
543 }
544 '}' => {
545 if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
546 if interpolated_string.nesting() == self.nesting {
547 let error_type = LexicalErrorType::from_interpolated_string_error(
548 InterpolatedStringErrorType::SingleRbrace,
549 interpolated_string.kind(),
550 );
551 return self.push_error(LexicalError::new(error_type, self.token_range()));
552 }
553 interpolated_string.try_end_format_spec(self.nesting);
554 }
555 self.nesting = self.nesting.saturating_sub(1);
556 TokenKind::Rbrace
557 }
558 ':' => {
559 if self
560 .interpolated_strings
561 .current_mut()
562 .is_some_and(|interpolated_string| {
563 interpolated_string.try_start_format_spec(self.nesting)
564 })
565 {
566 TokenKind::Colon
567 } else if self.cursor.eat_char('=') {
568 TokenKind::ColonEqual
569 } else {
570 TokenKind::Colon
571 }
572 }
573 ';' => TokenKind::Semi,
574 '<' => {
575 if self.cursor.eat_char('<') {
576 if self.cursor.eat_char('=') {
577 TokenKind::LeftShiftEqual
578 } else {
579 TokenKind::LeftShift
580 }
581 } else if self.cursor.eat_char('=') {
582 TokenKind::LessEqual
583 } else {
584 TokenKind::Less
585 }
586 }
587 '>' => {
588 if self.cursor.eat_char('>') {
589 if self.cursor.eat_char('=') {
590 TokenKind::RightShiftEqual
591 } else {
592 TokenKind::RightShift
593 }
594 } else if self.cursor.eat_char('=') {
595 TokenKind::GreaterEqual
596 } else {
597 TokenKind::Greater
598 }
599 }
600 ',' => TokenKind::Comma,
601 '.' => {
602 if self.cursor.first().is_ascii_digit() {
603 self.lex_decimal_number('.')
604 } else if self.cursor.eat_char2('.', '.') {
605 TokenKind::Ellipsis
606 } else {
607 TokenKind::Dot
608 }
609 }
610 '\n' => {
611 return if self.nesting == 0 && !self.state.is_new_logical_line() {
612 self.state = State::AfterNewline;
613 TokenKind::Newline
614 } else {
615 if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
616 interpolated_string.try_end_format_spec(self.nesting);
617 }
618 TokenKind::NonLogicalNewline
619 };
620 }
621 '\r' => {
622 self.cursor.eat_char('\n');
623
624 return if self.nesting == 0 && !self.state.is_new_logical_line() {
625 self.state = State::AfterNewline;
626 TokenKind::Newline
627 } else {
628 if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
629 interpolated_string.try_end_format_spec(self.nesting);
630 }
631 TokenKind::NonLogicalNewline
632 };
633 }
634
635 _ => {
636 self.state = State::Other;
637
638 return self.push_error(LexicalError::new(
639 LexicalErrorType::UnrecognizedToken { tok: c },
640 self.token_range(),
641 ));
642 }
643 };
644
645 self.state = State::Other;
646
647 token
648 }
649
650 fn lex_identifier(&mut self, first: char) -> TokenKind {
652 let quote = match (first, self.cursor.first()) {
654 (_, quote @ ('\'' | '"')) => self.try_single_char_prefix(first).then(|| {
655 self.cursor.bump();
656 quote
657 }),
658 (_, second) if is_quote(self.cursor.second()) => {
659 self.try_double_char_prefix([first, second]).then(|| {
660 self.cursor.bump();
661 self.cursor.bump().unwrap()
663 })
664 }
665 _ => None,
666 };
667
668 if let Some(quote) = quote {
669 if self.current_flags.is_interpolated_string() {
670 if let Some(kind) = self.lex_interpolated_string_start(quote) {
671 return kind;
672 }
673 }
674
675 return self.lex_string(quote);
676 }
677
678 let mut is_ascii = first.is_ascii();
686 self.cursor
687 .eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
688
689 let text = self.token_text();
690
691 if !is_ascii {
692 self.current_value = TokenValue::Name(text.nfkc().collect::<Name>());
693 return TokenKind::Name;
694 }
695
696 if text.len() > 8 {
700 self.current_value = TokenValue::Name(Name::new(text));
701 return TokenKind::Name;
702 }
703
704 match text {
705 "False" => TokenKind::False,
706 "None" => TokenKind::None,
707 "True" => TokenKind::True,
708 "and" => TokenKind::And,
709 "as" => TokenKind::As,
710 "assert" => TokenKind::Assert,
711 "async" => TokenKind::Async,
712 "await" => TokenKind::Await,
713 "break" => TokenKind::Break,
714 "case" => TokenKind::Case,
715 "class" => TokenKind::Class,
716 "continue" => TokenKind::Continue,
717 "def" => TokenKind::Def,
718 "del" => TokenKind::Del,
719 "elif" => TokenKind::Elif,
720 "else" => TokenKind::Else,
721 "except" => TokenKind::Except,
722 "finally" => TokenKind::Finally,
723 "for" => TokenKind::For,
724 "from" => TokenKind::From,
725 "global" => TokenKind::Global,
726 "if" => TokenKind::If,
727 "import" => TokenKind::Import,
728 "in" => TokenKind::In,
729 "is" => TokenKind::Is,
730 "lazy" => TokenKind::Lazy,
731 "lambda" => TokenKind::Lambda,
732 "match" => TokenKind::Match,
733 "nonlocal" => TokenKind::Nonlocal,
734 "not" => TokenKind::Not,
735 "or" => TokenKind::Or,
736 "pass" => TokenKind::Pass,
737 "raise" => TokenKind::Raise,
738 "return" => TokenKind::Return,
739 "try" => TokenKind::Try,
740 "type" => TokenKind::Type,
741 "while" => TokenKind::While,
742 "with" => TokenKind::With,
743 "yield" => TokenKind::Yield,
744 _ => {
745 self.current_value = TokenValue::Name(Name::new(text));
746 TokenKind::Name
747 }
748 }
749 }
750
751 fn try_single_char_prefix(&mut self, first: char) -> bool {
754 match first {
755 'f' | 'F' => self.current_flags |= TokenFlags::F_STRING,
756 't' | 'T' => self.current_flags |= TokenFlags::T_STRING,
757 'u' | 'U' => self.current_flags |= TokenFlags::UNICODE_STRING,
758 'b' | 'B' => self.current_flags |= TokenFlags::BYTE_STRING,
759 'r' => self.current_flags |= TokenFlags::RAW_STRING_LOWERCASE,
760 'R' => self.current_flags |= TokenFlags::RAW_STRING_UPPERCASE,
761 _ => return false,
762 }
763 true
764 }
765
766 fn try_double_char_prefix(&mut self, value: [char; 2]) -> bool {
769 match value {
770 ['r', 'f' | 'F'] | ['f' | 'F', 'r'] => {
771 self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_LOWERCASE;
772 }
773 ['R', 'f' | 'F'] | ['f' | 'F', 'R'] => {
774 self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_UPPERCASE;
775 }
776 ['r', 't' | 'T'] | ['t' | 'T', 'r'] => {
777 self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_LOWERCASE;
778 }
779 ['R', 't' | 'T'] | ['t' | 'T', 'R'] => {
780 self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_UPPERCASE;
781 }
782 ['r', 'b' | 'B'] | ['b' | 'B', 'r'] => {
783 self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_LOWERCASE;
784 }
785 ['R', 'b' | 'B'] | ['b' | 'B', 'R'] => {
786 self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_UPPERCASE;
787 }
788 _ => return false,
789 }
790 true
791 }
792
793 fn lex_interpolated_string_start(&mut self, quote: char) -> Option<TokenKind> {
795 #[cfg(debug_assertions)]
796 debug_assert_eq!(self.cursor.previous(), quote);
797
798 if quote == '"' {
799 self.current_flags |= TokenFlags::DOUBLE_QUOTES;
800 }
801
802 if self.cursor.eat_char2(quote, quote) {
803 self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
804 }
805
806 let ftcontext = InterpolatedStringContext::new(self.current_flags, self.nesting)?;
807
808 let kind = ftcontext.kind();
809
810 self.interpolated_strings.push(ftcontext);
811
812 Some(kind.start_token())
813 }
814
815 fn lex_interpolated_string_middle_or_end(&mut self) -> Option<TokenKind> {
817 let interpolated_string = self.interpolated_strings.current().unwrap();
819 let string_kind = interpolated_string.kind();
820 let interpolated_flags = interpolated_string.flags();
821
822 if interpolated_string.is_triple_quoted() {
824 let quote_char = interpolated_string.quote_char();
825 if self.cursor.eat_char3(quote_char, quote_char, quote_char) {
826 self.current_flags = interpolated_string.flags();
827 return Some(string_kind.end_token());
828 }
829 } else if self.cursor.eat_char(interpolated_string.quote_char()) {
830 self.current_flags = interpolated_string.flags();
831 return Some(string_kind.end_token());
832 }
833
834 let mut normalized = String::new();
838
839 let mut last_offset = self.offset();
841
842 let in_format_spec = interpolated_string.is_in_format_spec(self.nesting);
844
845 let mut in_named_unicode = false;
846
847 loop {
848 match self.cursor.first() {
849 EOF_CHAR if self.cursor.is_eof() => {
853 let error = if interpolated_string.is_triple_quoted() {
854 InterpolatedStringErrorType::UnterminatedTripleQuotedString
855 } else {
856 InterpolatedStringErrorType::UnterminatedString
857 };
858
859 self.nesting = interpolated_string.nesting();
860 self.interpolated_strings.pop();
861 self.current_flags |= TokenFlags::UNCLOSED_STRING;
862 self.push_error(LexicalError::new(
863 LexicalErrorType::from_interpolated_string_error(error, string_kind),
864 self.token_range(),
865 ));
866
867 break;
868 }
869 '\n' | '\r' if !interpolated_string.is_triple_quoted() => {
870 let error_type = if in_format_spec {
873 InterpolatedStringErrorType::NewlineInFormatSpec
874 } else {
875 InterpolatedStringErrorType::UnterminatedString
876 };
877
878 self.nesting = interpolated_string.nesting();
879 self.interpolated_strings.pop();
880 self.current_flags |= TokenFlags::UNCLOSED_STRING;
881
882 self.push_error(LexicalError::new(
883 LexicalErrorType::from_interpolated_string_error(error_type, string_kind),
884 self.token_range(),
885 ));
886
887 break;
888 }
889 '\\' => {
890 self.cursor.bump(); if matches!(self.cursor.first(), '{' | '}') {
892 continue;
895 } else if !interpolated_string.is_raw_string() {
896 if self.cursor.eat_char2('N', '{') {
897 in_named_unicode = true;
898 continue;
899 }
900 }
901 if self.cursor.eat_char('\r') {
903 self.cursor.eat_char('\n');
904 } else {
905 self.cursor.bump();
906 }
907 }
908 quote @ ('\'' | '"') if quote == interpolated_string.quote_char() => {
909 if let Some(triple_quotes) = interpolated_string.triple_quotes() {
910 if self.cursor.rest().starts_with(triple_quotes) {
911 break;
912 }
913 self.cursor.bump();
914 } else {
915 break;
916 }
917 }
918 '{' => {
919 if self.cursor.second() == '{' && !in_format_spec {
920 self.cursor.bump();
921 normalized
922 .push_str(&self.source[TextRange::new(last_offset, self.offset())]);
923 self.cursor.bump(); last_offset = self.offset();
925 } else {
926 break;
927 }
928 }
929 '}' => {
930 if in_named_unicode {
931 in_named_unicode = false;
932 self.cursor.bump();
933 } else if self.cursor.second() == '}' && !in_format_spec {
934 self.cursor.bump();
935 normalized
936 .push_str(&self.source[TextRange::new(last_offset, self.offset())]);
937 self.cursor.bump(); last_offset = self.offset();
939 } else {
940 break;
941 }
942 }
943 _ => {
944 self.cursor.bump();
945 }
946 }
947 }
948 let range = self.token_range();
949 if range.is_empty() {
950 return None;
951 }
952
953 let value = if normalized.is_empty() {
954 self.source[range].to_string()
955 } else {
956 normalized.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
957 normalized
958 };
959
960 self.current_value = TokenValue::InterpolatedStringMiddle(value.into_boxed_str());
961
962 self.current_flags = interpolated_flags;
963 Some(string_kind.middle_token())
964 }
965
966 fn lex_string(&mut self, quote: char) -> TokenKind {
968 #[cfg(debug_assertions)]
969 debug_assert_eq!(self.cursor.previous(), quote);
970
971 if quote == '"' {
972 self.current_flags |= TokenFlags::DOUBLE_QUOTES;
973 }
974
975 if self.cursor.eat_char2(quote, quote) {
978 self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
979 }
980
981 let value_start = self.offset();
982
983 let quote_byte = u8::try_from(quote).expect("char that fits in u8");
984 let value_end = if self.current_flags.is_triple_quoted() {
985 loop {
988 let Some(index) = memchr::memchr(quote_byte, self.cursor.rest().as_bytes()) else {
989 self.cursor.skip_to_end();
990
991 self.current_flags |= TokenFlags::UNCLOSED_STRING;
992 self.push_error(LexicalError::new(
993 LexicalErrorType::UnclosedStringError,
994 self.token_range(),
995 ));
996 break self.offset();
997 };
998
999 let num_backslashes = self.cursor.rest().as_bytes()[..index]
1002 .iter()
1003 .rev()
1004 .take_while(|&&c| c == b'\\')
1005 .count();
1006
1007 self.cursor.skip_bytes(index + 1);
1009
1010 if num_backslashes % 2 == 1 {
1012 continue;
1013 }
1014
1015 if self.cursor.eat_char2(quote, quote) {
1017 break self.offset() - TextSize::new(3);
1018 }
1019 }
1020 } else {
1021 loop {
1024 let Some(index) =
1025 memchr::memchr3(quote_byte, b'\r', b'\n', self.cursor.rest().as_bytes())
1026 else {
1027 self.cursor.skip_to_end();
1028 self.current_flags |= TokenFlags::UNCLOSED_STRING;
1029
1030 self.push_error(LexicalError::new(
1031 LexicalErrorType::UnclosedStringError,
1032 self.token_range(),
1033 ));
1034
1035 break self.offset();
1036 };
1037
1038 let num_backslashes = self.cursor.rest().as_bytes()[..index]
1041 .iter()
1042 .rev()
1043 .take_while(|&&c| c == b'\\')
1044 .count();
1045
1046 self.cursor.skip_bytes(index);
1048
1049 let quote_or_newline = self.cursor.first();
1051
1052 if num_backslashes % 2 == 1 {
1054 self.cursor.bump();
1055 if quote_or_newline == '\r' {
1056 self.cursor.eat_char('\n');
1057 }
1058 continue;
1059 }
1060
1061 match quote_or_newline {
1062 '\r' | '\n' => {
1063 self.current_flags |= TokenFlags::UNCLOSED_STRING;
1064 self.push_error(LexicalError::new(
1065 LexicalErrorType::UnclosedStringError,
1066 self.token_range(),
1067 ));
1068 break self.offset();
1069 }
1070 ch if ch == quote => {
1071 let value_end = self.offset();
1072 self.cursor.bump();
1073 break value_end;
1074 }
1075 _ => unreachable!("memchr2 returned an index that is not a quote or a newline"),
1076 }
1077 }
1078 };
1079
1080 self.current_value = TokenValue::String(
1081 self.source[TextRange::new(value_start, value_end)]
1082 .to_string()
1083 .into_boxed_str(),
1084 );
1085
1086 TokenKind::String
1087 }
1088
1089 fn lex_number(&mut self, first: char) -> TokenKind {
1091 if first == '0' {
1092 if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() {
1093 self.lex_number_radix(Radix::Hex)
1094 } else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() {
1095 self.lex_number_radix(Radix::Octal)
1096 } else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() {
1097 self.lex_number_radix(Radix::Binary)
1098 } else {
1099 self.lex_decimal_number(first)
1100 }
1101 } else {
1102 self.lex_decimal_number(first)
1103 }
1104 }
1105
1106 fn lex_number_radix(&mut self, radix: Radix) -> TokenKind {
1108 #[cfg(debug_assertions)]
1109 debug_assert!(matches!(
1110 self.cursor.previous().to_ascii_lowercase(),
1111 'x' | 'o' | 'b'
1112 ));
1113
1114 let mut number = LexedText::new(self.offset(), self.source);
1116 self.radix_run(&mut number, radix);
1117
1118 let token = &self.source[self.token_range()];
1120
1121 let value = match Int::from_str_radix(number.as_str(), radix.as_u32(), token) {
1122 Ok(int) => int,
1123 Err(err) => {
1124 return self.push_error(LexicalError::new(
1125 LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
1126 self.token_range(),
1127 ));
1128 }
1129 };
1130 self.current_value = TokenValue::Int(value);
1131 TokenKind::Int
1132 }
1133
1134 fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> TokenKind {
1136 #[cfg(debug_assertions)]
1137 debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.');
1138 let start_is_zero = first_digit_or_dot == '0';
1139
1140 let mut number = LexedText::new(self.token_start(), self.source);
1141 if first_digit_or_dot != '.' {
1142 number.push(first_digit_or_dot);
1143 self.radix_run(&mut number, Radix::Decimal);
1144 }
1145
1146 let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') {
1147 number.push('.');
1148
1149 if self.cursor.eat_char('_') {
1150 return self.push_error(LexicalError::new(
1151 LexicalErrorType::OtherError("Invalid Syntax".to_string().into_boxed_str()),
1152 TextRange::new(self.offset() - TextSize::new(1), self.offset()),
1153 ));
1154 }
1155
1156 self.radix_run(&mut number, Radix::Decimal);
1157 true
1158 } else {
1159 false
1161 };
1162
1163 let is_float = match self.cursor.rest().as_bytes() {
1164 [b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => {
1165 number.push(self.cursor.bump().unwrap());
1167
1168 if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) {
1169 number.push(sign);
1170 }
1171
1172 self.radix_run(&mut number, Radix::Decimal);
1173
1174 true
1175 }
1176 _ => is_float,
1177 };
1178
1179 if is_float {
1180 let Ok(value) = f64::from_str(number.as_str()) else {
1182 return self.push_error(LexicalError::new(
1183 LexicalErrorType::OtherError(
1184 "Invalid decimal literal".to_string().into_boxed_str(),
1185 ),
1186 self.token_range(),
1187 ));
1188 };
1189
1190 if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
1192 self.current_value = TokenValue::Complex {
1193 real: 0.0,
1194 imag: value,
1195 };
1196 TokenKind::Complex
1197 } else {
1198 self.current_value = TokenValue::Float(value);
1199 TokenKind::Float
1200 }
1201 } else {
1202 if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
1204 let imag = f64::from_str(number.as_str()).unwrap();
1205 self.current_value = TokenValue::Complex { real: 0.0, imag };
1206 TokenKind::Complex
1207 } else {
1208 let value = match Int::from_str(number.as_str()) {
1209 Ok(value) => {
1210 if start_is_zero && value.as_u8() != Some(0) {
1211 return self.push_error(LexicalError::new(
1213 LexicalErrorType::OtherError(
1214 "Invalid decimal integer literal"
1215 .to_string()
1216 .into_boxed_str(),
1217 ),
1218 self.token_range(),
1219 ));
1220 }
1221 value
1222 }
1223 Err(err) => {
1224 return self.push_error(LexicalError::new(
1225 LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
1226 self.token_range(),
1227 ));
1228 }
1229 };
1230 self.current_value = TokenValue::Int(value);
1231 TokenKind::Int
1232 }
1233 }
1234 }
1235
1236 fn radix_run(&mut self, number: &mut LexedText, radix: Radix) {
1240 loop {
1241 if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) {
1242 number.push(c);
1243 }
1244 else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) {
1246 self.cursor.bump();
1248 number.skip_char();
1249 } else {
1250 break;
1251 }
1252 }
1253 }
1254
1255 fn lex_comment(&mut self) -> TokenKind {
1257 #[cfg(debug_assertions)]
1258 debug_assert_eq!(self.cursor.previous(), '#');
1259
1260 let bytes = self.cursor.rest().as_bytes();
1261 let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len());
1262 self.cursor.skip_bytes(offset);
1263
1264 TokenKind::Comment
1265 }
1266
1267 fn lex_ipython_escape_command(
1269 &mut self,
1270 escape_kind: IpyEscapeKind,
1271 context: IpyEscapeLexContext,
1272 ) -> TokenKind {
1273 let mut value = String::new();
1274
1275 loop {
1276 match self.cursor.first() {
1277 '\\' => {
1278 if self.cursor.second() == '\r' {
1288 self.cursor.bump();
1289 self.cursor.bump();
1290 self.cursor.eat_char('\n');
1291 continue;
1292 } else if self.cursor.second() == '\n' {
1293 self.cursor.bump();
1294 self.cursor.bump();
1295 continue;
1296 }
1297
1298 self.cursor.bump();
1299 value.push('\\');
1300 }
1301 '?' => {
1314 self.cursor.bump();
1315 let mut question_count = 1u32;
1316 while self.cursor.eat_char('?') {
1317 question_count += 1;
1318 }
1319
1320 if !context.allows_help_end()
1326 || !matches!(
1327 escape_kind,
1328 IpyEscapeKind::Magic
1329 | IpyEscapeKind::Magic2
1330 | IpyEscapeKind::Help
1331 | IpyEscapeKind::Help2
1332 )
1333 {
1334 value.reserve(question_count as usize);
1335 for _ in 0..question_count {
1336 value.push('?');
1337 }
1338 continue;
1339 }
1340
1341 if question_count > 2
1354 || value.chars().last().is_none_or(is_python_whitespace)
1355 || !matches!(self.cursor.first(), '\n' | '\r' | EOF_CHAR)
1356 {
1357 value.reserve(question_count as usize);
1359 for _ in 0..question_count {
1360 value.push('?');
1361 }
1362 continue;
1363 }
1364
1365 if escape_kind.is_help() {
1366 value = value.trim_start_matches([' ', '?']).to_string();
1373 } else if escape_kind.is_magic() {
1374 value.insert_str(0, escape_kind.as_str());
1379 }
1380
1381 let kind = match question_count {
1382 1 => IpyEscapeKind::Help,
1383 2 => IpyEscapeKind::Help2,
1384 _ => unreachable!("`question_count` is always 1 or 2"),
1385 };
1386
1387 self.current_value = TokenValue::IpyEscapeCommand {
1388 kind,
1389 value: value.into_boxed_str(),
1390 };
1391
1392 return TokenKind::IpyEscapeCommand;
1393 }
1394 '\n' | '\r' | EOF_CHAR => {
1395 self.current_value = TokenValue::IpyEscapeCommand {
1396 kind: escape_kind,
1397 value: value.into_boxed_str(),
1398 };
1399
1400 return TokenKind::IpyEscapeCommand;
1401 }
1402 c => {
1403 self.cursor.bump();
1404 value.push(c);
1405 }
1406 }
1407 }
1408 }
1409
1410 fn consume_end(&mut self) -> TokenKind {
1411 while let Some(interpolated_string) = self.interpolated_strings.pop() {
1415 self.nesting = interpolated_string.nesting();
1416 self.push_error(LexicalError::new(
1417 LexicalErrorType::from_interpolated_string_error(
1418 InterpolatedStringErrorType::UnterminatedString,
1419 interpolated_string.kind(),
1420 ),
1421 self.token_range(),
1422 ));
1423 }
1424
1425 let init_nesting = u32::from(self.mode == Mode::ParenthesizedExpression);
1429
1430 if self.nesting > init_nesting {
1431 self.nesting = 0;
1433 return self.push_error(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
1434 }
1435
1436 if !self.state.is_new_logical_line() {
1438 self.state = State::AfterNewline;
1439 TokenKind::Newline
1440 }
1441 else if self.indentations.dedent().is_some() {
1443 TokenKind::Dedent
1444 } else {
1445 TokenKind::EndOfFile
1446 }
1447 }
1448
1449 pub(crate) fn re_lex_logical_token(
1505 &mut self,
1506 non_logical_newline_start: Option<TextSize>,
1507 ) -> bool {
1508 if self.nesting == 0 {
1509 return false;
1510 }
1511
1512 self.nesting -= 1;
1515
1516 if self.current_flags.is_triple_quoted_interpolated_string() {
1519 return false;
1520 }
1521
1522 let Some(new_position) = non_logical_newline_start else {
1523 return false;
1524 };
1525
1526 if matches!(
1540 self.current_kind,
1541 TokenKind::Rpar | TokenKind::Rsqb | TokenKind::Rbrace
1542 ) {
1543 self.nesting += 1;
1544 }
1545
1546 self.cursor = Cursor::new(self.source);
1547 self.cursor.skip_bytes(new_position.to_usize());
1548 self.state = State::Other;
1549 self.next_token();
1550 true
1551 }
1552
1553 pub(crate) fn re_lex_string_token_in_interpolation_element(
1562 &mut self,
1563 kind: InterpolatedStringKind,
1564 ) {
1565 let Some(interpolated_string) = self.interpolated_strings.current() else {
1566 return;
1567 };
1568
1569 let current_string_flags = self.current_flags().as_any_string_flags();
1570
1571 if !matches!(self.current_kind, TokenKind::String)
1573 || !self.current_flags.is_unclosed()
1574 || current_string_flags.prefix() != AnyStringPrefix::Regular(StringLiteralPrefix::Empty)
1575 || current_string_flags.quote_style().as_char() != interpolated_string.quote_char()
1576 || current_string_flags.is_triple_quoted() != interpolated_string.is_triple_quoted()
1577 {
1578 return;
1579 }
1580
1581 let first_line = &self.source
1584 [(self.current_range.start() + current_string_flags.quote_len()).to_usize()..];
1585
1586 for c in first_line.chars() {
1587 if matches!(c, '\n' | '\r' | '#') {
1588 break;
1589 }
1590
1591 if !is_python_whitespace(c) {
1593 return;
1594 }
1595 }
1596
1597 if self.errors.last().is_some_and(|error| {
1598 error.location() == self.current_range
1599 && matches!(error.error(), LexicalErrorType::UnclosedStringError)
1600 }) {
1601 self.errors.pop();
1602 }
1603
1604 self.current_range =
1605 TextRange::at(self.current_range.start(), self.current_flags.quote_len());
1606 self.current_kind = kind.end_token();
1607 self.current_value = TokenValue::None;
1608 self.current_flags = TokenFlags::empty();
1609
1610 self.nesting = interpolated_string.nesting();
1611 self.interpolated_strings.pop();
1612
1613 self.cursor = Cursor::new(self.source);
1614 self.cursor.skip_bytes(self.current_range.end().to_usize());
1615 }
1616
1617 pub(crate) fn re_lex_raw_string_in_format_spec(&mut self) {
1631 if matches!(self.current_kind, TokenKind::String)
1634 && self.current_flags.is_unclosed()
1635 && self.current_flags.prefix()
1636 == AnyStringPrefix::Regular(StringLiteralPrefix::Raw { uppercase: false })
1637 {
1638 if self.errors.last().is_some_and(|error| {
1639 error.location() == self.current_range
1640 && matches!(error.error(), LexicalErrorType::UnclosedStringError)
1641 }) {
1642 self.errors.pop();
1643 }
1644
1645 self.current_range = TextRange::at(self.current_range.start(), 'r'.text_len());
1646 self.current_kind = TokenKind::Name;
1647 self.current_value = TokenValue::Name(Name::new_static("r"));
1648 self.current_flags = TokenFlags::empty();
1649 self.cursor = Cursor::new(self.source);
1650 self.cursor.skip_bytes(self.current_range.end().to_usize());
1651 }
1652 }
1653
1654 #[inline]
1655 fn token_range(&self) -> TextRange {
1656 let end = self.offset();
1657 let len = self.cursor.token_len();
1658
1659 TextRange::at(end - len, len)
1660 }
1661
1662 #[inline]
1663 fn token_text(&self) -> &'src str {
1664 &self.source[self.token_range()]
1665 }
1666
1667 #[expect(clippy::cast_possible_truncation)]
1670 #[inline]
1671 fn offset(&self) -> TextSize {
1672 TextSize::new(self.source.len() as u32) - self.cursor.text_len()
1673 }
1674
1675 #[inline]
1676 fn token_start(&self) -> TextSize {
1677 self.token_range().start()
1678 }
1679
1680 pub(crate) fn checkpoint(&self) -> LexerCheckpoint {
1682 LexerCheckpoint {
1683 value: self.current_value.clone(),
1684 current_kind: self.current_kind,
1685 current_range: self.current_range,
1686 current_flags: self.current_flags,
1687 cursor_offset: self.offset(),
1688 state: self.state,
1689 nesting: self.nesting,
1690 indentations_checkpoint: self.indentations.checkpoint(),
1691 pending_indentation: self.pending_indentation,
1692 interpolated_strings_checkpoint: self.interpolated_strings.checkpoint(),
1693 errors_position: self.errors.len(),
1694 }
1695 }
1696
1697 pub(crate) fn rewind(&mut self, checkpoint: LexerCheckpoint) {
1699 let LexerCheckpoint {
1700 value,
1701 current_kind,
1702 current_range,
1703 current_flags,
1704 cursor_offset,
1705 state,
1706 nesting,
1707 indentations_checkpoint,
1708 pending_indentation,
1709 interpolated_strings_checkpoint,
1710 errors_position,
1711 } = checkpoint;
1712
1713 let mut cursor = Cursor::new(self.source);
1714 cursor.skip_bytes(cursor_offset.to_usize());
1716
1717 self.current_value = value;
1718 self.current_kind = current_kind;
1719 self.current_range = current_range;
1720 self.current_flags = current_flags;
1721 self.cursor = cursor;
1722 self.state = state;
1723 self.nesting = nesting;
1724 self.indentations.rewind(indentations_checkpoint);
1725 self.pending_indentation = pending_indentation;
1726 self.interpolated_strings
1727 .rewind(interpolated_strings_checkpoint);
1728 self.errors.truncate(errors_position);
1729 }
1730
1731 pub fn finish(self) -> Vec<LexicalError> {
1732 self.errors
1733 }
1734}
1735
1736pub(crate) struct LexerCheckpoint {
1737 value: TokenValue,
1738 current_kind: TokenKind,
1739 current_range: TextRange,
1740 current_flags: TokenFlags,
1741 cursor_offset: TextSize,
1742 state: State,
1743 nesting: u32,
1744 indentations_checkpoint: IndentationsCheckpoint,
1745 pending_indentation: Option<Indentation>,
1746 interpolated_strings_checkpoint: InterpolatedStringsCheckpoint,
1747 errors_position: usize,
1748}
1749
1750#[derive(Copy, Clone, Debug)]
1751enum State {
1752 AfterNewline,
1754
1755 NonEmptyLogicalLine,
1757
1758 AfterEqual,
1760
1761 Other,
1763}
1764
1765impl State {
1766 const fn is_after_newline(self) -> bool {
1767 matches!(self, State::AfterNewline)
1768 }
1769
1770 const fn is_new_logical_line(self) -> bool {
1771 matches!(self, State::AfterNewline | State::NonEmptyLogicalLine)
1772 }
1773
1774 const fn is_after_equal(self) -> bool {
1775 matches!(self, State::AfterEqual)
1776 }
1777}
1778
1779#[derive(Copy, Clone, Debug)]
1780enum IpyEscapeLexContext {
1781 Assignment,
1782 LogicalLineStart,
1783}
1784
1785impl IpyEscapeLexContext {
1786 const fn allows_help_end(self) -> bool {
1787 matches!(self, Self::LogicalLineStart)
1788 }
1789}
1790
1791#[derive(Copy, Clone, Debug)]
1792enum Radix {
1793 Binary,
1794 Octal,
1795 Decimal,
1796 Hex,
1797}
1798
1799impl Radix {
1800 const fn as_u32(self) -> u32 {
1801 match self {
1802 Radix::Binary => 2,
1803 Radix::Octal => 8,
1804 Radix::Decimal => 10,
1805 Radix::Hex => 16,
1806 }
1807 }
1808
1809 const fn is_digit(self, c: char) -> bool {
1810 match self {
1811 Radix::Binary => matches!(c, '0'..='1'),
1812 Radix::Octal => matches!(c, '0'..='7'),
1813 Radix::Decimal => c.is_ascii_digit(),
1814 Radix::Hex => c.is_ascii_hexdigit(),
1815 }
1816 }
1817}
1818
1819const fn is_quote(c: char) -> bool {
1820 matches!(c, '\'' | '"')
1821}
1822
1823const fn is_ascii_identifier_start(c: char) -> bool {
1824 matches!(c, 'a'..='z' | 'A'..='Z' | '_')
1825}
1826
1827fn is_unicode_identifier_start(c: char) -> bool {
1830 is_xid_start(c)
1831}
1832
1833fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
1840 if c.is_ascii() {
1843 matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
1844 } else {
1845 *identifier_is_ascii_only = false;
1846 is_xid_continue(c)
1847 }
1848}
1849
1850enum LexedText<'a> {
1851 Source { source: &'a str, range: TextRange },
1852 Owned(String),
1853}
1854
1855impl<'a> LexedText<'a> {
1856 fn new(start: TextSize, source: &'a str) -> Self {
1857 Self::Source {
1858 range: TextRange::empty(start),
1859 source,
1860 }
1861 }
1862
1863 fn push(&mut self, c: char) {
1864 match self {
1865 LexedText::Source { range, source } => {
1866 *range = range.add_end(c.text_len());
1867 debug_assert!(source[*range].ends_with(c));
1868 }
1869 LexedText::Owned(owned) => owned.push(c),
1870 }
1871 }
1872
1873 fn as_str<'b>(&'b self) -> &'b str
1874 where
1875 'b: 'a,
1876 {
1877 match self {
1878 LexedText::Source { range, source } => &source[*range],
1879 LexedText::Owned(owned) => owned,
1880 }
1881 }
1882
1883 fn skip_char(&mut self) {
1884 match self {
1885 LexedText::Source { range, source } => {
1886 *self = LexedText::Owned(source[*range].to_string());
1887 }
1888 LexedText::Owned(_) => {}
1889 }
1890 }
1891}
1892
1893pub fn lex(source: &str, mode: Mode) -> Lexer<'_> {
1895 Lexer::new(source, mode, TextSize::default())
1896}
1897
1898#[cfg(test)]
1899mod tests {
1900 use std::fmt::Write;
1901
1902 use insta::assert_snapshot;
1903
1904 use super::*;
1905
1906 const WINDOWS_EOL: &str = "\r\n";
1907 const MAC_EOL: &str = "\r";
1908 const UNIX_EOL: &str = "\n";
1909
1910 struct TestToken {
1912 kind: TokenKind,
1913 value: TokenValue,
1914 range: TextRange,
1915 flags: TokenFlags,
1916 }
1917
1918 impl std::fmt::Debug for TestToken {
1919 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1920 let mut tuple = f.debug_tuple("");
1921 let mut tuple = if matches!(self.value, TokenValue::None) {
1922 tuple.field(&self.kind)
1923 } else {
1924 tuple.field(&self.value)
1925 };
1926 tuple = tuple.field(&self.range);
1927 if self.flags.is_empty() {
1928 tuple.finish()
1929 } else {
1930 tuple.field(&self.flags).finish()
1931 }
1932 }
1933 }
1934
1935 struct LexerOutput {
1936 tokens: Vec<TestToken>,
1937 errors: Vec<LexicalError>,
1938 }
1939
1940 impl std::fmt::Display for LexerOutput {
1941 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1942 writeln!(f, "## Tokens")?;
1943 writeln!(f, "```\n{:#?}\n```", self.tokens)?;
1944 if !self.errors.is_empty() {
1945 writeln!(f, "## Errors")?;
1946 writeln!(f, "```\n{:#?}\n```", self.errors)?;
1947 }
1948 Ok(())
1949 }
1950 }
1951
1952 fn lex(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
1953 let mut lexer = Lexer::new(source, mode, start_offset);
1954 let mut tokens = Vec::new();
1955 loop {
1956 let kind = lexer.next_token();
1957 if kind.is_eof() {
1958 break;
1959 }
1960 tokens.push(TestToken {
1961 kind,
1962 value: lexer.take_value(),
1963 range: lexer.current_range(),
1964 flags: lexer.current_flags(),
1965 });
1966 }
1967 LexerOutput {
1968 tokens,
1969 errors: lexer.finish(),
1970 }
1971 }
1972
1973 #[track_caller]
1974 fn lex_valid(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
1975 let output = lex(source, mode, start_offset);
1976
1977 if !output.errors.is_empty() {
1978 let mut message = "Unexpected lexical errors for a valid source:\n".to_string();
1979 for error in &output.errors {
1980 writeln!(&mut message, "{error:?}").unwrap();
1981 }
1982 writeln!(&mut message, "Source:\n{source}").unwrap();
1983 panic!("{message}");
1984 }
1985
1986 output
1987 }
1988
1989 #[track_caller]
1990 fn lex_invalid(source: &str, mode: Mode) -> LexerOutput {
1991 let output = lex(source, mode, TextSize::default());
1992
1993 assert!(
1994 !output.errors.is_empty(),
1995 "Expected lexer to generate at least one error for the following source:\n{source}"
1996 );
1997
1998 output
1999 }
2000
2001 #[track_caller]
2002 fn lex_source(source: &str) -> LexerOutput {
2003 lex_valid(source, Mode::Module, TextSize::default())
2004 }
2005
2006 #[track_caller]
2007 fn lex_source_with_offset(source: &str, start_offset: TextSize) -> LexerOutput {
2008 lex_valid(source, Mode::Module, start_offset)
2009 }
2010
2011 #[track_caller]
2012 fn lex_jupyter_source(source: &str) -> LexerOutput {
2013 lex_valid(source, Mode::Ipython, TextSize::default())
2014 }
2015
2016 #[test]
2017 fn bom() {
2018 let source = "\u{feff}x = 1";
2019 assert_snapshot!(lex_source(source));
2020 }
2021
2022 #[test]
2023 fn bom_with_offset() {
2024 let source = "\u{feff}x + y + z";
2025 assert_snapshot!(lex_source_with_offset(source, TextSize::new(7)));
2026 }
2027
2028 #[test]
2029 fn bom_with_offset_edge() {
2030 let source = "\u{feff}x + y + z";
2033 assert_snapshot!(lex_source_with_offset(source, TextSize::new(11)));
2034 }
2035
2036 fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput {
2037 let source = format!("%matplotlib \\{eol} --inline");
2038 lex_jupyter_source(&source)
2039 }
2040
2041 #[test]
2042 fn test_ipython_escape_command_line_continuation_unix_eol() {
2043 assert_snapshot!(ipython_escape_command_line_continuation_eol(UNIX_EOL));
2044 }
2045
2046 #[test]
2047 fn test_ipython_escape_command_line_continuation_mac_eol() {
2048 assert_snapshot!(ipython_escape_command_line_continuation_eol(MAC_EOL));
2049 }
2050
2051 #[test]
2052 fn test_ipython_escape_command_line_continuation_windows_eol() {
2053 assert_snapshot!(ipython_escape_command_line_continuation_eol(WINDOWS_EOL));
2054 }
2055
2056 fn ipython_escape_command_line_continuation_with_eol_and_eof(eol: &str) -> LexerOutput {
2057 let source = format!("%matplotlib \\{eol}");
2058 lex_jupyter_source(&source)
2059 }
2060
2061 #[test]
2062 fn test_ipython_escape_command_line_continuation_with_unix_eol_and_eof() {
2063 assert_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
2064 UNIX_EOL
2065 ));
2066 }
2067
2068 #[test]
2069 fn test_ipython_escape_command_line_continuation_with_mac_eol_and_eof() {
2070 assert_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
2071 MAC_EOL
2072 ));
2073 }
2074
2075 #[test]
2076 fn test_ipython_escape_command_line_continuation_with_windows_eol_and_eof() {
2077 assert_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
2078 WINDOWS_EOL
2079 ));
2080 }
2081
2082 #[test]
2083 fn test_empty_ipython_escape_command() {
2084 let source = "%\n%%\n!\n!!\n?\n??\n/\n,\n;";
2085 assert_snapshot!(lex_jupyter_source(source));
2086 }
2087
2088 #[test]
2089 fn test_ipython_escape_command() {
2090 let source = r"
2091?foo
2092??foo
2093%timeit a = b
2094%timeit a % 3
2095%matplotlib \
2096 --inline
2097!pwd \
2098 && ls -a | sed 's/^/\\ /'
2099!!cd /Users/foo/Library/Application\ Support/
2100/foo 1 2
2101,foo 1 2
2102;foo 1 2
2103!ls
2104"
2105 .trim();
2106 assert_snapshot!(lex_jupyter_source(source));
2107 }
2108
2109 #[test]
2110 fn test_ipython_help_end_escape_command() {
2111 let source = r"
2112?foo?
2113?? foo?
2114?? foo ?
2115?foo??
2116??foo??
2117???foo?
2118???foo??
2119??foo???
2120???foo???
2121?? \
2122 foo?
2123?? \
2124?
2125????
2126%foo?
2127%foo??
2128%%foo???
2129!pwd?"
2130 .trim();
2131 assert_snapshot!(lex_jupyter_source(source));
2132 }
2133
2134 #[test]
2135 fn test_ipython_escape_command_indentation() {
2136 let source = r"
2137if True:
2138 %matplotlib \
2139 --inline"
2140 .trim();
2141 assert_snapshot!(lex_jupyter_source(source));
2142 }
2143
2144 #[test]
2145 fn test_ipython_escape_command_assignment() {
2146 let source = r"
2147pwd = !pwd
2148foo = %timeit a = b
2149bar = %timeit a % 3
2150baz = %matplotlib \
2151 inline
2152qux = %foo?
2153quux = !pwd?"
2154 .trim();
2155 assert_snapshot!(lex_jupyter_source(source));
2156 }
2157
2158 fn assert_no_ipython_escape_command(tokens: &[TestToken]) {
2159 for token in tokens {
2160 if matches!(token.kind, TokenKind::IpyEscapeCommand) {
2161 panic!("Unexpected escape command token at {:?}", token.range)
2162 }
2163 }
2164 }
2165
2166 #[test]
2167 fn test_ipython_escape_command_not_an_assignment() {
2168 let source = r"
2169# Other escape kinds are not valid here (can't test `foo = ?str` because '?' is not a valid token)
2170foo = /func
2171foo = ;func
2172foo = ,func
2173
2174(foo == %timeit a = b)
2175(foo := %timeit a = b)
2176def f(arg=%timeit a = b):
2177 pass"
2178 .trim();
2179 let output = lex(source, Mode::Ipython, TextSize::default());
2180 assert!(output.errors.is_empty());
2181 assert_no_ipython_escape_command(&output.tokens);
2182 }
2183
2184 #[test]
2185 fn test_numbers() {
2186 let source = "0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j 000 0x995DC9BBDF1939FA 0x995DC9BBDF1939FA995DC9BBDF1939FA";
2187 assert_snapshot!(lex_source(source));
2188 }
2189
2190 #[test]
2191 fn test_invalid_leading_zero_small() {
2192 let source = "025";
2193 assert_snapshot!(lex_invalid(source, Mode::Module));
2194 }
2195
2196 #[test]
2197 fn test_invalid_leading_zero_big() {
2198 let source =
2199 "0252222222222222522222222222225222222222222252222222222222522222222222225222222222222";
2200 assert_snapshot!(lex_invalid(source, Mode::Module));
2201 }
2202
2203 #[test]
2204 fn test_line_comment_long() {
2205 let source = "99232 # foo".to_string();
2206 assert_snapshot!(lex_source(&source));
2207 }
2208
2209 #[test]
2210 fn test_line_comment_whitespace() {
2211 let source = "99232 # ".to_string();
2212 assert_snapshot!(lex_source(&source));
2213 }
2214
2215 #[test]
2216 fn test_line_comment_single_whitespace() {
2217 let source = "99232 # ".to_string();
2218 assert_snapshot!(lex_source(&source));
2219 }
2220
2221 #[test]
2222 fn test_line_comment_empty() {
2223 let source = "99232 #".to_string();
2224 assert_snapshot!(lex_source(&source));
2225 }
2226
2227 fn comment_until_eol(eol: &str) -> LexerOutput {
2228 let source = format!("123 # Foo{eol}456");
2229 lex_source(&source)
2230 }
2231
2232 #[test]
2233 fn test_comment_until_unix_eol() {
2234 assert_snapshot!(comment_until_eol(UNIX_EOL));
2235 }
2236
2237 #[test]
2238 fn test_comment_until_mac_eol() {
2239 assert_snapshot!(comment_until_eol(MAC_EOL));
2240 }
2241
2242 #[test]
2243 fn test_comment_until_windows_eol() {
2244 assert_snapshot!(comment_until_eol(WINDOWS_EOL));
2245 }
2246
2247 #[test]
2248 fn test_assignment() {
2249 let source = r"a_variable = 99 + 2-0";
2250 assert_snapshot!(lex_source(source));
2251 }
2252
2253 fn indentation_with_eol(eol: &str) -> LexerOutput {
2254 let source = format!("def foo():{eol} return 99{eol}{eol}");
2255 lex_source(&source)
2256 }
2257
2258 #[test]
2259 fn test_indentation_with_unix_eol() {
2260 assert_snapshot!(indentation_with_eol(UNIX_EOL));
2261 }
2262
2263 #[test]
2264 fn test_indentation_with_mac_eol() {
2265 assert_snapshot!(indentation_with_eol(MAC_EOL));
2266 }
2267
2268 #[test]
2269 fn test_indentation_with_windows_eol() {
2270 assert_snapshot!(indentation_with_eol(WINDOWS_EOL));
2271 }
2272
2273 fn double_dedent_with_eol(eol: &str) -> LexerOutput {
2274 let source = format!("def foo():{eol} if x:{eol}{eol} return 99{eol}{eol}");
2275 lex_source(&source)
2276 }
2277
2278 #[test]
2279 fn test_double_dedent_with_unix_eol() {
2280 assert_snapshot!(double_dedent_with_eol(UNIX_EOL));
2281 }
2282
2283 #[test]
2284 fn test_double_dedent_with_mac_eol() {
2285 assert_snapshot!(double_dedent_with_eol(MAC_EOL));
2286 }
2287
2288 #[test]
2289 fn test_double_dedent_with_windows_eol() {
2290 assert_snapshot!(double_dedent_with_eol(WINDOWS_EOL));
2291 }
2292
2293 fn double_dedent_with_tabs_eol(eol: &str) -> LexerOutput {
2294 let source = format!("def foo():{eol}\tif x:{eol}{eol}\t\t return 99{eol}{eol}");
2295 lex_source(&source)
2296 }
2297
2298 #[test]
2299 fn test_double_dedent_with_tabs_unix_eol() {
2300 assert_snapshot!(double_dedent_with_tabs_eol(UNIX_EOL));
2301 }
2302
2303 #[test]
2304 fn test_double_dedent_with_tabs_mac_eol() {
2305 assert_snapshot!(double_dedent_with_tabs_eol(MAC_EOL));
2306 }
2307
2308 #[test]
2309 fn test_double_dedent_with_tabs_windows_eol() {
2310 assert_snapshot!(double_dedent_with_tabs_eol(WINDOWS_EOL));
2311 }
2312
2313 #[test]
2314 fn dedent_after_whitespace() {
2315 let source = "\
2316if first:
2317 if second:
2318 pass
2319 foo
2320";
2321 assert_snapshot!(lex_source(source));
2322 }
2323
2324 fn newline_in_brackets_eol(eol: &str) -> LexerOutput {
2325 let source = r"x = [
2326
2327 1,2
2328,(3,
23294,
2330), {
23315,
23326,\
23337}]
2334"
2335 .replace('\n', eol);
2336 lex_source(&source)
2337 }
2338
2339 #[test]
2340 fn test_newline_in_brackets_unix_eol() {
2341 assert_snapshot!(newline_in_brackets_eol(UNIX_EOL));
2342 }
2343
2344 #[test]
2345 fn test_newline_in_brackets_mac_eol() {
2346 assert_snapshot!(newline_in_brackets_eol(MAC_EOL));
2347 }
2348
2349 #[test]
2350 fn test_newline_in_brackets_windows_eol() {
2351 assert_snapshot!(newline_in_brackets_eol(WINDOWS_EOL));
2352 }
2353
2354 #[test]
2355 fn test_non_logical_newline_in_string_continuation() {
2356 let source = r"(
2357 'a'
2358 'b'
2359
2360 'c' \
2361 'd'
2362)";
2363 assert_snapshot!(lex_source(source));
2364 }
2365
2366 #[test]
2367 fn test_logical_newline_line_comment() {
2368 let source = "#Hello\n#World\n";
2369 assert_snapshot!(lex_source(source));
2370 }
2371
2372 #[test]
2373 fn test_operators() {
2374 let source = "//////=/ /";
2375 assert_snapshot!(lex_source(source));
2376 }
2377
2378 #[test]
2379 fn test_string() {
2380 let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\'' '\420' '\200\0a'"#;
2381 assert_snapshot!(lex_source(source));
2382 }
2383
2384 fn string_continuation_with_eol(eol: &str) -> LexerOutput {
2385 let source = format!("\"abc\\{eol}def\"");
2386 lex_source(&source)
2387 }
2388
2389 #[test]
2390 fn test_string_continuation_with_unix_eol() {
2391 assert_snapshot!(string_continuation_with_eol(UNIX_EOL));
2392 }
2393
2394 #[test]
2395 fn test_string_continuation_with_mac_eol() {
2396 assert_snapshot!(string_continuation_with_eol(MAC_EOL));
2397 }
2398
2399 #[test]
2400 fn test_string_continuation_with_windows_eol() {
2401 assert_snapshot!(string_continuation_with_eol(WINDOWS_EOL));
2402 }
2403
2404 #[test]
2405 fn test_escape_unicode_name() {
2406 let source = r#""\N{EN SPACE}""#;
2407 assert_snapshot!(lex_source(source));
2408 }
2409
2410 fn get_tokens_only(source: &str) -> Vec<TokenKind> {
2411 let output = lex(source, Mode::Module, TextSize::default());
2412 assert!(output.errors.is_empty());
2413 output.tokens.into_iter().map(|token| token.kind).collect()
2414 }
2415
2416 #[test]
2417 fn test_nfkc_normalization() {
2418 let source1 = "𝒞 = 500";
2419 let source2 = "C = 500";
2420 assert_eq!(get_tokens_only(source1), get_tokens_only(source2));
2421 }
2422
2423 fn triple_quoted_eol(eol: &str) -> LexerOutput {
2424 let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
2425 lex_source(&source)
2426 }
2427
2428 #[test]
2429 fn test_triple_quoted_unix_eol() {
2430 assert_snapshot!(triple_quoted_eol(UNIX_EOL));
2431 }
2432
2433 #[test]
2434 fn test_triple_quoted_mac_eol() {
2435 assert_snapshot!(triple_quoted_eol(MAC_EOL));
2436 }
2437
2438 #[test]
2439 fn test_triple_quoted_windows_eol() {
2440 assert_snapshot!(triple_quoted_eol(WINDOWS_EOL));
2441 }
2442
2443 fn line_continuation_at_eof_after_newline(eol: &str) -> LexerOutput {
2444 let source = format!(r"\{eol}");
2445 lex_invalid(&source, Mode::Module)
2446 }
2447
2448 #[test]
2449 fn test_line_continuation_at_eof_after_newline_unix_eol() {
2450 assert_snapshot!(line_continuation_at_eof_after_newline(UNIX_EOL));
2451 }
2452
2453 #[test]
2454 fn test_line_continuation_at_eof_after_newline_mac_eol() {
2455 assert_snapshot!(line_continuation_at_eof_after_newline(MAC_EOL));
2456 }
2457
2458 #[test]
2459 fn test_line_continuation_at_eof_after_newline_windows_eol() {
2460 assert_snapshot!(line_continuation_at_eof_after_newline(WINDOWS_EOL));
2461 }
2462
2463 fn line_continuation_at_eof(eol: &str) -> LexerOutput {
2464 let source = format!(r"1, \{eol}");
2465 lex_invalid(&source, Mode::Module)
2466 }
2467
2468 #[test]
2469 fn test_line_continuation_at_eof_unix_eol() {
2470 assert_snapshot!(line_continuation_at_eof(UNIX_EOL));
2471 }
2472
2473 #[test]
2474 fn test_line_continuation_at_eof_mac_eol() {
2475 assert_snapshot!(line_continuation_at_eof(MAC_EOL));
2476 }
2477
2478 #[test]
2479 fn test_line_continuation_at_eof_windows_eol() {
2480 assert_snapshot!(line_continuation_at_eof(WINDOWS_EOL));
2481 }
2482
2483 #[test]
2486 fn test_infinite_loop() {
2487 let source = "[1";
2488 lex_invalid(source, Mode::Module);
2489 }
2490
2491 #[test]
2493 fn test_emoji_identifier() {
2494 let source = "🐦";
2495 assert_snapshot!(lex_invalid(source, Mode::Module));
2496 }
2497
2498 #[test]
2499 fn tet_too_low_dedent() {
2500 let source = "if True:
2501 pass
2502 pass";
2503 assert_snapshot!(lex_invalid(source, Mode::Module));
2504 }
2505
2506 #[test]
2507 fn test_empty_fstrings() {
2508 let source = r#"f"" "" F"" f'' '' f"""""" f''''''"#;
2509 assert_snapshot!(lex_source(source));
2510 }
2511
2512 #[test]
2513 fn test_fstring_prefix() {
2514 let source = r#"f"" F"" rf"" rF"" Rf"" RF"" fr"" Fr"" fR"" FR"""#;
2515 assert_snapshot!(lex_source(source));
2516 }
2517
2518 #[test]
2519 fn test_fstring() {
2520 let source = r#"f"normal {foo} {{another}} {bar} {{{three}}}""#;
2521 assert_snapshot!(lex_source(source));
2522 }
2523
2524 #[test]
2525 fn test_fstring_parentheses() {
2526 let source = r#"f"{}" f"{{}}" f" {}" f"{{{}}}" f"{{{{}}}}" f" {} {{}} {{{}}} {{{{}}}} ""#;
2527 assert_snapshot!(lex_source(source));
2528 }
2529
2530 fn fstring_single_quote_escape_eol(eol: &str) -> LexerOutput {
2531 let source = format!(r"f'text \{eol} more text'");
2532 lex_source(&source)
2533 }
2534
2535 #[test]
2536 fn test_fstring_single_quote_escape_unix_eol() {
2537 assert_snapshot!(fstring_single_quote_escape_eol(UNIX_EOL));
2538 }
2539
2540 #[test]
2541 fn test_fstring_single_quote_escape_mac_eol() {
2542 assert_snapshot!(fstring_single_quote_escape_eol(MAC_EOL));
2543 }
2544
2545 #[test]
2546 fn test_fstring_single_quote_escape_windows_eol() {
2547 assert_snapshot!(fstring_single_quote_escape_eol(WINDOWS_EOL));
2548 }
2549
2550 #[test]
2551 fn test_fstring_escape() {
2552 let source = r#"f"\{x:\"\{x}} \"\"\
2553 end""#;
2554 assert_snapshot!(lex_source(source));
2555 }
2556
2557 #[test]
2558 fn test_fstring_escape_braces() {
2559 let source = r"f'\{foo}' f'\\{foo}' f'\{{foo}}' f'\\{{foo}}'";
2560 assert_snapshot!(lex_source(source));
2561 }
2562
2563 #[test]
2564 fn test_fstring_escape_raw() {
2565 let source = r#"rf"\{x:\"\{x}} \"\"\
2566 end""#;
2567 assert_snapshot!(lex_source(source));
2568 }
2569
2570 #[test]
2571 fn test_fstring_named_unicode() {
2572 let source = r#"f"\N{BULLET} normal \Nope \N""#;
2573 assert_snapshot!(lex_source(source));
2574 }
2575
2576 #[test]
2577 fn test_fstring_named_unicode_raw() {
2578 let source = r#"rf"\N{BULLET} normal""#;
2579 assert_snapshot!(lex_source(source));
2580 }
2581
2582 #[test]
2583 fn test_fstring_with_named_expression() {
2584 let source = r#"f"{x:=10} {(x:=10)} {x,{y:=10}} {[x:=10]}""#;
2585 assert_snapshot!(lex_source(source));
2586 }
2587
2588 #[test]
2589 fn test_fstring_with_format_spec() {
2590 let source = r#"f"{foo:} {x=!s:.3f} {x:.{y}f} {'':*^{1:{1}}} {x:{{1}.pop()}}""#;
2591 assert_snapshot!(lex_source(source));
2592 }
2593
2594 #[test]
2595 fn test_fstring_with_multiline_format_spec() {
2596 let source = r"f'''__{
2599 x:d
2600}__'''
2601f'''__{
2602 x:a
2603 b
2604 c
2605}__'''
2606";
2607 assert_snapshot!(lex_source(source));
2608 }
2609
2610 #[test]
2611 fn test_fstring_newline_format_spec() {
2612 let source = r"
2613f'__{
2614 x:d
2615}__'
2616f'__{
2617 x:a
2618 b
2619}__'
2620";
2621 assert_snapshot!(lex_invalid(source, Mode::Module));
2622 }
2623
2624 #[test]
2625 fn test_fstring_conversion() {
2626 let source = r#"f"{x!s} {x=!r} {x:.3f!r} {{x!r}}""#;
2627 assert_snapshot!(lex_source(source));
2628 }
2629
2630 #[test]
2631 fn test_fstring_nested() {
2632 let source = r#"f"foo {f"bar {x + f"{wow}"}"} baz" f'foo {f'bar'} some {f"another"}'"#;
2633 assert_snapshot!(lex_source(source));
2634 }
2635
2636 #[test]
2637 fn test_fstring_expression_multiline() {
2638 let source = r#"f"first {
2639 x
2640 *
2641 y
2642} second""#;
2643 assert_snapshot!(lex_source(source));
2644 }
2645
2646 #[test]
2647 fn test_fstring_multiline() {
2648 let source = r#"f"""
2649hello
2650 world
2651""" f'''
2652 world
2653hello
2654''' f"some {f"""multiline
2655allowed {x}"""} string""#;
2656 assert_snapshot!(lex_source(source));
2657 }
2658
2659 #[test]
2660 fn test_fstring_comments() {
2661 let source = r#"f"""
2662# not a comment { # comment {
2663 x
2664} # not a comment
2665""""#;
2666 assert_snapshot!(lex_source(source));
2667 }
2668
2669 #[test]
2670 fn test_fstring_with_ipy_escape_command() {
2671 let source = r#"f"foo {!pwd} bar""#;
2672 assert_snapshot!(lex_source(source));
2673 }
2674
2675 #[test]
2676 fn test_fstring_with_lambda_expression() {
2677 let source = r#"
2678f"{lambda x:{x}}"
2679f"{(lambda x:{x})}"
2680"#
2681 .trim();
2682 assert_snapshot!(lex_source(source));
2683 }
2684
2685 #[test]
2686 fn test_fstring_with_nul_char() {
2687 let source = r"f'\0'";
2688 assert_snapshot!(lex_source(source));
2689 }
2690
2691 #[test]
2692 fn test_empty_tstrings() {
2693 let source = r#"t"" "" t"" t'' '' t"""""" t''''''"#;
2694 assert_snapshot!(lex_source(source));
2695 }
2696
2697 #[test]
2698 fn test_tstring_prefix() {
2699 let source = r#"t"" t"" rt"" rt"" Rt"" Rt"" tr"" Tr"" tR"" TR"""#;
2700 assert_snapshot!(lex_source(source));
2701 }
2702
2703 #[test]
2704 fn test_tstring() {
2705 let source = r#"t"normal {foo} {{another}} {bar} {{{three}}}""#;
2706 assert_snapshot!(lex_source(source));
2707 }
2708
2709 #[test]
2710 fn test_tstring_parentheses() {
2711 let source = r#"t"{}" t"{{}}" t" {}" t"{{{}}}" t"{{{{}}}}" t" {} {{}} {{{}}} {{{{}}}} ""#;
2712 assert_snapshot!(lex_source(source));
2713 }
2714
2715 fn tstring_single_quote_escape_eol(eol: &str) -> LexerOutput {
2716 let source = format!(r"t'text \{eol} more text'");
2717 lex_source(&source)
2718 }
2719
2720 #[test]
2721 fn test_tstring_single_quote_escape_unix_eol() {
2722 assert_snapshot!(tstring_single_quote_escape_eol(UNIX_EOL));
2723 }
2724
2725 #[test]
2726 fn test_tstring_single_quote_escape_mac_eol() {
2727 assert_snapshot!(tstring_single_quote_escape_eol(MAC_EOL));
2728 }
2729
2730 #[test]
2731 fn test_tstring_single_quote_escape_windows_eol() {
2732 assert_snapshot!(tstring_single_quote_escape_eol(WINDOWS_EOL));
2733 }
2734
2735 #[test]
2736 fn test_tstring_escape() {
2737 let source = r#"t"\{x:\"\{x}} \"\"\
2738 end""#;
2739 assert_snapshot!(lex_source(source));
2740 }
2741
2742 #[test]
2743 fn test_tstring_escape_braces() {
2744 let source = r"t'\{foo}' t'\\{foo}' t'\{{foo}}' t'\\{{foo}}'";
2745 assert_snapshot!(lex_source(source));
2746 }
2747
2748 #[test]
2749 fn test_tstring_escape_raw() {
2750 let source = r#"rt"\{x:\"\{x}} \"\"\
2751 end""#;
2752 assert_snapshot!(lex_source(source));
2753 }
2754
2755 #[test]
2756 fn test_tstring_named_unicode() {
2757 let source = r#"t"\N{BULLET} normal \Nope \N""#;
2758 assert_snapshot!(lex_source(source));
2759 }
2760
2761 #[test]
2762 fn test_tstring_named_unicode_raw() {
2763 let source = r#"rt"\N{BULLET} normal""#;
2764 assert_snapshot!(lex_source(source));
2765 }
2766
2767 #[test]
2768 fn test_tstring_with_named_expression() {
2769 let source = r#"t"{x:=10} {(x:=10)} {x,{y:=10}} {[x:=10]}""#;
2770 assert_snapshot!(lex_source(source));
2771 }
2772
2773 #[test]
2774 fn test_tstring_with_format_spec() {
2775 let source = r#"t"{foo:} {x=!s:.3f} {x:.{y}f} {'':*^{1:{1}}} {x:{{1}.pop()}}""#;
2776 assert_snapshot!(lex_source(source));
2777 }
2778
2779 #[test]
2780 fn test_tstring_with_multiline_format_spec() {
2781 let source = r"t'''__{
2784 x:d
2785}__'''
2786t'''__{
2787 x:a
2788 b
2789 c
2790}__'''
2791";
2792 assert_snapshot!(lex_source(source));
2793 }
2794
2795 #[test]
2796 fn test_tstring_newline_format_spec() {
2797 let source = r"
2798t'__{
2799 x:d
2800}__'
2801t'__{
2802 x:a
2803 b
2804}__'
2805";
2806 assert_snapshot!(lex_invalid(source, Mode::Module));
2807 }
2808
2809 #[test]
2810 fn test_tstring_conversion() {
2811 let source = r#"t"{x!s} {x=!r} {x:.3f!r} {{x!r}}""#;
2812 assert_snapshot!(lex_source(source));
2813 }
2814
2815 #[test]
2816 fn test_tstring_nested() {
2817 let source = r#"t"foo {t"bar {x + t"{wow}"}"} baz" t'foo {t'bar'} some {t"another"}'"#;
2818 assert_snapshot!(lex_source(source));
2819 }
2820
2821 #[test]
2822 fn test_tstring_expression_multiline() {
2823 let source = r#"t"first {
2824 x
2825 *
2826 y
2827} second""#;
2828 assert_snapshot!(lex_source(source));
2829 }
2830
2831 #[test]
2832 fn test_tstring_multiline() {
2833 let source = r#"t"""
2834hello
2835 world
2836""" t'''
2837 world
2838hello
2839''' t"some {t"""multiline
2840allowed {x}"""} string""#;
2841 assert_snapshot!(lex_source(source));
2842 }
2843
2844 #[test]
2845 fn test_tstring_comments() {
2846 let source = r#"t"""
2847# not a comment { # comment {
2848 x
2849} # not a comment
2850""""#;
2851 assert_snapshot!(lex_source(source));
2852 }
2853
2854 #[test]
2855 fn test_tstring_with_ipy_escape_command() {
2856 let source = r#"t"foo {!pwd} bar""#;
2857 assert_snapshot!(lex_source(source));
2858 }
2859
2860 #[test]
2861 fn test_tstring_with_lambda_expression() {
2862 let source = r#"
2863t"{lambda x:{x}}"
2864t"{(lambda x:{x})}"
2865"#
2866 .trim();
2867 assert_snapshot!(lex_source(source));
2868 }
2869
2870 #[test]
2871 fn test_tstring_with_nul_char() {
2872 let source = r"t'\0'";
2873 assert_snapshot!(lex_source(source));
2874 }
2875
2876 #[test]
2877 fn test_nested_t_and_fstring() {
2878 let source = r#"t"foo {f"bar {x + t"{wow}"}"} baz" f'foo {t'bar'!r} some {f"another"}'"#;
2879 assert_snapshot!(lex_source(source));
2880 }
2881
2882 #[test]
2883 fn test_match_softkeyword_in_notebook() {
2884 let source = r"match foo:
2885 case bar:
2886 pass";
2887 assert_snapshot!(lex_jupyter_source(source));
2888 }
2889
2890 fn lex_fstring_error(source: &str) -> InterpolatedStringErrorType {
2891 let output = lex(source, Mode::Module, TextSize::default());
2892 match output
2893 .errors
2894 .into_iter()
2895 .next()
2896 .expect("lexer should give at least one error")
2897 .into_error()
2898 {
2899 LexicalErrorType::FStringError(error) => error,
2900 err => panic!("Expected FStringError: {err:?}"),
2901 }
2902 }
2903
2904 #[test]
2905 fn test_fstring_error() {
2906 use InterpolatedStringErrorType::{
2907 SingleRbrace, UnterminatedString, UnterminatedTripleQuotedString,
2908 };
2909
2910 assert_eq!(lex_fstring_error("f'}'"), SingleRbrace);
2911 assert_eq!(lex_fstring_error("f'{{}'"), SingleRbrace);
2912 assert_eq!(lex_fstring_error("f'{{}}}'"), SingleRbrace);
2913 assert_eq!(lex_fstring_error("f'foo}'"), SingleRbrace);
2914 assert_eq!(lex_fstring_error(r"f'\u007b}'"), SingleRbrace);
2915 assert_eq!(lex_fstring_error("f'{a:b}}'"), SingleRbrace);
2916 assert_eq!(lex_fstring_error("f'{3:}}>10}'"), SingleRbrace);
2917 assert_eq!(lex_fstring_error(r"f'\{foo}\}'"), SingleRbrace);
2918
2919 assert_eq!(lex_fstring_error(r#"f""#), UnterminatedString);
2920 assert_eq!(lex_fstring_error(r"f'"), UnterminatedString);
2921
2922 assert_eq!(lex_fstring_error(r#"f""""#), UnterminatedTripleQuotedString);
2923 assert_eq!(lex_fstring_error(r"f'''"), UnterminatedTripleQuotedString);
2924 assert_eq!(
2925 lex_fstring_error(r#"f"""""#),
2926 UnterminatedTripleQuotedString
2927 );
2928 assert_eq!(
2929 lex_fstring_error(r#"f""""""#),
2930 UnterminatedTripleQuotedString
2931 );
2932 }
2933
2934 fn lex_tstring_error(source: &str) -> InterpolatedStringErrorType {
2935 let output = lex(source, Mode::Module, TextSize::default());
2936 match output
2937 .errors
2938 .into_iter()
2939 .next()
2940 .expect("lexer should give at least one error")
2941 .into_error()
2942 {
2943 LexicalErrorType::TStringError(error) => error,
2944 err => panic!("Expected TStringError: {err:?}"),
2945 }
2946 }
2947
2948 #[test]
2949 fn lex_fstring_unclosed() {
2950 let source = r#"f"hello"#;
2951
2952 assert_snapshot!(lex_invalid(source, Mode::Module), @r#"
2953 ## Tokens
2954 ```
2955 [
2956 (
2957 FStringStart,
2958 0..2,
2959 TokenFlags(
2960 DOUBLE_QUOTES | F_STRING,
2961 ),
2962 ),
2963 (
2964 InterpolatedStringMiddle(
2965 "hello",
2966 ),
2967 2..7,
2968 TokenFlags(
2969 DOUBLE_QUOTES | F_STRING,
2970 ),
2971 ),
2972 (
2973 Newline,
2974 7..7,
2975 ),
2976 ]
2977 ```
2978 ## Errors
2979 ```
2980 [
2981 LexicalError {
2982 error: FStringError(
2983 UnterminatedString,
2984 ),
2985 location: 2..7,
2986 },
2987 ]
2988 ```
2989 "#);
2990 }
2991
2992 #[test]
2993 fn lex_fstring_missing_brace() {
2994 let source = "f'{'";
2995
2996 assert_snapshot!(lex_invalid(source, Mode::Module), @r#"
2997 ## Tokens
2998 ```
2999 [
3000 (
3001 FStringStart,
3002 0..2,
3003 TokenFlags(
3004 F_STRING,
3005 ),
3006 ),
3007 (
3008 Lbrace,
3009 2..3,
3010 ),
3011 (
3012 String(
3013 "",
3014 ),
3015 3..4,
3016 TokenFlags(
3017 UNCLOSED_STRING,
3018 ),
3019 ),
3020 (
3021 Newline,
3022 4..4,
3023 ),
3024 ]
3025 ```
3026 ## Errors
3027 ```
3028 [
3029 LexicalError {
3030 error: UnclosedStringError,
3031 location: 3..4,
3032 },
3033 LexicalError {
3034 error: FStringError(
3035 UnterminatedString,
3036 ),
3037 location: 4..4,
3038 },
3039 ]
3040 ```
3041 "#);
3042 }
3043
3044 #[test]
3045 fn lex_fstring_missing_brace_after_format_spec() {
3046 let source = r#"f"{foo!r""#;
3047
3048 assert_snapshot!(lex_invalid(source, Mode::Module), @r#"
3049 ## Tokens
3050 ```
3051 [
3052 (
3053 FStringStart,
3054 0..2,
3055 TokenFlags(
3056 DOUBLE_QUOTES | F_STRING,
3057 ),
3058 ),
3059 (
3060 Lbrace,
3061 2..3,
3062 ),
3063 (
3064 Name(
3065 Name("foo"),
3066 ),
3067 3..6,
3068 ),
3069 (
3070 Exclamation,
3071 6..7,
3072 ),
3073 (
3074 String(
3075 "",
3076 ),
3077 7..9,
3078 TokenFlags(
3079 DOUBLE_QUOTES | RAW_STRING_LOWERCASE | UNCLOSED_STRING,
3080 ),
3081 ),
3082 (
3083 Newline,
3084 9..9,
3085 ),
3086 ]
3087 ```
3088 ## Errors
3089 ```
3090 [
3091 LexicalError {
3092 error: UnclosedStringError,
3093 location: 7..9,
3094 },
3095 LexicalError {
3096 error: FStringError(
3097 UnterminatedString,
3098 ),
3099 location: 9..9,
3100 },
3101 ]
3102 ```
3103 "#);
3104 }
3105
3106 #[test]
3107 fn test_tstring_error() {
3108 use InterpolatedStringErrorType::{
3109 SingleRbrace, UnterminatedString, UnterminatedTripleQuotedString,
3110 };
3111
3112 assert_eq!(lex_tstring_error("t'}'"), SingleRbrace);
3113 assert_eq!(lex_tstring_error("t'{{}'"), SingleRbrace);
3114 assert_eq!(lex_tstring_error("t'{{}}}'"), SingleRbrace);
3115 assert_eq!(lex_tstring_error("t'foo}'"), SingleRbrace);
3116 assert_eq!(lex_tstring_error(r"t'\u007b}'"), SingleRbrace);
3117 assert_eq!(lex_tstring_error("t'{a:b}}'"), SingleRbrace);
3118 assert_eq!(lex_tstring_error("t'{3:}}>10}'"), SingleRbrace);
3119 assert_eq!(lex_tstring_error(r"t'\{foo}\}'"), SingleRbrace);
3120
3121 assert_eq!(lex_tstring_error(r#"t""#), UnterminatedString);
3122 assert_eq!(lex_tstring_error(r"t'"), UnterminatedString);
3123
3124 assert_eq!(lex_tstring_error(r#"t""""#), UnterminatedTripleQuotedString);
3125 assert_eq!(lex_tstring_error(r"t'''"), UnterminatedTripleQuotedString);
3126 assert_eq!(
3127 lex_tstring_error(r#"t"""""#),
3128 UnterminatedTripleQuotedString
3129 );
3130 assert_eq!(
3131 lex_tstring_error(r#"t""""""#),
3132 UnterminatedTripleQuotedString
3133 );
3134 }
3135
3136 #[test]
3137 fn backslash_continuation_indentation() {
3138 let source = r"if True:
3143 1
3144 \
3145 2
3146 \
31473
3148else:
3149 pass
3150"
3151 .to_string();
3152 assert_snapshot!(lex_source(&source));
3153 }
3154
3155 #[test]
3156 fn backslash_continuation_at_root() {
3157 let source = r"if True:
3162\
3163 1
3164 if True:
3165\
3166 2
3167else:\
3168 3
3169"
3170 .to_string();
3171 assert_snapshot!(lex_source(&source));
3172 }
3173
3174 #[test]
3175 fn multiple_backslash_continuation() {
3176 let source = r"if True:
3180 1
3181 \
3182 \
3183 \
3184 \
3185 2
3186"
3187 .to_string();
3188 assert_snapshot!(lex_source(&source));
3189 }
3190
3191 #[test]
3192 fn backslash_continuation_mismatch_indentation() {
3193 let source = r"if True:
3195 1
3196 \
3197 2
3198"
3199 .to_string();
3200 assert_snapshot!(lex_invalid(&source, Mode::Module));
3201 }
3202}