1mod string_types;
60
61use regex::Regex;
62use std::cell::RefCell;
63use std::cmp::Ordering;
64use std::convert::TryInto;
65use std::fmt::Debug;
66use std::fmt::Formatter;
67use std::rc::Rc;
68
69use crate::tokenizer::core::string_types::FTStringType;
70use crate::tokenizer::{
71 core::string_types::{FTStringNode, StringQuoteChar, StringQuoteSize},
72 operators::OPERATOR_RE,
73 text_position::{TextPosition, TextPositionSnapshot},
74 whitespace_parser::State as WhitespaceState,
75};
76
77const MAX_INDENT: usize = 100;
81
82const MAX_CHAR: char = '\u{10ffff}';
85
86thread_local! {
87 static SPACE_TAB_FORMFEED_RE: Regex = Regex::new(r"\A[ \f\t]+").expect("regex");
88 static ANY_NON_NEWLINE_RE: Regex = Regex::new(r"\A[^\r\n]+").expect("regex");
89 static STRING_PREFIX_RE: Regex =
90 Regex::new(r"\A(?i)(u|[bf]r|r[bft]|r|b|f|t)").expect("regex");
91 static POTENTIAL_IDENTIFIER_TAIL_RE: Regex =
92 Regex::new(r"\A([a-zA-Z0-9_]|[^\x00-\x7f])+").expect("regex");
93 static DECIMAL_DOT_DIGIT_RE: Regex = Regex::new(r"\A\.[0-9]").expect("regex");
94 static DECIMAL_TAIL_RE: Regex =
95 Regex::new(r"\A[0-9](_?[0-9])*").expect("regex");
96 static HEXADECIMAL_TAIL_RE: Regex =
97 Regex::new(r"\A(_?[0-9a-fA-F])+").expect("regex");
98 static OCTAL_TAIL_RE: Regex = Regex::new(r"\A(_?[0-7])+").expect("regex");
99 static BINARY_TAIL_RE: Regex = Regex::new(r"\A(_?[01])+").expect("regex");
100
101 static UNICODE_IDENTIFIER_RE: Regex =
105 Regex::new(r"\A[\p{XID_Start}_]\p{XID_Continue}*\z").expect("regex");
106}
107
108#[derive(Debug, Eq, PartialEq, Copy, Clone)]
109pub enum TokType {
110 String,
111 Name,
112 Number,
113 Op,
114 Newline,
115 Indent,
116 Dedent,
117 Async,
118 Await,
119 FStringStart,
120 FStringString,
121 FStringEnd,
122 TStringStart,
123 TStringString,
124 TStringEnd,
125 EndMarker,
126}
127
128#[derive(Debug, thiserror::Error, Eq, PartialEq)]
129pub enum TokError<'t> {
130 #[error("inconsistent mixing of tabs and spaces")]
131 TabSpace,
132 #[error("too many indentation levels")]
133 TooDeep,
134 #[error("no matching outer block for dedent")]
135 Dedent,
136 #[error("unexpected characters after a line continuation")]
137 LineContinuation,
138 #[error("unexpected end of file after a line continuation")]
139 LineContinuationEof,
140 #[error("{0:?} is not a valid identifier")]
141 BadIdentifier(&'t str),
142 #[error("invalid decimal literal")]
143 BadDecimal,
144 #[error(
145 "{}{}",
146 "leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal ",
147 "integers"
148 )]
149 BadDecimalLeadingZeros,
150 #[error("invalid hexadecimal literal")]
151 BadHexadecimal,
152 #[error("invalid octal literal")]
153 BadOctal,
154 #[error("invalid digit {0:?} in octal literal")]
155 BadOctalDigit(char),
156 #[error("invalid binary literal")]
157 BadBinary,
158 #[error("invalid digit {0:?} in binary literal")]
159 BadBinaryDigit(char),
160 #[error("unterminated string literal")]
161 UnterminatedString,
162 #[error("unterminated triple-quoted string literal")]
163 UnterminatedTripleQuotedString,
164 #[error("unmatched {0:?}")]
165 UnmatchedClosingParen(char),
166 #[error("Closing parenthesis {1:?} does not match opening parenthesis {0:?}")]
167 MismatchedClosingParen(char, char),
168 #[error("Closing parenthesis {1:?} does not match opening parenthesis {0:?} on line {2:}")]
169 MismatchedClosingParenOnLine(char, char, usize),
170 #[error("{0:?} is not a valid character in this position")]
171 BadCharacter(char),
172}
173
174#[derive(Clone)]
176pub struct TokState<'t> {
177 pub text_pos: TextPosition<'t>,
181 pub start_pos: TextPositionSnapshot,
183 done: bool,
185 tab_size: usize,
187 alt_tab_size: usize,
189 indent_stack: Vec<usize>,
193 alt_indent_stack: Vec<usize>,
195 at_bol: bool,
197 pub bol_width: usize,
200 blank_line: bool,
202 pending_indents: i32,
205 paren_stack: Vec<(char, usize)>,
209 cont_line: bool,
211
212 async_hacks: bool,
214 async_def: bool,
216 async_def_indent: usize,
218 async_def_nl: bool,
220
221 split_ftstring: bool,
230 ftstring_stack: Vec<FTStringNode>,
231
232 missing_nl_before_eof: bool,
233}
234
235pub struct TokConfig {
236 pub async_hacks: bool,
240 pub split_ftstring: bool,
241 }
244
245fn is_digit<C: Into<Option<char>>>(ch: C) -> bool {
246 matches!(ch.into(), Some('0'..='9'))
247}
248
249#[derive(Debug)]
250enum NumberState {
251 StartDigit,
252 Fraction,
253 Exponent,
254 Imaginary,
255}
256
257impl<'t> TokState<'t> {
258 pub fn new(text: &'t str, config: &TokConfig) -> Self {
259 let text_pos = TextPosition::new(text);
260 let start_pos = (&text_pos).into();
261 Self {
262 text_pos,
263 start_pos,
264 done: false,
265 tab_size: 8,
266 alt_tab_size: 1,
267 indent_stack: Vec::new(),
268 alt_indent_stack: Vec::new(),
269 at_bol: true,
270 bol_width: 0,
271 blank_line: false,
272 pending_indents: 0,
273 paren_stack: Vec::new(),
274 cont_line: false,
275 async_hacks: config.async_hacks,
276 async_def: false,
277 async_def_indent: 0,
278 async_def_nl: false,
279 split_ftstring: config.split_ftstring,
280 ftstring_stack: Vec::new(),
281 missing_nl_before_eof: text.is_empty() || text.as_bytes()[text.len() - 1] != b'\n',
282 }
283 }
284
285 pub fn is_parenthesized(&self) -> bool {
286 !self.paren_stack.is_empty()
287 }
288
289 fn next_inner(&mut self) -> Result<TokType, TokError<'t>> {
292 if self.split_ftstring {
293 if let Some(tos) = self.ftstring_stack.last() {
294 if !tos.is_in_expr() {
295 self.start_pos = (&self.text_pos).into();
296 let is_in_format_spec = tos.is_in_format_spec();
297 let is_raw_string = tos.is_raw_string;
298 if let Some(tok) =
299 self.maybe_consume_ftstring_string(is_in_format_spec, is_raw_string)?
300 {
301 return Ok(tok);
302 }
303 if let Some(tok) = self.maybe_consume_ftstring_end() {
304 return Ok(tok);
305 }
306 }
307 }
308 }
309
310 self.consume_bol_whitespace()?;
313
314 if let Some(t) = self.process_pending_indents() {
316 self.start_pos = (&self.text_pos).into();
317 return Ok(t);
318 }
319
320 self.maybe_close_async_def();
321
322 'again: loop {
323 SPACE_TAB_FORMFEED_RE.with(|v| self.text_pos.consume(v));
325
326 if self.text_pos.peek() == Some('#') {
328 ANY_NON_NEWLINE_RE.with(|v| self.text_pos.consume(v));
329 }
331
332 self.start_pos = (&self.text_pos).into();
334
335 return match self.text_pos.peek() {
336 None => {
338 if self.missing_nl_before_eof && !self.blank_line {
339 self.at_bol = true;
340 self.missing_nl_before_eof = false;
341 Ok(TokType::Newline)
342 } else {
343 let hanging_indents = self.indent_stack.len() as i32;
344 if self.pending_indents == 0 && hanging_indents != 0 {
345 self.pending_indents = -hanging_indents;
348 self.indent_stack.clear();
349 self.alt_indent_stack.clear();
350 self.missing_nl_before_eof = false;
351 }
352 if let Some(t) = self.process_pending_indents() {
353 Ok(t)
354 } else {
355 Ok(TokType::EndMarker)
356 }
357 }
358 }
359
360 Some('a'..='z') | Some('A'..='Z') | Some('_') | Some('\u{80}'..=MAX_CHAR) => {
362 self.consume_identifier_or_prefixed_string()
363 }
364
365 Some('\n') => {
367 self.text_pos.next();
368 self.at_bol = true;
369 if self.split_ftstring
370 && self
371 .ftstring_stack
372 .last()
373 .map(|node| node.allow_multiline())
374 == Some(false)
375 {
376 Err(TokError::UnterminatedString)
377 } else if self.blank_line || !self.paren_stack.is_empty() {
378 self.next_inner()
381 } else {
382 self.cont_line = false;
383 if self.async_def {
384 self.async_def_nl = true;
385 }
386 Ok(TokType::Newline)
387 }
388 }
389
390 Some('.') if self.text_pos.consume("...") => {
392 return Ok(TokType::Op);
393 }
394
395 Some('.') if DECIMAL_DOT_DIGIT_RE.with(|r| self.text_pos.matches(r)) => {
397 self.consume_number(NumberState::Fraction)
398 }
399
400 Some('.') => {
402 self.text_pos.next();
403 Ok(TokType::Op)
404 }
405
406 Some('0'..='9') => self.consume_number(NumberState::StartDigit),
408
409 Some('\'') | Some('"') => self.consume_string(),
411
412 Some('\\') => {
414 self.text_pos.next();
415 if let Some('\n') = self.text_pos.next() {
416 if self.text_pos.peek() == None {
417 Err(TokError::LineContinuationEof)
418 } else {
419 self.cont_line = true;
420 continue 'again;
422 }
423 } else {
424 Err(TokError::LineContinuation)
425 }
426 }
427
428 Some(ch @ '(') | Some(ch @ '[') | Some(ch @ '{') => {
429 self.text_pos.next();
430 if let Some(tos) = self.ftstring_stack.last_mut() {
431 tos.open_parentheses();
432 }
433 self.paren_stack.push((ch, self.text_pos.line_number()));
434 Ok(TokType::Op)
435 }
436
437 Some(closing @ ')') | Some(closing @ ']') | Some(closing @ '}') => {
438 self.text_pos.next();
439 if let Some(tos) = self.ftstring_stack.last_mut() {
440 tos.close_parentheses();
441 }
442 if let Some((opening, line_number)) = self.paren_stack.pop() {
443 match (opening, closing) {
444 ('(', ')') | ('[', ']') | ('{', '}') => Ok(TokType::Op),
445 _ => {
446 if line_number != self.text_pos.line_number() {
447 Err(TokError::MismatchedClosingParenOnLine(
448 opening,
449 closing,
450 line_number,
451 ))
452 } else {
453 Err(TokError::MismatchedClosingParen(opening, closing))
454 }
455 }
456 }
457 } else {
458 Err(TokError::UnmatchedClosingParen(closing))
459 }
460 }
461
462 Some(':')
463 if self
464 .ftstring_stack
465 .last()
466 .map(|tos| tos.parentheses_count - tos.format_spec_count == 1)
467 .unwrap_or(false) =>
468 {
469 let tos = self
475 .ftstring_stack
476 .last_mut()
477 .expect("ftstring_stack is not empty");
478 tos.format_spec_count += 1;
479 self.text_pos.next();
480 Ok(TokType::Op)
481 }
482
483 Some(_) if OPERATOR_RE.with(|r| self.text_pos.consume(r)) => Ok(TokType::Op),
485
486 Some(ch) => Err(TokError::BadCharacter(ch)),
490 };
491 }
492 }
493
494 fn consume_bol_whitespace(&mut self) -> Result<(), TokError<'t>> {
497 self.blank_line = false;
498 if !self.at_bol {
499 return Ok(());
500 }
501
502 let mut col = 0; let mut altcol = 0; self.at_bol = false;
505 self.bol_width = 0;
506
507 loop {
509 match self.text_pos.peek() {
510 Some(' ') => {
511 col += 1;
512 altcol += 1;
513 self.bol_width += 1;
514 self.text_pos.next();
515 }
516 Some('\t') => {
517 col = (col / self.tab_size + 1) * self.tab_size;
520 altcol = (altcol / self.alt_tab_size + 1) * self.alt_tab_size;
522 self.bol_width += 1;
523 self.text_pos.next();
524 }
525 Some('\x0c') => {
527 col = 0;
528 altcol = 0;
529 self.bol_width += 1;
530 self.text_pos.next();
531 }
532 _ => {
533 break;
534 }
535 }
536 }
537
538 self.blank_line = matches!(
542 self.text_pos.peek(),
543 Some('#') | Some('\n') | Some('\\') | None
544 );
545
546 if self.blank_line || !self.paren_stack.is_empty() {
547 return Ok(());
548 }
549
550 let prev_col = self.indent_stack.last().unwrap_or(&0);
551 match col.cmp(prev_col) {
552 Ordering::Equal => {
553 if altcol != *self.alt_indent_stack.last().unwrap_or(&0) {
555 return Err(TokError::TabSpace);
556 }
557 }
558 Ordering::Greater => {
559 if self.indent_stack.len() + 1 >= MAX_INDENT {
562 return Err(TokError::TooDeep);
563 }
564 if altcol <= *self.alt_indent_stack.last().unwrap_or(&0) {
567 return Err(TokError::TabSpace);
568 }
569 if self.text_pos.peek().is_some() {
571 self.pending_indents += 1;
572 self.indent_stack.push(col);
573 self.alt_indent_stack.push(altcol);
574 }
575 }
576 Ordering::Less => {
577 while matches!(self.indent_stack.last(), Some(&ind_cols) if col < ind_cols) {
580 self.pending_indents -= 1;
581 self.indent_stack.pop();
582 self.alt_indent_stack.pop();
583 }
584 if col != *self.indent_stack.last().unwrap_or(&0) {
585 return Err(TokError::Dedent);
586 }
587 if altcol != *self.alt_indent_stack.last().unwrap_or(&0) {
588 return Err(TokError::TabSpace);
589 }
590 }
591 }
592
593 Ok(())
594 }
595
596 fn process_pending_indents(&mut self) -> Option<TokType> {
597 if self.pending_indents != 0 {
598 if self.pending_indents < 0 {
599 self.pending_indents += 1;
600 Some(TokType::Dedent)
601 } else {
602 self.pending_indents -= 1;
603 Some(TokType::Indent)
604 }
605 } else {
606 None
607 }
608 }
609
610 fn maybe_close_async_def(&mut self) {
611 if self.async_def
613 && !self.blank_line
614 && self.paren_stack.is_empty()
622 && self.async_def_nl
624 && self.async_def_indent >= self.indent_stack.len()
626 {
627 self.async_def = false;
628 self.async_def_indent = 0;
629 self.async_def_nl = false;
630 }
631 }
632
633 fn consume_identifier_or_prefixed_string(&mut self) -> Result<TokType, TokError<'t>> {
634 if STRING_PREFIX_RE.with(|r| self.text_pos.consume(r)) {
636 if let Some('"') | Some('\'') = self.text_pos.peek() {
637 if self.split_ftstring {
639 let res = match self
640 .text_pos
641 .slice_from_start_pos(&self.start_pos)
642 .chars()
643 .find(|c| matches!(c, 'f' | 'F' | 't' | 'T'))
644 {
645 Some('f' | 'F') => Some(FTStringType::FString),
646 Some('t' | 'T') => Some(FTStringType::TString),
647 _ => None,
648 };
649 if let Some(str_type) = res {
650 return self.consume_prefixed_string_start(str_type);
652 }
653 }
654 return self.consume_string();
655 }
656 } else {
657 let first_ch = self.text_pos.next();
659 debug_assert!(matches!(
660 first_ch,
661 Some('a'..='z') | Some('A'..='Z') | Some('_') | Some('\u{80}'..=MAX_CHAR)
662 ));
663 }
664 POTENTIAL_IDENTIFIER_TAIL_RE.with(|r| self.text_pos.consume(r));
665 let identifier_str = self.text_pos.slice_from_start_pos(&self.start_pos);
666 if !verify_identifier(identifier_str) {
667 return Err(TokError::BadIdentifier(identifier_str));
669 }
670
671 let allow_async = !self.async_hacks || self.async_def;
672 match (identifier_str, allow_async) {
673 ("async", true) => Ok(TokType::Async),
674 ("await", true) => Ok(TokType::Await),
675 ("async", false) => {
676 let mut lookahead_state = self.clone();
680 if lookahead_state.next_inner() == Ok(TokType::Name)
681 && lookahead_state
682 .text_pos
683 .slice_from_start_pos(&lookahead_state.start_pos)
684 == "def"
685 {
686 self.async_def = true;
687 self.async_def_indent = self.indent_stack.len();
688 Ok(TokType::Async)
689 } else {
690 Ok(TokType::Name)
691 }
692 }
693 _ => Ok(TokType::Name),
694 }
695 }
696
697 fn consume_number(&mut self, state: NumberState) -> Result<TokType, TokError<'t>> {
698 match state {
701 NumberState::StartDigit => {
702 let start_digit_ch = self.text_pos.peek();
703 debug_assert!(is_digit(start_digit_ch));
704
705 if start_digit_ch == Some('0') {
706 self.text_pos.next();
707 match self.text_pos.peek() {
708 Some('x') | Some('X') => {
709 self.text_pos.next();
710 if !HEXADECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r))
711 || self.text_pos.peek() == Some('_')
712 {
713 Err(TokError::BadHexadecimal)
714 } else {
715 Ok(TokType::Number)
716 }
717 }
718 Some('o') | Some('O') => {
719 self.text_pos.next();
720 if !OCTAL_TAIL_RE.with(|r| self.text_pos.consume(r))
721 || self.text_pos.peek() == Some('_')
722 {
723 return Err(TokError::BadOctal);
724 }
725 if let Some(next_ch) = self.text_pos.peek() {
726 if is_digit(next_ch) {
727 return Err(TokError::BadOctalDigit(next_ch));
728 }
729 }
730 Ok(TokType::Number)
731 }
732 Some('b') | Some('B') => {
733 self.text_pos.next();
734 if !BINARY_TAIL_RE.with(|r| self.text_pos.consume(r))
735 || self.text_pos.peek() == Some('_')
736 {
737 return Err(TokError::BadBinary);
738 }
739 if let Some(next_ch) = self.text_pos.peek() {
740 if is_digit(next_ch) {
741 return Err(TokError::BadBinaryDigit(next_ch));
742 }
743 }
744 Ok(TokType::Number)
745 }
746 _ => {
747 let mut nonzero = false;
748 loop {
750 if self.text_pos.peek() == Some('_') {
751 self.text_pos.next();
752 if !is_digit(self.text_pos.peek()) {
753 return Err(TokError::BadDecimal);
754 }
755 }
756 if self.text_pos.peek() != Some('0') {
757 break;
758 }
759 self.text_pos.next();
760 }
761 if is_digit(self.text_pos.peek()) {
762 nonzero = true;
763 self.consume_decimal_tail()?;
764 }
765 if self.text_pos.peek() == Some('.') {
766 self.consume_number(NumberState::Fraction)
767 } else if let Some('e') | Some('E') = self.text_pos.peek() {
768 self.consume_number(NumberState::Exponent)
769 } else if let Some('j') | Some('J') = self.text_pos.peek() {
770 self.consume_number(NumberState::Imaginary)
771 } else if nonzero {
772 Err(TokError::BadDecimalLeadingZeros)
773 } else {
774 Ok(TokType::Number)
775 }
776 }
777 }
778 } else {
779 self.consume_decimal_tail()?;
780 if self.text_pos.peek() == Some('.') {
781 self.consume_number(NumberState::Fraction)
782 } else if let Some('e') | Some('E') = self.text_pos.peek() {
783 self.consume_number(NumberState::Exponent)
784 } else if let Some('j') | Some('J') = self.text_pos.peek() {
785 self.consume_number(NumberState::Imaginary)
786 } else {
787 Ok(TokType::Number)
788 }
789 }
790 }
791 NumberState::Fraction => {
792 let dot_ch = self.text_pos.next();
793 debug_assert!(dot_ch == Some('.'));
794
795 if is_digit(self.text_pos.peek()) {
796 self.consume_decimal_tail()?;
797 }
798 if let Some('e') | Some('E') = self.text_pos.peek() {
799 self.consume_number(NumberState::Exponent)
800 } else if let Some('j') | Some('J') = self.text_pos.peek() {
801 self.consume_number(NumberState::Imaginary)
802 } else {
803 Ok(TokType::Number)
804 }
805 }
806 NumberState::Exponent => {
807 let e_ch = self.text_pos.next();
808 debug_assert!(matches!(e_ch, Some('e') | Some('E')));
809
810 if let Some('+') | Some('-') = self.text_pos.peek() {
811 self.text_pos.next();
812 if !is_digit(self.text_pos.peek()) {
813 return Err(TokError::BadDecimal);
814 }
815 } else if !is_digit(self.text_pos.peek()) {
816 self.text_pos.backup_no_newline();
818 return Ok(TokType::Number);
819 }
820 self.consume_decimal_tail()?;
821 if let Some('j') | Some('J') = self.text_pos.peek() {
822 self.consume_number(NumberState::Imaginary)
823 } else {
824 Ok(TokType::Number)
825 }
826 }
827 NumberState::Imaginary => {
828 let j_ch = self.text_pos.next();
829 debug_assert!(matches!(j_ch, Some('j') | Some('J')));
830
831 Ok(TokType::Number)
832 }
833 }
834 }
835
836 fn consume_decimal_tail(&mut self) -> Result<(), TokError<'t>> {
838 let result = DECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r));
839 debug_assert!(result, "try_decimal_tail was called on a non-digit char");
842 if self.text_pos.peek() == Some('_') {
843 Err(TokError::BadDecimal)
844 } else {
845 Ok(())
846 }
847 }
848
849 fn consume_open_quote(&mut self) -> (StringQuoteChar, StringQuoteSize) {
850 let quote_char: StringQuoteChar = self
851 .text_pos
852 .peek()
853 .try_into()
854 .expect("the next character must be a quote when calling consume_open_quote");
855 let triple_quote_pattern = quote_char.triple_str();
856 let quote_size = if self.text_pos.consume(triple_quote_pattern) {
857 StringQuoteSize::Triple
858 } else {
859 self.text_pos.next(); StringQuoteSize::Single
861 };
862 (quote_char, quote_size)
863 }
864
865 fn consume_string(&mut self) -> Result<TokType, TokError<'t>> {
866 let (quote_char, quote_size) = self.consume_open_quote();
869 let quote_raw = quote_char.into();
870
871 let mut end_quote_size: usize = 0;
872 let quote_usize: usize = quote_size.into();
873 while end_quote_size != quote_usize {
874 match (self.text_pos.next(), quote_size) {
875 (None, StringQuoteSize::Triple) => {
876 return Err(TokError::UnterminatedTripleQuotedString);
877 }
878 (None, StringQuoteSize::Single) | (Some('\n'), StringQuoteSize::Single) => {
879 return Err(TokError::UnterminatedString);
880 }
881 (ch @ Some('\''), _) | (ch @ Some('"'), _) if ch == Some(quote_raw) => {
882 end_quote_size += 1;
883 }
884 (Some(ch), _) => {
885 end_quote_size = 0;
886 if ch == '\\' {
887 self.text_pos.next();
889 }
890 }
891 }
892 }
893
894 Ok(TokType::String)
895 }
896
897 fn consume_prefixed_string_start(
898 &mut self,
899 str_type: FTStringType,
900 ) -> Result<TokType, TokError<'t>> {
901 let (quote_char, quote_size) = self.consume_open_quote();
903 let is_raw_string = self
904 .text_pos
905 .slice_from_start_pos(&self.start_pos)
906 .contains(&['r', 'R'][..]);
907 self.ftstring_stack.push(FTStringNode::new(
908 quote_char,
909 quote_size,
910 is_raw_string,
911 str_type.clone(),
912 ));
913
914 match str_type {
915 FTStringType::FString => Ok(TokType::FStringStart),
916 FTStringType::TString => Ok(TokType::TStringStart),
917 }
918 }
919
920 fn maybe_consume_ftstring_string(
921 &mut self,
922 is_in_format_spec: bool,
923 is_raw_string: bool,
924 ) -> Result<Option<TokType>, TokError<'t>> {
925 let allow_multiline = self
926 .ftstring_stack
927 .last()
928 .map(|node| node.allow_multiline())
929 == Some(true);
930 let str_type = self
931 .ftstring_stack
932 .last()
933 .map(|node| node.string_type.clone());
934 let mut in_named_unicode: bool = false;
935 let mut ok_result = Ok(None); 'outer: loop {
937 match (self.text_pos.peek(), allow_multiline) {
938 (None, true) => {
939 return Err(TokError::UnterminatedTripleQuotedString);
940 }
941 (None, false) | (Some('\n'), false) => {
942 return Err(TokError::UnterminatedString);
943 }
944 (ch @ Some('\''), _) | (ch @ Some('"'), _) => {
945 if let Some(node) = self.ftstring_stack.last() {
947 if ch == Some(node.quote_char.into()) {
948 match node.quote_size {
949 StringQuoteSize::Single => {
950 break 'outer;
951 }
952 StringQuoteSize::Triple => {
953 if self.text_pos.matches(node.quote_char.triple_str()) {
954 break 'outer;
955 }
956 }
957 }
958 }
959 }
960 self.text_pos.next();
961 }
962 (Some('\\'), _) if !is_raw_string => {
963 self.text_pos.next();
964 if is_in_format_spec {
965 if let Some('{') | Some('}') = self.text_pos.peek() {
966 } else {
969 self.text_pos.next();
971 }
972 } else if let Some(
973 '\n'
974 | '\\'
975 | '\''
976 | '"'
977 | 'a'
978 | 'b'
979 | 'f'
980 | 'n'
981 | 'r'
982 | 't'
983 | 'v'
984 | 'x'
985 | '0'..='9'
986 | 'N'
987 | 'u'
988 | 'U',
989 ) = self.text_pos.peek()
990 {
991 let next_ch = self.text_pos.next();
993 if let Some('N') = next_ch {
995 if let Some('{') = self.text_pos.peek() {
997 in_named_unicode = true;
998 self.text_pos.next();
999 }
1000 }
1001 }
1002 }
1003 (Some('\\'), _) if is_raw_string => {
1004 self.text_pos.next();
1005 if let Some('"' | '\'' | '\\') = self.text_pos.peek() {
1007 self.text_pos.next();
1008 }
1009 }
1010 (Some('{'), _) => {
1011 if is_in_format_spec {
1012 break 'outer;
1014 }
1015 let consumed_double = self.text_pos.consume("{{");
1016 if !consumed_double {
1017 break 'outer;
1018 }
1019 }
1020 (Some('}'), _) => {
1021 if in_named_unicode {
1022 in_named_unicode = false;
1023 self.text_pos.next();
1024 } else if is_in_format_spec {
1025 break 'outer;
1027 } else if !self.text_pos.consume("}}") {
1028 return Err(TokError::UnmatchedClosingParen('}'));
1029 }
1030 }
1031 _ => {
1032 self.text_pos.next();
1033 }
1034 }
1035 ok_result = match str_type {
1036 Some(FTStringType::FString) => Ok(Some(TokType::FStringString)),
1037 Some(FTStringType::TString) => Ok(Some(TokType::TStringString)),
1038 None => unreachable!("We should always have a string type"),
1039 };
1040 }
1041 ok_result
1042 }
1043
1044 fn maybe_consume_ftstring_end(&mut self) -> Option<TokType> {
1045 let ch = self.text_pos.peek();
1046 if let Some(node) = self.ftstring_stack.last() {
1047 if ch == Some(node.quote_char.into()) {
1048 if node.quote_size == StringQuoteSize::Triple {
1049 self.text_pos.consume(node.quote_char.triple_str());
1050 } else {
1051 self.text_pos.next(); }
1053 let tok_type = match node.string_type {
1054 FTStringType::FString => TokType::FStringEnd,
1055 FTStringType::TString => TokType::TStringEnd,
1056 };
1057 self.ftstring_stack.pop();
1058 return Some(tok_type);
1059 }
1060 }
1061 None
1062 }
1063}
1064
1065impl<'t> Iterator for TokState<'t> {
1066 type Item = Result<TokType, TokError<'t>>;
1067
1068 fn next(&mut self) -> Option<Result<TokType, TokError<'t>>> {
1070 if self.done {
1072 None
1073 } else {
1074 match self.next_inner() {
1075 Err(err) => {
1076 self.done = true;
1077 Some(Err(err))
1078 }
1079 Ok(TokType::EndMarker) => {
1080 self.done = true;
1081 Some(Ok(TokType::EndMarker))
1082 }
1083 Ok(t) => Some(Ok(t)),
1084 }
1085 }
1086 }
1087}
1088
1089fn verify_identifier(name: &str) -> bool {
1093 name.is_ascii() || UNICODE_IDENTIFIER_RE.with(|r| r.is_match(name))
1097}
1098
1099#[derive(Clone)]
1100pub struct Token<'a> {
1101 pub r#type: TokType,
1102 pub string: &'a str,
1103 pub start_pos: TextPositionSnapshot,
1104 pub end_pos: TextPositionSnapshot,
1105 pub whitespace_before: Rc<RefCell<WhitespaceState<'a>>>,
1106 pub whitespace_after: Rc<RefCell<WhitespaceState<'a>>>,
1107 pub relative_indent: Option<&'a str>,
1108}
1109
1110impl<'a> Debug for Token<'a> {
1111 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
1112 write!(
1113 f,
1114 "Token({:?}, {}, start={:?}, end={:?}, relative_indent={:?}, ws_before={:?}, ws_after={:?}",
1115 self.r#type, self.string, self.start_pos, self.end_pos, self.relative_indent, self.whitespace_before, self.whitespace_after
1116 )
1117 }
1118}
1119
1120impl<'a> PartialEq for Token<'a> {
1122 fn eq(&self, _other: &Self) -> bool {
1123 true
1124 }
1125}
1126
1127impl<'a> Eq for Token<'a> {}
1128
1129pub struct TokenIterator<'a> {
1130 previous_whitespace: Option<Rc<RefCell<WhitespaceState<'a>>>>,
1131 core_state: TokState<'a>,
1132 absolute_indents: Vec<&'a str>,
1133}
1134
1135impl<'a> TokenIterator<'a> {
1136 pub fn new(module_text: &'a str, config: &TokConfig) -> Self {
1137 Self {
1138 previous_whitespace: None,
1139 absolute_indents: vec![],
1140 core_state: TokState::new(module_text, config),
1141 }
1142 }
1143}
1144
1145impl<'a> Iterator for TokenIterator<'a> {
1146 type Item = Result<Token<'a>, TokError<'a>>;
1147
1148 fn next(&mut self) -> Option<Self::Item> {
1149 let next = self.core_state.next();
1150 next.as_ref()?;
1151 Some((|| {
1152 let tok_type = next.unwrap()?;
1153 let relative_indent = match tok_type {
1154 TokType::Indent => {
1155 let end_idx = self.core_state.text_pos.byte_idx();
1156 let start_idx = end_idx - self.core_state.bol_width;
1157 let absolute_indent = &self.core_state.text_pos.text()[start_idx..end_idx];
1158 let relative_indent =
1159 if let Some(prev_absolute_indent) = self.absolute_indents.last() {
1160 if let Some(ri) = absolute_indent.strip_prefix(prev_absolute_indent) {
1161 ri
1162 } else {
1163 return Err(TokError::Dedent);
1165 }
1166 } else {
1167 absolute_indent
1169 };
1170 self.absolute_indents.push(absolute_indent);
1171 if let Some(ws) = self.previous_whitespace.as_mut() {
1173 ws.borrow_mut().absolute_indent = absolute_indent;
1174 }
1175 Some(relative_indent)
1176 }
1177 TokType::Dedent => {
1178 self.absolute_indents.pop();
1179 if let Some(ws) = self.previous_whitespace.as_mut() {
1181 ws.borrow_mut().absolute_indent =
1182 self.absolute_indents.last().unwrap_or(&"");
1183 }
1184 None
1185 }
1186 _ => None,
1187 };
1188 let text_pos = &self.core_state.text_pos;
1189 let whitespace_before = self.previous_whitespace.clone().unwrap_or_default();
1190 let whitespace_after = match tok_type {
1191 TokType::Indent | TokType::Dedent | TokType::EndMarker => whitespace_before.clone(),
1192 _ => Rc::new(RefCell::new(WhitespaceState {
1193 line: text_pos.line_number(),
1194 column: text_pos.char_column_number(),
1195 column_byte: text_pos.byte_column_number(),
1196 byte_offset: text_pos.byte_idx(),
1197 absolute_indent: self.absolute_indents.last().unwrap_or(&""),
1198 is_parenthesized: self.core_state.is_parenthesized(),
1199 })),
1200 };
1201 self.previous_whitespace = Some(whitespace_after.clone());
1202
1203 Ok(Token {
1204 r#type: tok_type,
1205 string: text_pos.slice_from_start_pos(&self.core_state.start_pos),
1206 start_pos: self.core_state.start_pos.clone(),
1207 end_pos: text_pos.into(),
1208 whitespace_after: whitespace_after.clone(),
1209 whitespace_before: whitespace_before.clone(),
1210 relative_indent,
1211 })
1212 })())
1213 }
1214}