rustpython_parser/
lexer.rs

1//! This module takes care of lexing Python source text.
2//!
3//! This means source code is scanned and translated into separate tokens. The rules
4//! governing what is and is not a valid token are defined in the Python reference
5//! guide section on [Lexical analysis].
6//!
7//! The primary function in this module is [`lex`], which takes a string slice
8//! and returns an iterator over the tokens in the source code. The tokens are currently returned
9//! as a `Result<Spanned, LexicalError>`, where [`Spanned`] is a tuple containing the
10//! start and end [`TextSize`] and a [`Tok`] denoting the token.
11//!
12//! # Example
13//!
14//! ```
15//! use rustpython_parser::{lexer::lex, Tok, Mode, StringKind};
16//!
17//! let source = "x = 'RustPython'";
18//! let tokens = lex(source, Mode::Module)
19//!     .map(|tok| tok.expect("Failed to lex"))
20//!     .collect::<Vec<_>>();
21//!
22//! for (token, range) in tokens {
23//!     println!(
24//!         "{token:?}@{range:?}",
25//!     );
26//! }
27//! ```
28//!
29//! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
30use crate::{
31    ast::bigint::BigInt,
32    soft_keywords::SoftKeywordTransformer,
33    string::FStringErrorType,
34    text_size::{TextLen, TextRange, TextSize},
35    token::{StringKind, Tok},
36    Mode,
37};
38use log::trace;
39use num_traits::{Num, Zero};
40use std::{char, cmp::Ordering, ops::Index, slice::SliceIndex, str::FromStr};
41use unic_emoji_char::is_emoji_presentation;
42use unic_ucd_ident::{is_xid_continue, is_xid_start};
43
44// Indentations are tracked by a stack of indentation levels. IndentationLevel keeps
45// track of the number of tabs and spaces at the current level.
46#[derive(Clone, Copy, PartialEq, Debug, Default)]
47struct IndentationLevel {
48    tabs: u32,
49    spaces: u32,
50}
51
52impl IndentationLevel {
53    fn compare_strict(
54        &self,
55        other: &IndentationLevel,
56        location: TextSize,
57    ) -> Result<Ordering, LexicalError> {
58        // We only know for sure that we're smaller or bigger if tabs
59        // and spaces both differ in the same direction. Otherwise we're
60        // dependent on the size of tabs.
61        match self.tabs.cmp(&other.tabs) {
62            Ordering::Less => {
63                if self.spaces <= other.spaces {
64                    Ok(Ordering::Less)
65                } else {
66                    Err(LexicalError {
67                        location,
68                        error: LexicalErrorType::TabError,
69                    })
70                }
71            }
72            Ordering::Greater => {
73                if self.spaces >= other.spaces {
74                    Ok(Ordering::Greater)
75                } else {
76                    Err(LexicalError {
77                        location,
78                        error: LexicalErrorType::TabError,
79                    })
80                }
81            }
82            Ordering::Equal => Ok(self.spaces.cmp(&other.spaces)),
83        }
84    }
85}
86
87// The indentations stack is used to keep track of the current indentation level.
88// Similar to the CPython implementation, the Indentations stack always has at
89// least one level which is never popped. See Reference 2.1.8.
90#[derive(Debug)]
91struct Indentations {
92    indent_stack: Vec<IndentationLevel>,
93}
94
95impl Indentations {
96    fn is_empty(&self) -> bool {
97        self.indent_stack.len() == 1
98    }
99
100    fn push(&mut self, indent: IndentationLevel) {
101        self.indent_stack.push(indent);
102    }
103
104    fn pop(&mut self) -> Option<IndentationLevel> {
105        if self.is_empty() {
106            return None;
107        }
108        self.indent_stack.pop()
109    }
110
111    fn current(&self) -> &IndentationLevel {
112        self.indent_stack
113            .last()
114            .expect("Indentations must have at least one level")
115    }
116}
117
118impl Default for Indentations {
119    fn default() -> Self {
120        Self {
121            indent_stack: vec![IndentationLevel::default()],
122        }
123    }
124}
125
126// A CharWindow is a sliding window over an iterator of chars. It is used to
127// allow for look-ahead when scanning tokens from the source code.
128struct CharWindow<T: Iterator<Item = char>, const N: usize> {
129    source: T,
130    window: [Option<char>; N],
131}
132
133impl<T, const N: usize> CharWindow<T, N>
134where
135    T: Iterator<Item = char>,
136{
137    fn new(source: T) -> Self {
138        Self {
139            source,
140            window: [None; N],
141        }
142    }
143
144    fn slide(&mut self) -> Option<char> {
145        self.window.rotate_left(1);
146        let next = self.source.next();
147        *self.window.last_mut().expect("never empty") = next;
148        next
149    }
150}
151
152impl<T, const N: usize, Idx> Index<Idx> for CharWindow<T, N>
153where
154    T: Iterator<Item = char>,
155    Idx: SliceIndex<[Option<char>]>,
156{
157    type Output = Idx::Output;
158
159    fn index(&self, index: Idx) -> &Self::Output {
160        &self.window[index]
161    }
162}
163
164/// A lexer for Python source code.
165pub struct Lexer<T: Iterator<Item = char>> {
166    // Contains the source code to be lexed.
167    window: CharWindow<T, 3>,
168    // Are we at the beginning of a line?
169    at_begin_of_line: bool,
170    // Amount of parenthesis.
171    nesting: usize,
172    // Indentation levels.
173    indentations: Indentations,
174    // Pending list of tokens to be returned.
175    pending: Vec<Spanned>,
176    // The current location.
177    location: TextSize,
178}
179
180// generated in build.rs, in gen_phf()
181/// A map of keywords to their tokens.
182pub static KEYWORDS: phf::Map<&'static str, Tok> =
183    include!(concat!(env!("OUT_DIR"), "/keywords.rs"));
184
185/// Contains a Token along with its `range`.
186pub type Spanned = (Tok, TextRange);
187/// The result of lexing a token.
188pub type LexResult = Result<Spanned, LexicalError>;
189
190/// Create a new lexer from a source string.
191///
192/// # Examples
193///
194/// ```
195/// use rustpython_parser::{Mode, lexer::lex};
196///
197/// let source = "def hello(): return 'world'";
198/// let lexer = lex(source, Mode::Module);
199///
200/// for token in lexer {
201///    println!("{:?}", token);
202/// }
203/// ```
204#[inline]
205pub fn lex(source: &str, mode: Mode) -> impl Iterator<Item = LexResult> + '_ {
206    lex_starts_at(source, mode, TextSize::default())
207}
208
209/// Create a new lexer from a source string, starting at a given location.
210/// You probably want to use [`lex`] instead.
211pub fn lex_starts_at(
212    source: &str,
213    mode: Mode,
214    start_offset: TextSize,
215) -> SoftKeywordTransformer<Lexer<std::str::Chars<'_>>> {
216    SoftKeywordTransformer::new(Lexer::new(source.chars(), start_offset), mode)
217}
218
219impl<T> Lexer<T>
220where
221    T: Iterator<Item = char>,
222{
223    /// Create a new lexer from T and a starting location. You probably want to use
224    /// [`lex`] instead.
225    pub fn new(input: T, start: TextSize) -> Self {
226        let mut lxr = Lexer {
227            at_begin_of_line: true,
228            nesting: 0,
229            indentations: Indentations::default(),
230            // Usually we have less than 5 tokens pending.
231            pending: Vec::with_capacity(5),
232            location: start,
233            window: CharWindow::new(input),
234        };
235        // Fill the window.
236        lxr.window.slide();
237        lxr.window.slide();
238        lxr.window.slide();
239        // TODO: Handle possible mismatch between BOM and explicit encoding declaration.
240        // spell-checker:ignore feff
241        if let Some('\u{feff}') = lxr.window[0] {
242            lxr.window.slide();
243            lxr.location += '\u{feff}'.text_len();
244        }
245        lxr
246    }
247
248    /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
249    fn lex_identifier(&mut self) -> LexResult {
250        // Detect potential string like rb'' b'' f'' u'' r''
251        match self.window[..3] {
252            [Some(c), Some('"' | '\''), ..] => {
253                if let Ok(kind) = StringKind::try_from(c) {
254                    return self.lex_string(kind);
255                }
256            }
257            [Some(c1), Some(c2), Some('"' | '\'')] => {
258                if let Ok(kind) = StringKind::try_from([c1, c2]) {
259                    return self.lex_string(kind);
260                }
261            }
262            _ => {}
263        };
264
265        let start_pos = self.get_pos();
266        let mut name = String::with_capacity(8);
267        while self.is_identifier_continuation() {
268            name.push(self.next_char().unwrap());
269        }
270        let end_pos = self.get_pos();
271
272        if let Some(tok) = KEYWORDS.get(&name) {
273            Ok((tok.clone(), TextRange::new(start_pos, end_pos)))
274        } else {
275            Ok((Tok::Name { name }, TextRange::new(start_pos, end_pos)))
276        }
277    }
278
279    /// Numeric lexing. The feast can start!
280    fn lex_number(&mut self) -> LexResult {
281        let start_pos = self.get_pos();
282        match self.window[..2] {
283            [Some('0'), Some('x' | 'X')] => {
284                // Hex! (0xdeadbeef)
285                self.next_char();
286                self.next_char();
287                self.lex_number_radix(start_pos, 16)
288            }
289            [Some('0'), Some('o' | 'O')] => {
290                // Octal style! (0o377)
291                self.next_char();
292                self.next_char();
293                self.lex_number_radix(start_pos, 8)
294            }
295            [Some('0'), Some('b' | 'B')] => {
296                // Binary! (0b_1110_0101)
297                self.next_char();
298                self.next_char();
299                self.lex_number_radix(start_pos, 2)
300            }
301            _ => self.lex_normal_number(),
302        }
303    }
304
305    /// Lex a hex/octal/decimal/binary number without a decimal point.
306    fn lex_number_radix(&mut self, start_pos: TextSize, radix: u32) -> LexResult {
307        let value_text = self.radix_run(radix);
308        let end_pos = self.get_pos();
309        let value = BigInt::from_str_radix(&value_text, radix).map_err(|e| LexicalError {
310            error: LexicalErrorType::OtherError(format!("{e:?}")),
311            location: start_pos,
312        })?;
313        Ok((Tok::Int { value }, TextRange::new(start_pos, end_pos)))
314    }
315
316    /// Lex a normal number, that is, no octal, hex or binary number.
317    fn lex_normal_number(&mut self) -> LexResult {
318        let start_pos = self.get_pos();
319        let start_is_zero = self.window[0] == Some('0');
320        // Normal number:
321        let mut value_text = self.radix_run(10);
322
323        // If float:
324        if self.window[0] == Some('.') || self.at_exponent() {
325            // Take '.':
326            if self.window[0] == Some('.') {
327                if self.window[1] == Some('_') {
328                    return Err(LexicalError {
329                        error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()),
330                        location: self.get_pos(),
331                    });
332                }
333                value_text.push(self.next_char().unwrap());
334                value_text.push_str(&self.radix_run(10));
335            }
336
337            // 1e6 for example:
338            if let Some('e' | 'E') = self.window[0] {
339                if self.window[1] == Some('_') {
340                    return Err(LexicalError {
341                        error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()),
342                        location: self.get_pos(),
343                    });
344                }
345                value_text.push(self.next_char().unwrap().to_ascii_lowercase());
346                // Optional +/-
347                if matches!(self.window[0], Some('-' | '+')) {
348                    if self.window[1] == Some('_') {
349                        return Err(LexicalError {
350                            error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()),
351                            location: self.get_pos(),
352                        });
353                    }
354                    value_text.push(self.next_char().unwrap());
355                }
356
357                value_text.push_str(&self.radix_run(10));
358            }
359
360            let value = f64::from_str(&value_text).map_err(|_| LexicalError {
361                error: LexicalErrorType::OtherError("Invalid decimal literal".to_owned()),
362                location: self.get_pos(),
363            })?;
364
365            // Parse trailing 'j':
366            if matches!(self.window[0], Some('j' | 'J')) {
367                self.next_char();
368                let end_pos = self.get_pos();
369                Ok((
370                    Tok::Complex {
371                        real: 0.0,
372                        imag: value,
373                    },
374                    TextRange::new(start_pos, end_pos),
375                ))
376            } else {
377                let end_pos = self.get_pos();
378                Ok((Tok::Float { value }, TextRange::new(start_pos, end_pos)))
379            }
380        } else {
381            // Parse trailing 'j':
382            if matches!(self.window[0], Some('j' | 'J')) {
383                self.next_char();
384                let end_pos = self.get_pos();
385                let imag = f64::from_str(&value_text).unwrap();
386                Ok((
387                    Tok::Complex { real: 0.0, imag },
388                    TextRange::new(start_pos, end_pos),
389                ))
390            } else {
391                let end_pos = self.get_pos();
392                let value = value_text.parse::<BigInt>().unwrap();
393                if start_is_zero && !value.is_zero() {
394                    // leading zeros in decimal integer literals are not permitted
395                    return Err(LexicalError {
396                        error: LexicalErrorType::OtherError("Invalid Token".to_owned()),
397                        location: self.get_pos(),
398                    });
399                }
400                Ok((Tok::Int { value }, TextRange::new(start_pos, end_pos)))
401            }
402        }
403    }
404
405    /// Consume a sequence of numbers with the given radix,
406    /// the digits can be decorated with underscores
407    /// like this: '1_2_3_4' == '1234'
408    fn radix_run(&mut self, radix: u32) -> String {
409        let mut value_text = String::new();
410
411        loop {
412            if let Some(c) = self.take_number(radix) {
413                value_text.push(c);
414            } else if self.window[0] == Some('_')
415                && Lexer::<T>::is_digit_of_radix(self.window[1], radix)
416            {
417                self.next_char();
418            } else {
419                break;
420            }
421        }
422        value_text
423    }
424
425    /// Consume a single character with the given radix.
426    fn take_number(&mut self, radix: u32) -> Option<char> {
427        let take_char = Lexer::<T>::is_digit_of_radix(self.window[0], radix);
428
429        take_char.then(|| self.next_char().unwrap())
430    }
431
432    /// Test if a digit is of a certain radix.
433    fn is_digit_of_radix(c: Option<char>, radix: u32) -> bool {
434        match radix {
435            2 => matches!(c, Some('0'..='1')),
436            8 => matches!(c, Some('0'..='7')),
437            10 => matches!(c, Some('0'..='9')),
438            16 => matches!(c, Some('0'..='9') | Some('a'..='f') | Some('A'..='F')),
439            other => unimplemented!("Radix not implemented: {}", other),
440        }
441    }
442
443    /// Test if we face '[eE][-+]?[0-9]+'
444    fn at_exponent(&self) -> bool {
445        match self.window[..2] {
446            [Some('e' | 'E'), Some('+' | '-')] => matches!(self.window[2], Some('0'..='9')),
447            [Some('e' | 'E'), Some('0'..='9')] => true,
448            _ => false,
449        }
450    }
451
452    /// Lex a single comment.
453    #[cfg(feature = "full-lexer")]
454    fn lex_comment(&mut self) -> LexResult {
455        let start_pos = self.get_pos();
456        let mut value = String::new();
457        loop {
458            match self.window[0] {
459                Some('\n' | '\r') | None => {
460                    let end_pos = self.get_pos();
461                    return Ok((Tok::Comment(value), TextRange::new(start_pos, end_pos)));
462                }
463                Some(_) => {}
464            }
465            value.push(self.next_char().unwrap());
466        }
467    }
468
469    #[cfg(feature = "full-lexer")]
470    fn lex_and_emit_comment(&mut self) -> Result<(), LexicalError> {
471        let comment = self.lex_comment()?;
472        self.emit(comment);
473        Ok(())
474    }
475
476    /// Discard comment if full-lexer is not enabled.
477    #[cfg(not(feature = "full-lexer"))]
478    fn lex_comment(&mut self) {
479        loop {
480            match self.window[0] {
481                Some('\n' | '\r') | None => {
482                    return;
483                }
484                Some(_) => {}
485            }
486            self.next_char().unwrap();
487        }
488    }
489
490    #[cfg(not(feature = "full-lexer"))]
491    #[inline]
492    fn lex_and_emit_comment(&mut self) -> Result<(), LexicalError> {
493        self.lex_comment();
494        Ok(())
495    }
496
497    /// Lex a string literal.
498    fn lex_string(&mut self, kind: StringKind) -> LexResult {
499        let start_pos = self.get_pos();
500        for _ in 0..u32::from(kind.prefix_len()) {
501            self.next_char();
502        }
503        let quote_char = self.next_char().unwrap();
504        let mut string_content = String::with_capacity(5);
505
506        // If the next two characters are also the quote character, then we have a triple-quoted
507        // string; consume those two characters and ensure that we require a triple-quote to close
508        let triple_quoted = if self.window[..2] == [Some(quote_char); 2] {
509            self.next_char();
510            self.next_char();
511            true
512        } else {
513            false
514        };
515
516        loop {
517            match self.next_char() {
518                Some(c) => {
519                    if c == '\\' {
520                        if let Some(next_c) = self.next_char() {
521                            string_content.push('\\');
522                            string_content.push(next_c);
523                            continue;
524                        }
525                    }
526                    if c == '\n' && !triple_quoted {
527                        return Err(LexicalError {
528                            error: LexicalErrorType::OtherError(
529                                "EOL while scanning string literal".to_owned(),
530                            ),
531                            location: self.get_pos(),
532                        });
533                    }
534
535                    if c == quote_char {
536                        if triple_quoted {
537                            // Look ahead at the next two characters; if we have two more
538                            // quote_chars, it's the end of the string; consume the remaining
539                            // closing quotes and break the loop
540                            if self.window[..2] == [Some(quote_char); 2] {
541                                self.next_char();
542                                self.next_char();
543                                break;
544                            }
545                        } else {
546                            break;
547                        }
548                    }
549                    string_content.push(c);
550                }
551                None => {
552                    return Err(LexicalError {
553                        error: if triple_quoted {
554                            LexicalErrorType::Eof
555                        } else {
556                            LexicalErrorType::StringError
557                        },
558                        location: self.get_pos(),
559                    });
560                }
561            }
562        }
563        let end_pos = self.get_pos();
564        let tok = Tok::String {
565            value: string_content,
566            kind,
567            triple_quoted,
568        };
569        Ok((tok, TextRange::new(start_pos, end_pos)))
570    }
571
572    // Checks if the character c is a valid starting character as described
573    // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
574    fn is_identifier_start(&self, c: char) -> bool {
575        match c {
576            'a'..='z' | 'A'..='Z' | '_' => true,
577            _ => is_xid_start(c),
578        }
579    }
580
581    // Checks if the character c is a valid continuation character as described
582    // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
583    fn is_identifier_continuation(&self) -> bool {
584        match self.window[0] {
585            Some('a'..='z' | 'A'..='Z' | '_' | '0'..='9') => true,
586            Some(c) => is_xid_continue(c),
587            _ => false,
588        }
589    }
590
591    // This is the main entry point. Call this function to retrieve the next token.
592    // This function is used by the iterator implementation.
593    fn inner_next(&mut self) -> LexResult {
594        // top loop, keep on processing, until we have something pending.
595        while self.pending.is_empty() {
596            // Detect indentation levels
597            if self.at_begin_of_line {
598                self.handle_indentations()?;
599            }
600
601            self.consume_normal()?;
602        }
603
604        Ok(self.pending.remove(0))
605    }
606
607    // Given we are at the start of a line, count the number of spaces and/or tabs until the first character.
608    fn eat_indentation(&mut self) -> Result<IndentationLevel, LexicalError> {
609        // Determine indentation:
610        let mut spaces: u32 = 0;
611        let mut tabs: u32 = 0;
612        loop {
613            match self.window[0] {
614                Some(' ') => {
615                    /*
616                    if tabs != 0 {
617                        // Don't allow spaces after tabs as part of indentation.
618                        // This is technically stricter than python3 but spaces after
619                        // tabs is even more insane than mixing spaces and tabs.
620                        return Some(Err(LexicalError {
621                            error: LexicalErrorType::OtherError("Spaces not allowed as part of indentation after tabs".to_owned()),
622                            location: self.get_pos(),
623                        }));
624                    }
625                    */
626                    self.next_char();
627                    spaces += 1;
628                }
629                Some('\t') => {
630                    if spaces != 0 {
631                        // Don't allow tabs after spaces as part of indentation.
632                        // This is technically stricter than python3 but spaces before
633                        // tabs is even more insane than mixing spaces and tabs.
634                        return Err(LexicalError {
635                            error: LexicalErrorType::TabsAfterSpaces,
636                            location: self.get_pos(),
637                        });
638                    }
639                    self.next_char();
640                    tabs += 1;
641                }
642                Some('#') => {
643                    self.lex_and_emit_comment()?;
644                    spaces = 0;
645                    tabs = 0;
646                }
647                Some('\x0C') => {
648                    // Form feed character!
649                    // Reset indentation for the Emacs user.
650                    self.next_char();
651                    spaces = 0;
652                    tabs = 0;
653                }
654                Some('\n' | '\r') => {
655                    // Empty line!
656                    #[cfg(feature = "full-lexer")]
657                    let tok_start = self.get_pos();
658                    self.next_char();
659                    #[cfg(feature = "full-lexer")]
660                    let tok_end = self.get_pos();
661                    #[cfg(feature = "full-lexer")]
662                    self.emit((Tok::NonLogicalNewline, TextRange::new(tok_start, tok_end)));
663                    spaces = 0;
664                    tabs = 0;
665                }
666                None => {
667                    spaces = 0;
668                    tabs = 0;
669                    break;
670                }
671                _ => {
672                    self.at_begin_of_line = false;
673                    break;
674                }
675            }
676        }
677
678        Ok(IndentationLevel { tabs, spaces })
679    }
680
681    // Push/pop indents/dedents based on the current indentation level.
682    fn handle_indentations(&mut self) -> Result<(), LexicalError> {
683        let indentation_level = self.eat_indentation()?;
684
685        if self.nesting != 0 {
686            return Ok(());
687        }
688
689        // Determine indent or dedent:
690        let current_indentation = self.indentations.current();
691        let ordering = indentation_level.compare_strict(current_indentation, self.get_pos())?;
692        match ordering {
693            Ordering::Equal => {
694                // Same same
695            }
696            Ordering::Greater => {
697                // New indentation level:
698                self.indentations.push(indentation_level);
699                let tok_pos = self.get_pos();
700                self.emit((
701                    Tok::Indent,
702                    TextRange::new(
703                        tok_pos
704                            - TextSize::new(indentation_level.spaces)
705                            - TextSize::new(indentation_level.tabs),
706                        tok_pos,
707                    ),
708                ));
709            }
710            Ordering::Less => {
711                // One or more dedentations
712                // Pop off other levels until col is found:
713
714                loop {
715                    let current_indentation = self.indentations.current();
716                    let ordering =
717                        indentation_level.compare_strict(current_indentation, self.get_pos())?;
718                    match ordering {
719                        Ordering::Less => {
720                            self.indentations.pop();
721                            let tok_pos = self.get_pos();
722                            self.emit((Tok::Dedent, TextRange::empty(tok_pos)));
723                        }
724                        Ordering::Equal => {
725                            // We arrived at proper level of indentation.
726                            break;
727                        }
728                        Ordering::Greater => {
729                            return Err(LexicalError {
730                                error: LexicalErrorType::IndentationError,
731                                location: self.get_pos(),
732                            });
733                        }
734                    }
735                }
736            }
737        }
738
739        Ok(())
740    }
741
742    // Take a look at the next character, if any, and decide upon the next steps.
743    fn consume_normal(&mut self) -> Result<(), LexicalError> {
744        if let Some(c) = self.window[0] {
745            // Identifiers are the most common case.
746            if self.is_identifier_start(c) {
747                let identifier = self.lex_identifier()?;
748                self.emit(identifier);
749            } else {
750                self.consume_character(c)?;
751            }
752        } else {
753            // We reached end of file.
754            let tok_pos = self.get_pos();
755
756            // First of all, we need all nestings to be finished.
757            if self.nesting > 0 {
758                return Err(LexicalError {
759                    error: LexicalErrorType::Eof,
760                    location: tok_pos,
761                });
762            }
763
764            // Next, insert a trailing newline, if required.
765            if !self.at_begin_of_line {
766                self.at_begin_of_line = true;
767                self.emit((Tok::Newline, TextRange::empty(tok_pos)));
768            }
769
770            // Next, flush the indentation stack to zero.
771            while !self.indentations.is_empty() {
772                self.indentations.pop();
773                self.emit((Tok::Dedent, TextRange::empty(tok_pos)));
774            }
775
776            self.emit((Tok::EndOfFile, TextRange::empty(tok_pos)));
777        }
778
779        Ok(())
780    }
781
782    // Dispatch based on the given character.
783    fn consume_character(&mut self, c: char) -> Result<(), LexicalError> {
784        match c {
785            '0'..='9' => {
786                let number = self.lex_number()?;
787                self.emit(number);
788            }
789            '#' => {
790                self.lex_and_emit_comment()?;
791            }
792            '"' | '\'' => {
793                let string = self.lex_string(StringKind::String)?;
794                self.emit(string);
795            }
796            '=' => {
797                let tok_start = self.get_pos();
798                self.next_char();
799                match self.window[0] {
800                    Some('=') => {
801                        self.next_char();
802                        let tok_end = self.get_pos();
803                        self.emit((Tok::EqEqual, TextRange::new(tok_start, tok_end)));
804                    }
805                    _ => {
806                        let tok_end = self.get_pos();
807                        self.emit((Tok::Equal, TextRange::new(tok_start, tok_end)));
808                    }
809                }
810            }
811            '+' => {
812                let tok_start = self.get_pos();
813                self.next_char();
814                if let Some('=') = self.window[0] {
815                    self.next_char();
816                    let tok_end = self.get_pos();
817                    self.emit((Tok::PlusEqual, TextRange::new(tok_start, tok_end)));
818                } else {
819                    let tok_end = self.get_pos();
820                    self.emit((Tok::Plus, TextRange::new(tok_start, tok_end)));
821                }
822            }
823            '*' => {
824                let tok_start = self.get_pos();
825                self.next_char();
826                match self.window[0] {
827                    Some('=') => {
828                        self.next_char();
829                        let tok_end = self.get_pos();
830                        self.emit((Tok::StarEqual, TextRange::new(tok_start, tok_end)));
831                    }
832                    Some('*') => {
833                        self.next_char();
834                        match self.window[0] {
835                            Some('=') => {
836                                self.next_char();
837                                let tok_end = self.get_pos();
838                                self.emit((
839                                    Tok::DoubleStarEqual,
840                                    TextRange::new(tok_start, tok_end),
841                                ));
842                            }
843                            _ => {
844                                let tok_end = self.get_pos();
845                                self.emit((Tok::DoubleStar, TextRange::new(tok_start, tok_end)));
846                            }
847                        }
848                    }
849                    _ => {
850                        let tok_end = self.get_pos();
851                        self.emit((Tok::Star, TextRange::new(tok_start, tok_end)));
852                    }
853                }
854            }
855            '/' => {
856                let tok_start = self.get_pos();
857                self.next_char();
858                match self.window[0] {
859                    Some('=') => {
860                        self.next_char();
861                        let tok_end = self.get_pos();
862                        self.emit((Tok::SlashEqual, TextRange::new(tok_start, tok_end)));
863                    }
864                    Some('/') => {
865                        self.next_char();
866                        match self.window[0] {
867                            Some('=') => {
868                                self.next_char();
869                                let tok_end = self.get_pos();
870                                self.emit((
871                                    Tok::DoubleSlashEqual,
872                                    TextRange::new(tok_start, tok_end),
873                                ));
874                            }
875                            _ => {
876                                let tok_end = self.get_pos();
877                                self.emit((Tok::DoubleSlash, TextRange::new(tok_start, tok_end)));
878                            }
879                        }
880                    }
881                    _ => {
882                        let tok_end = self.get_pos();
883                        self.emit((Tok::Slash, TextRange::new(tok_start, tok_end)));
884                    }
885                }
886            }
887            '%' => {
888                let tok_start = self.get_pos();
889                self.next_char();
890                if let Some('=') = self.window[0] {
891                    self.next_char();
892                    let tok_end = self.get_pos();
893                    self.emit((Tok::PercentEqual, TextRange::new(tok_start, tok_end)));
894                } else {
895                    let tok_end = self.get_pos();
896                    self.emit((Tok::Percent, TextRange::new(tok_start, tok_end)));
897                }
898            }
899            '|' => {
900                let tok_start = self.get_pos();
901                self.next_char();
902                if let Some('=') = self.window[0] {
903                    self.next_char();
904                    let tok_end = self.get_pos();
905                    self.emit((Tok::VbarEqual, TextRange::new(tok_start, tok_end)));
906                } else {
907                    let tok_end = self.get_pos();
908                    self.emit((Tok::Vbar, TextRange::new(tok_start, tok_end)));
909                }
910            }
911            '^' => {
912                let tok_start = self.get_pos();
913                self.next_char();
914                if let Some('=') = self.window[0] {
915                    self.next_char();
916                    let tok_end = self.get_pos();
917                    self.emit((Tok::CircumflexEqual, TextRange::new(tok_start, tok_end)));
918                } else {
919                    let tok_end = self.get_pos();
920                    self.emit((Tok::CircumFlex, TextRange::new(tok_start, tok_end)));
921                }
922            }
923            '&' => {
924                let tok_start = self.get_pos();
925                self.next_char();
926                if let Some('=') = self.window[0] {
927                    self.next_char();
928                    let tok_end = self.get_pos();
929                    self.emit((Tok::AmperEqual, TextRange::new(tok_start, tok_end)));
930                } else {
931                    let tok_end = self.get_pos();
932                    self.emit((Tok::Amper, TextRange::new(tok_start, tok_end)));
933                }
934            }
935            '-' => {
936                let tok_start = self.get_pos();
937                self.next_char();
938                match self.window[0] {
939                    Some('=') => {
940                        self.next_char();
941                        let tok_end = self.get_pos();
942                        self.emit((Tok::MinusEqual, TextRange::new(tok_start, tok_end)));
943                    }
944                    Some('>') => {
945                        self.next_char();
946                        let tok_end = self.get_pos();
947                        self.emit((Tok::Rarrow, TextRange::new(tok_start, tok_end)));
948                    }
949                    _ => {
950                        let tok_end = self.get_pos();
951                        self.emit((Tok::Minus, TextRange::new(tok_start, tok_end)));
952                    }
953                }
954            }
955            '@' => {
956                let tok_start = self.get_pos();
957                self.next_char();
958                if let Some('=') = self.window[0] {
959                    self.next_char();
960                    let tok_end = self.get_pos();
961                    self.emit((Tok::AtEqual, TextRange::new(tok_start, tok_end)));
962                } else {
963                    let tok_end = self.get_pos();
964                    self.emit((Tok::At, TextRange::new(tok_start, tok_end)));
965                }
966            }
967            '!' => {
968                let tok_start = self.get_pos();
969                self.next_char();
970                if let Some('=') = self.window[0] {
971                    self.next_char();
972                    let tok_end = self.get_pos();
973                    self.emit((Tok::NotEqual, TextRange::new(tok_start, tok_end)));
974                } else {
975                    return Err(LexicalError {
976                        error: LexicalErrorType::UnrecognizedToken { tok: '!' },
977                        location: tok_start,
978                    });
979                }
980            }
981            '~' => {
982                self.eat_single_char(Tok::Tilde);
983            }
984            '(' => {
985                self.eat_single_char(Tok::Lpar);
986                self.nesting += 1;
987            }
988            ')' => {
989                self.eat_single_char(Tok::Rpar);
990                if self.nesting == 0 {
991                    return Err(LexicalError {
992                        error: LexicalErrorType::NestingError,
993                        location: self.get_pos(),
994                    });
995                }
996                self.nesting -= 1;
997            }
998            '[' => {
999                self.eat_single_char(Tok::Lsqb);
1000                self.nesting += 1;
1001            }
1002            ']' => {
1003                self.eat_single_char(Tok::Rsqb);
1004                if self.nesting == 0 {
1005                    return Err(LexicalError {
1006                        error: LexicalErrorType::NestingError,
1007                        location: self.get_pos(),
1008                    });
1009                }
1010                self.nesting -= 1;
1011            }
1012            '{' => {
1013                self.eat_single_char(Tok::Lbrace);
1014                self.nesting += 1;
1015            }
1016            '}' => {
1017                self.eat_single_char(Tok::Rbrace);
1018                if self.nesting == 0 {
1019                    return Err(LexicalError {
1020                        error: LexicalErrorType::NestingError,
1021                        location: self.get_pos(),
1022                    });
1023                }
1024                self.nesting -= 1;
1025            }
1026            ':' => {
1027                let tok_start = self.get_pos();
1028                self.next_char();
1029                if let Some('=') = self.window[0] {
1030                    self.next_char();
1031                    let tok_end = self.get_pos();
1032                    self.emit((Tok::ColonEqual, TextRange::new(tok_start, tok_end)));
1033                } else {
1034                    let tok_end = self.get_pos();
1035                    self.emit((Tok::Colon, TextRange::new(tok_start, tok_end)));
1036                }
1037            }
1038            ';' => {
1039                self.eat_single_char(Tok::Semi);
1040            }
1041            '<' => {
1042                let tok_start = self.get_pos();
1043                self.next_char();
1044                match self.window[0] {
1045                    Some('<') => {
1046                        self.next_char();
1047                        match self.window[0] {
1048                            Some('=') => {
1049                                self.next_char();
1050                                let tok_end = self.get_pos();
1051                                self.emit((
1052                                    Tok::LeftShiftEqual,
1053                                    TextRange::new(tok_start, tok_end),
1054                                ));
1055                            }
1056                            _ => {
1057                                let tok_end = self.get_pos();
1058                                self.emit((Tok::LeftShift, TextRange::new(tok_start, tok_end)));
1059                            }
1060                        }
1061                    }
1062                    Some('=') => {
1063                        self.next_char();
1064                        let tok_end = self.get_pos();
1065                        self.emit((Tok::LessEqual, TextRange::new(tok_start, tok_end)));
1066                    }
1067                    _ => {
1068                        let tok_end = self.get_pos();
1069                        self.emit((Tok::Less, TextRange::new(tok_start, tok_end)));
1070                    }
1071                }
1072            }
1073            '>' => {
1074                let tok_start = self.get_pos();
1075                self.next_char();
1076                match self.window[0] {
1077                    Some('>') => {
1078                        self.next_char();
1079                        match self.window[0] {
1080                            Some('=') => {
1081                                self.next_char();
1082                                let tok_end = self.get_pos();
1083                                self.emit((
1084                                    Tok::RightShiftEqual,
1085                                    TextRange::new(tok_start, tok_end),
1086                                ));
1087                            }
1088                            _ => {
1089                                let tok_end = self.get_pos();
1090                                self.emit((Tok::RightShift, TextRange::new(tok_start, tok_end)));
1091                            }
1092                        }
1093                    }
1094                    Some('=') => {
1095                        self.next_char();
1096                        let tok_end = self.get_pos();
1097                        self.emit((Tok::GreaterEqual, TextRange::new(tok_start, tok_end)));
1098                    }
1099                    _ => {
1100                        let tok_end = self.get_pos();
1101                        self.emit((Tok::Greater, TextRange::new(tok_start, tok_end)));
1102                    }
1103                }
1104            }
1105            ',' => {
1106                self.eat_single_char(Tok::Comma);
1107            }
1108            '.' => {
1109                if let Some('0'..='9') = self.window[1] {
1110                    let number = self.lex_number()?;
1111                    self.emit(number);
1112                } else {
1113                    let tok_start = self.get_pos();
1114                    self.next_char();
1115                    if self.window[..2] == [Some('.'); 2] {
1116                        self.next_char();
1117                        self.next_char();
1118                        let tok_end = self.get_pos();
1119                        self.emit((Tok::Ellipsis, TextRange::new(tok_start, tok_end)));
1120                    } else {
1121                        let tok_end = self.get_pos();
1122                        self.emit((Tok::Dot, TextRange::new(tok_start, tok_end)));
1123                    }
1124                }
1125            }
1126            '\n' | '\r' => {
1127                let tok_start = self.get_pos();
1128                self.next_char();
1129                let tok_end = self.get_pos();
1130
1131                // Depending on the nesting level, we emit a logical or
1132                // non-logical newline:
1133                if self.nesting == 0 {
1134                    self.at_begin_of_line = true;
1135                    self.emit((Tok::Newline, TextRange::new(tok_start, tok_end)));
1136                } else {
1137                    #[cfg(feature = "full-lexer")]
1138                    self.emit((Tok::NonLogicalNewline, TextRange::new(tok_start, tok_end)));
1139                }
1140            }
1141            ' ' | '\t' | '\x0C' => {
1142                // Skip white-spaces
1143                self.next_char();
1144                while let Some(' ' | '\t' | '\x0C') = self.window[0] {
1145                    self.next_char();
1146                }
1147            }
1148            '\\' => {
1149                self.next_char();
1150                match self.window[0] {
1151                    Some('\n' | '\r') => {
1152                        self.next_char();
1153                    }
1154                    _ => {
1155                        return Err(LexicalError {
1156                            error: LexicalErrorType::LineContinuationError,
1157                            location: self.get_pos(),
1158                        });
1159                    }
1160                }
1161
1162                if self.window[0].is_none() {
1163                    return Err(LexicalError {
1164                        error: LexicalErrorType::Eof,
1165                        location: self.get_pos(),
1166                    });
1167                }
1168            }
1169            _ => {
1170                if is_emoji_presentation(c) {
1171                    let tok_start = self.get_pos();
1172                    self.next_char();
1173                    let tok_end = self.get_pos();
1174                    self.emit((
1175                        Tok::Name {
1176                            name: c.to_string(),
1177                        },
1178                        TextRange::new(tok_start, tok_end),
1179                    ));
1180                } else {
1181                    let c = self.next_char();
1182                    return Err(LexicalError {
1183                        error: LexicalErrorType::UnrecognizedToken { tok: c.unwrap() },
1184                        location: self.get_pos(),
1185                    });
1186                }
1187            }
1188        }
1189
1190        Ok(())
1191    }
1192
1193    // Used by single character tokens to advance the window and emit the correct token.
1194    fn eat_single_char(&mut self, ty: Tok) {
1195        let tok_start = self.get_pos();
1196        self.next_char().unwrap_or_else(|| unsafe {
1197            // SAFETY: eat_single_char has been called only after a character has been read
1198            // from the window, so the window is guaranteed to be non-empty.
1199            std::hint::unreachable_unchecked()
1200        });
1201        let tok_end = self.get_pos();
1202        self.emit((ty, TextRange::new(tok_start, tok_end)));
1203    }
1204
1205    // Helper function to go to the next character coming up.
1206    fn next_char(&mut self) -> Option<char> {
1207        let mut c = self.window[0];
1208        self.window.slide();
1209        match c {
1210            Some('\r') => {
1211                if self.window[0] == Some('\n') {
1212                    self.location += TextSize::from(1);
1213                    self.window.slide();
1214                }
1215
1216                self.location += TextSize::from(1);
1217                c = Some('\n');
1218            }
1219            #[allow(unused_variables)]
1220            Some(c) => {
1221                self.location += c.text_len();
1222            }
1223            _ => {}
1224        }
1225        c
1226    }
1227
1228    // Helper function to retrieve the current position.
1229    fn get_pos(&self) -> TextSize {
1230        self.location
1231    }
1232
1233    // Helper function to emit a lexed token to the queue of tokens.
1234    fn emit(&mut self, spanned: Spanned) {
1235        self.pending.push(spanned);
1236    }
1237}
1238
1239// Implement iterator pattern for Lexer.
1240// Calling the next element in the iterator will yield the next lexical
1241// token.
1242impl<T> Iterator for Lexer<T>
1243where
1244    T: Iterator<Item = char>,
1245{
1246    type Item = LexResult;
1247
1248    fn next(&mut self) -> Option<Self::Item> {
1249        let token = self.inner_next();
1250        trace!(
1251            "Lex token {:?}, nesting={:?}, indent stack: {:?}",
1252            token,
1253            self.nesting,
1254            self.indentations,
1255        );
1256
1257        match token {
1258            Ok((Tok::EndOfFile, _)) => None,
1259            r => Some(r),
1260        }
1261    }
1262}
1263
1264/// Represents an error that occur during lexing and are
1265/// returned by the `parse_*` functions in the iterator in the
1266/// [lexer] implementation.
1267///
1268/// [lexer]: crate::lexer
1269#[derive(Debug, PartialEq)]
1270pub struct LexicalError {
1271    /// The type of error that occurred.
1272    pub error: LexicalErrorType,
1273    /// The location of the error.
1274    pub location: TextSize,
1275}
1276
1277impl LexicalError {
1278    /// Creates a new `LexicalError` with the given error type and location.
1279    pub fn new(error: LexicalErrorType, location: TextSize) -> Self {
1280        Self { error, location }
1281    }
1282}
1283
1284/// Represents the different types of errors that can occur during lexing.
1285#[derive(Debug, PartialEq)]
1286pub enum LexicalErrorType {
1287    // TODO: Can probably be removed, the places it is used seem to be able
1288    // to use the `UnicodeError` variant instead.
1289    #[doc(hidden)]
1290    StringError,
1291    // TODO: Should take a start/end position to report.
1292    /// Decoding of a unicode escape sequence in a string literal failed.
1293    UnicodeError,
1294    /// The nesting of brackets/braces/parentheses is not balanced.
1295    NestingError,
1296    /// The indentation is not consistent.
1297    IndentationError,
1298    /// Inconsistent use of tabs and spaces.
1299    TabError,
1300    /// Encountered a tab after a space.
1301    TabsAfterSpaces,
1302    /// A non-default argument follows a default argument.
1303    DefaultArgumentError,
1304    /// A duplicate argument was found in a function definition.
1305    DuplicateArgumentError(String),
1306    /// A positional argument follows a keyword argument.
1307    PositionalArgumentError,
1308    /// An iterable argument unpacking `*args` follows keyword argument unpacking `**kwargs`.
1309    UnpackedArgumentError,
1310    /// A keyword argument was repeated.
1311    DuplicateKeywordArgumentError(String),
1312    /// An unrecognized token was encountered.
1313    UnrecognizedToken { tok: char },
1314    /// An f-string error containing the [`FStringErrorType`].
1315    FStringError(FStringErrorType),
1316    /// An unexpected character was encountered after a line continuation.
1317    LineContinuationError,
1318    /// An unexpected end of file was encountered.
1319    Eof,
1320    /// An unexpected error occurred.
1321    OtherError(String),
1322}
1323
1324impl std::fmt::Display for LexicalErrorType {
1325    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
1326        match self {
1327            LexicalErrorType::StringError => write!(f, "Got unexpected string"),
1328            LexicalErrorType::FStringError(error) => write!(f, "f-string: {error}"),
1329            LexicalErrorType::UnicodeError => write!(f, "Got unexpected unicode"),
1330            LexicalErrorType::NestingError => write!(f, "Got unexpected nesting"),
1331            LexicalErrorType::IndentationError => {
1332                write!(f, "unindent does not match any outer indentation level")
1333            }
1334            LexicalErrorType::TabError => {
1335                write!(f, "inconsistent use of tabs and spaces in indentation")
1336            }
1337            LexicalErrorType::TabsAfterSpaces => {
1338                write!(f, "Tabs not allowed as part of indentation after spaces")
1339            }
1340            LexicalErrorType::DefaultArgumentError => {
1341                write!(f, "non-default argument follows default argument")
1342            }
1343            LexicalErrorType::DuplicateArgumentError(arg_name) => {
1344                write!(f, "duplicate argument '{arg_name}' in function definition")
1345            }
1346            LexicalErrorType::DuplicateKeywordArgumentError(arg_name) => {
1347                write!(f, "keyword argument repeated: {arg_name}")
1348            }
1349            LexicalErrorType::PositionalArgumentError => {
1350                write!(f, "positional argument follows keyword argument")
1351            }
1352            LexicalErrorType::UnpackedArgumentError => {
1353                write!(
1354                    f,
1355                    "iterable argument unpacking follows keyword argument unpacking"
1356                )
1357            }
1358            LexicalErrorType::UnrecognizedToken { tok } => {
1359                write!(f, "Got unexpected token {tok}")
1360            }
1361            LexicalErrorType::LineContinuationError => {
1362                write!(f, "unexpected character after line continuation character")
1363            }
1364            LexicalErrorType::Eof => write!(f, "unexpected EOF while parsing"),
1365            LexicalErrorType::OtherError(msg) => write!(f, "{msg}"),
1366        }
1367    }
1368}
1369
1370#[cfg(test)]
1371mod tests {
1372    use super::*;
1373    use crate::ast::bigint::BigInt;
1374
1375    const WINDOWS_EOL: &str = "\r\n";
1376    const MAC_EOL: &str = "\r";
1377    const UNIX_EOL: &str = "\n";
1378
1379    pub fn lex_source(source: &str) -> Vec<Tok> {
1380        let lexer = lex(source, Mode::Module);
1381        lexer.map(|x| x.unwrap().0).collect()
1382    }
1383
1384    fn str_tok(s: &str) -> Tok {
1385        Tok::String {
1386            value: s.to_owned(),
1387            kind: StringKind::String,
1388            triple_quoted: false,
1389        }
1390    }
1391
1392    fn raw_str_tok(s: &str) -> Tok {
1393        Tok::String {
1394            value: s.to_owned(),
1395            kind: StringKind::RawString,
1396            triple_quoted: false,
1397        }
1398    }
1399
1400    #[test]
1401    fn test_numbers() {
1402        let source = "0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j";
1403        let tokens = lex_source(source);
1404        assert_eq!(
1405            tokens,
1406            vec![
1407                Tok::Int {
1408                    value: BigInt::from(47),
1409                },
1410                Tok::Int {
1411                    value: BigInt::from(10)
1412                },
1413                Tok::Int {
1414                    value: BigInt::from(13),
1415                },
1416                Tok::Int {
1417                    value: BigInt::from(0),
1418                },
1419                Tok::Int {
1420                    value: BigInt::from(123),
1421                },
1422                Tok::Int {
1423                    value: BigInt::from(1234567890),
1424                },
1425                Tok::Float { value: 0.2 },
1426                Tok::Float { value: 100.0 },
1427                Tok::Float { value: 2100.0 },
1428                Tok::Complex {
1429                    real: 0.0,
1430                    imag: 2.0,
1431                },
1432                Tok::Complex {
1433                    real: 0.0,
1434                    imag: 2.2,
1435                },
1436                Tok::Newline,
1437            ]
1438        );
1439    }
1440
1441    macro_rules! test_line_comment {
1442        ($($name:ident: $eol:expr,)*) => {
1443            $(
1444            #[test]
1445            #[cfg(feature = "full-lexer")]
1446            fn $name() {
1447                let source = format!(r"99232  # {}", $eol);
1448                let tokens = lex_source(&source);
1449                assert_eq!(tokens, vec![Tok::Int { value: BigInt::from(99232) }, Tok::Comment(format!("# {}", $eol)), Tok::Newline]);
1450            }
1451            )*
1452        }
1453    }
1454
1455    test_line_comment! {
1456        test_line_comment_long: " foo",
1457        test_line_comment_whitespace: "  ",
1458        test_line_comment_single_whitespace: " ",
1459        test_line_comment_empty: "",
1460    }
1461
1462    macro_rules! test_comment_until_eol {
1463        ($($name:ident: $eol:expr,)*) => {
1464            $(
1465            #[test]
1466            #[cfg(feature = "full-lexer")]
1467            fn $name() {
1468                let source = format!("123  # Foo{}456", $eol);
1469                let tokens = lex_source(&source);
1470                assert_eq!(
1471                    tokens,
1472                    vec![
1473                        Tok::Int { value: BigInt::from(123) },
1474                        Tok::Comment("# Foo".to_string()),
1475                        Tok::Newline,
1476                        Tok::Int { value: BigInt::from(456) },
1477                        Tok::Newline,
1478                    ]
1479                )
1480            }
1481            )*
1482        }
1483    }
1484
1485    test_comment_until_eol! {
1486        test_comment_until_windows_eol: WINDOWS_EOL,
1487        test_comment_until_mac_eol: MAC_EOL,
1488        test_comment_until_unix_eol: UNIX_EOL,
1489    }
1490
1491    #[test]
1492    fn test_assignment() {
1493        let source = r"a_variable = 99 + 2-0";
1494        let tokens = lex_source(source);
1495        assert_eq!(
1496            tokens,
1497            vec![
1498                Tok::Name {
1499                    name: String::from("a_variable"),
1500                },
1501                Tok::Equal,
1502                Tok::Int {
1503                    value: BigInt::from(99)
1504                },
1505                Tok::Plus,
1506                Tok::Int {
1507                    value: BigInt::from(2)
1508                },
1509                Tok::Minus,
1510                Tok::Int {
1511                    value: BigInt::from(0)
1512                },
1513                Tok::Newline,
1514            ]
1515        );
1516    }
1517
1518    macro_rules! test_indentation_with_eol {
1519        ($($name:ident: $eol:expr,)*) => {
1520            $(
1521            #[test]
1522            #[cfg(feature = "full-lexer")]
1523            fn $name() {
1524                let source = format!("def foo():{}   return 99{}{}", $eol, $eol, $eol);
1525                let tokens = lex_source(&source);
1526                assert_eq!(
1527                    tokens,
1528                    vec![
1529                        Tok::Def,
1530                        Tok::Name {
1531                            name: String::from("foo"),
1532                        },
1533                        Tok::Lpar,
1534                        Tok::Rpar,
1535                        Tok::Colon,
1536                        Tok::Newline,
1537                        Tok::Indent,
1538                        Tok::Return,
1539                        Tok::Int { value: BigInt::from(99) },
1540                        Tok::Newline,
1541                        Tok::NonLogicalNewline,
1542                        Tok::Dedent,
1543                    ]
1544                );
1545            }
1546            )*
1547        };
1548    }
1549
1550    test_indentation_with_eol! {
1551        test_indentation_windows_eol: WINDOWS_EOL,
1552        test_indentation_mac_eol: MAC_EOL,
1553        test_indentation_unix_eol: UNIX_EOL,
1554    }
1555
1556    macro_rules! test_double_dedent_with_eol {
1557        ($($name:ident: $eol:expr,)*) => {
1558        $(
1559            #[test]
1560            #[cfg(feature = "full-lexer")]
1561            fn $name() {
1562                let source = format!("def foo():{} if x:{}{}  return 99{}{}", $eol, $eol, $eol, $eol, $eol);
1563                let tokens = lex_source(&source);
1564                assert_eq!(
1565                    tokens,
1566                    vec![
1567                        Tok::Def,
1568                        Tok::Name {
1569                            name: String::from("foo"),
1570                        },
1571                        Tok::Lpar,
1572                        Tok::Rpar,
1573                        Tok::Colon,
1574                        Tok::Newline,
1575                        Tok::Indent,
1576                        Tok::If,
1577                        Tok::Name {
1578                            name: String::from("x"),
1579                        },
1580                        Tok::Colon,
1581                        Tok::Newline,
1582                        Tok::NonLogicalNewline,
1583                        Tok::Indent,
1584                        Tok::Return,
1585                        Tok::Int { value: BigInt::from(99) },
1586                        Tok::Newline,
1587                        Tok::NonLogicalNewline,
1588                        Tok::Dedent,
1589                        Tok::Dedent,
1590                    ]
1591                );
1592            }
1593        )*
1594        }
1595    }
1596
1597    macro_rules! test_double_dedent_with_tabs {
1598        ($($name:ident: $eol:expr,)*) => {
1599        $(
1600            #[test]
1601            #[cfg(feature = "full-lexer")]
1602            fn $name() {
1603                let source = format!("def foo():{}\tif x:{}{}\t return 99{}{}", $eol, $eol, $eol, $eol, $eol);
1604                let tokens = lex_source(&source);
1605                assert_eq!(
1606                    tokens,
1607                    vec![
1608                        Tok::Def,
1609                        Tok::Name {
1610                            name: String::from("foo"),
1611                        },
1612                        Tok::Lpar,
1613                        Tok::Rpar,
1614                        Tok::Colon,
1615                        Tok::Newline,
1616                        Tok::Indent,
1617                        Tok::If,
1618                        Tok::Name {
1619                            name: String::from("x"),
1620                        },
1621                        Tok::Colon,
1622                        Tok::Newline,
1623                        Tok::NonLogicalNewline,
1624                        Tok::Indent,
1625                        Tok::Return,
1626                        Tok::Int { value: BigInt::from(99) },
1627                        Tok::Newline,
1628                        Tok::NonLogicalNewline,
1629                        Tok::Dedent,
1630                        Tok::Dedent,
1631                    ]
1632                );
1633            }
1634        )*
1635        }
1636    }
1637
1638    test_double_dedent_with_eol! {
1639        test_double_dedent_windows_eol: WINDOWS_EOL,
1640        test_double_dedent_mac_eol: MAC_EOL,
1641        test_double_dedent_unix_eol: UNIX_EOL,
1642    }
1643
1644    test_double_dedent_with_tabs! {
1645        test_double_dedent_tabs_windows_eol: WINDOWS_EOL,
1646        test_double_dedent_tabs_mac_eol: MAC_EOL,
1647        test_double_dedent_tabs_unix_eol: UNIX_EOL,
1648    }
1649
1650    macro_rules! test_newline_in_brackets {
1651        ($($name:ident: $eol:expr,)*) => {
1652        $(
1653            #[test]
1654            #[cfg(feature = "full-lexer")]
1655            fn $name() {
1656                let source = r"x = [
1657
1658    1,2
1659,(3,
16604,
1661), {
16625,
16636,\
16647}]
1665".replace("\n", $eol);
1666                let tokens = lex_source(&source);
1667                assert_eq!(
1668                    tokens,
1669                    vec![
1670                        Tok::Name {
1671                            name: String::from("x"),
1672                        },
1673                        Tok::Equal,
1674                        Tok::Lsqb,
1675                        Tok::NonLogicalNewline,
1676                        Tok::NonLogicalNewline,
1677                        Tok::Int { value: BigInt::from(1) },
1678                        Tok::Comma,
1679                        Tok::Int { value: BigInt::from(2) },
1680                        Tok::NonLogicalNewline,
1681                        Tok::Comma,
1682                        Tok::Lpar,
1683                        Tok::Int { value: BigInt::from(3) },
1684                        Tok::Comma,
1685                        Tok::NonLogicalNewline,
1686                        Tok::Int { value: BigInt::from(4) },
1687                        Tok::Comma,
1688                        Tok::NonLogicalNewline,
1689                        Tok::Rpar,
1690                        Tok::Comma,
1691                        Tok::Lbrace,
1692                        Tok::NonLogicalNewline,
1693                        Tok::Int { value: BigInt::from(5) },
1694                        Tok::Comma,
1695                        Tok::NonLogicalNewline,
1696                        Tok::Int { value: BigInt::from(6) },
1697                        Tok::Comma,
1698                        // Continuation here - no NonLogicalNewline.
1699                        Tok::Int { value: BigInt::from(7) },
1700                        Tok::Rbrace,
1701                        Tok::Rsqb,
1702                        Tok::Newline,
1703                    ]
1704                );
1705            }
1706        )*
1707        };
1708    }
1709
1710    test_newline_in_brackets! {
1711        test_newline_in_brackets_windows_eol: WINDOWS_EOL,
1712        test_newline_in_brackets_mac_eol: MAC_EOL,
1713        test_newline_in_brackets_unix_eol: UNIX_EOL,
1714    }
1715
1716    #[test]
1717    #[cfg(feature = "full-lexer")]
1718    fn test_non_logical_newline_in_string_continuation() {
1719        let source = r"(
1720    'a'
1721    'b'
1722
1723    'c' \
1724    'd'
1725)";
1726        let tokens = lex_source(source);
1727        assert_eq!(
1728            tokens,
1729            vec![
1730                Tok::Lpar,
1731                Tok::NonLogicalNewline,
1732                str_tok("a"),
1733                Tok::NonLogicalNewline,
1734                str_tok("b"),
1735                Tok::NonLogicalNewline,
1736                Tok::NonLogicalNewline,
1737                str_tok("c"),
1738                str_tok("d"),
1739                Tok::NonLogicalNewline,
1740                Tok::Rpar,
1741                Tok::Newline,
1742            ]
1743        );
1744    }
1745
1746    #[test]
1747    #[cfg(feature = "full-lexer")]
1748    fn test_logical_newline_line_comment() {
1749        let source = "#Hello\n#World\n";
1750        let tokens = lex_source(source);
1751        assert_eq!(
1752            tokens,
1753            vec![
1754                Tok::Comment("#Hello".to_owned()),
1755                Tok::NonLogicalNewline,
1756                Tok::Comment("#World".to_owned()),
1757                Tok::NonLogicalNewline,
1758            ]
1759        );
1760    }
1761
1762    #[test]
1763    fn test_operators() {
1764        let source = "//////=/ /";
1765        let tokens = lex_source(source);
1766        assert_eq!(
1767            tokens,
1768            vec![
1769                Tok::DoubleSlash,
1770                Tok::DoubleSlash,
1771                Tok::DoubleSlashEqual,
1772                Tok::Slash,
1773                Tok::Slash,
1774                Tok::Newline,
1775            ]
1776        );
1777    }
1778
1779    #[test]
1780    fn test_string() {
1781        let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\'' '\420' '\200\0a'"#;
1782        let tokens = lex_source(source);
1783        assert_eq!(
1784            tokens,
1785            vec![
1786                str_tok("double"),
1787                str_tok("single"),
1788                str_tok(r"can\'t"),
1789                str_tok(r#"\\\""#),
1790                str_tok(r"\t\r\n"),
1791                str_tok(r"\g"),
1792                raw_str_tok(r"raw\'"),
1793                str_tok(r"\420"),
1794                str_tok(r"\200\0a"),
1795                Tok::Newline,
1796            ]
1797        );
1798    }
1799
1800    macro_rules! test_string_continuation {
1801        ($($name:ident: $eol:expr,)*) => {
1802        $(
1803            #[test]
1804            fn $name() {
1805                let source = format!("\"abc\\{}def\"", $eol);
1806                let tokens = lex_source(&source);
1807                assert_eq!(
1808                    tokens,
1809                    vec![
1810                        str_tok("abc\\\ndef"),
1811                        Tok::Newline,
1812                    ]
1813                )
1814            }
1815        )*
1816        }
1817    }
1818
1819    test_string_continuation! {
1820        test_string_continuation_windows_eol: WINDOWS_EOL,
1821        test_string_continuation_mac_eol: MAC_EOL,
1822        test_string_continuation_unix_eol: UNIX_EOL,
1823    }
1824
1825    #[test]
1826    fn test_escape_unicode_name() {
1827        let source = r#""\N{EN SPACE}""#;
1828        let tokens = lex_source(source);
1829        assert_eq!(tokens, vec![str_tok(r"\N{EN SPACE}"), Tok::Newline])
1830    }
1831
1832    macro_rules! test_triple_quoted {
1833        ($($name:ident: $eol:expr,)*) => {
1834        $(
1835            #[test]
1836            fn $name() {
1837                let source = format!("\"\"\"{0} test string{0} \"\"\"", $eol);
1838                let tokens = lex_source(&source);
1839                assert_eq!(
1840                    tokens,
1841                    vec![
1842                        Tok::String {
1843                            value: "\n test string\n ".to_owned(),
1844                            kind: StringKind::String,
1845                            triple_quoted: true,
1846                        },
1847                        Tok::Newline,
1848                    ]
1849                )
1850            }
1851        )*
1852        }
1853    }
1854
1855    test_triple_quoted! {
1856        test_triple_quoted_windows_eol: WINDOWS_EOL,
1857        test_triple_quoted_mac_eol: MAC_EOL,
1858        test_triple_quoted_unix_eol: UNIX_EOL,
1859    }
1860}
rustpython_parser/lexer.rs

rustpython_parser/
lexer.rs