python_syntax/
lexer.rs

1use std::{collections::HashMap, convert::TryInto};
2
3use num_bigint::BigUint;
4use regex::{CaptureLocations, Regex};
5
6use crate::{
7    error::{Error, ErrorKind},
8    source::Location,
9    token::{self, Token},
10};
11
12const TABSIZE: usize = 8;
13
14pub type LexResult = Result<(Location, Token, Location), Error>;
15
16struct ParseStringError {
17    offset: usize,
18}
19
20impl ParseStringError {
21    fn new(offset: usize) -> ParseStringError {
22        ParseStringError { offset }
23    }
24
25    fn offset(&self) -> usize {
26        self.offset
27    }
28}
29
30pub struct Lexer<'a> {
31    input: &'a str,
32    exhausted: bool,
33    location: Location,
34    // Stack of indents with and without expanded tabs
35    indents: Vec<(usize, usize)>,
36    // Column without expanded tabs
37    alt_col: usize,
38    // Pending DEDENT tokens
39    pending: usize,
40    // Parentheses nesting level
41    parens: isize,
42    // Are we in the new logical line?
43    newline: bool,
44    caps: CaptureLocations,
45}
46
47impl<'a> Iterator for Lexer<'a> {
48    type Item = LexResult;
49
50    fn next(&mut self) -> Option<Self::Item> {
51        if !self.exhausted {
52            Some(self.get_token())
53        } else {
54            None
55        }
56    }
57}
58
59lazy_static::lazy_static! {
60    static ref TOKEN_RE: Regex = {
61        fn escape_keys<V>(map: &HashMap<&str, V>) -> String {
62            let mut tokens = map.keys().collect::<Vec<_>>();
63            tokens.sort_unstable_by(|a, b| a.len().cmp(&b.len()).reverse());
64            tokens
65                .iter()
66                .map(|x| regex::escape(x))
67                .collect::<Vec<_>>()
68                .join("|")
69        }
70        let pat = format!(
71            r#"^(?x:
72            (  # 1
73                [\ \t\f]+
74                | \\ (?: \r\n | [\r\n] )
75                | \# .*
76            )
77            | ( \r\n | [\r\n] )  # 2
78            | ( $ ) # 3
79            | (  # 4
80                (?:
81                    (?: [0-9] (?: _? [0-9] )* )? \. [0-9] (?: _? [0-9] )*
82                    | [0-9] (?: _? [0-9] )* \.?
83                ) [eE] [+-]? [0-9] (?: _? [0-9] )*
84                | (?: [0-9] (?: _? [0-9] )* )? \. [0-9] (?: _? [0-9] )*
85                | [0-9] (?: _? [0-9] )* \.
86            ) ( [jJ] )?  # 5
87            | ( [0-9] (?: _? [0-9] )* [jJ] )  # 6
88            | (  # 7
89                (?: 0[bB] (?: _? [01] )+ )
90                | (?: 0[oO] (?: _? [0-7] )+ )
91                | (?: 0[xX] (?: _? [0-9a-fA-F] )+ )
92                | (?: [1-9] (?: _? [0-9] )* | 0+ (?: _? 0 )* )
93            )
94            | ( (?-i: rf | fr | rb | br | [rfbu] )? )  # 8
95            (?:
96                ''' ( (?s: [^\\] | \\. )*? ) '''  # 9
97                | """ ( (?s: [^\\] | \\. )*? ) """  # 10
98                | ' ( (?s: [^\\\n'] | \\. )*? ) '  # 11
99                | " ( (?s: [^\\\n"] | \\. )*? ) "  # 12
100                | ( ['"] | ''' | """ )  # 13
101            )
102            | ( {symbols} )  # 14
103            | ( {keywords} ) \b  # 15
104            | ( [\p{{XID_Start}}_] \p{{XID_Continue}}* )  # 16
105            )"#,
106            symbols = escape_keys(&token::SYMBOLS),
107            keywords = escape_keys(&token::KEYWORDS),
108        );
109        Regex::new(&pat).unwrap()
110    };
111
112    static ref BYTES_RE: Regex = Regex::new(
113        r#"(?x) \\ (?:
114        ( [\\'"abfnrtv] )
115        | ( [1-7]{1,3} )
116        | ( [0-9a-fA-F]{2} )
117        | \n
118        )"#).unwrap();
119    static ref STR_RE: Regex = Regex::new(
120        &[BYTES_RE.as_str(),
121        r#"| \\ (?:
122        N \{ (.*) \}
123        | u ( [0-9a-fA-F]{4} )
124        | U ( [0-9a-fA-F]{8} )
125        )"#].concat()).unwrap();
126}
127
128impl<'a> Lexer<'a> {
129    pub fn new(input: &str) -> Lexer {
130        Lexer {
131            input,
132            exhausted: false,
133            location: Location::new(1, 1),
134            indents: vec![(1, 1)],
135            alt_col: 1,
136            pending: 0,
137            parens: 0,
138            newline: true,
139            caps: TOKEN_RE.capture_locations(),
140        }
141    }
142
143    fn consume(&mut self, nbytes: usize) -> &str {
144        let (consumed, rest) = self.input.split_at(nbytes);
145        self.input = rest;
146        let mut bytes = consumed.bytes().peekable();
147        while let Some(c) = bytes.next() {
148            match c {
149                b'\r' => {
150                    if bytes.peek() == Some(&&b'\n') {
151                        continue;
152                    } else {
153                        self.location = Location::new(self.location.line + 1, 1);
154                        self.alt_col = 1;
155                    }
156                }
157                b'\n' => {
158                    self.location = Location::new(self.location.line + 1, 1);
159                    self.alt_col = 1;
160                }
161                b'\t' => {
162                    self.location = Location::new(
163                        self.location.line,
164                        self.location.column + TABSIZE - ((self.location.column - 1) % TABSIZE),
165                    );
166                    self.alt_col += 1;
167                }
168                _ => {
169                    self.location = Location::new(self.location.line, self.location.column + 1);
170                    self.alt_col += 1;
171                }
172            }
173        }
174        consumed
175    }
176
177    fn matches(&self, nth: usize) -> bool {
178        self.caps.get(nth).is_some()
179    }
180
181    fn get_token(&mut self) -> LexResult {
182        if self.pending > 0 {
183            // Process pending DEDENT
184            self.pending -= 1;
185            let loc = Location::new(self.location.line, 0);
186            return Ok((loc, Token::Dedent, loc));
187        }
188
189        if TOKEN_RE.captures_read(&mut self.caps, self.input).is_none() {
190            self.exhausted = true;
191            let c = self.input.chars().next().unwrap();
192            return Err(Error::new(ErrorKind::UnexpectedCharacter(c), self.location));
193        }
194
195        if self.newline && !(self.matches(1) || self.matches(2)) {
196            // Not a blank line, handle indentation
197            let col = self.location.column;
198            let (last, alt_last) = *self.indents.last().unwrap();
199            let emit_loc = Location::new(self.location.line, col);
200            if col == last {
201                // No change
202                if self.alt_col != alt_last {
203                    // "Fix" alt_col so we can recover from error
204                    self.alt_col = alt_last;
205                    return Err(Error::new(ErrorKind::MixedTabsAndSpaces, emit_loc));
206                }
207            } else if col > last {
208                // Indent
209                if self.alt_col <= alt_last {
210                    self.alt_col = alt_last;
211                    return Err(Error::new(ErrorKind::MixedTabsAndSpaces, emit_loc));
212                }
213                self.indents.push((col, self.alt_col));
214                return Ok((emit_loc, Token::Indent, emit_loc));
215            } else {
216                // Dedent
217                while self.indents.len() > 1 && col < self.indents.last().unwrap().0 {
218                    self.pending += 1;
219                    self.indents.pop();
220                }
221                let (last, alt_last) = *self.indents.last().unwrap();
222                if col != last {
223                    self.exhausted = true;
224                    return Err(Error::new(ErrorKind::UnmatchedDedent, emit_loc));
225                }
226                if self.alt_col != alt_last {
227                    self.alt_col = alt_last;
228                    return Err(Error::new(ErrorKind::MixedTabsAndSpaces, emit_loc));
229                }
230                return self.get_token();
231            }
232        }
233
234        let start = self.location;
235        let match_len = self.caps.get(0).unwrap().1;
236        let lex = &self.input[..match_len];
237        self.consume(match_len);
238        let end = self.location;
239
240        if self.matches(1) {
241            // Skip whitespace, line continuations and comments
242            return self.get_token();
243        } else if self.matches(2) {
244            // Newline
245            if !self.newline && self.parens == 0 {
246                self.newline = true;
247                return Ok((start, Token::Newline, start));
248            } else {
249                // Blank line / implicitly joined lines
250                return self.get_token();
251            }
252        }
253
254        self.newline = false;
255
256        if self.matches(3) {
257            // EOF
258            self.exhausted = true;
259            Ok((start, Token::Eof, start))
260        } else if self.matches(4) && !self.matches(5) {
261            // Float
262            Ok((start, Token::Float(Self::parse_float(lex).unwrap()), end))
263        } else if self.matches(5) || self.matches(6) {
264            // Imaginary (float)
265            let lex = &lex[..lex.len() - 1]; // Strip trailing j
266            Ok((
267                start,
268                Token::Imaginary(Self::parse_float(lex).unwrap()),
269                end,
270            ))
271        } else if self.matches(7) {
272            // Integer
273            Ok((start, Token::Integer(Self::parse_int(lex).unwrap()), end))
274        } else if self.matches(8) {
275            // String
276            if self.matches(13) {
277                // Unterminated
278                self.exhausted = true;
279                return Err(Error::new(ErrorKind::UnterminatedString, start));
280            }
281            let prefix = self
282                .caps
283                .get(8)
284                .map(|(start, end)| lex[start..end].to_ascii_lowercase());
285            let raw = prefix.as_ref().map_or(false, |x| x.contains('r'));
286            let formatted = prefix.as_ref().map_or(false, |x| x.contains('f'));
287            let bytes = prefix.as_ref().map_or(false, |x| x.contains('b'));
288            let value = self
289                .caps
290                .get(9)
291                .or_else(|| self.caps.get(10))
292                .or_else(|| self.caps.get(11))
293                .or_else(|| self.caps.get(12))
294                .map(|(start, end)| &lex[start..end])
295                .unwrap();
296            if bytes {
297                let value = if !raw {
298                    Self::parse_bytes(value).map_err(|e| {
299                        Error::new(ErrorKind::NonAsciiBytes { offset: e.offset() }, start)
300                    })?
301                } else {
302                    value.as_bytes().to_vec()
303                };
304                Ok((start, Token::Bytes(value), end))
305            } else {
306                let value = if !raw {
307                    Self::parse_str(value).map_err(|e| {
308                        Error::new(ErrorKind::UnicodeDecode { offset: e.offset() }, start)
309                    })?
310                } else {
311                    value.to_string()
312                };
313                if formatted {
314                    Ok((start, Token::FormattedString(value), end))
315                } else {
316                    Ok((start, Token::String(value), end))
317                }
318            }
319        } else if self.matches(14) {
320            // Operator or delimiter
321            let tok = token::SYMBOLS.get(lex).unwrap();
322            match tok {
323                Token::ParenOpen | Token::BracketOpen | Token::BraceOpen => self.parens += 1,
324                Token::ParenClose | Token::BracketClose | Token::BraceClose => self.parens -= 1,
325                _ => (),
326            }
327            Ok((start, tok.clone(), end))
328        } else if self.matches(15) {
329            // Keyword
330            let tok = token::KEYWORDS.get(lex).unwrap();
331            Ok((start, tok.clone(), end))
332        } else if self.matches(16) {
333            // Identifier
334            Ok((start, Token::Name(lex.into()), end))
335        } else {
336            unreachable!()
337        }
338    }
339
340    fn parse_bytes(mut s: &str) -> Result<Vec<u8>, ParseStringError> {
341        if let Some(offset) = s.bytes().position(|c| !c.is_ascii()) {
342            return Err(ParseStringError::new(offset));
343        }
344
345        let mut result = vec![];
346        let mut caps = BYTES_RE.capture_locations();
347
348        while let Some(cap) = BYTES_RE.captures_read(&mut caps, s) {
349            result.extend(s[..cap.start()].bytes());
350            s = &s[cap.end()..];
351            let escape = &cap.as_str()[1..];
352            let value = if caps.get(1).is_some() {
353                Self::ascii_escape(escape.as_bytes()[0])
354            } else if caps.get(2).is_some() {
355                u8::from_str_radix(escape, 8).unwrap()
356            } else if caps.get(3).is_some() {
357                u8::from_str_radix(escape, 16).unwrap()
358            } else {
359                // Line continuation
360                continue;
361            };
362            result.push(value);
363        }
364        result.extend(s.bytes());
365
366        Ok(result)
367    }
368
369    fn parse_str(mut s: &str) -> Result<String, ParseStringError> {
370        let mut result = String::new();
371        let mut caps = STR_RE.capture_locations();
372        let mut offset = 0;
373
374        while let Some(cap) = STR_RE.captures_read(&mut caps, s) {
375            result.push_str(&s[..cap.start()]);
376            s = &s[cap.end()..];
377            let escape = &cap.as_str()[1..];
378            let value = if caps.get(1).is_some() {
379                Self::ascii_escape(escape.as_bytes()[0]) as char
380            } else if caps.get(2).is_some() {
381                u8::from_str_radix(escape, 8).unwrap() as char
382            } else if caps.get(3).is_some() {
383                u8::from_str_radix(escape, 16).unwrap() as char
384            } else if caps.get(4).is_some() {
385                unicode_names2::character(&escape[2..escape.len() - 1])
386                    .ok_or_else(|| ParseStringError::new(offset))?
387            } else if caps.get(5).is_some() || caps.get(6).is_some() {
388                u32::from_str_radix(&escape[1..], 16)
389                    .unwrap()
390                    .try_into()
391                    .map_err(|_| ParseStringError::new(offset))?
392            } else {
393                // Line continuation
394                offset += 2;
395                continue;
396            };
397            result.push(value);
398            offset += cap.end();
399        }
400        result.push_str(s);
401
402        Ok(result)
403    }
404
405    fn ascii_escape(c: u8) -> u8 {
406        match c {
407            b'n' => b'\n',
408            b'r' => b'\r',
409            b't' => b'\t',
410            b'a' => b'\x07',
411            b'b' => b'\x08',
412            b'f' => b'\x0c',
413            b'v' => b'\x0b',
414            c => c,
415        }
416    }
417
418    fn parse_int(s: &str) -> Option<BigUint> {
419        let mut b = s.as_bytes();
420        let radix = match b.get(1) {
421            Some(b'b') | Some(b'B') => 2,
422            Some(b'o') | Some(b'O') => 8,
423            Some(b'x') | Some(b'X') => 16,
424            _ => 10,
425        };
426        if radix != 10 {
427            b = &b[2..];
428        }
429        if b[0] == b'_' {
430            // BigUint::parse_bytes rejects leading underscore
431            b = &b[1..];
432        }
433        BigUint::parse_bytes(b, radix)
434    }
435
436    fn parse_float(s: &str) -> Option<f64> {
437        if !s.contains('_') {
438            s.parse().ok()
439        } else {
440            s.replace('_', "").parse().ok()
441        }
442    }
443}
444
445#[cfg(test)]
446mod tests {
447    use super::*;
448    use crate::token::Token::*;
449
450    fn lex(i: &str) -> Vec<Result<Token, ErrorKind>> {
451        Lexer::new(i)
452            .map(|x| x.map(|x| x.1).map_err(|e| e.kind))
453            .collect()
454    }
455
456    macro_rules! assert_lex {
457        ($l:expr, $r:expr) => {{
458            assert_eq!(lex($l), vec![Ok($r), Ok(Eof)])
459        }};
460    }
461
462    #[test]
463    fn test_lex_indent() {
464        assert_eq!(
465            lex("a\n b\nc"),
466            vec![
467                Ok(Name("a".into())),
468                Ok(Newline),
469                Ok(Indent),
470                Ok(Name("b".into())),
471                Ok(Newline),
472                Ok(Dedent),
473                Ok(Name("c".into())),
474                Ok(Eof)
475            ]
476        );
477        assert_eq!(
478            lex("  a\n b"),
479            vec![
480                Ok(Indent),
481                Ok(Name("a".into())),
482                Ok(Newline),
483                Err(ErrorKind::UnmatchedDedent),
484            ]
485        );
486        assert_eq!(
487            lex("  \ta\n\tb"),
488            vec![
489                Ok(Indent),
490                Ok(Name("a".into())),
491                Ok(Newline),
492                Err(ErrorKind::MixedTabsAndSpaces),
493                Ok(Name("b".into())),
494                Ok(Eof)
495            ]
496        );
497    }
498
499    #[test]
500    fn test_lex_newline() {
501        assert_eq!(
502            lex("a\n\r\nb\r"),
503            vec![
504                Ok(Name("a".into())),
505                Ok(Newline),
506                Ok(Name("b".into())),
507                Ok(Newline),
508                Ok(Eof),
509            ]
510        );
511    }
512
513    #[test]
514    fn test_lex_integer() {
515        assert_lex!("1234567890", Integer(1234567890u64.into()));
516        assert_lex!("0b10", Integer(0b10u64.into()));
517        assert_lex!("0o12345670", Integer(0o12345670u64.into()));
518        assert_lex!("0x123456789abcdef0", Integer(0x123456789abcdef0u64.into()));
519        assert_lex!("1_2_3_4_5", Integer(12345u64.into()));
520        assert_lex!("0x_1_2_3", Integer(0x1_2_3u64.into()));
521        assert_lex!("000", Integer(0u64.into()));
522    }
523
524    #[test]
525    fn test_lex_float() {
526        assert_lex!("3.14", Float(3.14));
527        assert_lex!("10.", Float(10.));
528        assert_lex!(".001", Float(0.001));
529        assert_lex!("1e100", Float(1e100));
530        assert_lex!("3.14e-10", Float(3.14e-10));
531        assert_lex!("0e0", Float(0e0));
532        assert_lex!("3.14_15_93", Float(3.14_15_93));
533        assert_lex!("0.1e+10", Float(0.1e+10));
534        assert_lex!("1.e1", Float(1.0e1));
535    }
536
537    #[test]
538    fn lex_string() {
539        assert_lex!("'foo'", String("foo".into()));
540        assert_lex!("'foo\\tbar'", String("foo\tbar".into()));
541        assert_lex!("'\\x'", String("\\x".into()));
542        assert_lex!("'\\N{SNOWMAN}'", String("☃".into()));
543        assert_lex!("b'\\N{SNOWMAN}'", Bytes(b"\\N{SNOWMAN}".to_vec()));
544        assert_lex!("'\\\\\\'\"'", String("\\'\"".into()));
545        assert_lex!("'a \\\nb'", String("a b".into()));
546        assert_lex!("'''a \nb\\\nc'''", String("a \nbc".into()));
547        assert_lex!("r'\\t\\\n\\''", String("\\t\\\n\\'".into()))
548    }
549}