rgo/lexer/
mod.rs

1//! # Lexer
2//!
3//! A `Lexer` parses a source string into a list of tokens, which may later be used to construct an
4//! Abstract Syntax Tree.
5//!
6//! ## Notes
7//!
8//! - We want meaningful errors from the start. That means printing the line and column number on
9//! error, returning `Result`s instead of panicking (later on, we may use unwinding to speed up
10//! lexical analysis in non-erroneous cases).
11//!
12//! - It is unclear whether we should operator on Unicode `char`, or plain bytes `u8`. `char`s are
13//! more convenient to display and offer a clean API; bytes are (most likely) faster to work with.
14//!
15//! - I'm not sure what the best way to store tokens is. A slice into the original source, an
16//! interned string...? Probably an interned string, this is what rustc uses and it speeds up
17//! comparisons, which are going to be very frequent. Probably reduces allocations, too - and we're
18//! allocating a _lot_. We'd have to benchmark to be sure.
19
20use std::iter::Iterator;
21pub use token::*;
22
23#[cfg(test)]
24mod test;
25
26pub struct Lexer<'src> {
27    /// Byte offset from the start of the source string.
28    offset: usize,
29    /// The source string.
30    src: &'src str,
31    /// The last character to be read.
32    current_char: Option<char>,
33    /// The kind of token we read last. Used for automatic semicolon insertion.
34    last_token_kind: Option<TokenKind>,
35}
36
37impl<'src> Lexer<'src> {
38    /// Create a new Lexer from the given source string.
39    pub fn new(s: &str) -> Lexer {
40        // Initialize the lexer with the first character of the source string.
41        let first_char = s.chars().next();
42
43        Lexer {
44            src: s,
45            offset: 0,
46            current_char: first_char,
47            last_token_kind: None,
48        }
49    }
50
51    /// 'eat' one character.
52    /// This is a _very_ hot function.
53    fn bump(&mut self) {
54        self.offset += self.current_char.unwrap().len_utf8();
55
56        if self.offset < self.src.len() {
57            let ch = char_at(&self.src, self.offset);
58            self.current_char = Some(ch);
59        } else {
60            self.current_char = None;
61        }
62    }
63
64    /// Return the next character **without** bumping.
65    /// Useful for lookahead.
66    fn next_char(&self) -> Option<char> {
67        let next_offset = self.offset + 1;
68        if next_offset < self.src.len() {
69            let ch = char_at(&self.src, next_offset);
70            Some(ch)
71        } else {
72            None
73        }
74    }
75
76    /// Scan a number literal (integer or float).
77    // FIXME: ONLY supports integers for now.
78    fn scan_number(&mut self) -> Token {
79        // Integer literal grammar:
80        //
81        // int_lit     = decimal_lit | octal_lit | hex_lit .
82        // decimal_lit = ( "1" … "9" ) { decimal_digit } .
83        // octal_lit   = "0" { octal_digit } .
84        // hex_lit     = "0" ( "x" | "X" ) hex_digit { hex_digit } .
85
86        let start = self.offset;
87
88        // If we have a hexadecimal, treat it specially.
89        if self.current_char == Some('0') &&
90           (self.next_char() == Some('x') || self.next_char() == Some('x')) {
91            self.bump();
92            self.bump();
93
94            while let Some(c) = self.current_char {
95                if c.is_digit(16) {
96                    self.bump();
97                } else {
98                    break;
99                }
100            }
101
102            return Token {
103                value: Some(self.src[start..self.offset].into()),
104                kind: TokenKind::Hex,
105            };
106        }
107
108        let has_leading_zero = self.current_char == Some('0');
109        let mut had_e = false;
110        let mut had_dot = false;
111
112        'outer: while let Some(c) = self.current_char {
113            if c.is_digit(10) {
114                self.bump();
115            } else if !had_e && (c == 'e' || c == 'E') {
116                self.bump();
117                had_e = true;
118
119                if self.current_char == Some('+') || self.current_char == Some('-') {
120                    self.bump();
121                }
122            } else if !had_e && !had_dot && c == '.' {
123                self.bump();
124                had_dot = true;
125            } else if c == 'i' {
126                self.bump();
127
128                return Token {
129                    value: Some(self.src[start..self.offset].into()),
130                    kind: TokenKind::Imaginary,
131                };
132            } else {
133                break;
134            }
135        }
136
137        let s = &self.src[start..self.offset];
138
139        let kind = if had_e || had_dot {
140            TokenKind::Float
141        } else if has_leading_zero {
142            TokenKind::Octal
143        } else {
144            TokenKind::Decimal
145        };
146
147        Token {
148            value: Some(s.into()),
149            kind: kind,
150        }
151    }
152
153    /// Skip whitespace and comments, returning whether at least one newline was encountered.
154    fn skip_whitespace_and_comments(&mut self) -> bool {
155        let mut contains_newline = false;
156
157        while let Some(c) = self.current_char {
158            if c == '\n' {
159                contains_newline = true;
160            }
161
162            // Are we at the start of a general comment (`/* ... */`)?
163            if c == '/' && self.next_char() == Some('*') {
164                // Skip the '/*'.
165                self.bump();
166                self.bump();
167
168                // Skip the comment body.
169                while let Some(c) = self.current_char {
170                    if c == '*' && self.next_char() == Some('/') {
171                        break;
172                    } else {
173                        self.bump();
174                    }
175                }
176
177                // Skip the '*/'.
178                self.bump();
179                self.bump();
180
181                // Resume whitespace skipping.
182                continue;
183
184            } else if c == '/' && self.next_char() == Some('/') {
185                while let Some(c) = self.current_char {
186                    if c == '\n' {
187                        break;
188                    } else {
189                        self.bump();
190                    }
191                }
192
193                // Resume whitespace skipping.
194                // Since we have not bumped past the newline character,
195                // the next iteration of the loop will catch it.
196                continue;
197            }
198
199            if c.is_whitespace() {
200                self.bump();
201            } else {
202                break;
203            }
204        }
205
206
207        contains_newline
208    }
209
210    fn scan_ident(&mut self) -> &str {
211        let start = self.offset;
212
213        while let Some(c) = self.current_char {
214            if can_continue_identifier(c) {
215                self.bump();
216            } else {
217                break;
218            }
219        }
220
221        &self.src[start..self.offset]
222    }
223
224    fn scan_ident_or_keyword(&mut self) -> Token {
225        let ident = self.scan_ident();
226        let mut value = None;
227
228        use token::TokenKind::*;
229
230        let kind = match &*ident {
231            "break" => Break,
232            "case" => Case,
233            "chan" => Chan,
234            "const" => Const,
235            "continue" => Continue,
236            "default" => Default,
237            "defer" => Defer,
238            "else" => Else,
239            "fallthrough" => Fallthrough,
240            "for" => For,
241            "func" => Func,
242            "go" => Go,
243            "goto" => Goto,
244            "if" => If,
245            "import" => Import,
246            "interface" => Interface,
247            "map" => Map,
248            "package" => Package,
249            "range" => Range,
250            "return" => Return,
251            "select" => Select,
252            "struct" => Struct,
253            "switch" => Switch,
254            "type" => Type,
255            "var" => Var,
256            // XXX(perf): unnecessary alloc.
257            _ => {
258                value = Some(ident.into());
259                TokenKind::Ident
260            }
261        };
262
263        Token {
264            kind: kind,
265            value: value,
266        }
267    }
268
269    /// Return the next token, if any.
270    fn next_token_inner(&mut self) -> Option<Token> {
271        // Whitespace and comment handling.
272        let contains_newline = self.skip_whitespace_and_comments();
273
274        // Automatic semicolon insertion in the simplest case (newline + token that may terminate a
275        // statement).
276        //
277        // The Go Spec also says that a semicolon may be omitted before a closing ")" or "}".
278        // This case is _not_ handled by the lexer, but by the parser, as it requires too much
279        // context.
280        if contains_newline && may_terminate_statement(self.last_token_kind) {
281            return Some(Token {
282                kind: TokenKind::Semicolon,
283                value: None,
284            });
285        }
286
287        // Check for EOF after whitespace handling.
288        let c = match self.current_char {
289            Some(c) => c,
290            None => return None,
291        };
292
293        let kind = match c {
294            // Single-character tokens.
295            '(' => {
296                self.bump();
297                TokenKind::LParen
298            }
299            ')' => {
300                self.bump();
301                TokenKind::RParen
302            }
303            '{' => {
304                self.bump();
305                TokenKind::LBrace
306            }
307            '}' => {
308                self.bump();
309                TokenKind::RBrace
310            }
311            '[' => {
312                self.bump();
313                TokenKind::LBracket
314            }
315            ']' => {
316                self.bump();
317                TokenKind::RBracket
318            }
319            ',' => {
320                self.bump();
321                TokenKind::Comma
322            }
323            ';' => {
324                self.bump();
325                TokenKind::Semicolon
326            }
327            // More complex tokens.
328            '.' => {
329                if self.next_char().map(|x| x.is_digit(10)) == Some(true) {
330                    return Some(self.scan_number());
331                }
332
333                self.bump();
334
335                // Look for an ellipsis ('...').
336                if self.current_char == Some('.') && self.next_char() == Some('.') {
337                    self.bump();
338                    self.bump();
339                    TokenKind::Ellipsis
340                } else {
341                    TokenKind::Dot
342                }
343            }
344            ':' => {
345                self.bump();
346
347                if self.current_char == Some('=') {
348                    self.bump();
349                    TokenKind::ColonAssign
350                } else {
351                    TokenKind::Colon
352                }
353            }
354            '=' => {
355                self.bump();
356
357                if self.current_char == Some('=') {
358                    self.bump();
359                    TokenKind::Equals
360                } else {
361                    TokenKind::Assign
362                }
363            }
364            '+' => {
365                self.bump();
366
367                match self.current_char {
368                    Some('+') => {
369                        self.bump();
370                        TokenKind::Increment
371                    }
372                    Some('=') => {
373                        self.bump();
374                        TokenKind::PlusAssign
375                    }
376                    _ => TokenKind::Plus,
377                }
378            }
379            '-' => {
380                self.bump();
381
382                match self.current_char {
383                    Some('-') => {
384                        self.bump();
385                        TokenKind::Decrement
386                    }
387                    Some('=') => {
388                        self.bump();
389                        TokenKind::MinusAssign
390                    }
391                    _ => TokenKind::Minus,
392                }
393            }
394            '*' => {
395                self.bump();
396
397                match self.current_char {
398                    Some('=') => {
399                        self.bump();
400                        TokenKind::StarAssign
401                    }
402                    _ => TokenKind::Star,
403                }
404            }
405            '/' => {
406                self.bump();
407
408                match self.current_char {
409                    Some('=') => {
410                        self.bump();
411                        TokenKind::SlashAssign
412                    }
413                    _ => TokenKind::Slash,
414                }
415            }
416            '<' => {
417                self.bump();
418
419                match self.current_char {
420                    Some('<') => {
421                        self.bump();
422                        match self.current_char {
423                            Some('=') => {
424                                self.bump();
425                                TokenKind::LshiftAssign
426                            }
427                            _ => TokenKind::Lshift,
428                        }
429                    }
430                    Some('=') => {
431                        self.bump();
432                        TokenKind::LessThanOrEqual
433                    }
434                    Some('-') => {
435                        self.bump();
436                        TokenKind::Arrow
437                    }
438                    _ => TokenKind::LessThan,
439                }
440            }
441            '>' => {
442                self.bump();
443
444                match self.current_char {
445                    Some('>') => {
446                        self.bump();
447                        match self.current_char {
448                            Some('=') => {
449                                self.bump();
450                                TokenKind::RshiftAssign
451                            }
452                            _ => TokenKind::Rshift,
453                        }
454                    }
455                    Some('=') => {
456                        self.bump();
457                        TokenKind::GreaterThanOrEqual
458                    }
459                    _ => TokenKind::GreaterThan,
460                }
461            }
462            '|' => {
463                self.bump();
464
465                match self.current_char {
466                    Some('|') => {
467                        self.bump();
468                        TokenKind::OrOr
469                    }
470                    Some('=') => {
471                        self.bump();
472                        TokenKind::OrAssign
473                    }
474                    _ => TokenKind::Or,
475                }
476            }
477            '&' => {
478                self.bump();
479
480                match self.current_char {
481                    Some('&') => {
482                        self.bump();
483                        TokenKind::AndAnd
484                    }
485                    Some('=') => {
486                        self.bump();
487                        TokenKind::AndAssign
488                    }
489                    Some('^') => {
490                        self.bump();
491                        match self.current_char {
492                            Some('=') => {
493                                self.bump();
494                                TokenKind::BitClearAssign
495                            }
496                            _ => TokenKind::BitClear,
497                        }
498                    }
499                    _ => TokenKind::And,
500                }
501            }
502            '!' => {
503                self.bump();
504
505                match self.current_char {
506                    Some('=') => {
507                        self.bump();
508                        TokenKind::NotEqual
509                    }
510                    _ => TokenKind::Not,
511                }
512            }
513            '^' => {
514                self.bump();
515
516                match self.current_char {
517                    Some('=') => {
518                        self.bump();
519                        TokenKind::CaretAssign
520                    }
521                    _ => TokenKind::Caret,
522                }
523            }
524            '%' => {
525                self.bump();
526
527                match self.current_char {
528                    Some('=') => {
529                        self.bump();
530                        TokenKind::PercentAssign
531                    }
532                    _ => TokenKind::Percent,
533                }
534            }
535            // Scan integer.
536            c if c.is_digit(10) => return Some(self.scan_number()),
537            c if can_start_identifier(c) => return Some(self.scan_ident_or_keyword()),
538            // Start of _interpreted_ string literal.
539            '"' => return Some(self.scan_interpreted_str_lit()),
540            '`' => return Some(self.scan_raw_str_lit()),
541            '\'' => return Some(self.scan_rune_lit()),
542            c => panic!("unexpected start of token: '{}'", c),
543        };
544
545        Some(Token {
546            kind: kind,
547            value: None,
548        })
549    }
550
551    // XXX: add some validity checking.
552    fn scan_rune_lit(&mut self) -> Token {
553        self.bump();
554
555        let start = self.offset;
556
557        while let Some(c) = self.current_char {
558            // If we encounter a backslash escape, we just skip past the '\' and the
559            // following character.
560            if c == '\\' {
561                self.bump();
562                self.bump();
563            } else if c == '\'' {
564                break;
565            } else {
566                self.bump();
567            }
568        }
569
570        let s = &self.src[start..self.offset];
571
572        // Skip the quote _after_ slicing so that it isn't included
573        // in the slice.
574        self.bump();
575
576        Token {
577            value: Some(s.into()),
578            kind: TokenKind::Rune,
579        }
580    }
581
582    fn scan_interpreted_str_lit(&mut self) -> Token {
583        self.bump();
584        let start = self.offset;
585
586        while let Some(c) = self.current_char {
587            // If we encounter a backslash escape, we just skip past the '\' and the
588            // following character.
589            if c == '\\' {
590                self.bump();
591                self.bump();
592            } else if c == '"' {
593                break;
594            } else {
595                self.bump();
596            }
597        }
598
599        let s = &self.src[start..self.offset];
600
601        // Skip the quote _after_ slicing so that it isn't included
602        // in the slice.
603        self.bump();
604        // XXX(perf): alloc.
605
606        Token {
607            value: Some(s.into()),
608            kind: TokenKind::Str,
609        }
610    }
611
612    // XXX: review and test.
613    fn scan_raw_str_lit(&mut self) -> Token {
614        // Bump past the opening backtrick.
615        self.bump();
616        let start = self.offset;
617
618        while let Some(c) = self.current_char {
619            // Raw strings are pretty simple, because we don't have to handle escapes.
620            if c == '`' {
621                break;
622            } else {
623                self.bump();
624            }
625        }
626
627        let s = &self.src[start..self.offset];
628
629        // Skip the backtick _after_ slicing so that it isn't included
630        // in the slice.
631        self.bump();
632        // XXX(perf): alloc.
633
634        Token {
635            value: Some(s.into()),
636            kind: TokenKind::StrRaw,
637        }
638    }
639}
640
641impl<'src> Iterator for Lexer<'src> {
642    type Item = TokenAndSpan;
643
644    fn next(&mut self) -> Option<TokenAndSpan> {
645        let start = self.offset as u32;
646        let t = self.next_token_inner();
647        self.last_token_kind = t.as_ref().map(|t| t.kind);
648
649        t.map(|t| {
650            TokenAndSpan {
651                token: t,
652                span: Span {
653                    start: start,
654                    end: self.offset as u32,
655                },
656            }
657        })
658    }
659}
660
661/// Convenience function to collect all the tokens from a string.
662pub fn tokenize(s: &str) -> Vec<TokenAndSpan> {
663    let lexer = Lexer::new(s);
664    lexer.collect()
665}
666
667
668// =====
669// Utility functions.
670//
671// XXX(perf): expensive checks on Unicode chars (is_alphabetic(), is_numeric()) in these functions.
672// =====
673
674fn can_start_identifier(c: char) -> bool {
675    c.is_alphabetic() || c == '_'
676}
677
678fn can_continue_identifier(c: char) -> bool {
679    c.is_alphabetic() || c.is_numeric() || c == '_'
680}
681
682fn char_at(s: &str, byte: usize) -> char {
683    s[byte..].chars().next().unwrap()
684}
685
686// For automatic semicolon insertion.
687fn may_terminate_statement(t: Option<TokenKind>) -> bool {
688    // A non-existent token may not terminate a line.
689    let t = match t {
690        Some(t) => t,
691        None => return false,
692    };
693
694    // From the Go spec:
695    //
696    // When the input is broken into tokens, a semicolon is automatically inserted into the
697    // token stream immediately after a line's final token if that token is:
698    // - an identifier
699    // - an integer, floating-point, imaginary, rune, or string literal
700    // - one of the keywords break, continue, fallthrough, or return
701    // - one of the operators and delimiters ++, --, ), ], or }
702    use token::TokenKind::*;
703    match t {
704        Ident => true,
705        Increment | Decrement | Break | Continue | Fallthrough | Return | RParen | RBracket |
706        RBrace => true,
707        t if t.is_literal() => true,
708        _ => false,
709    }
710}