rgo/lexer/mod.rs
1//! # Lexer
2//!
3//! A `Lexer` parses a source string into a list of tokens, which may later be used to construct an
4//! Abstract Syntax Tree.
5//!
6//! ## Notes
7//!
8//! - We want meaningful errors from the start. That means printing the line and column number on
9//! error, returning `Result`s instead of panicking (later on, we may use unwinding to speed up
10//! lexical analysis in non-erroneous cases).
11//!
12//! - It is unclear whether we should operator on Unicode `char`, or plain bytes `u8`. `char`s are
13//! more convenient to display and offer a clean API; bytes are (most likely) faster to work with.
14//!
15//! - I'm not sure what the best way to store tokens is. A slice into the original source, an
16//! interned string...? Probably an interned string, this is what rustc uses and it speeds up
17//! comparisons, which are going to be very frequent. Probably reduces allocations, too - and we're
18//! allocating a _lot_. We'd have to benchmark to be sure.
19
20use std::iter::Iterator;
21pub use token::*;
22
23#[cfg(test)]
24mod test;
25
26pub struct Lexer<'src> {
27 /// Byte offset from the start of the source string.
28 offset: usize,
29 /// The source string.
30 src: &'src str,
31 /// The last character to be read.
32 current_char: Option<char>,
33 /// The kind of token we read last. Used for automatic semicolon insertion.
34 last_token_kind: Option<TokenKind>,
35}
36
37impl<'src> Lexer<'src> {
38 /// Create a new Lexer from the given source string.
39 pub fn new(s: &str) -> Lexer {
40 // Initialize the lexer with the first character of the source string.
41 let first_char = s.chars().next();
42
43 Lexer {
44 src: s,
45 offset: 0,
46 current_char: first_char,
47 last_token_kind: None,
48 }
49 }
50
51 /// 'eat' one character.
52 /// This is a _very_ hot function.
53 fn bump(&mut self) {
54 self.offset += self.current_char.unwrap().len_utf8();
55
56 if self.offset < self.src.len() {
57 let ch = char_at(&self.src, self.offset);
58 self.current_char = Some(ch);
59 } else {
60 self.current_char = None;
61 }
62 }
63
64 /// Return the next character **without** bumping.
65 /// Useful for lookahead.
66 fn next_char(&self) -> Option<char> {
67 let next_offset = self.offset + 1;
68 if next_offset < self.src.len() {
69 let ch = char_at(&self.src, next_offset);
70 Some(ch)
71 } else {
72 None
73 }
74 }
75
76 /// Scan a number literal (integer or float).
77 // FIXME: ONLY supports integers for now.
78 fn scan_number(&mut self) -> Token {
79 // Integer literal grammar:
80 //
81 // int_lit = decimal_lit | octal_lit | hex_lit .
82 // decimal_lit = ( "1" … "9" ) { decimal_digit } .
83 // octal_lit = "0" { octal_digit } .
84 // hex_lit = "0" ( "x" | "X" ) hex_digit { hex_digit } .
85
86 let start = self.offset;
87
88 // If we have a hexadecimal, treat it specially.
89 if self.current_char == Some('0') &&
90 (self.next_char() == Some('x') || self.next_char() == Some('x')) {
91 self.bump();
92 self.bump();
93
94 while let Some(c) = self.current_char {
95 if c.is_digit(16) {
96 self.bump();
97 } else {
98 break;
99 }
100 }
101
102 return Token {
103 value: Some(self.src[start..self.offset].into()),
104 kind: TokenKind::Hex,
105 };
106 }
107
108 let has_leading_zero = self.current_char == Some('0');
109 let mut had_e = false;
110 let mut had_dot = false;
111
112 'outer: while let Some(c) = self.current_char {
113 if c.is_digit(10) {
114 self.bump();
115 } else if !had_e && (c == 'e' || c == 'E') {
116 self.bump();
117 had_e = true;
118
119 if self.current_char == Some('+') || self.current_char == Some('-') {
120 self.bump();
121 }
122 } else if !had_e && !had_dot && c == '.' {
123 self.bump();
124 had_dot = true;
125 } else if c == 'i' {
126 self.bump();
127
128 return Token {
129 value: Some(self.src[start..self.offset].into()),
130 kind: TokenKind::Imaginary,
131 };
132 } else {
133 break;
134 }
135 }
136
137 let s = &self.src[start..self.offset];
138
139 let kind = if had_e || had_dot {
140 TokenKind::Float
141 } else if has_leading_zero {
142 TokenKind::Octal
143 } else {
144 TokenKind::Decimal
145 };
146
147 Token {
148 value: Some(s.into()),
149 kind: kind,
150 }
151 }
152
153 /// Skip whitespace and comments, returning whether at least one newline was encountered.
154 fn skip_whitespace_and_comments(&mut self) -> bool {
155 let mut contains_newline = false;
156
157 while let Some(c) = self.current_char {
158 if c == '\n' {
159 contains_newline = true;
160 }
161
162 // Are we at the start of a general comment (`/* ... */`)?
163 if c == '/' && self.next_char() == Some('*') {
164 // Skip the '/*'.
165 self.bump();
166 self.bump();
167
168 // Skip the comment body.
169 while let Some(c) = self.current_char {
170 if c == '*' && self.next_char() == Some('/') {
171 break;
172 } else {
173 self.bump();
174 }
175 }
176
177 // Skip the '*/'.
178 self.bump();
179 self.bump();
180
181 // Resume whitespace skipping.
182 continue;
183
184 } else if c == '/' && self.next_char() == Some('/') {
185 while let Some(c) = self.current_char {
186 if c == '\n' {
187 break;
188 } else {
189 self.bump();
190 }
191 }
192
193 // Resume whitespace skipping.
194 // Since we have not bumped past the newline character,
195 // the next iteration of the loop will catch it.
196 continue;
197 }
198
199 if c.is_whitespace() {
200 self.bump();
201 } else {
202 break;
203 }
204 }
205
206
207 contains_newline
208 }
209
210 fn scan_ident(&mut self) -> &str {
211 let start = self.offset;
212
213 while let Some(c) = self.current_char {
214 if can_continue_identifier(c) {
215 self.bump();
216 } else {
217 break;
218 }
219 }
220
221 &self.src[start..self.offset]
222 }
223
224 fn scan_ident_or_keyword(&mut self) -> Token {
225 let ident = self.scan_ident();
226 let mut value = None;
227
228 use token::TokenKind::*;
229
230 let kind = match &*ident {
231 "break" => Break,
232 "case" => Case,
233 "chan" => Chan,
234 "const" => Const,
235 "continue" => Continue,
236 "default" => Default,
237 "defer" => Defer,
238 "else" => Else,
239 "fallthrough" => Fallthrough,
240 "for" => For,
241 "func" => Func,
242 "go" => Go,
243 "goto" => Goto,
244 "if" => If,
245 "import" => Import,
246 "interface" => Interface,
247 "map" => Map,
248 "package" => Package,
249 "range" => Range,
250 "return" => Return,
251 "select" => Select,
252 "struct" => Struct,
253 "switch" => Switch,
254 "type" => Type,
255 "var" => Var,
256 // XXX(perf): unnecessary alloc.
257 _ => {
258 value = Some(ident.into());
259 TokenKind::Ident
260 }
261 };
262
263 Token {
264 kind: kind,
265 value: value,
266 }
267 }
268
269 /// Return the next token, if any.
270 fn next_token_inner(&mut self) -> Option<Token> {
271 // Whitespace and comment handling.
272 let contains_newline = self.skip_whitespace_and_comments();
273
274 // Automatic semicolon insertion in the simplest case (newline + token that may terminate a
275 // statement).
276 //
277 // The Go Spec also says that a semicolon may be omitted before a closing ")" or "}".
278 // This case is _not_ handled by the lexer, but by the parser, as it requires too much
279 // context.
280 if contains_newline && may_terminate_statement(self.last_token_kind) {
281 return Some(Token {
282 kind: TokenKind::Semicolon,
283 value: None,
284 });
285 }
286
287 // Check for EOF after whitespace handling.
288 let c = match self.current_char {
289 Some(c) => c,
290 None => return None,
291 };
292
293 let kind = match c {
294 // Single-character tokens.
295 '(' => {
296 self.bump();
297 TokenKind::LParen
298 }
299 ')' => {
300 self.bump();
301 TokenKind::RParen
302 }
303 '{' => {
304 self.bump();
305 TokenKind::LBrace
306 }
307 '}' => {
308 self.bump();
309 TokenKind::RBrace
310 }
311 '[' => {
312 self.bump();
313 TokenKind::LBracket
314 }
315 ']' => {
316 self.bump();
317 TokenKind::RBracket
318 }
319 ',' => {
320 self.bump();
321 TokenKind::Comma
322 }
323 ';' => {
324 self.bump();
325 TokenKind::Semicolon
326 }
327 // More complex tokens.
328 '.' => {
329 if self.next_char().map(|x| x.is_digit(10)) == Some(true) {
330 return Some(self.scan_number());
331 }
332
333 self.bump();
334
335 // Look for an ellipsis ('...').
336 if self.current_char == Some('.') && self.next_char() == Some('.') {
337 self.bump();
338 self.bump();
339 TokenKind::Ellipsis
340 } else {
341 TokenKind::Dot
342 }
343 }
344 ':' => {
345 self.bump();
346
347 if self.current_char == Some('=') {
348 self.bump();
349 TokenKind::ColonAssign
350 } else {
351 TokenKind::Colon
352 }
353 }
354 '=' => {
355 self.bump();
356
357 if self.current_char == Some('=') {
358 self.bump();
359 TokenKind::Equals
360 } else {
361 TokenKind::Assign
362 }
363 }
364 '+' => {
365 self.bump();
366
367 match self.current_char {
368 Some('+') => {
369 self.bump();
370 TokenKind::Increment
371 }
372 Some('=') => {
373 self.bump();
374 TokenKind::PlusAssign
375 }
376 _ => TokenKind::Plus,
377 }
378 }
379 '-' => {
380 self.bump();
381
382 match self.current_char {
383 Some('-') => {
384 self.bump();
385 TokenKind::Decrement
386 }
387 Some('=') => {
388 self.bump();
389 TokenKind::MinusAssign
390 }
391 _ => TokenKind::Minus,
392 }
393 }
394 '*' => {
395 self.bump();
396
397 match self.current_char {
398 Some('=') => {
399 self.bump();
400 TokenKind::StarAssign
401 }
402 _ => TokenKind::Star,
403 }
404 }
405 '/' => {
406 self.bump();
407
408 match self.current_char {
409 Some('=') => {
410 self.bump();
411 TokenKind::SlashAssign
412 }
413 _ => TokenKind::Slash,
414 }
415 }
416 '<' => {
417 self.bump();
418
419 match self.current_char {
420 Some('<') => {
421 self.bump();
422 match self.current_char {
423 Some('=') => {
424 self.bump();
425 TokenKind::LshiftAssign
426 }
427 _ => TokenKind::Lshift,
428 }
429 }
430 Some('=') => {
431 self.bump();
432 TokenKind::LessThanOrEqual
433 }
434 Some('-') => {
435 self.bump();
436 TokenKind::Arrow
437 }
438 _ => TokenKind::LessThan,
439 }
440 }
441 '>' => {
442 self.bump();
443
444 match self.current_char {
445 Some('>') => {
446 self.bump();
447 match self.current_char {
448 Some('=') => {
449 self.bump();
450 TokenKind::RshiftAssign
451 }
452 _ => TokenKind::Rshift,
453 }
454 }
455 Some('=') => {
456 self.bump();
457 TokenKind::GreaterThanOrEqual
458 }
459 _ => TokenKind::GreaterThan,
460 }
461 }
462 '|' => {
463 self.bump();
464
465 match self.current_char {
466 Some('|') => {
467 self.bump();
468 TokenKind::OrOr
469 }
470 Some('=') => {
471 self.bump();
472 TokenKind::OrAssign
473 }
474 _ => TokenKind::Or,
475 }
476 }
477 '&' => {
478 self.bump();
479
480 match self.current_char {
481 Some('&') => {
482 self.bump();
483 TokenKind::AndAnd
484 }
485 Some('=') => {
486 self.bump();
487 TokenKind::AndAssign
488 }
489 Some('^') => {
490 self.bump();
491 match self.current_char {
492 Some('=') => {
493 self.bump();
494 TokenKind::BitClearAssign
495 }
496 _ => TokenKind::BitClear,
497 }
498 }
499 _ => TokenKind::And,
500 }
501 }
502 '!' => {
503 self.bump();
504
505 match self.current_char {
506 Some('=') => {
507 self.bump();
508 TokenKind::NotEqual
509 }
510 _ => TokenKind::Not,
511 }
512 }
513 '^' => {
514 self.bump();
515
516 match self.current_char {
517 Some('=') => {
518 self.bump();
519 TokenKind::CaretAssign
520 }
521 _ => TokenKind::Caret,
522 }
523 }
524 '%' => {
525 self.bump();
526
527 match self.current_char {
528 Some('=') => {
529 self.bump();
530 TokenKind::PercentAssign
531 }
532 _ => TokenKind::Percent,
533 }
534 }
535 // Scan integer.
536 c if c.is_digit(10) => return Some(self.scan_number()),
537 c if can_start_identifier(c) => return Some(self.scan_ident_or_keyword()),
538 // Start of _interpreted_ string literal.
539 '"' => return Some(self.scan_interpreted_str_lit()),
540 '`' => return Some(self.scan_raw_str_lit()),
541 '\'' => return Some(self.scan_rune_lit()),
542 c => panic!("unexpected start of token: '{}'", c),
543 };
544
545 Some(Token {
546 kind: kind,
547 value: None,
548 })
549 }
550
551 // XXX: add some validity checking.
552 fn scan_rune_lit(&mut self) -> Token {
553 self.bump();
554
555 let start = self.offset;
556
557 while let Some(c) = self.current_char {
558 // If we encounter a backslash escape, we just skip past the '\' and the
559 // following character.
560 if c == '\\' {
561 self.bump();
562 self.bump();
563 } else if c == '\'' {
564 break;
565 } else {
566 self.bump();
567 }
568 }
569
570 let s = &self.src[start..self.offset];
571
572 // Skip the quote _after_ slicing so that it isn't included
573 // in the slice.
574 self.bump();
575
576 Token {
577 value: Some(s.into()),
578 kind: TokenKind::Rune,
579 }
580 }
581
582 fn scan_interpreted_str_lit(&mut self) -> Token {
583 self.bump();
584 let start = self.offset;
585
586 while let Some(c) = self.current_char {
587 // If we encounter a backslash escape, we just skip past the '\' and the
588 // following character.
589 if c == '\\' {
590 self.bump();
591 self.bump();
592 } else if c == '"' {
593 break;
594 } else {
595 self.bump();
596 }
597 }
598
599 let s = &self.src[start..self.offset];
600
601 // Skip the quote _after_ slicing so that it isn't included
602 // in the slice.
603 self.bump();
604 // XXX(perf): alloc.
605
606 Token {
607 value: Some(s.into()),
608 kind: TokenKind::Str,
609 }
610 }
611
612 // XXX: review and test.
613 fn scan_raw_str_lit(&mut self) -> Token {
614 // Bump past the opening backtrick.
615 self.bump();
616 let start = self.offset;
617
618 while let Some(c) = self.current_char {
619 // Raw strings are pretty simple, because we don't have to handle escapes.
620 if c == '`' {
621 break;
622 } else {
623 self.bump();
624 }
625 }
626
627 let s = &self.src[start..self.offset];
628
629 // Skip the backtick _after_ slicing so that it isn't included
630 // in the slice.
631 self.bump();
632 // XXX(perf): alloc.
633
634 Token {
635 value: Some(s.into()),
636 kind: TokenKind::StrRaw,
637 }
638 }
639}
640
641impl<'src> Iterator for Lexer<'src> {
642 type Item = TokenAndSpan;
643
644 fn next(&mut self) -> Option<TokenAndSpan> {
645 let start = self.offset as u32;
646 let t = self.next_token_inner();
647 self.last_token_kind = t.as_ref().map(|t| t.kind);
648
649 t.map(|t| {
650 TokenAndSpan {
651 token: t,
652 span: Span {
653 start: start,
654 end: self.offset as u32,
655 },
656 }
657 })
658 }
659}
660
661/// Convenience function to collect all the tokens from a string.
662pub fn tokenize(s: &str) -> Vec<TokenAndSpan> {
663 let lexer = Lexer::new(s);
664 lexer.collect()
665}
666
667
668// =====
669// Utility functions.
670//
671// XXX(perf): expensive checks on Unicode chars (is_alphabetic(), is_numeric()) in these functions.
672// =====
673
674fn can_start_identifier(c: char) -> bool {
675 c.is_alphabetic() || c == '_'
676}
677
678fn can_continue_identifier(c: char) -> bool {
679 c.is_alphabetic() || c.is_numeric() || c == '_'
680}
681
682fn char_at(s: &str, byte: usize) -> char {
683 s[byte..].chars().next().unwrap()
684}
685
686// For automatic semicolon insertion.
687fn may_terminate_statement(t: Option<TokenKind>) -> bool {
688 // A non-existent token may not terminate a line.
689 let t = match t {
690 Some(t) => t,
691 None => return false,
692 };
693
694 // From the Go spec:
695 //
696 // When the input is broken into tokens, a semicolon is automatically inserted into the
697 // token stream immediately after a line's final token if that token is:
698 // - an identifier
699 // - an integer, floating-point, imaginary, rune, or string literal
700 // - one of the keywords break, continue, fallthrough, or return
701 // - one of the operators and delimiters ++, --, ), ], or }
702 use token::TokenKind::*;
703 match t {
704 Ident => true,
705 Increment | Decrement | Break | Continue | Fallthrough | Return | RParen | RBracket |
706 RBrace => true,
707 t if t.is_literal() => true,
708 _ => false,
709 }
710}