lexer/
lib.rs

1use crate::token::{lookup_identifier, Span, Token, TokenKind};
2
3mod lexer_test;
4pub mod token;
5
6pub struct Lexer<'a> {
7    input: &'a str,
8    position: usize,
9    read_position: usize,
10    ch: char,
11}
12
13impl<'a> Lexer<'a> {
14    pub fn new(input: &'a str) -> Self {
15        let mut l = Lexer { input, position: 0, read_position: 0, ch: 0 as char };
16
17        l.read_char();
18        return l;
19    }
20
21    fn read_char(&mut self) {
22        if self.read_position >= self.input.len() {
23            self.ch = 0 as char
24        } else {
25            if let Some(ch) = self.input.chars().nth(self.read_position) {
26                self.ch = ch;
27            } else {
28                panic!("read out of range")
29            }
30        }
31
32        self.position = self.read_position;
33        self.read_position += 1;
34    }
35
36    fn peek_char(&self) -> char {
37        if self.read_position >= self.input.len() {
38            0 as char
39        } else {
40            if let Some(ch) = self.input.chars().nth(self.read_position) {
41                ch
42            } else {
43                panic!("read out of range")
44            }
45        }
46    }
47
48    pub fn next_token(&mut self) -> Token {
49        // println!("self ch {}, position {} read_position {}", self.ch, self.position, self.read_position);
50        // Skip any whitespace and successive line comments before producing a token.
51        self.skip_ignorable();
52        let t = match self.ch {
53            '=' => {
54                if self.peek_char() == '=' {
55                    self.read_char();
56                    TokenKind::EQ
57                } else {
58                    TokenKind::ASSIGN
59                }
60            }
61            ';' => TokenKind::SEMICOLON,
62            '(' => TokenKind::LPAREN,
63            ')' => TokenKind::RPAREN,
64            ',' => TokenKind::COMMA,
65            '+' => TokenKind::PLUS,
66            '-' => TokenKind::MINUS,
67            '!' => {
68                if self.peek_char() == '=' {
69                    self.read_char();
70                    TokenKind::NotEq
71                } else {
72                    TokenKind::BANG
73                }
74            }
75            '*' => TokenKind::ASTERISK,
76            '/' => TokenKind::SLASH,
77            '<' => TokenKind::LT,
78            '>' => TokenKind::GT,
79            '{' => TokenKind::LBRACE,
80            '}' => TokenKind::RBRACE,
81            '[' => TokenKind::LBRACKET,
82            ':' => TokenKind::COLON,
83            ']' => TokenKind::RBRACKET,
84            '\u{0}' => TokenKind::EOF,
85            '"' => {
86                let (start, end, string) = self.read_string();
87                return Token { span: Span { start, end }, kind: TokenKind::STRING(string) };
88            }
89            _ => {
90                if is_letter(self.ch) {
91                    let (start, end, identifier) = self.read_identifier();
92                    return Token {
93                        span: Span { start, end },
94                        kind: lookup_identifier(&identifier),
95                    };
96                } else if is_digit(self.ch) {
97                    let (start, end, num) = self.read_number();
98                    return Token { span: Span { start, end }, kind: TokenKind::INT(num) };
99                } else {
100                    TokenKind::ILLEGAL
101                }
102            }
103        };
104
105        self.read_char();
106        return Token {
107            span: Span { start: self.position - 1, end: self.read_position - 1 },
108            kind: t,
109        };
110    }
111
112    fn skip_whitespace(&mut self) {
113        while self.ch.is_ascii_whitespace() {
114            self.read_char();
115        }
116    }
117
118    fn skip_ignorable(&mut self) {
119        loop {
120            self.skip_whitespace();
121            if self.ch == '/' && self.peek_char() == '/' {
122                self.skip_comments();
123                // Continue the loop, in case there are more comments or whitespace
124                continue;
125            }
126            break;
127        }
128    }
129
130    fn skip_comments(&mut self) {
131        if self.ch == '/' && self.peek_char() == '/' {
132            self.read_char();
133            self.read_char();
134            loop {
135                self.read_char();
136                if self.ch == '\n' || self.ch == '\u{0}' {
137                    // consume the comments end
138                    if self.ch == '\n' {
139                        self.read_char();
140                    }
141                    break;
142                }
143            }
144        }
145    }
146
147    fn read_identifier(&mut self) -> (usize, usize, String) {
148        let pos = self.position;
149        while is_letter(self.ch) {
150            self.read_char();
151        }
152
153        let x = self.input[pos..self.position].to_string();
154        return (pos, self.position, x);
155    }
156
157    fn read_number(&mut self) -> (usize, usize, i64) {
158        let pos = self.position;
159        while is_digit(self.ch) {
160            self.read_char();
161        }
162
163        let x = self.input[pos..self.position].parse().unwrap();
164
165        return (pos, self.position, x);
166    }
167
168    fn read_string(&mut self) -> (usize, usize, String) {
169        let pos = self.position + 1;
170        loop {
171            self.read_char();
172            if self.ch == '"' || self.ch == '\u{0}' {
173                break;
174            }
175        }
176
177        let x = self.input[pos..self.position].to_string();
178
179        // consume the end "
180        if self.ch == '"' {
181            self.read_char();
182        }
183        return (pos - 1, self.position, x);
184    }
185}
186
187fn is_letter(c: char) -> bool {
188    c.is_ascii_alphabetic() || c == '_'
189}
190
191fn is_digit(c: char) -> bool {
192    c >= '0' && c <= '9'
193}