lexer/
lib.rs

1use crate::token::{lookup_identifier, Span, Token, TokenKind};
2
3mod lexer_test;
4pub mod token;
5
6pub struct Lexer<'a> {
7    input: &'a str,
8    position: usize,
9    read_position: usize,
10    ch: char,
11}
12
13impl<'a> Lexer<'a> {
14    pub fn new(input: &'a str) -> Self {
15        let mut l = Lexer { input, position: 0, read_position: 0, ch: 0 as char };
16
17        l.read_char();
18        return l;
19    }
20
21    fn read_char(&mut self) {
22        if self.read_position >= self.input.len() {
23            self.ch = 0 as char
24        } else {
25            if let Some(ch) = self.input.chars().nth(self.read_position) {
26                self.ch = ch;
27            } else {
28                panic!("read out of range")
29            }
30        }
31
32        self.position = self.read_position;
33        self.read_position += 1;
34    }
35
36    fn peek_char(&self) -> char {
37        if self.read_position >= self.input.len() {
38            0 as char
39        } else {
40            if let Some(ch) = self.input.chars().nth(self.read_position) {
41                ch
42            } else {
43                panic!("read out of range")
44            }
45        }
46    }
47
48    pub fn next_token(&mut self) -> Token {
49        // println!("self ch {}, position {} read_position {}", self.ch, self.position, self.read_position);
50        self.skip_whitespace();
51        self.skip_comments();
52        let t = match self.ch {
53            '=' => {
54                if self.peek_char() == '=' {
55                    self.read_char();
56                    TokenKind::EQ
57                } else {
58                    TokenKind::ASSIGN
59                }
60            }
61            ';' => TokenKind::SEMICOLON,
62            '(' => TokenKind::LPAREN,
63            ')' => TokenKind::RPAREN,
64            ',' => TokenKind::COMMA,
65            '+' => TokenKind::PLUS,
66            '-' => TokenKind::MINUS,
67            '!' => {
68                if self.peek_char() == '=' {
69                    self.read_char();
70                    TokenKind::NotEq
71                } else {
72                    TokenKind::BANG
73                }
74            }
75            '*' => TokenKind::ASTERISK,
76            '/' => TokenKind::SLASH,
77            '<' => TokenKind::LT,
78            '>' => TokenKind::GT,
79            '{' => TokenKind::LBRACE,
80            '}' => TokenKind::RBRACE,
81            '[' => TokenKind::LBRACKET,
82            ':' => TokenKind::COLON,
83            ']' => TokenKind::RBRACKET,
84            '\u{0}' => TokenKind::EOF,
85            '"' => {
86                let (start, end, string) = self.read_string();
87                return Token { span: Span { start, end }, kind: TokenKind::STRING(string) };
88            }
89            _ => {
90                if is_letter(self.ch) {
91                    let (start, end, identifier) = self.read_identifier();
92                    return Token {
93                        span: Span { start, end },
94                        kind: lookup_identifier(&identifier),
95                    };
96                } else if is_digit(self.ch) {
97                    let (start, end, num) = self.read_number();
98                    return Token { span: Span { start, end }, kind: TokenKind::INT(num) };
99                } else {
100                    TokenKind::ILLEGAL
101                }
102            }
103        };
104
105        self.read_char();
106        return Token {
107            span: Span { start: self.position - 1, end: self.read_position - 1 },
108            kind: t,
109        };
110    }
111
112    fn skip_whitespace(&mut self) {
113        while self.ch.is_ascii_whitespace() {
114            self.read_char();
115        }
116    }
117
118    fn skip_comments(&mut self) {
119        if self.ch == '/' && self.peek_char() == '/' {
120            self.read_char();
121            self.read_char();
122            loop {
123                self.read_char();
124                if self.ch == '\n' || self.ch == '\u{0}' {
125                    // consume the comments end
126                    if self.ch == '\n' {
127                        self.read_char();
128                    }
129                    break;
130                }
131            }
132        }
133    }
134
135    fn read_identifier(&mut self) -> (usize, usize, String) {
136        let pos = self.position;
137        while is_letter(self.ch) {
138            self.read_char();
139        }
140
141        let x = self.input[pos..self.position].to_string();
142        return (pos, self.position, x);
143    }
144
145    fn read_number(&mut self) -> (usize, usize, i64) {
146        let pos = self.position;
147        while is_digit(self.ch) {
148            self.read_char();
149        }
150
151        let x = self.input[pos..self.position].parse().unwrap();
152
153        return (pos, self.position, x);
154    }
155
156    fn read_string(&mut self) -> (usize, usize, String) {
157        let pos = self.position + 1;
158        loop {
159            self.read_char();
160            if self.ch == '"' || self.ch == '\u{0}' {
161                break;
162            }
163        }
164
165        let x = self.input[pos..self.position].to_string();
166
167        // consume the end "
168        if self.ch == '"' {
169            self.read_char();
170        }
171        return (pos - 1, self.position, x);
172    }
173}
174
175fn is_letter(c: char) -> bool {
176    c.is_ascii_alphabetic() || c == '_'
177}
178
179fn is_digit(c: char) -> bool {
180    c >= '0' && c <= '9'
181}