kravl_parser/syntax/
lexer.rs

1use syntax::tokens::{
2    TokenType,
3    Token,
4    BinOp,
5};
6
7/* A sadly OOP approach on a lexer.
8 * Potentially improved using a peekable iterator.
9 */
10
11pub struct Lexer {
12    tokens:       Vec<Token>,
13    lines:        u32,
14    start:        usize,
15    pos:          usize,
16    top:          usize,
17}
18
19impl Lexer {
20    pub fn new() -> Lexer {
21        Lexer {
22            tokens: Vec::new(),
23            lines:  0,
24            start:  0,
25            pos:    0,
26            top:    0,
27        }
28    }
29
30    pub fn from(tokens: Vec<Token>) -> Lexer {
31        Lexer {
32            tokens: tokens,
33            lines:  0,
34            start:  0,
35            pos:    0,
36            top:    0,
37        }
38    }
39
40    pub fn get_tokens(&self) -> &Vec<Token> {
41        &self.tokens
42    }
43
44    pub fn reset(&mut self) {
45        self.tokens = Vec::new();
46        self.lines = 0;
47        self.start = 0;
48        self.pos   = 0;
49        self.top   = 0;
50    }
51
52    fn push_token(&mut self, token_type: TokenType, line: &str) {
53        self.tokens.push(Token::new(
54            token_type,
55
56            String::from(&line[self.start .. self.pos]),
57            self.lines,
58            self.pos as u32,
59        ));
60
61        self.start = self.pos;
62    }
63
64    fn look(&self, line: &str, offset: usize) -> char {
65        match line.chars().nth(self.pos + offset) {
66            Some(v) => v,
67            None    => ' ',
68        }
69    }
70
71    fn skip_whitespace(&mut self, line: &str) {
72        while self.look(line, 0) == ' ' && self.pos < line.len() - 1 {
73            self.pos   += 1;
74            self.start += 1;
75        }
76    }
77
78    pub fn bin_op(v: &str) -> Option<(BinOp, u8)> {
79        match v {
80            "*"  => Some((BinOp::Mul, 1)),
81            "/"  => Some((BinOp::Div, 1)),
82            "+"  => Some((BinOp::Plus, 2)),
83            "-"  => Some((BinOp::Minus, 2)),
84            "=="  => Some((BinOp::Equal, 4)),
85            "~=" => Some((BinOp::NotEqual, 4)),
86            "<"  => Some((BinOp::Lt, 4)),
87            ">"  => Some((BinOp::Gt, 4)),
88            "<=" => Some((BinOp::GtEqual, 4)),
89            ">=" => Some((BinOp::LtEqual, 4)),
90            _    => None,
91        }
92    }
93
94    fn keyword(&mut self, line: &str) -> Option<TokenType> {
95        match &line[self.start .. self.pos] {
96            "define" => Some(TokenType::Definition),
97            "lambda" => Some(TokenType::Lambda),
98            "if"     => Some(TokenType::If),
99            "else"   => Some(TokenType::Else),
100            "return" => Some(TokenType::Return),
101            "true"   => Some(TokenType::True),
102            "false"  => Some(TokenType::False),
103            "do"     => Some(TokenType::Do),
104            "end"    => Some(TokenType::End),
105            _        => None
106        }
107    }
108
109    fn is_bin_op(&mut self, line: &str) -> bool {
110        let mut is_bin_op = false;
111        
112        let mut offset = 2;
113        while self.pos + offset >= line.len() {
114            offset -= 1;
115        }
116
117        while offset > 0 && !is_bin_op {
118            match Lexer::bin_op(&line[self.start .. self.pos + offset]) {
119                Some(_) => is_bin_op = true,
120                None => ()
121            }
122            offset -= 1;
123        }
124
125        self.pos += offset;
126        is_bin_op
127    }
128    
129    pub fn next_token(&mut self) -> bool {
130        if self.top < self.tokens.len() {
131            self.top += 1;
132            return true
133        }
134        false
135    }
136
137    pub fn previous_token(&mut self) -> bool {
138        if self.top != 0 {
139            self.top -= 1;
140            return true
141        }
142        false
143    }
144
145    pub fn tokens_remaining(&self) -> usize {
146        self.tokens.len() - self.top
147    }
148
149    pub fn current_token(&self) -> &Token {
150        if self.top > self.tokens.len() - 1 {
151            return &self.tokens[self.tokens.len() - 1]
152        }
153        &self.tokens[self.top]
154    }
155
156    pub fn current_token_content(&self) -> String {
157        self.current_token().content.clone()
158    }
159
160    pub fn match_current_token(&self, t: TokenType) -> Result<&Token, String> {
161        match self.current_token().token_type == t {
162            true  => Ok(self.current_token()),
163            false => Err(format!(
164                "expected {:?} but found {:?}", t, self.current_token()
165            ))
166        }
167    }
168
169    fn push_move(&mut self, t: TokenType, line: &str) {
170        self.pos += 1;
171        self.push_token(t, line);
172    }
173
174    pub fn tokenize(&mut self, source: String) -> Result<(), String> {
175
176        fn identifier_valid(c: char) -> bool {
177            c.is_alphabetic() || c == '_' 
178                              || c == '?'
179                              || c == '!'
180                              || c.is_digit(10)
181        }
182
183        for line in source.lines() {
184            self.lines += 1;
185            self.start  = 0;
186            self.pos    = 0;
187
188            while self.pos < line.len() {
189                self.skip_whitespace(line);
190
191                let chr  = self.look(line, 0);
192
193                if chr == '"' || chr == '\'' {
194                    let del = chr;
195
196                    self.start += 1;
197                    self.pos   += 1;
198
199                    while self.look(line, 0) != del {
200                        self.pos += 1;
201                    }
202
203                    self.push_token(TokenType::Text, line);
204
205                    self.start += 1;
206                    self.pos   += 1;
207
208                    continue
209                }
210
211                if chr.is_alphabetic() {
212                    while identifier_valid(self.look(line, 0)) {
213                        self.pos += 1;
214                    }
215
216                    match self.keyword(line) {
217                        Some(t) => self.push_token(t, line),
218                        None    => self.push_token(TokenType::Identifier, line),
219                    }
220
221                    continue
222                }
223
224                let peek = self.look(line, 1);
225
226                if chr.is_digit(10) ||
227                   chr == '.' && peek.is_digit(10) ||
228                   chr == '-' && peek.is_digit(10) {
229
230                    if chr == '-' {
231                        self.pos += 1;
232                    }
233
234                    while self.look(line, 0).is_digit(10) {
235                        self.pos += 1;
236                    }
237
238                    if self.look(line, 0) == '.' && self.look(line, 1).is_digit(10) {
239                        self.pos += 1;
240                        while self.look(line, 0).is_digit(10) {
241                            self.pos += 1;
242                        }
243                        self.push_token(TokenType::Float, line);
244                        continue;
245                    }
246                    self.push_token(TokenType::Integer, line);
247                    continue;
248                }
249
250                if chr == '-' && self.look(line, 1) == '>'  {
251                    self.pos += 2;
252                    self.push_token(TokenType::Arrow, line);
253 
254                    continue
255                }
256
257                if self.is_bin_op(line) {
258                    self.pos += 1;
259                    self.push_token(TokenType::BinOp, line);
260
261                    continue
262                }
263
264                match chr {
265                    '=' => {
266                        self.push_move(TokenType::Assign, line);
267                        continue
268                    }
269
270                    '(' => {
271                        self.push_move(TokenType::LParen, line);
272                        continue
273                    }
274
275                    ')' => {
276                        self.push_move(TokenType::RParen, line);
277                        continue
278                    }
279
280                    '[' => {
281                        self.push_move(TokenType::LBracket, line);
282                        continue
283                    }
284
285                    ']' => {
286                        self.push_move(TokenType::RBracket, line);
287                        continue
288                    }
289
290                    '{' => {
291                        self.push_move(TokenType::LBrace, line);
292                        continue
293                    }
294
295                    '}' => {
296                        self.push_move(TokenType::RBrace, line);
297                        continue
298                    }
299
300                    ':' => {
301                        self.push_move(TokenType::Colon, line);
302                        continue
303                    }
304
305                    ',' => {
306                        self.push_move(TokenType::Comma, line);
307                        continue
308                    }
309
310                    '.' => {
311                        self.push_move(TokenType::Period, line);
312                        continue
313                    }
314
315                    ';' => {
316                        self.push_move(TokenType::Semicolon, line);
317                        continue
318                    }
319
320                    ' '  => break,
321                    '\0' => break,
322                    '\n' => break,
323
324                    _   => {
325                        panic!("fucked symbol: {}, line: {} col: {}",
326                                &line[self.start .. line.len()],
327                                self.lines, self.start)
328                    },
329                }
330            }
331        }
332    
333        Ok(())
334    }
335}