forth_lexer/
parser.rs

1use std::{iter::Peekable, str::Chars};
2
3use nom::AsChar;
4
5use crate::token::{Data, Token};
6
7pub enum LexError {}
8#[derive(Debug)]
9pub struct Lexer<'a> {
10    position: usize,
11    read_position: usize,
12    ch: char,
13    raw: &'a str,
14    input: Peekable<Chars<'a>>,
15}
16
17impl<'a> Lexer<'a> {
18    pub fn new(input: &'a str) -> Lexer<'a> {
19        let mut lex = Lexer {
20            position: 0,
21            read_position: 0,
22            ch: '0',
23            input: input.chars().peekable(),
24            raw: input,
25        };
26        lex.read_char();
27
28        lex
29    }
30
31    pub fn reset(&mut self) {
32        self.position = 0;
33        self.read_position = 0;
34        self.ch = '\0';
35    }
36
37    pub fn here(&self) -> Data<'a> {
38        Data {
39            start: self.position,
40            end: self.position,
41            value: "",
42        }
43    }
44
45    pub fn next_token(&mut self) -> Result<Token<'a>, LexError> {
46        self.skip_whitespace();
47
48        let tok = match self.ch {
49            ':' => Token::Colon(self.read_single_char_token()),
50            ';' => Token::Semicolon(self.read_single_char_token()),
51            '%' => self.try_parse_number_with_prefix(|c| c.is_digit(2)),
52            '&' => self.try_parse_number_with_prefix(|c| c == 'x' || c.is_digit(8)),
53            '$' => self.try_parse_number_with_prefix(|c| c.is_hex_digit()),
54            '\'' => self.parse_quote_or_word(),
55            '0' => self.try_parse_number_with_prefix(|c| c == 'x' || c.is_hex_digit()),
56            '0'..='9' => {
57                let ident = self.read_number();
58                Token::Number(ident)
59            }
60            '\\' => {
61                if self.peek_char().is_whitespace() {
62                    let comment = self.read_comment_to('\n');
63                    Token::Comment(comment)
64                } else {
65                    let ident = self.read_ident();
66                    Token::Word(ident)
67                }
68            }
69            '(' => {
70                if self.peek_char().is_whitespace() {
71                    let comment = self.read_comment_to(')');
72                    // Stack comments contain '--' to denote stack effects
73                    if comment.value.contains("--") {
74                        Token::StackComment(comment)
75                    } else {
76                        Token::Comment(comment)
77                    }
78                } else {
79                    let ident = self.read_ident();
80                    Token::Word(ident)
81                }
82            }
83            '\0' => {
84                let mut dat = self.here();
85                dat.value = "\0";
86                self.read_char();
87                Token::Eof(dat)
88            }
89            _ => {
90                let ident = self.read_ident();
91                Token::Word(ident)
92            }
93        };
94
95        Ok(tok)
96    }
97
98    fn read_char(&mut self) {
99        self.ch = match self.input.peek() {
100            Some(ch) => *ch,
101            None => '\0',
102        };
103
104        self.input.next();
105
106        self.position = self.read_position;
107        self.read_position += 1;
108    }
109
110    fn try_parse_number_with_prefix(&mut self, validator: fn(char) -> bool) -> Token<'a> {
111        if validator(self.peek_char()) {
112            Token::Number(self.read_number())
113        } else {
114            Token::Word(self.read_ident())
115        }
116    }
117
118    fn parse_quote_or_word(&mut self) -> Token<'a> {
119        let begin = self.position;
120        let next = self.peek_char();
121
122        if next.is_whitespace() {
123            return Token::Word(self.read_ident());
124        }
125
126        self.read_char(); // consume character after quote
127
128        if self.peek_char() == '\'' {
129            // Character literal like 'A'
130            self.read_char(); // consume closing quote
131            let number = Data {
132                start: begin,
133                end: self.position + 1,
134                value: &self.raw[begin..(self.position + 1)],
135            };
136            self.read_char(); // move past
137            return Token::Number(number);
138        }
139
140        // Quoted word
141        let mut word = self.read_ident();
142        word.start = begin;
143        word.value = &self.raw[begin..word.end];
144        Token::Word(word)
145    }
146
147    fn read_single_char_token(&mut self) -> Data<'a> {
148        let start = self.position;
149        self.read_char();
150        Data {
151            start,
152            end: start + 1,
153            value: &self.raw[start..start + 1],
154        }
155    }
156
157    fn peek_char(&mut self) -> char {
158        match self.input.peek() {
159            Some(ch) => *ch,
160            None => '\0',
161        }
162    }
163
164    fn skip_whitespace(&mut self) {
165        while self.ch.is_ascii_whitespace() {
166            self.read_char();
167        }
168    }
169
170    fn read_comment_to(&mut self, to: char) -> Data<'a> {
171        let start = self.position;
172        while self.ch != to && self.ch != '\0' {
173            self.read_char();
174        }
175        if to == ')' {
176            self.read_char();
177        }
178
179        Data {
180            start,
181            end: self.position,
182            value: &self.raw[start..self.position],
183        }
184    }
185
186    fn read_ident(&mut self) -> Data<'a> {
187        let start = self.position;
188        while !self.ch.is_whitespace() && self.ch != '\0' {
189            self.read_char();
190        }
191        Data {
192            start,
193            end: self.position,
194            value: &self.raw[start..self.position],
195        }
196    }
197
198    fn read_number(&mut self) -> Data<'a> {
199        let start = self.position;
200        //TODO: parse legal forth numbers
201        while self.ch.is_hex_digit()
202            || self.ch == '_'
203            || self.ch == '&'
204            || self.ch == '%'
205            || self.ch == 'x'
206            || self.ch == '$'
207        {
208            self.read_char();
209        }
210        Data {
211            start,
212            end: self.position,
213            value: &self.raw[start..self.position],
214        }
215    }
216
217    pub fn parse(&mut self) -> Vec<Token<'a>> {
218        let mut tokens = vec![];
219        #[allow(irrefutable_let_patterns)]
220        while let Ok(tok) = self.next_token() {
221            match tok {
222                Token::Eof(_) => {
223                    break;
224                }
225                _ => {
226                    tokens.push(tok.clone());
227                }
228            }
229        }
230        tokens
231    }
232}
233
234#[cfg(test)]
235mod tests {
236    use super::*;
237    use Token::*;
238
239    #[test]
240    fn test_parse_proper_def() {
241        let mut lexer = Lexer::new(": add1 ( n -- n )\n  1 + \\ adds one\n;");
242        let tokens = lexer.parse();
243        let expected = vec![
244            Colon(Data::new(0, 1, ":")),
245            Word(Data::new(2, 6, "add1")),
246            StackComment(Data::new(7, 17, "( n -- n )")),
247            Number(Data::new(20, 21, "1")),
248            Word(Data::new(22, 23, "+")),
249            Comment(Data::new(24, 34, "\\ adds one")),
250            Semicolon(Data::new(35, 36, ";")),
251        ];
252        assert_eq!(tokens, expected)
253    }
254
255    #[test]
256    fn test_parse_simple_def() {
257        let mut lexer = Lexer::new(": add1 1 + ;");
258        let tokens = lexer.parse();
259        let expected = vec![
260            Colon(Data::new(0, 1, ":")),
261            Word(Data::new(2, 6, "add1")),
262            Number(Data::new(7, 8, "1")),
263            Word(Data::new(9, 10, "+")),
264            Semicolon(Data::new(11, 12, ";")),
265        ];
266        assert_eq!(tokens, expected)
267    }
268
269    #[test]
270    fn test_parse_words_and_comments() {
271        let mut lexer = Lexer::new("word \\ this is a comment\nword2 ( and this ) word3");
272        let tokens = lexer.parse();
273        let expected = vec![
274            Word(Data::new(0, 4, "word")),
275            Comment(Data::new(5, 24, "\\ this is a comment")),
276            Word(Data::new(25, 30, "word2")),
277            Comment(Data::new(31, 43, "( and this )")),
278            Word(Data::new(44, 49, "word3")),
279        ];
280        assert_eq!(tokens, expected)
281    }
282
283    #[test]
284    fn test_parse_words_on_lines() {
285        let mut lexer = Lexer::new("some\nwords here\0");
286        let tokens = lexer.parse();
287        let expected = vec![
288            Word(Data::new(0, 4, "some")),
289            Word(Data::new(5, 10, "words")),
290            Word(Data::new(11, 15, "here")),
291        ];
292        assert_eq!(tokens, expected)
293    }
294
295    #[test]
296    fn test_parse_number_literal() {
297        let mut lexer = Lexer::new("12");
298        let tokens = lexer.parse();
299        let expected = vec![Number(Data::new(0, 2, "12"))];
300        assert_eq!(tokens, expected)
301    }
302
303    #[test]
304    fn test_parse_number_oct() {
305        let mut lexer = Lexer::new("&12");
306        let tokens = lexer.parse();
307        let expected = vec![Number(Data::new(0, 3, "&12"))];
308        assert_eq!(tokens, expected)
309    }
310
311    #[test]
312    fn test_parse_number_bin() {
313        let mut lexer = Lexer::new("%0100101");
314        let tokens = lexer.parse();
315        let expected = vec![Number(Data::new(0, 8, "%0100101"))];
316        assert_eq!(tokens, expected);
317    }
318
319    #[test]
320    #[ignore]
321    fn test_parse_number_bin_only_valid() {
322        //TODO: but ill formed will also parse
323        //      %12345 is not a binary number
324        let mut lexer = Lexer::new("%12345");
325        let tokens = lexer.parse();
326        let expected = vec![Word(Data::new(0, 6, "%12345"))];
327        assert_eq!(tokens, expected);
328    }
329
330    #[test]
331    fn test_parse_number_hex() {
332        let mut lexer = Lexer::new("$FfAaDd");
333        let tokens = lexer.parse();
334        let expected = vec![Number(Data::new(0, 7, "$FfAaDd"))];
335        assert_eq!(tokens, expected)
336    }
337
338    #[test]
339    fn test_parse_number_0xhex() {
340        let mut lexer = Lexer::new("0xFE");
341        let tokens = lexer.parse();
342        let expected = vec![Number(Data::new(0, 4, "0xFE"))];
343        assert_eq!(tokens, expected)
344    }
345
346    #[test]
347    fn test_parse_number_char() {
348        let mut lexer = Lexer::new("'c'");
349        let tokens = lexer.parse();
350        let expected = vec![Number(Data::new(0, 3, "'c'"))];
351        assert_eq!(tokens, expected)
352    }
353
354    #[test]
355    fn test_parse_stack_comment() {
356        let mut lexer = Lexer::new("( n1 n2 -- n3 )");
357        let tokens = lexer.parse();
358        let expected = vec![StackComment(Data::new(0, 15, "( n1 n2 -- n3 )"))];
359        assert_eq!(tokens, expected)
360    }
361
362    #[test]
363    fn test_parse_regular_comment() {
364        let mut lexer = Lexer::new("( this is just a comment )");
365        let tokens = lexer.parse();
366        let expected = vec![Comment(Data::new(0, 26, "( this is just a comment )"))];
367        assert_eq!(tokens, expected)
368    }
369
370    #[test]
371    fn test_parse_stack_comment_complex() {
372        let mut lexer = Lexer::new("( addr len -- addr' len' flag )");
373        let tokens = lexer.parse();
374        let expected = vec![StackComment(Data::new(
375            0,
376            31,
377            "( addr len -- addr' len' flag )",
378        ))];
379        assert_eq!(tokens, expected)
380    }
381
382    #[test]
383    fn test_parse_line_comment() {
384        let mut lexer = Lexer::new("\\ this is a line comment");
385        let tokens = lexer.parse();
386        let expected = vec![Comment(Data::new(0, 24, "\\ this is a line comment"))];
387        assert_eq!(tokens, expected)
388    }
389
390    #[test]
391    fn test_parse_number_word() {
392        let mut lexer = Lexer::new("word");
393        let tokens = lexer.parse();
394        let expected = vec![Word(Data::new(0, 4, "word"))];
395        assert_eq!(tokens, expected)
396    }
397
398    #[cfg(feature = "ropey")]
399    #[test]
400    fn test_to_ropey() {
401        let progn = "word1 word2 word3";
402        let rope = ropey::Rope::from_str(progn);
403        let mut lexer = Lexer::new(progn);
404        let tokens = lexer.parse();
405        let word2 = if let Some(Token::Word(word)) = tokens.get(1) {
406            word.to_owned()
407        } else {
408            Data::default()
409        };
410        let x = rope.slice(&word2);
411        assert_eq!("word2", word2.value);
412        assert_eq!(word2.value, x);
413    }
414}