forth_lexer/
parser.rs

1use std::{iter::Peekable, str::Chars};
2
3use nom::AsChar;
4
5use crate::token::{Data, Token};
6
7pub enum LexError {}
8#[derive(Debug)]
9pub struct Lexer<'a> {
10    position: usize,
11    read_position: usize,
12    ch: char,
13    raw: &'a str,
14    input: Peekable<Chars<'a>>,
15}
16
17impl<'a> Lexer<'a> {
18    pub fn new(input: &'a str) -> Lexer<'a> {
19        let mut lex = Lexer {
20            position: 0,
21            read_position: 0,
22            ch: '0',
23            input: input.chars().peekable(),
24            raw: input,
25        };
26        lex.read_char();
27
28        lex
29    }
30
31    pub fn reset(&mut self) {
32        self.position = 0;
33        self.read_position = 0;
34        self.ch = '\0';
35    }
36
37    pub fn here(&self) -> Data<'a> {
38        Data {
39            start: self.position,
40            end: self.position,
41            value: "",
42        }
43    }
44
45    pub fn next_token(&mut self) -> Result<Token<'a>, LexError> {
46        self.skip_whitespace();
47
48        let tok = match self.ch {
49            ':' => Token::Colon(self.read_single_char_token()),
50            ';' => Token::Semicolon(self.read_single_char_token()),
51            '%' => self.try_parse_number_with_prefix(|c| c.is_digit(2)),
52            '&' => self.try_parse_number_with_prefix(|c| c == 'x' || c.is_digit(8)),
53            '$' => self.try_parse_number_with_prefix(|c| c.is_hex_digit()),
54            '\'' => self.parse_quote_or_word(),
55            '0' => self.try_parse_number_with_prefix(|c| c == 'x' || c.is_hex_digit()),
56            '0'..='9' => {
57                let ident = self.read_number();
58                Token::Number(ident)
59            }
60            '\\' => {
61                if self.peek_char().is_whitespace() {
62                    let comment = self.read_comment_to('\n');
63                    Token::Comment(comment)
64                } else {
65                    let ident = self.read_ident();
66                    Token::Word(ident)
67                }
68            }
69            '(' => {
70                if self.peek_char().is_whitespace() {
71                    let comment = self.read_comment_to(')');
72                    // Stack comments contain '--' to denote stack effects
73                    if comment.value.contains("--") {
74                        Token::StackComment(comment)
75                    } else {
76                        Token::Comment(comment)
77                    }
78                } else {
79                    let ident = self.read_ident();
80                    Token::Word(ident)
81                }
82            }
83            '\0' => {
84                let mut dat = self.here();
85                dat.value = "\0";
86                self.read_char();
87                Token::Eof(dat)
88            }
89            _ => {
90                let ident = self.read_ident();
91                Token::Word(ident)
92            }
93        };
94
95        Ok(tok)
96    }
97
98    fn read_char(&mut self) {
99        self.ch = match self.input.peek() {
100            Some(ch) => *ch,
101            None => '\0',
102        };
103
104        self.input.next();
105
106        self.position = self.read_position;
107        self.read_position += 1;
108    }
109
110    fn try_parse_number_with_prefix(&mut self, validator: fn(char) -> bool) -> Token<'a> {
111        if validator(self.peek_char()) {
112            Token::Number(self.read_number())
113        } else {
114            Token::Word(self.read_ident())
115        }
116    }
117
118    fn parse_quote_or_word(&mut self) -> Token<'a> {
119        let begin = self.position;
120        let next = self.peek_char();
121
122        if next.is_whitespace() {
123            return Token::Word(self.read_ident());
124        }
125
126        self.read_char(); // consume character after quote
127
128        if self.peek_char() == '\'' {
129            // Character literal like 'A'
130            self.read_char(); // consume closing quote
131            let number = Data {
132                start: begin,
133                end: self.position + 1,
134                value: &self.raw[begin..(self.position + 1)],
135            };
136            self.read_char(); // move past
137            return Token::Number(number);
138        }
139
140        // Quoted word
141        let mut word = self.read_ident();
142        word.start = begin;
143        word.value = &self.raw[begin..word.end];
144        Token::Word(word)
145    }
146
147    fn read_single_char_token(&mut self) -> Data<'a> {
148        let start = self.position;
149        self.read_char();
150        Data {
151            start,
152            end: start + 1,
153            value: &self.raw[start..start + 1],
154        }
155    }
156
157    fn peek_char(&mut self) -> char {
158        match self.input.peek() {
159            Some(ch) => *ch,
160            None => '\0',
161        }
162    }
163
164    fn skip_whitespace(&mut self) {
165        while self.ch.is_ascii_whitespace() {
166            self.read_char();
167        }
168    }
169
170    fn read_comment_to(&mut self, to: char) -> Data<'a> {
171        let start = self.position;
172        while self.ch != to && self.ch != '\0' {
173            self.read_char();
174        }
175        if to == ')' {
176            self.read_char();
177        }
178
179        let end = self.position.min(self.raw.len());
180        Data {
181            start,
182            end,
183            value: &self.raw[start..end],
184        }
185    }
186
187    fn read_ident(&mut self) -> Data<'a> {
188        let start = self.position;
189        while !self.ch.is_whitespace() && self.ch != '\0' {
190            self.read_char();
191        }
192        let end = self.position.min(self.raw.len());
193        Data {
194            start,
195            end,
196            value: &self.raw[start..end],
197        }
198    }
199
200    fn read_number(&mut self) -> Data<'a> {
201        let start = self.position;
202        //TODO: parse legal forth numbers
203        while self.ch.is_hex_digit()
204            || self.ch == '_'
205            || self.ch == '&'
206            || self.ch == '%'
207            || self.ch == 'x'
208            || self.ch == '$'
209        {
210            self.read_char();
211        }
212        let end = self.position.min(self.raw.len());
213        Data {
214            start,
215            end,
216            value: &self.raw[start..end],
217        }
218    }
219
220    pub fn parse(&mut self) -> Vec<Token<'a>> {
221        let mut tokens = vec![];
222        #[allow(irrefutable_let_patterns)]
223        while let Ok(tok) = self.next_token() {
224            match tok {
225                Token::Eof(_) => {
226                    break;
227                }
228                _ => {
229                    tokens.push(tok.clone());
230                }
231            }
232        }
233        tokens
234    }
235}
236
237#[cfg(test)]
238mod tests {
239    use super::*;
240    use Token::*;
241
242    #[test]
243    fn test_parse_proper_def() {
244        let mut lexer = Lexer::new(": add1 ( n -- n )\n  1 + \\ adds one\n;");
245        let tokens = lexer.parse();
246        let expected = vec![
247            Colon(Data::new(0, 1, ":")),
248            Word(Data::new(2, 6, "add1")),
249            StackComment(Data::new(7, 17, "( n -- n )")),
250            Number(Data::new(20, 21, "1")),
251            Word(Data::new(22, 23, "+")),
252            Comment(Data::new(24, 34, "\\ adds one")),
253            Semicolon(Data::new(35, 36, ";")),
254        ];
255        assert_eq!(tokens, expected)
256    }
257
258    #[test]
259    fn test_parse_simple_def() {
260        let mut lexer = Lexer::new(": add1 1 + ;");
261        let tokens = lexer.parse();
262        let expected = vec![
263            Colon(Data::new(0, 1, ":")),
264            Word(Data::new(2, 6, "add1")),
265            Number(Data::new(7, 8, "1")),
266            Word(Data::new(9, 10, "+")),
267            Semicolon(Data::new(11, 12, ";")),
268        ];
269        assert_eq!(tokens, expected)
270    }
271
272    #[test]
273    fn test_parse_words_and_comments() {
274        let mut lexer = Lexer::new("word \\ this is a comment\nword2 ( and this ) word3");
275        let tokens = lexer.parse();
276        let expected = vec![
277            Word(Data::new(0, 4, "word")),
278            Comment(Data::new(5, 24, "\\ this is a comment")),
279            Word(Data::new(25, 30, "word2")),
280            Comment(Data::new(31, 43, "( and this )")),
281            Word(Data::new(44, 49, "word3")),
282        ];
283        assert_eq!(tokens, expected)
284    }
285
286    #[test]
287    fn test_parse_words_on_lines() {
288        let mut lexer = Lexer::new("some\nwords here\0");
289        let tokens = lexer.parse();
290        let expected = vec![
291            Word(Data::new(0, 4, "some")),
292            Word(Data::new(5, 10, "words")),
293            Word(Data::new(11, 15, "here")),
294        ];
295        assert_eq!(tokens, expected)
296    }
297
298    #[test]
299    fn test_parse_number_literal() {
300        let mut lexer = Lexer::new("12");
301        let tokens = lexer.parse();
302        let expected = vec![Number(Data::new(0, 2, "12"))];
303        assert_eq!(tokens, expected)
304    }
305
306    #[test]
307    fn test_parse_number_oct() {
308        let mut lexer = Lexer::new("&12");
309        let tokens = lexer.parse();
310        let expected = vec![Number(Data::new(0, 3, "&12"))];
311        assert_eq!(tokens, expected)
312    }
313
314    #[test]
315    fn test_parse_number_bin() {
316        let mut lexer = Lexer::new("%0100101");
317        let tokens = lexer.parse();
318        let expected = vec![Number(Data::new(0, 8, "%0100101"))];
319        assert_eq!(tokens, expected);
320    }
321
322    #[test]
323    #[ignore]
324    fn test_parse_number_bin_only_valid() {
325        //TODO: but ill formed will also parse
326        //      %12345 is not a binary number
327        let mut lexer = Lexer::new("%12345");
328        let tokens = lexer.parse();
329        let expected = vec![Word(Data::new(0, 6, "%12345"))];
330        assert_eq!(tokens, expected);
331    }
332
333    #[test]
334    fn test_parse_number_hex() {
335        let mut lexer = Lexer::new("$FfAaDd");
336        let tokens = lexer.parse();
337        let expected = vec![Number(Data::new(0, 7, "$FfAaDd"))];
338        assert_eq!(tokens, expected)
339    }
340
341    #[test]
342    fn test_parse_number_0xhex() {
343        let mut lexer = Lexer::new("0xFE");
344        let tokens = lexer.parse();
345        let expected = vec![Number(Data::new(0, 4, "0xFE"))];
346        assert_eq!(tokens, expected)
347    }
348
349    #[test]
350    fn test_parse_number_char() {
351        let mut lexer = Lexer::new("'c'");
352        let tokens = lexer.parse();
353        let expected = vec![Number(Data::new(0, 3, "'c'"))];
354        assert_eq!(tokens, expected)
355    }
356
357    #[test]
358    fn test_parse_stack_comment() {
359        let mut lexer = Lexer::new("( n1 n2 -- n3 )");
360        let tokens = lexer.parse();
361        let expected = vec![StackComment(Data::new(0, 15, "( n1 n2 -- n3 )"))];
362        assert_eq!(tokens, expected)
363    }
364
365    #[test]
366    fn test_parse_regular_comment() {
367        let mut lexer = Lexer::new("( this is just a comment )");
368        let tokens = lexer.parse();
369        let expected = vec![Comment(Data::new(0, 26, "( this is just a comment )"))];
370        assert_eq!(tokens, expected)
371    }
372
373    #[test]
374    fn test_parse_stack_comment_complex() {
375        let mut lexer = Lexer::new("( addr len -- addr' len' flag )");
376        let tokens = lexer.parse();
377        let expected = vec![StackComment(Data::new(
378            0,
379            31,
380            "( addr len -- addr' len' flag )",
381        ))];
382        assert_eq!(tokens, expected)
383    }
384
385    #[test]
386    fn test_parse_line_comment() {
387        let mut lexer = Lexer::new("\\ this is a line comment");
388        let tokens = lexer.parse();
389        let expected = vec![Comment(Data::new(0, 24, "\\ this is a line comment"))];
390        assert_eq!(tokens, expected)
391    }
392
393    #[test]
394    fn test_parse_number_word() {
395        let mut lexer = Lexer::new("word");
396        let tokens = lexer.parse();
397        let expected = vec![Word(Data::new(0, 4, "word"))];
398        assert_eq!(tokens, expected)
399    }
400
401    #[cfg(feature = "ropey")]
402    #[test]
403    fn test_to_ropey() {
404        let progn = "word1 word2 word3";
405        let rope = ropey::Rope::from_str(progn);
406        let mut lexer = Lexer::new(progn);
407        let tokens = lexer.parse();
408        let word2 = if let Some(Token::Word(word)) = tokens.get(1) {
409            word.to_owned()
410        } else {
411            Data::default()
412        };
413        let x = rope.slice(&word2);
414        assert_eq!("word2", word2.value);
415        assert_eq!(word2.value, x);
416    }
417}