forth_lexer/
parser.rs

1use std::{iter::Peekable, str::Chars};
2
3use nom::AsChar;
4
5use crate::token::{Data, Token};
6
7pub enum LexError {}
8#[derive(Debug)]
9pub struct Lexer<'a> {
10    position: usize,
11    read_position: usize,
12    ch: char,
13    input: Peekable<Chars<'a>>,
14}
15
16impl<'a> Lexer<'a> {
17    pub fn new(input: &'a str) -> Lexer<'a> {
18        let mut lex = Lexer {
19            position: 0,
20            read_position: 0,
21            ch: '0',
22            input: input.chars().peekable(),
23        };
24        lex.read_char();
25
26        lex
27    }
28
29    pub fn reset(&mut self) {
30        self.position = 0;
31        self.read_position = 0;
32        self.ch = '\0';
33    }
34
35    pub fn here<T>(&self) -> Data<T>
36    where
37        T: Default,
38    {
39        Data {
40            start: self.position,
41            end: self.position,
42            value: T::default(),
43        }
44    }
45
46    pub fn next_token(&mut self) -> Result<Token, LexError> {
47        self.skip_whitespace();
48
49        let tok = match self.ch {
50            ':' => {
51                let mut dat = self.here::<char>();
52                dat.value = ':';
53                Token::Colon(dat)
54            }
55            ';' => {
56                let mut dat = self.here::<char>();
57                dat.value = ';';
58                dat.end = dat.start + 1;
59                Token::Semicolon(dat)
60            }
61            //TODO: comments
62            //TODO: strings
63            '%' => {
64                if self.peek_char().is_digit(2) {
65                    let ident = self.read_number();
66                    Token::Number(ident)
67                } else {
68                    let ident = self.read_ident();
69                    Token::Word(ident)
70                }
71            }
72            '&' => {
73                if self.peek_char() == 'x' || self.peek_char().is_digit(8) {
74                    let ident = self.read_number();
75                    Token::Number(ident)
76                } else {
77                    let ident = self.read_ident();
78                    Token::Word(ident)
79                }
80            }
81            '$' => {
82                if self.peek_char().is_hex_digit() {
83                    let ident = self.read_number();
84                    Token::Number(ident)
85                } else {
86                    let ident = self.read_ident();
87                    Token::Word(ident)
88                }
89            }
90            '\'' => {
91                if !self.peek_char().is_whitespace() {
92                    self.read_char();
93                    if self.peek_char() == '\'' {
94                        let num = self.ch;
95                        self.read_char();
96                        let number = Data::<String> {
97                            start: self.position - 2,
98                            end: self.position + 1,
99                            value: format!("'{}'", num),
100                        };
101                        Token::Number(number)
102                    } else {
103                        let mut ident = self.read_ident();
104                        ident.start -= 1;
105                        ident.value = format!("{}{}", "'", ident.value);
106                        Token::Word(ident)
107                    }
108                } else {
109                    let ident = self.read_ident();
110                    Token::Word(ident)
111                }
112            }
113            '0' => {
114                if self.peek_char() == 'x' || self.peek_char().is_hex_digit() {
115                    let ident = self.read_number();
116                    Token::Number(ident)
117                } else {
118                    let ident = self.read_ident();
119                    Token::Word(ident)
120                }
121            }
122            '0'..='9' => {
123                let ident = self.read_number();
124                Token::Number(ident)
125            }
126            '\\' => {
127                if self.peek_char().is_whitespace() {
128                    let comment = self.read_comment_to('\n');
129                    Token::Comment(comment)
130                } else {
131                    let ident = self.read_ident();
132                    Token::Word(ident)
133                }
134            }
135            '(' => {
136                if self.peek_char().is_whitespace() {
137                    let comment = self.read_comment_to(')');
138                    Token::Comment(comment)
139                } else {
140                    let ident = self.read_ident();
141                    Token::Word(ident)
142                }
143            }
144            '\0' => {
145                let mut dat = self.here::<char>();
146                dat.value = '\0';
147                Token::Eof(dat)
148            }
149            _ => {
150                let ident = self.read_ident();
151                Token::Word(ident)
152            }
153        };
154
155        self.read_char();
156        Ok(tok)
157    }
158
159    fn read_char(&mut self) {
160        self.ch = match self.input.peek() {
161            Some(ch) => *ch,
162            None => '\0',
163        };
164
165        self.input.next();
166
167        self.position = self.read_position;
168        self.read_position += 1;
169    }
170
171    fn peek_char(&mut self) -> char {
172        match self.input.peek() {
173            Some(ch) => *ch,
174            None => '\0',
175        }
176    }
177
178    fn skip_whitespace(&mut self) {
179        while self.ch.is_ascii_whitespace() {
180            self.read_char();
181        }
182    }
183
184    fn read_comment_to(&mut self, to: char) -> Data<String> {
185        let start = self.position;
186        let mut value = String::new();
187        while self.ch != to {
188            value.push(self.ch);
189            self.read_char();
190        }
191        if to == ')' {
192            value.push(self.ch);
193            self.read_char();
194        }
195
196        Data::<String> {
197            start,
198            end: self.position,
199            value,
200        }
201    }
202
203    fn read_ident(&mut self) -> Data<String> {
204        let start = self.position;
205        let mut value = String::new();
206        while !self.ch.is_whitespace() && self.ch != '\0' {
207            value.push(self.ch);
208            self.read_char();
209        }
210        Data::<String> {
211            start,
212            end: self.position,
213            value,
214        }
215    }
216
217    fn read_number(&mut self) -> Data<String> {
218        let start = self.position;
219        let mut value = String::new();
220        //TODO: parse legal forth numbers
221        while self.ch.is_hex_digit()
222            || self.ch == '_'
223            || self.ch == '&'
224            || self.ch == '%'
225            || self.ch == 'x'
226            || self.ch == '$'
227        {
228            value.push(self.ch);
229            self.read_char();
230        }
231        Data::<String> {
232            start,
233            end: self.position,
234            value,
235        }
236    }
237
238    pub fn parse(&mut self) -> Vec<Token> {
239        let mut tokens = vec![];
240        while let Ok(tok) = self.next_token() {
241            match tok {
242                Token::Eof(_) => {
243                    break;
244                }
245                _ => {
246                    tokens.push(tok);
247                }
248            }
249        }
250        tokens
251    }
252}
253
254#[cfg(test)]
255mod tests {
256    use super::*;
257    use Token::*;
258
259    #[test]
260    fn test_parse_proper_def() {
261        let mut lexer = Lexer::new(": add1 ( n -- n )\n  1 + \\ adds one\n;");
262        let tokens = lexer.parse();
263        let expected = vec![
264            Colon(Data::new(0, 0, ':')),
265            Word(Data::new(2, 6, "add1".into())),
266            Comment(Data::new(7, 17, "( n -- n )".into())),
267            Number(Data::new(20, 21, "1".into())),
268            Word(Data::new(22, 23, "+".into())),
269            Comment(Data::new(24, 34, "\\ adds one".into())),
270            Semicolon(Data::new(35, 36, ';')),
271        ];
272        assert_eq!(tokens, expected)
273    }
274
275    #[test]
276    fn test_parse_simple_def() {
277        let mut lexer = Lexer::new(": add1 1 + ;");
278        let tokens = lexer.parse();
279        let expected = vec![
280            Colon(Data::new(0, 0, ':')),
281            Word(Data::new(2, 6, "add1".into())),
282            Number(Data::new(7, 8, "1".into())),
283            Word(Data::new(9, 10, "+".into())),
284            Semicolon(Data::new(11, 12, ';')),
285        ];
286        assert_eq!(tokens, expected)
287    }
288
289    #[test]
290    fn test_parse_words_and_comments() {
291        let mut lexer = Lexer::new("word \\ this is a comment\nword2 ( and this ) word3");
292        let tokens = lexer.parse();
293        let expected = vec![
294            Word(Data::new(0, 4, "word".into())),
295            Comment(Data::new(5, 24, "\\ this is a comment".into())),
296            Word(Data::new(25, 30, "word2".into())),
297            Comment(Data::new(31, 43, "( and this )".into())),
298            Word(Data::new(44, 49, "word3".into())),
299        ];
300        assert_eq!(tokens, expected)
301    }
302
303    #[test]
304    fn test_parse_words_on_lines() {
305        let mut lexer = Lexer::new("some\nwords here\0");
306        let tokens = lexer.parse();
307        let expected = vec![
308            Word(Data::new(0, 4, "some".into())),
309            Word(Data::new(5, 10, "words".into())),
310            Word(Data::new(11, 15, "here".into())),
311        ];
312        assert_eq!(tokens, expected)
313    }
314
315    #[test]
316    fn test_parse_number_literal() {
317        let mut lexer = Lexer::new("12");
318        let tokens = lexer.parse();
319        let expected = vec![Number(Data::new(0, 2, "12".into()))];
320        assert_eq!(tokens, expected)
321    }
322
323    #[test]
324    fn test_parse_number_oct() {
325        let mut lexer = Lexer::new("&12");
326        let tokens = lexer.parse();
327        let expected = vec![Number(Data::new(0, 3, "&12".into()))];
328        assert_eq!(tokens, expected)
329    }
330
331    #[test]
332    fn test_parse_number_bin() {
333        let mut lexer = Lexer::new("%0100101");
334        let tokens = lexer.parse();
335        let expected = vec![Number(Data::new(0, 8, "%0100101".into()))];
336        assert_eq!(tokens, expected);
337    }
338
339    #[test]
340    #[ignore]
341    fn test_parse_number_bin_only_valid() {
342        //TODO: but ill formed will also parse
343        //      %12345 is not a binary number
344        let mut lexer = Lexer::new("%12345");
345        let tokens = lexer.parse();
346        let expected = vec![Word(Data::new(0, 6, "%12345".into()))];
347        assert_eq!(tokens, expected);
348    }
349
350    #[test]
351    fn test_parse_number_hex() {
352        let mut lexer = Lexer::new("$FfAaDd");
353        let tokens = lexer.parse();
354        let expected = vec![Number(Data::new(0, 7, "$FfAaDd".into()))];
355        assert_eq!(tokens, expected)
356    }
357
358    #[test]
359    fn test_parse_number_0xhex() {
360        let mut lexer = Lexer::new("0xFE");
361        let tokens = lexer.parse();
362        let expected = vec![Number(Data::new(0, 4, "0xFE".into()))];
363        assert_eq!(tokens, expected)
364    }
365
366    #[test]
367    fn test_parse_number_char() {
368        let mut lexer = Lexer::new("'c'");
369        let tokens = lexer.parse();
370        let expected = vec![Number(Data::new(0, 3, "'c'".into()))];
371        assert_eq!(tokens, expected)
372    }
373
374    #[test]
375    fn test_parse_number_word() {
376        let mut lexer = Lexer::new("word");
377        let tokens = lexer.parse();
378        let expected = vec![Word(Data::new(0, 4, "word".into()))];
379        assert_eq!(tokens, expected)
380    }
381
382    #[cfg(feature = "ropey")]
383    #[test]
384    fn test_to_ropey() {
385        let progn = "word1 word2 word3";
386        let rope = ropey::Rope::from_str(progn);
387        let mut lexer = Lexer::new(progn);
388        let tokens = lexer.parse();
389        let word2 = if let Some(Token::Word(word)) = tokens.get(1) {
390            word.to_owned()
391        } else {
392            Data::<String>::default()
393        };
394        let x = rope.slice(&word2);
395        assert_eq!("word2", word2.value);
396        assert_eq!(word2.value, x);
397    }
398}