garden_lang_parser/
lex.rs

1use std::path::{Path, PathBuf};
2use std::rc::Rc;
3
4use lazy_static::lazy_static;
5use line_numbers::LinePositions;
6use regex::Regex;
7
8use crate::diagnostics::ErrorMessage;
9use crate::position::Position;
10use crate::{msgcode, msgtext, ParseError};
11
12lazy_static! {
13    pub(crate) static ref INTEGER_RE: Regex = Regex::new(r"^-?[0-9]+").unwrap();
14    pub(crate) static ref STRING_RE: Regex = Regex::new(r#"^"(\\"|[^"])*""#).unwrap();
15    pub(crate) static ref SYMBOL_RE: Regex = Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*").unwrap();
16}
17
18#[derive(Debug, Clone, PartialEq, Eq)]
19pub struct Token<'a> {
20    pub position: Position,
21    pub text: &'a str,
22    /// Comments before this token.
23    pub preceding_comments: Vec<(Position, &'a str)>,
24}
25
26#[derive(Debug, Clone)]
27pub struct TokenStream<'a> {
28    pub(crate) path: Rc<PathBuf>,
29    tokens: Vec<Token<'a>>,
30    /// The index of our current position in the underlying vec.
31    pub(crate) idx: usize,
32    /// Comments after the last token in the file.
33    pub trailing_comments: Vec<(Position, &'a str)>,
34}
35
36impl<'a> TokenStream<'a> {
37    pub(crate) fn is_empty(&self) -> bool {
38        self.tokens.get(self.idx).is_none()
39    }
40
41    pub fn pop(&mut self) -> Option<Token<'a>> {
42        match self.tokens.get(self.idx) {
43            Some(token) => {
44                self.idx += 1;
45                Some(token.clone())
46            }
47            None => None,
48        }
49    }
50
51    pub(crate) fn unpop(&mut self) {
52        assert!(self.idx > 0);
53        self.idx -= 1;
54    }
55
56    pub(crate) fn peek(&self) -> Option<Token<'a>> {
57        self.tokens.get(self.idx).cloned()
58    }
59
60    pub(crate) fn peek_two(&self) -> Option<(Token<'a>, Token<'a>)> {
61        match (self.tokens.get(self.idx), self.tokens.get(self.idx + 1)) {
62            (Some(token1), Some(token2)) => Some((token1.clone(), token2.clone())),
63            _ => None,
64        }
65    }
66
67    pub(crate) fn prev(&self) -> Option<Token<'a>> {
68        if self.idx == 0 {
69            return None;
70        }
71
72        self.tokens.get(self.idx - 1).cloned()
73    }
74}
75
76pub(crate) fn lex_between<'a>(
77    path: &Path,
78    s: &'a str,
79    offset: usize,
80    end_offset: usize,
81) -> (TokenStream<'a>, Vec<ParseError>) {
82    assert!(end_offset <= s.len());
83
84    let path = Rc::new(path.to_owned());
85    let lp = LinePositions::from(s);
86    let mut tokens: Vec<Token<'a>> = vec![];
87    let mut errors: Vec<ParseError> = vec![];
88
89    let mut preceding_comments = vec![];
90    let mut offset = offset;
91
92    // Skip shebang if present at the beginning of the file.
93    if offset == 0 && s.starts_with('#') {
94        offset = s.find('\n').unwrap_or(s.len());
95    }
96
97    'outer: while offset < end_offset {
98        let s = &s[offset..];
99
100        // Skip over comments.
101        if s.starts_with("//") {
102            let (line_number, column) = lp.from_offset(offset);
103            if let Some(i) = s.find('\n') {
104                preceding_comments.push((
105                    Position {
106                        start_offset: offset,
107                        end_offset: offset + i,
108                        line_number: line_number.as_usize(),
109                        end_line_number: line_number.as_usize(),
110                        column,
111                        end_column: i,
112                        path: path.clone(),
113                    },
114                    &s[0..i + 1],
115                ));
116                offset += i + 1;
117            } else {
118                // Comment at EOF without a trailing newline.
119                preceding_comments.push((
120                    Position {
121                        start_offset: offset,
122                        end_offset: offset + s.len(),
123                        line_number: line_number.as_usize(),
124                        end_line_number: line_number.as_usize(),
125                        column,
126                        end_column: s.len(),
127                        path: path.clone(),
128                    },
129                    s,
130                ));
131                offset += s.len();
132            }
133            continue;
134        }
135
136        // Skip over other whitespace.
137        let Some(first_char) = s.chars().next() else {
138            break;
139        };
140        if first_char.is_whitespace() {
141            offset += 1;
142            continue;
143        }
144
145        for token_str in ["==", "!=", ">=", "<=", "&&", "||", "=>", "+=", "-=", "**"] {
146            if s.starts_with(token_str) {
147                let (line_number, column) = lp.from_offset(offset);
148
149                tokens.push(Token {
150                    position: Position {
151                        start_offset: offset,
152                        end_offset: offset + token_str.len(),
153                        line_number: line_number.as_usize(),
154                        end_line_number: line_number.as_usize(),
155                        column,
156                        end_column: column + token_str.len(),
157                        path: path.clone(),
158                    },
159                    text: &s[0..token_str.len()],
160                    preceding_comments,
161                });
162                preceding_comments = vec![];
163
164                offset += token_str.len();
165                continue 'outer;
166            }
167        }
168
169        // Match integers before binary operators, so -1 is treated as
170        // a single integer literal, not the token - followed by 1.
171        if let Some(integer_match) = INTEGER_RE.find(s) {
172            let (line_number, column) = lp.from_offset(offset);
173
174            tokens.push(Token {
175                position: Position {
176                    start_offset: offset,
177                    end_offset: offset + integer_match.end(),
178                    line_number: line_number.as_usize(),
179                    end_line_number: line_number.as_usize(),
180                    column,
181                    end_column: column + integer_match.end(),
182                    path: path.clone(),
183                },
184                text: integer_match.as_str(),
185                preceding_comments,
186            });
187            preceding_comments = vec![];
188
189            offset += integer_match.end();
190            continue;
191        }
192
193        for token_char in [
194            '+', '-', '*', '/', '%', '^', '(', ')', '{', '}', '=', ',', '<', '>', '[', ']', '.',
195            ':',
196        ] {
197            if s.starts_with(token_char) {
198                let (line_number, column) = lp.from_offset(offset);
199
200                tokens.push(Token {
201                    position: Position {
202                        start_offset: offset,
203                        end_offset: offset + 1,
204                        line_number: line_number.as_usize(),
205                        end_line_number: line_number.as_usize(),
206                        column,
207                        end_column: column + 1,
208                        path: path.clone(),
209                    },
210                    text: &s[0..1],
211                    preceding_comments,
212                });
213                preceding_comments = vec![];
214
215                offset += 1;
216                continue 'outer;
217            }
218        }
219        if let Some(string_match) = STRING_RE.find(s) {
220            let (line_number, column) = lp.from_offset(offset);
221
222            tokens.push(Token {
223                position: Position {
224                    start_offset: offset,
225                    end_offset: offset + string_match.end(),
226                    line_number: line_number.as_usize(),
227                    end_line_number: line_number.as_usize(),
228                    column,
229                    end_column: column + string_match.end(),
230                    path: path.clone(),
231                },
232                text: string_match.as_str(),
233                preceding_comments,
234            });
235            preceding_comments = vec![];
236
237            offset += string_match.end();
238        } else if let Some(variable_match) = SYMBOL_RE.find(s) {
239            let (line_number, column) = lp.from_offset(offset);
240
241            tokens.push(Token {
242                position: Position {
243                    start_offset: offset,
244                    end_offset: offset + variable_match.end(),
245                    line_number: line_number.as_usize(),
246                    end_line_number: line_number.as_usize(),
247                    column,
248                    end_column: column + variable_match.end(),
249                    path: path.clone(),
250                },
251                text: variable_match.as_str(),
252                preceding_comments,
253            });
254            preceding_comments = vec![];
255
256            offset += variable_match.end();
257        } else {
258            let (line_number, column) = lp.from_offset(offset);
259
260            errors.push(ParseError::Invalid {
261                position: Position {
262                    start_offset: offset,
263                    end_offset: offset + 1,
264                    line_number: line_number.as_usize(),
265                    end_line_number: line_number.as_usize(),
266                    column,
267                    end_column: column + 1,
268                    path: path.clone(),
269                },
270                message: ErrorMessage(vec![
271                    msgtext!("Unrecognized syntax "),
272                    msgcode!("{}", &s[0..1]),
273                ]),
274                additional: vec![],
275            });
276
277            offset += 1;
278        }
279    }
280
281    (
282        TokenStream {
283            path: path.clone(),
284            tokens,
285            idx: 0,
286            trailing_comments: preceding_comments,
287        },
288        errors,
289    )
290}
291
292pub fn lex<'a>(path: &Path, s: &'a str) -> (TokenStream<'a>, Vec<ParseError>) {
293    lex_between(path, s, 0, s.len())
294}
295
296#[cfg(test)]
297mod tests {
298    use std::path::PathBuf;
299
300    use super::*;
301
302    #[test]
303    fn test_lex_no_offset() {
304        let tokens = lex(&PathBuf::from("__test.gdn"), "1").0;
305        assert_eq!(
306            tokens.peek(),
307            Some(Token {
308                position: Position {
309                    start_offset: 0,
310                    end_offset: 1,
311                    line_number: 0,
312                    end_line_number: 0,
313                    column: 0,
314                    end_column: 1,
315                    path: PathBuf::from("__test.gdn").into()
316                },
317                text: "1",
318                preceding_comments: vec![],
319            })
320        );
321    }
322
323    #[test]
324    fn test_lex_with_offset() {
325        let tokens = lex(&PathBuf::from("__test.gdn"), " a").0;
326        assert_eq!(
327            tokens.peek(),
328            Some(Token {
329                position: Position {
330                    start_offset: 1,
331                    end_offset: 2,
332                    line_number: 0,
333                    end_line_number: 0,
334                    column: 1,
335                    end_column: 2,
336                    path: PathBuf::from("__test.gdn").into()
337                },
338                text: "a",
339                preceding_comments: vec![],
340            })
341        );
342    }
343
344    #[test]
345    fn test_lex_spaces() {
346        assert_eq!(
347            lex(&PathBuf::from("__test.gdn"), "1 + 2")
348                .0
349                .tokens
350                .iter()
351                .map(|token| token.text)
352                .collect::<Vec<_>>(),
353            vec!["1", "+", "2"]
354        );
355    }
356
357    #[test]
358    fn test_lex_no_spaces() {
359        assert_eq!(
360            lex(&PathBuf::from("__test.gdn"), "1+2")
361                .0
362                .tokens
363                .iter()
364                .map(|token| token.text)
365                .collect::<Vec<_>>(),
366            vec!["1", "+", "2"]
367        );
368    }
369
370    #[test]
371    fn test_lex_comment() {
372        let tokens = lex(&PathBuf::from("__test.gdn"), "// 2\n1").0;
373        assert_eq!(
374            tokens.peek(),
375            Some(Token {
376                position: Position {
377                    start_offset: 5,
378                    end_offset: 6,
379                    line_number: 1,
380                    end_line_number: 1,
381                    column: 0,
382                    end_column: 1,
383                    path: PathBuf::from("__test.gdn").into()
384                },
385                text: "1",
386                preceding_comments: vec![(
387                    Position {
388                        start_offset: 0,
389                        end_offset: 4,
390                        line_number: 0,
391                        end_line_number: 0,
392                        column: 0,
393                        end_column: 4,
394                        path: PathBuf::from("__test.gdn").into()
395                    },
396                    " 2\n"
397                )],
398            })
399        );
400    }
401
402    #[test]
403    fn test_lex_comment_not_touching() {
404        let tokens = lex(&PathBuf::from("__test.gdn"), "// 2\n\n1").0;
405        assert_eq!(
406            tokens.peek(),
407            Some(Token {
408                position: Position {
409                    start_offset: 6,
410                    end_offset: 7,
411                    line_number: 2,
412                    end_line_number: 2,
413                    column: 0,
414                    end_column: 1,
415                    path: PathBuf::from("__test.gdn").into()
416                },
417                text: "1",
418                preceding_comments: vec![],
419            })
420        );
421    }
422
423    #[test]
424    fn test_lex_comment_leading_newline() {
425        assert!(lex(&PathBuf::from("__test.gdn"), "\n// 2").0.is_empty());
426    }
427
428    #[test]
429    fn test_lex_standalone_comment() {
430        assert!(lex(&PathBuf::from("__test.gdn"), "// foo").0.is_empty());
431    }
432}