1use std::path::{Path, PathBuf};
2use std::rc::Rc;
3
4use lazy_static::lazy_static;
5use line_numbers::LinePositions;
6use regex::Regex;
7
8use crate::diagnostics::ErrorMessage;
9use crate::position::Position;
10use crate::{msgcode, msgtext, ParseError};
11
12lazy_static! {
13 pub(crate) static ref INTEGER_RE: Regex = Regex::new(r"^-?[0-9]+").unwrap();
14 pub(crate) static ref STRING_RE: Regex = Regex::new(r#"^"(\\"|[^"])*""#).unwrap();
15 pub(crate) static ref SYMBOL_RE: Regex = Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*").unwrap();
16}
17
18#[derive(Debug, Clone, PartialEq, Eq)]
19pub struct Token<'a> {
20 pub position: Position,
21 pub text: &'a str,
22 pub preceding_comments: Vec<(Position, &'a str)>,
24}
25
26#[derive(Debug, Clone)]
27pub struct TokenStream<'a> {
28 pub(crate) path: Rc<PathBuf>,
29 tokens: Vec<Token<'a>>,
30 pub(crate) idx: usize,
32 pub trailing_comments: Vec<(Position, &'a str)>,
34}
35
36impl<'a> TokenStream<'a> {
37 pub(crate) fn is_empty(&self) -> bool {
38 self.tokens.get(self.idx).is_none()
39 }
40
41 pub fn pop(&mut self) -> Option<Token<'a>> {
42 match self.tokens.get(self.idx) {
43 Some(token) => {
44 self.idx += 1;
45 Some(token.clone())
46 }
47 None => None,
48 }
49 }
50
51 pub(crate) fn unpop(&mut self) {
52 assert!(self.idx > 0);
53 self.idx -= 1;
54 }
55
56 pub(crate) fn peek(&self) -> Option<Token<'a>> {
57 self.tokens.get(self.idx).cloned()
58 }
59
60 pub(crate) fn peek_two(&self) -> Option<(Token<'a>, Token<'a>)> {
61 match (self.tokens.get(self.idx), self.tokens.get(self.idx + 1)) {
62 (Some(token1), Some(token2)) => Some((token1.clone(), token2.clone())),
63 _ => None,
64 }
65 }
66
67 pub(crate) fn prev(&self) -> Option<Token<'a>> {
68 if self.idx == 0 {
69 return None;
70 }
71
72 self.tokens.get(self.idx - 1).cloned()
73 }
74}
75
76pub(crate) fn lex_between<'a>(
77 path: &Path,
78 s: &'a str,
79 offset: usize,
80 end_offset: usize,
81) -> (TokenStream<'a>, Vec<ParseError>) {
82 assert!(end_offset <= s.len());
83
84 let path = Rc::new(path.to_owned());
85 let lp = LinePositions::from(s);
86 let mut tokens: Vec<Token<'a>> = vec![];
87 let mut errors: Vec<ParseError> = vec![];
88
89 let mut preceding_comments = vec![];
90 let mut offset = offset;
91
92 if offset == 0 && s.starts_with('#') {
94 offset = s.find('\n').unwrap_or(s.len());
95 }
96
97 'outer: while offset < end_offset {
98 let s = &s[offset..];
99
100 if s.starts_with("//") {
102 let (line_number, column) = lp.from_offset(offset);
103 if let Some(i) = s.find('\n') {
104 preceding_comments.push((
105 Position {
106 start_offset: offset,
107 end_offset: offset + i,
108 line_number: line_number.as_usize(),
109 end_line_number: line_number.as_usize(),
110 column,
111 end_column: i,
112 path: path.clone(),
113 },
114 &s[0..i + 1],
115 ));
116 offset += i + 1;
117 } else {
118 preceding_comments.push((
120 Position {
121 start_offset: offset,
122 end_offset: offset + s.len(),
123 line_number: line_number.as_usize(),
124 end_line_number: line_number.as_usize(),
125 column,
126 end_column: s.len(),
127 path: path.clone(),
128 },
129 s,
130 ));
131 offset += s.len();
132 }
133 continue;
134 }
135
136 let Some(first_char) = s.chars().next() else {
138 break;
139 };
140 if first_char.is_whitespace() {
141 offset += 1;
142 continue;
143 }
144
145 for token_str in ["==", "!=", ">=", "<=", "&&", "||", "=>", "+=", "-=", "**"] {
146 if s.starts_with(token_str) {
147 let (line_number, column) = lp.from_offset(offset);
148
149 tokens.push(Token {
150 position: Position {
151 start_offset: offset,
152 end_offset: offset + token_str.len(),
153 line_number: line_number.as_usize(),
154 end_line_number: line_number.as_usize(),
155 column,
156 end_column: column + token_str.len(),
157 path: path.clone(),
158 },
159 text: &s[0..token_str.len()],
160 preceding_comments,
161 });
162 preceding_comments = vec![];
163
164 offset += token_str.len();
165 continue 'outer;
166 }
167 }
168
169 if let Some(integer_match) = INTEGER_RE.find(s) {
172 let (line_number, column) = lp.from_offset(offset);
173
174 tokens.push(Token {
175 position: Position {
176 start_offset: offset,
177 end_offset: offset + integer_match.end(),
178 line_number: line_number.as_usize(),
179 end_line_number: line_number.as_usize(),
180 column,
181 end_column: column + integer_match.end(),
182 path: path.clone(),
183 },
184 text: integer_match.as_str(),
185 preceding_comments,
186 });
187 preceding_comments = vec![];
188
189 offset += integer_match.end();
190 continue;
191 }
192
193 for token_char in [
194 '+', '-', '*', '/', '%', '^', '(', ')', '{', '}', '=', ',', '<', '>', '[', ']', '.',
195 ':',
196 ] {
197 if s.starts_with(token_char) {
198 let (line_number, column) = lp.from_offset(offset);
199
200 tokens.push(Token {
201 position: Position {
202 start_offset: offset,
203 end_offset: offset + 1,
204 line_number: line_number.as_usize(),
205 end_line_number: line_number.as_usize(),
206 column,
207 end_column: column + 1,
208 path: path.clone(),
209 },
210 text: &s[0..1],
211 preceding_comments,
212 });
213 preceding_comments = vec![];
214
215 offset += 1;
216 continue 'outer;
217 }
218 }
219 if let Some(string_match) = STRING_RE.find(s) {
220 let (line_number, column) = lp.from_offset(offset);
221
222 tokens.push(Token {
223 position: Position {
224 start_offset: offset,
225 end_offset: offset + string_match.end(),
226 line_number: line_number.as_usize(),
227 end_line_number: line_number.as_usize(),
228 column,
229 end_column: column + string_match.end(),
230 path: path.clone(),
231 },
232 text: string_match.as_str(),
233 preceding_comments,
234 });
235 preceding_comments = vec![];
236
237 offset += string_match.end();
238 } else if let Some(variable_match) = SYMBOL_RE.find(s) {
239 let (line_number, column) = lp.from_offset(offset);
240
241 tokens.push(Token {
242 position: Position {
243 start_offset: offset,
244 end_offset: offset + variable_match.end(),
245 line_number: line_number.as_usize(),
246 end_line_number: line_number.as_usize(),
247 column,
248 end_column: column + variable_match.end(),
249 path: path.clone(),
250 },
251 text: variable_match.as_str(),
252 preceding_comments,
253 });
254 preceding_comments = vec![];
255
256 offset += variable_match.end();
257 } else {
258 let (line_number, column) = lp.from_offset(offset);
259
260 errors.push(ParseError::Invalid {
261 position: Position {
262 start_offset: offset,
263 end_offset: offset + 1,
264 line_number: line_number.as_usize(),
265 end_line_number: line_number.as_usize(),
266 column,
267 end_column: column + 1,
268 path: path.clone(),
269 },
270 message: ErrorMessage(vec![
271 msgtext!("Unrecognized syntax "),
272 msgcode!("{}", &s[0..1]),
273 ]),
274 additional: vec![],
275 });
276
277 offset += 1;
278 }
279 }
280
281 (
282 TokenStream {
283 path: path.clone(),
284 tokens,
285 idx: 0,
286 trailing_comments: preceding_comments,
287 },
288 errors,
289 )
290}
291
292pub fn lex<'a>(path: &Path, s: &'a str) -> (TokenStream<'a>, Vec<ParseError>) {
293 lex_between(path, s, 0, s.len())
294}
295
296#[cfg(test)]
297mod tests {
298 use std::path::PathBuf;
299
300 use super::*;
301
302 #[test]
303 fn test_lex_no_offset() {
304 let tokens = lex(&PathBuf::from("__test.gdn"), "1").0;
305 assert_eq!(
306 tokens.peek(),
307 Some(Token {
308 position: Position {
309 start_offset: 0,
310 end_offset: 1,
311 line_number: 0,
312 end_line_number: 0,
313 column: 0,
314 end_column: 1,
315 path: PathBuf::from("__test.gdn").into()
316 },
317 text: "1",
318 preceding_comments: vec![],
319 })
320 );
321 }
322
323 #[test]
324 fn test_lex_with_offset() {
325 let tokens = lex(&PathBuf::from("__test.gdn"), " a").0;
326 assert_eq!(
327 tokens.peek(),
328 Some(Token {
329 position: Position {
330 start_offset: 1,
331 end_offset: 2,
332 line_number: 0,
333 end_line_number: 0,
334 column: 1,
335 end_column: 2,
336 path: PathBuf::from("__test.gdn").into()
337 },
338 text: "a",
339 preceding_comments: vec![],
340 })
341 );
342 }
343
344 #[test]
345 fn test_lex_spaces() {
346 assert_eq!(
347 lex(&PathBuf::from("__test.gdn"), "1 + 2")
348 .0
349 .tokens
350 .iter()
351 .map(|token| token.text)
352 .collect::<Vec<_>>(),
353 vec!["1", "+", "2"]
354 );
355 }
356
357 #[test]
358 fn test_lex_no_spaces() {
359 assert_eq!(
360 lex(&PathBuf::from("__test.gdn"), "1+2")
361 .0
362 .tokens
363 .iter()
364 .map(|token| token.text)
365 .collect::<Vec<_>>(),
366 vec!["1", "+", "2"]
367 );
368 }
369
370 #[test]
371 fn test_lex_comment() {
372 let tokens = lex(&PathBuf::from("__test.gdn"), "// 2\n1").0;
373 assert_eq!(
374 tokens.peek(),
375 Some(Token {
376 position: Position {
377 start_offset: 5,
378 end_offset: 6,
379 line_number: 1,
380 end_line_number: 1,
381 column: 0,
382 end_column: 1,
383 path: PathBuf::from("__test.gdn").into()
384 },
385 text: "1",
386 preceding_comments: vec![(
387 Position {
388 start_offset: 0,
389 end_offset: 4,
390 line_number: 0,
391 end_line_number: 0,
392 column: 0,
393 end_column: 4,
394 path: PathBuf::from("__test.gdn").into()
395 },
396 " 2\n"
397 )],
398 })
399 );
400 }
401
402 #[test]
403 fn test_lex_comment_not_touching() {
404 let tokens = lex(&PathBuf::from("__test.gdn"), "// 2\n\n1").0;
405 assert_eq!(
406 tokens.peek(),
407 Some(Token {
408 position: Position {
409 start_offset: 6,
410 end_offset: 7,
411 line_number: 2,
412 end_line_number: 2,
413 column: 0,
414 end_column: 1,
415 path: PathBuf::from("__test.gdn").into()
416 },
417 text: "1",
418 preceding_comments: vec![],
419 })
420 );
421 }
422
423 #[test]
424 fn test_lex_comment_leading_newline() {
425 assert!(lex(&PathBuf::from("__test.gdn"), "\n// 2").0.is_empty());
426 }
427
428 #[test]
429 fn test_lex_standalone_comment() {
430 assert!(lex(&PathBuf::from("__test.gdn"), "// foo").0.is_empty());
431 }
432}