graphcal_compiler/syntax/
lexer.rs1use crate::syntax::comments::{BlankLine, Comment, CommentBody, CommentDelimiter, SourceMetadata};
2use crate::syntax::span::{Span, Spanned};
3use crate::syntax::token::{LexicalItem, LexicalToken, Token, TriviaToken};
4use logos::Logos;
5use peek_cache::{PeekCache, SourceItem};
6use std::num::NonZeroUsize;
7
8const LEXER_MAX_LOOKAHEAD: NonZeroUsize = NonZeroUsize::new(3).unwrap();
9
10pub struct Lexer<'src> {
37 inner: logos::Lexer<'src, LexicalToken>,
38 peek_cache: PeekCache<(Token, Span)>,
39 source: &'src str,
40 source_metadata: SourceMetadata,
41 first_error_span: Option<Span>,
43}
44
45impl<'src> Lexer<'src> {
46 #[must_use]
47 pub fn new(source: &'src str) -> Self {
48 Self {
49 inner: LexicalToken::lexer(source),
50 peek_cache: PeekCache::new(LEXER_MAX_LOOKAHEAD),
51 source,
52 source_metadata: SourceMetadata::default(),
53 first_error_span: None,
54 }
55 }
56
57 pub fn peek(&mut self) -> Option<&Token> {
59 self.peek_with_span().map(|(tok, _)| tok)
60 }
61
62 pub fn peek_second(&mut self) -> Option<&Token> {
64 self.peek_token_at(1)
65 }
66
67 pub fn peek_third(&mut self) -> Option<&Token> {
69 self.peek_token_at(2)
70 }
71
72 pub fn peek_with_span(&mut self) -> Option<(&Token, Span)> {
77 let inner = &mut self.inner;
78 let source = self.source;
79 let source_metadata = &mut self.source_metadata;
80 let first_error_span = &mut self.first_error_span;
81 self.peek_cache
82 .peek(|| read_next_token(inner, source, source_metadata, first_error_span))
83 .map(|(tok, span)| (tok, *span))
84 }
85
86 fn peek_token_at(&mut self, offset: usize) -> Option<&Token> {
87 self.peek_with_span_at(offset).map(|(tok, _)| tok)
88 }
89
90 fn peek_with_span_at(&mut self, offset: usize) -> Option<(&Token, Span)> {
91 let inner = &mut self.inner;
92 let source = self.source;
93 let source_metadata = &mut self.source_metadata;
94 let first_error_span = &mut self.first_error_span;
95 self.peek_cache
96 .peek_at(offset, || {
97 read_next_token(inner, source, source_metadata, first_error_span)
98 })
99 .map(|(tok, span)| (tok, *span))
100 }
101
102 #[must_use]
109 pub const fn first_error_span(&self) -> Option<Span> {
110 self.first_error_span
111 }
112
113 pub fn next_token(&mut self) -> Option<(Token, Span)> {
118 let inner = &mut self.inner;
119 let source = self.source;
120 let source_metadata = &mut self.source_metadata;
121 let first_error_span = &mut self.first_error_span;
122 self.peek_cache
123 .next(|| read_next_token(inner, source, source_metadata, first_error_span))
124 }
125
126 #[must_use]
128 pub fn slice_at(&self, span: Span) -> &'src str {
129 &self.source[span.offset()..span.offset() + span.len()]
130 }
131
132 #[must_use]
134 pub const fn source_len(&self) -> usize {
135 self.source.len()
136 }
137
138 #[must_use]
139 pub fn into_source_metadata(self) -> SourceMetadata {
140 self.source_metadata
141 }
142}
143
144fn read_next_token(
145 inner: &mut logos::Lexer<'_, LexicalToken>,
146 source: &str,
147 source_metadata: &mut SourceMetadata,
148 first_error_span: &mut Option<Span>,
149) -> SourceItem<(Token, Span)> {
150 loop {
151 let Some(result) = inner.next() else {
152 break SourceItem::Eof;
153 };
154 let slice_span = inner.span();
155 let span = Span::new(slice_span.start, slice_span.end - slice_span.start);
156 match result.map(LexicalToken::classify) {
157 Ok(LexicalItem::Trivia(TriviaToken::Whitespace)) => {
158 record_blank_lines(source, span, source_metadata);
159 }
160 Ok(LexicalItem::Trivia(TriviaToken::Comment)) => {
161 record_comment(source, span, source_metadata);
162 }
163 Ok(LexicalItem::Syntax(token)) => break SourceItem::Item((token, span)),
164 Err(()) => {
165 if first_error_span.is_none() {
166 *first_error_span = Some(span);
167 }
168 }
169 }
170 }
171}
172
173fn record_comment(source: &str, span: Span, source_metadata: &mut SourceMetadata) {
174 let lexeme = &source[span.offset()..span.offset() + span.len()];
175 let delimiter = comment_delimiter(lexeme);
176 let body = CommentBody::new(&lexeme[delimiter.len()..]);
177 source_metadata.push_comment(Spanned::new(Comment::new(delimiter, body), span));
178}
179
180fn comment_delimiter(lexeme: &str) -> CommentDelimiter {
181 let bytes = lexeme.as_bytes();
182 match (bytes.get(2).copied(), bytes.get(3).copied()) {
183 (Some(b'/'), Some(b'/')) => CommentDelimiter::Line,
184 (Some(b'/'), _) => CommentDelimiter::Doc,
185 _ => CommentDelimiter::Line,
186 }
187}
188
189fn record_blank_lines(source: &str, span: Span, source_metadata: &mut SourceMetadata) {
190 let whitespace = &source[span.offset()..span.offset() + span.len()];
191 let mut previous_line_ending_start: Option<usize> = None;
192 let mut pos = 0;
193 while pos < whitespace.len() {
194 match line_ending_at(whitespace.as_bytes(), pos) {
195 Some(line_ending) => {
196 if let Some(start) = previous_line_ending_start {
197 let end = pos + line_ending.len();
198 source_metadata.push_blank_line(BlankLine::new(Span::new(
199 span.offset() + start,
200 end - start,
201 )));
202 }
203 previous_line_ending_start = Some(pos);
204 pos += line_ending.len();
205 }
206 None => pos += 1,
207 }
208 }
209}
210
211fn line_ending_at(bytes: &[u8], pos: usize) -> Option<LineEnding> {
212 match bytes.get(pos).copied() {
213 Some(b'\n') => Some(LineEnding::Lf),
214 Some(b'\r') if bytes.get(pos + 1) == Some(&b'\n') => Some(LineEnding::CrLf),
215 Some(b'\r') => Some(LineEnding::Cr),
216 _ => None,
217 }
218}
219
220#[derive(Debug, Clone, Copy, PartialEq, Eq)]
221enum LineEnding {
222 Lf,
223 CrLf,
224 Cr,
225}
226
227impl LineEnding {
228 const fn len(self) -> usize {
229 match self {
230 Self::Lf | Self::Cr => 1,
231 Self::CrLf => 2,
232 }
233 }
234}
235
236mod peek_cache {
237 use std::collections::VecDeque;
238 use std::num::NonZeroUsize;
239
240 pub(super) struct PeekCache<T> {
241 items: VecDeque<T>,
242 eof_seen: bool,
243 max_lookahead: NonZeroUsize,
244 }
245
246 impl<T> PeekCache<T> {
247 pub(super) fn new(max_lookahead: NonZeroUsize) -> Self {
248 Self {
249 items: VecDeque::with_capacity(max_lookahead.get()),
250 eof_seen: false,
251 max_lookahead,
252 }
253 }
254
255 pub(super) fn peek<F>(&mut self, load_next: F) -> Option<&T>
256 where
257 F: FnMut() -> SourceItem<T>,
258 {
259 self.peek_at(0, load_next)
260 }
261
262 pub(super) fn peek_at<F>(&mut self, offset: usize, load_next: F) -> Option<&T>
263 where
264 F: FnMut() -> SourceItem<T>,
265 {
266 debug_assert!(offset < self.max_lookahead.get());
267 if offset >= self.max_lookahead.get() {
268 return None;
269 }
270
271 self.fill_until(offset, load_next);
272 self.items.get(offset)
273 }
274
275 pub(super) fn next<F>(&mut self, load_next: F) -> Option<T>
276 where
277 F: FnMut() -> SourceItem<T>,
278 {
279 self.fill_until(0, load_next);
280 self.items.pop_front()
281 }
282
283 fn fill_until<F>(&mut self, offset: usize, mut load_next: F)
284 where
285 F: FnMut() -> SourceItem<T>,
286 {
287 debug_assert!(offset < self.max_lookahead.get());
291 while self.items.len() <= offset && !self.eof_seen {
292 match load_next() {
293 SourceItem::Item(item) => self.items.push_back(item),
294 SourceItem::Eof => self.eof_seen = true,
295 }
296 }
297 }
298 }
299
300 pub(super) enum SourceItem<T> {
301 Item(T),
302 Eof,
303 }
304}
305
306#[cfg(test)]
307mod tests {
308 use super::peek_cache::SourceItem;
309 use super::*;
310
311 #[test]
312 fn lexer_yields_tokens_with_spans() {
313 let input = "param x = 1.0;";
314 let mut lexer = Lexer::new(input);
315
316 let (tok, span) = lexer.next_token().unwrap();
317 assert_eq!(tok, Token::Param);
318 assert_eq!(lexer.slice_at(span), "param");
319
320 let (tok, span) = lexer.next_token().unwrap();
321 assert_eq!(tok, Token::Ident);
322 assert_eq!(lexer.slice_at(span), "x");
323
324 let (tok, span) = lexer.next_token().unwrap();
325 assert_eq!(tok, Token::Eq);
326 assert_eq!(lexer.slice_at(span), "=");
327
328 let (tok, span) = lexer.next_token().unwrap();
329 assert_eq!(tok, Token::Number);
330 assert_eq!(lexer.slice_at(span), "1.0");
331
332 let (tok, _) = lexer.next_token().unwrap();
333 assert_eq!(tok, Token::Semicolon);
334
335 assert!(lexer.next_token().is_none());
336 }
337
338 #[test]
339 fn peek_does_not_consume() {
340 let input = "param x";
341 let mut lexer = Lexer::new(input);
342
343 assert_eq!(lexer.peek(), Some(&Token::Param));
344 assert_eq!(lexer.peek(), Some(&Token::Param));
345
346 lexer.next_token();
347 assert_eq!(lexer.peek(), Some(&Token::Ident));
348 }
349
350 #[test]
351 fn peek_with_span_returns_correct_span() {
352 let input = "const node g0 = 9.80665;";
353 let mut lexer = Lexer::new(input);
354
355 let (tok, span) = lexer.peek_with_span().unwrap();
356 assert_eq!(*tok, Token::Const);
357 assert_eq!(lexer.slice_at(span), "const");
358 }
359
360 #[test]
361 fn exhaust_lexer() {
362 let input = "42";
363 let mut lexer = Lexer::new(input);
364 let (tok, _) = lexer.next_token().unwrap();
365 assert_eq!(tok, Token::Number);
366 assert!(lexer.next_token().is_none());
367 assert!(lexer.peek().is_none());
368 }
369
370 #[test]
371 fn next_token_without_prior_peek() {
372 let input = "param x";
375 let mut lexer = Lexer::new(input);
376
377 let (tok, span) = lexer.next_token().unwrap();
378 assert_eq!(tok, Token::Param);
379 assert_eq!(lexer.slice_at(span), "param");
380
381 let (tok, span) = lexer.next_token().unwrap();
382 assert_eq!(tok, Token::Ident);
383 assert_eq!(lexer.slice_at(span), "x");
384
385 assert!(lexer.next_token().is_none());
386 }
387
388 #[test]
389 fn lookahead_does_not_consume() {
390 let input = "param x = 1.0;";
391 let mut lexer = Lexer::new(input);
392
393 assert_eq!(lexer.peek(), Some(&Token::Param));
394 assert_eq!(lexer.peek_second(), Some(&Token::Ident));
395 assert_eq!(lexer.peek_third(), Some(&Token::Eq));
396
397 let (token, _) = lexer.next_token().unwrap();
398 assert_eq!(token, Token::Param);
399 let (token, _) = lexer.next_token().unwrap();
400 assert_eq!(token, Token::Ident);
401 }
402
403 #[test]
404 fn unknown_character_is_recorded() {
405 let input = "param §x = 1.0;";
408 let mut lexer = Lexer::new(input);
409 while lexer.next_token().is_some() {}
411 let err_span = lexer.first_error_span().expect("expected an error span");
412 assert_eq!(
413 &input[err_span.offset()..err_span.offset() + err_span.len()],
414 "§"
415 );
416 }
417
418 #[test]
419 fn only_first_unknown_character_is_recorded() {
420 let input = "§§";
423 let mut lexer = Lexer::new(input);
424 assert!(lexer.next_token().is_none());
425 let err_span = lexer.first_error_span().expect("expected an error span");
426 assert_eq!(err_span.offset(), 0);
427 }
428
429 #[test]
430 fn spans_are_byte_accurate() {
431 let input = "node delta_v = @v_exhaust * ln(@mass_ratio);";
433 let mut lexer = Lexer::new(input);
434
435 let (_, span) = lexer.next_token().unwrap(); assert_eq!(span.offset(), 0);
437 assert_eq!(span.len(), 4);
438
439 let (_, span) = lexer.next_token().unwrap(); assert_eq!(span.offset(), 5);
441 assert_eq!(span.len(), 7);
442 }
443
444 #[test]
445 fn peek_cache_uses_supplied_items_without_lexer_knowledge() {
446 let mut cache = PeekCache::<u8>::new(NonZeroUsize::new(2).unwrap());
447 let mut next = 0;
448
449 assert_eq!(
450 cache.peek(|| {
451 next += 1;
452 SourceItem::Item(next)
453 }),
454 Some(&1)
455 );
456 assert_eq!(
457 cache.peek(|| {
458 next += 1;
459 SourceItem::Item(next)
460 }),
461 Some(&1)
462 );
463 assert_eq!(
464 cache.next(|| {
465 next += 1;
466 SourceItem::Item(next)
467 }),
468 Some(1)
469 );
470 assert_eq!(
471 cache.next(|| {
472 next += 1;
473 SourceItem::Item(next)
474 }),
475 Some(2)
476 );
477 assert_eq!(next, 2);
478 }
479
480 #[test]
481 fn peek_cache_respects_caller_supplied_lookahead() {
482 let mut cache = PeekCache::<u8>::new(NonZeroUsize::new(4).unwrap());
483 let mut next = 0;
484
485 assert_eq!(
486 cache.peek_at(3, || {
487 next += 1;
488 SourceItem::Item(next)
489 }),
490 Some(&4)
491 );
492 assert_eq!(next, 4);
493 }
494}