Skip to main content

graphcal_compiler/syntax/
lexer.rs

1use crate::syntax::comments::{BlankLine, Comment, CommentBody, CommentDelimiter, SourceMetadata};
2use crate::syntax::span::{Span, Spanned};
3use crate::syntax::token::{LexicalItem, LexicalToken, Token, TriviaToken};
4use logos::Logos;
5use peek_cache::{PeekCache, SourceItem};
6use std::num::NonZeroUsize;
7
8const LEXER_MAX_LOOKAHEAD: NonZeroUsize = NonZeroUsize::new(3).unwrap();
9
10/// A peekable wrapper around `logos::Lexer` that yields `(Token, Span)` pairs.
11///
12/// # Internal design
13///
14/// The inner `logos::Lexer` is **only** advanced by `read_next_token()`, which
15/// adapts the tokenizer into the generic private peek cache. `Lexer` cannot read
16/// or take the cache slots directly; the cache fields are private to a nested
17/// module, and its public methods fill the first slot before returning or
18/// consuming it.
19///
20/// ```text
21///   cache = []             (initial / just consumed)
22///       │
23///       ▼  cache calls read_next_token()
24///   cache = [Token(_), ...] (token ready)
25///   cache.eof_seen = true   (EOF)
26///       │
27///       ▼  cache.next() takes the token
28///   cache = [...]           (remaining lookahead)
29/// ```
30///
31/// When the underlying `logos::Lexer` encounters an unrecognized character, the
32/// span of the *first* such character is recorded in `first_error_span` and the
33/// character is skipped. The parser surfaces this as a `ParseError::UnknownToken`
34/// when a top-level `parse_*` entry point finishes, regardless of whether the
35/// downstream parse happened to succeed.
36pub struct Lexer<'src> {
37    inner: logos::Lexer<'src, LexicalToken>,
38    peek_cache: PeekCache<(Token, Span)>,
39    source: &'src str,
40    source_metadata: SourceMetadata,
41    /// Span of the first unrecognized character encountered during lexing, if any.
42    first_error_span: Option<Span>,
43}
44
45impl<'src> Lexer<'src> {
46    #[must_use]
47    pub fn new(source: &'src str) -> Self {
48        Self {
49            inner: LexicalToken::lexer(source),
50            peek_cache: PeekCache::new(LEXER_MAX_LOOKAHEAD),
51            source,
52            source_metadata: SourceMetadata::default(),
53            first_error_span: None,
54        }
55    }
56
57    /// Peek at the next token without consuming it.
58    pub fn peek(&mut self) -> Option<&Token> {
59        self.peek_with_span().map(|(tok, _)| tok)
60    }
61
62    /// Peek at the token after the next token without consuming either one.
63    pub fn peek_second(&mut self) -> Option<&Token> {
64        self.peek_token_at(1)
65    }
66
67    /// Peek at the third token from the current position without consuming any token.
68    pub fn peek_third(&mut self) -> Option<&Token> {
69        self.peek_token_at(2)
70    }
71
72    /// Peek at the next token and its span without consuming it.
73    ///
74    /// This delegates to the cache, which fills its first slot before returning
75    /// a reference to it.
76    pub fn peek_with_span(&mut self) -> Option<(&Token, Span)> {
77        let inner = &mut self.inner;
78        let source = self.source;
79        let source_metadata = &mut self.source_metadata;
80        let first_error_span = &mut self.first_error_span;
81        self.peek_cache
82            .peek(|| read_next_token(inner, source, source_metadata, first_error_span))
83            .map(|(tok, span)| (tok, *span))
84    }
85
86    fn peek_token_at(&mut self, offset: usize) -> Option<&Token> {
87        self.peek_with_span_at(offset).map(|(tok, _)| tok)
88    }
89
90    fn peek_with_span_at(&mut self, offset: usize) -> Option<(&Token, Span)> {
91        let inner = &mut self.inner;
92        let source = self.source;
93        let source_metadata = &mut self.source_metadata;
94        let first_error_span = &mut self.first_error_span;
95        self.peek_cache
96            .peek_at(offset, || {
97                read_next_token(inner, source, source_metadata, first_error_span)
98            })
99            .map(|(tok, span)| (tok, *span))
100    }
101
102    /// Return the span of the first unrecognized character encountered during lexing.
103    ///
104    /// Returns `None` if the lexer has not yet seen any invalid input. The value
105    /// is set lazily as tokens are consumed; callers that need an up-to-date
106    /// answer at a specific point should ensure lexing has progressed past the
107    /// region of interest (e.g., by draining the remaining tokens).
108    #[must_use]
109    pub const fn first_error_span(&self) -> Option<Span> {
110        self.first_error_span
111    }
112
113    /// Consume and return the next token and its span.
114    ///
115    /// The cache fills its first slot before taking from it, so this method
116    /// cannot accidentally consume an uninitialized cache slot.
117    pub fn next_token(&mut self) -> Option<(Token, Span)> {
118        let inner = &mut self.inner;
119        let source = self.source;
120        let source_metadata = &mut self.source_metadata;
121        let first_error_span = &mut self.first_error_span;
122        self.peek_cache
123            .next(|| read_next_token(inner, source, source_metadata, first_error_span))
124    }
125
126    /// Get the source text corresponding to a span.
127    #[must_use]
128    pub fn slice_at(&self, span: Span) -> &'src str {
129        &self.source[span.offset()..span.offset() + span.len()]
130    }
131
132    /// Return the total length (in bytes) of the source string.
133    #[must_use]
134    pub const fn source_len(&self) -> usize {
135        self.source.len()
136    }
137
138    #[must_use]
139    pub fn into_source_metadata(self) -> SourceMetadata {
140        self.source_metadata
141    }
142}
143
144fn read_next_token(
145    inner: &mut logos::Lexer<'_, LexicalToken>,
146    source: &str,
147    source_metadata: &mut SourceMetadata,
148    first_error_span: &mut Option<Span>,
149) -> SourceItem<(Token, Span)> {
150    loop {
151        let Some(result) = inner.next() else {
152            break SourceItem::Eof;
153        };
154        let slice_span = inner.span();
155        let span = Span::new(slice_span.start, slice_span.end - slice_span.start);
156        match result.map(LexicalToken::classify) {
157            Ok(LexicalItem::Trivia(TriviaToken::Whitespace)) => {
158                record_blank_lines(source, span, source_metadata);
159            }
160            Ok(LexicalItem::Trivia(TriviaToken::Comment)) => {
161                record_comment(source, span, source_metadata);
162            }
163            Ok(LexicalItem::Syntax(token)) => break SourceItem::Item((token, span)),
164            Err(()) => {
165                if first_error_span.is_none() {
166                    *first_error_span = Some(span);
167                }
168            }
169        }
170    }
171}
172
173fn record_comment(source: &str, span: Span, source_metadata: &mut SourceMetadata) {
174    let lexeme = &source[span.offset()..span.offset() + span.len()];
175    let delimiter = comment_delimiter(lexeme);
176    let body = CommentBody::new(&lexeme[delimiter.len()..]);
177    source_metadata.push_comment(Spanned::new(Comment::new(delimiter, body), span));
178}
179
180fn comment_delimiter(lexeme: &str) -> CommentDelimiter {
181    let bytes = lexeme.as_bytes();
182    match (bytes.get(2).copied(), bytes.get(3).copied()) {
183        (Some(b'/'), Some(b'/')) => CommentDelimiter::Line,
184        (Some(b'/'), _) => CommentDelimiter::Doc,
185        _ => CommentDelimiter::Line,
186    }
187}
188
189fn record_blank_lines(source: &str, span: Span, source_metadata: &mut SourceMetadata) {
190    let whitespace = &source[span.offset()..span.offset() + span.len()];
191    let mut previous_line_ending_start: Option<usize> = None;
192    let mut pos = 0;
193    while pos < whitespace.len() {
194        match line_ending_at(whitespace.as_bytes(), pos) {
195            Some(line_ending) => {
196                if let Some(start) = previous_line_ending_start {
197                    let end = pos + line_ending.len();
198                    source_metadata.push_blank_line(BlankLine::new(Span::new(
199                        span.offset() + start,
200                        end - start,
201                    )));
202                }
203                previous_line_ending_start = Some(pos);
204                pos += line_ending.len();
205            }
206            None => pos += 1,
207        }
208    }
209}
210
211fn line_ending_at(bytes: &[u8], pos: usize) -> Option<LineEnding> {
212    match bytes.get(pos).copied() {
213        Some(b'\n') => Some(LineEnding::Lf),
214        Some(b'\r') if bytes.get(pos + 1) == Some(&b'\n') => Some(LineEnding::CrLf),
215        Some(b'\r') => Some(LineEnding::Cr),
216        _ => None,
217    }
218}
219
220#[derive(Debug, Clone, Copy, PartialEq, Eq)]
221enum LineEnding {
222    Lf,
223    CrLf,
224    Cr,
225}
226
227impl LineEnding {
228    const fn len(self) -> usize {
229        match self {
230            Self::Lf | Self::Cr => 1,
231            Self::CrLf => 2,
232        }
233    }
234}
235
236mod peek_cache {
237    use std::collections::VecDeque;
238    use std::num::NonZeroUsize;
239
240    pub(super) struct PeekCache<T> {
241        items: VecDeque<T>,
242        eof_seen: bool,
243        max_lookahead: NonZeroUsize,
244    }
245
246    impl<T> PeekCache<T> {
247        pub(super) fn new(max_lookahead: NonZeroUsize) -> Self {
248            Self {
249                items: VecDeque::with_capacity(max_lookahead.get()),
250                eof_seen: false,
251                max_lookahead,
252            }
253        }
254
255        pub(super) fn peek<F>(&mut self, load_next: F) -> Option<&T>
256        where
257            F: FnMut() -> SourceItem<T>,
258        {
259            self.peek_at(0, load_next)
260        }
261
262        pub(super) fn peek_at<F>(&mut self, offset: usize, load_next: F) -> Option<&T>
263        where
264            F: FnMut() -> SourceItem<T>,
265        {
266            debug_assert!(offset < self.max_lookahead.get());
267            if offset >= self.max_lookahead.get() {
268                return None;
269            }
270
271            self.fill_until(offset, load_next);
272            self.items.get(offset)
273        }
274
275        pub(super) fn next<F>(&mut self, load_next: F) -> Option<T>
276        where
277            F: FnMut() -> SourceItem<T>,
278        {
279            self.fill_until(0, load_next);
280            self.items.pop_front()
281        }
282
283        fn fill_until<F>(&mut self, offset: usize, mut load_next: F)
284        where
285            F: FnMut() -> SourceItem<T>,
286        {
287            // Private callers preserve this invariant: `peek_at` validates
288            // arbitrary offsets, and `next` only asks for slot 0, which is
289            // guaranteed by the nonzero lookahead bound.
290            debug_assert!(offset < self.max_lookahead.get());
291            while self.items.len() <= offset && !self.eof_seen {
292                match load_next() {
293                    SourceItem::Item(item) => self.items.push_back(item),
294                    SourceItem::Eof => self.eof_seen = true,
295                }
296            }
297        }
298    }
299
300    pub(super) enum SourceItem<T> {
301        Item(T),
302        Eof,
303    }
304}
305
306#[cfg(test)]
307mod tests {
308    use super::peek_cache::SourceItem;
309    use super::*;
310
311    #[test]
312    fn lexer_yields_tokens_with_spans() {
313        let input = "param x = 1.0;";
314        let mut lexer = Lexer::new(input);
315
316        let (tok, span) = lexer.next_token().unwrap();
317        assert_eq!(tok, Token::Param);
318        assert_eq!(lexer.slice_at(span), "param");
319
320        let (tok, span) = lexer.next_token().unwrap();
321        assert_eq!(tok, Token::Ident);
322        assert_eq!(lexer.slice_at(span), "x");
323
324        let (tok, span) = lexer.next_token().unwrap();
325        assert_eq!(tok, Token::Eq);
326        assert_eq!(lexer.slice_at(span), "=");
327
328        let (tok, span) = lexer.next_token().unwrap();
329        assert_eq!(tok, Token::Number);
330        assert_eq!(lexer.slice_at(span), "1.0");
331
332        let (tok, _) = lexer.next_token().unwrap();
333        assert_eq!(tok, Token::Semicolon);
334
335        assert!(lexer.next_token().is_none());
336    }
337
338    #[test]
339    fn peek_does_not_consume() {
340        let input = "param x";
341        let mut lexer = Lexer::new(input);
342
343        assert_eq!(lexer.peek(), Some(&Token::Param));
344        assert_eq!(lexer.peek(), Some(&Token::Param));
345
346        lexer.next_token();
347        assert_eq!(lexer.peek(), Some(&Token::Ident));
348    }
349
350    #[test]
351    fn peek_with_span_returns_correct_span() {
352        let input = "const node g0 = 9.80665;";
353        let mut lexer = Lexer::new(input);
354
355        let (tok, span) = lexer.peek_with_span().unwrap();
356        assert_eq!(*tok, Token::Const);
357        assert_eq!(lexer.slice_at(span), "const");
358    }
359
360    #[test]
361    fn exhaust_lexer() {
362        let input = "42";
363        let mut lexer = Lexer::new(input);
364        let (tok, _) = lexer.next_token().unwrap();
365        assert_eq!(tok, Token::Number);
366        assert!(lexer.next_token().is_none());
367        assert!(lexer.peek().is_none());
368    }
369
370    #[test]
371    fn next_token_without_prior_peek() {
372        // next_token() internally peeks first, so calling it directly
373        // (without an explicit peek()) must work identically.
374        let input = "param x";
375        let mut lexer = Lexer::new(input);
376
377        let (tok, span) = lexer.next_token().unwrap();
378        assert_eq!(tok, Token::Param);
379        assert_eq!(lexer.slice_at(span), "param");
380
381        let (tok, span) = lexer.next_token().unwrap();
382        assert_eq!(tok, Token::Ident);
383        assert_eq!(lexer.slice_at(span), "x");
384
385        assert!(lexer.next_token().is_none());
386    }
387
388    #[test]
389    fn lookahead_does_not_consume() {
390        let input = "param x = 1.0;";
391        let mut lexer = Lexer::new(input);
392
393        assert_eq!(lexer.peek(), Some(&Token::Param));
394        assert_eq!(lexer.peek_second(), Some(&Token::Ident));
395        assert_eq!(lexer.peek_third(), Some(&Token::Eq));
396
397        let (token, _) = lexer.next_token().unwrap();
398        assert_eq!(token, Token::Param);
399        let (token, _) = lexer.next_token().unwrap();
400        assert_eq!(token, Token::Ident);
401    }
402
403    #[test]
404    fn unknown_character_is_recorded() {
405        // `§` is not part of the grammar. The lexer should skip it while
406        // recording the span for the parser to surface as `UnknownToken`.
407        let input = "param §x = 1.0;";
408        let mut lexer = Lexer::new(input);
409        // Drain all tokens so the lexer encounters the stray character.
410        while lexer.next_token().is_some() {}
411        let err_span = lexer.first_error_span().expect("expected an error span");
412        assert_eq!(
413            &input[err_span.offset()..err_span.offset() + err_span.len()],
414            "§"
415        );
416    }
417
418    #[test]
419    fn only_first_unknown_character_is_recorded() {
420        // Multiple stray characters: the lexer reports only the first so that
421        // the diagnostic points at the original culprit.
422        let input = "§§";
423        let mut lexer = Lexer::new(input);
424        assert!(lexer.next_token().is_none());
425        let err_span = lexer.first_error_span().expect("expected an error span");
426        assert_eq!(err_span.offset(), 0);
427    }
428
429    #[test]
430    fn spans_are_byte_accurate() {
431        //          0123456789...
432        let input = "node delta_v = @v_exhaust * ln(@mass_ratio);";
433        let mut lexer = Lexer::new(input);
434
435        let (_, span) = lexer.next_token().unwrap(); // node
436        assert_eq!(span.offset(), 0);
437        assert_eq!(span.len(), 4);
438
439        let (_, span) = lexer.next_token().unwrap(); // delta_v
440        assert_eq!(span.offset(), 5);
441        assert_eq!(span.len(), 7);
442    }
443
444    #[test]
445    fn peek_cache_uses_supplied_items_without_lexer_knowledge() {
446        let mut cache = PeekCache::<u8>::new(NonZeroUsize::new(2).unwrap());
447        let mut next = 0;
448
449        assert_eq!(
450            cache.peek(|| {
451                next += 1;
452                SourceItem::Item(next)
453            }),
454            Some(&1)
455        );
456        assert_eq!(
457            cache.peek(|| {
458                next += 1;
459                SourceItem::Item(next)
460            }),
461            Some(&1)
462        );
463        assert_eq!(
464            cache.next(|| {
465                next += 1;
466                SourceItem::Item(next)
467            }),
468            Some(1)
469        );
470        assert_eq!(
471            cache.next(|| {
472                next += 1;
473                SourceItem::Item(next)
474            }),
475            Some(2)
476        );
477        assert_eq!(next, 2);
478    }
479
480    #[test]
481    fn peek_cache_respects_caller_supplied_lookahead() {
482        let mut cache = PeekCache::<u8>::new(NonZeroUsize::new(4).unwrap());
483        let mut next = 0;
484
485        assert_eq!(
486            cache.peek_at(3, || {
487                next += 1;
488                SourceItem::Item(next)
489            }),
490            Some(&4)
491        );
492        assert_eq!(next, 4);
493    }
494}