Skip to main content

wirespec_syntax/
lexer.rs

1//! Hand-written lexer/tokenizer for wirespec.
2//!
3//! Produces a stream of `Token`s from source text. Handles keywords, operators,
4//! integer literals (decimal, hex, binary), string literals, and comments
5//! (`#` and `//` line comments).
6
7use crate::span::Span;
8
9#[derive(Debug, Clone, PartialEq)]
10pub struct Token {
11    pub kind: TokenKind,
12    pub span: Span,
13}
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum TokenKind {
17    // Identifiers and literals
18    Name(String),
19    Integer(i64),
20    StringLit(String),
21
22    // Keywords
23    Module,
24    Import,
25    Const,
26    Enum,
27    Flags,
28    Type,
29    Packet,
30    Frame,
31    Capsule,
32    State,
33    Machine,
34    Transition,
35    Initial,
36    Terminal,
37    On,
38    Guard,
39    Action,
40    Delegate,
41    Match,
42    If,
43    Let,
44    Require,
45    StaticAssert,
46    Within,
47    Export,
48    Varint,
49    Bytes,
50    Bits,
51    Bit,
52    Fill,
53    Remaining,
54    True,
55    False,
56    Null,
57    And,
58    Or,
59    Not,
60    InState,
61    All,
62
63    // Punctuation / operators
64    LBrace,
65    RBrace,
66    LParen,
67    RParen,
68    LBracket,
69    RBracket,
70    Colon,
71    ColonColon,
72    Semicolon,
73    Comma,
74    Dot,
75    DotDot,
76    DotDotEq,
77    Arrow,      // ->
78    FatArrow,   // =>
79    Assign,     // =
80    PlusAssign, // +=
81    At,         // @
82    LArrow,     // <-
83
84    // Arithmetic
85    Plus,
86    Minus,
87    Star,
88    Slash,
89    Percent,
90
91    // Bitwise
92    Amp,
93    Pipe,
94    Caret,
95    Shl,
96    Shr,
97    Bang,
98
99    // Comparison
100    EqEq,
101    BangEq,
102    Lt,
103    Le,
104    Gt,
105    Ge,
106
107    // Misc
108    QuestionQuestion, // ??
109    Tilde,            // ~
110    TildeGt,          // ~>
111
112    // Special
113    Eof,
114}
115
116pub struct Lexer<'src> {
117    source: &'src [u8],
118    pos: usize,
119    tokens: Vec<Token>,
120}
121
122impl<'src> Lexer<'src> {
123    pub fn new(source: &'src str) -> Self {
124        Self {
125            source: source.as_bytes(),
126            pos: 0,
127            tokens: Vec::new(),
128        }
129    }
130
131    pub fn tokenize(mut self) -> Result<Vec<Token>, LexError> {
132        while self.pos < self.source.len() {
133            self.skip_whitespace_and_comments();
134            if self.pos >= self.source.len() {
135                break;
136            }
137            self.next_token()?;
138        }
139        self.tokens.push(Token {
140            kind: TokenKind::Eof,
141            span: Span::new(self.pos as u32, 0),
142        });
143        Ok(self.tokens)
144    }
145
146    fn peek(&self) -> u8 {
147        if self.pos < self.source.len() {
148            self.source[self.pos]
149        } else {
150            0
151        }
152    }
153
154    fn peek_at(&self, offset: usize) -> u8 {
155        let idx = self.pos + offset;
156        if idx < self.source.len() {
157            self.source[idx]
158        } else {
159            0
160        }
161    }
162
163    fn advance(&mut self) -> u8 {
164        let ch = self.source[self.pos];
165        self.pos += 1;
166        ch
167    }
168
169    fn skip_whitespace_and_comments(&mut self) {
170        while self.pos < self.source.len() {
171            let ch = self.peek();
172            if ch == b' ' || ch == b'\t' || ch == b'\r' || ch == b'\n' {
173                self.pos += 1;
174            } else if ch == b'#' || (ch == b'/' && self.peek_at(1) == b'/') {
175                // Line comment (# or //)
176                while self.pos < self.source.len() && self.source[self.pos] != b'\n' {
177                    self.pos += 1;
178                }
179            } else {
180                break;
181            }
182        }
183    }
184
185    fn next_token(&mut self) -> Result<(), LexError> {
186        let start = self.pos;
187        let ch = self.advance();
188
189        let kind = match ch {
190            b'{' => TokenKind::LBrace,
191            b'}' => TokenKind::RBrace,
192            b'(' => TokenKind::LParen,
193            b')' => TokenKind::RParen,
194            b'[' => TokenKind::LBracket,
195            b']' => TokenKind::RBracket,
196            b';' => TokenKind::Semicolon,
197            b',' => TokenKind::Comma,
198            b'*' => TokenKind::Star,
199            b'%' => TokenKind::Percent,
200            b'^' => TokenKind::Caret,
201            b'@' => TokenKind::At,
202
203            b':' => {
204                if self.peek() == b':' {
205                    self.advance();
206                    TokenKind::ColonColon
207                } else {
208                    TokenKind::Colon
209                }
210            }
211
212            b'.' => {
213                if self.peek() == b'.' {
214                    self.advance();
215                    if self.peek() == b'=' {
216                        self.advance();
217                        TokenKind::DotDotEq
218                    } else {
219                        TokenKind::DotDot
220                    }
221                } else {
222                    TokenKind::Dot
223                }
224            }
225
226            b'-' => {
227                if self.peek() == b'>' {
228                    self.advance();
229                    TokenKind::Arrow
230                } else {
231                    TokenKind::Minus
232                }
233            }
234
235            b'=' => {
236                if self.peek() == b'>' {
237                    self.advance();
238                    TokenKind::FatArrow
239                } else if self.peek() == b'=' {
240                    self.advance();
241                    TokenKind::EqEq
242                } else {
243                    TokenKind::Assign
244                }
245            }
246
247            b'+' => {
248                if self.peek() == b'=' {
249                    self.advance();
250                    TokenKind::PlusAssign
251                } else {
252                    TokenKind::Plus
253                }
254            }
255
256            b'!' => {
257                if self.peek() == b'=' {
258                    self.advance();
259                    TokenKind::BangEq
260                } else {
261                    TokenKind::Bang
262                }
263            }
264
265            b'<' => {
266                if self.peek() == b'=' {
267                    self.advance();
268                    TokenKind::Le
269                } else if self.peek() == b'<' {
270                    self.advance();
271                    TokenKind::Shl
272                } else if self.peek() == b'-' {
273                    self.advance();
274                    TokenKind::LArrow
275                } else {
276                    TokenKind::Lt
277                }
278            }
279
280            b'>' => {
281                if self.peek() == b'=' {
282                    self.advance();
283                    TokenKind::Ge
284                } else if self.peek() == b'>' {
285                    self.advance();
286                    TokenKind::Shr
287                } else {
288                    TokenKind::Gt
289                }
290            }
291
292            b'&' => TokenKind::Amp,
293            b'|' => TokenKind::Pipe,
294            b'/' => TokenKind::Slash,
295            b'~' => {
296                if self.peek() == b'>' {
297                    self.advance();
298                    TokenKind::TildeGt
299                } else {
300                    TokenKind::Tilde
301                }
302            }
303
304            b'?' => {
305                if self.peek() == b'?' {
306                    self.advance();
307                    TokenKind::QuestionQuestion
308                } else {
309                    return Err(LexError {
310                        msg: "unexpected '?'".into(),
311                        offset: start,
312                    });
313                }
314            }
315
316            b'"' => return self.lex_string(start),
317
318            b'0' if self.peek() == b'x' || self.peek() == b'X' => {
319                self.advance(); // skip 'x'
320                return self.lex_hex(start);
321            }
322
323            b'0' if self.peek() == b'b' || self.peek() == b'B' => {
324                self.advance(); // skip 'b'
325                return self.lex_binary(start);
326            }
327
328            ch if ch.is_ascii_digit() => {
329                return self.lex_decimal(start);
330            }
331
332            ch if ch.is_ascii_alphabetic() || ch == b'_' => {
333                return self.lex_name(start);
334            }
335
336            _ => {
337                return Err(LexError {
338                    msg: format!("unexpected character: {:?}", ch as char),
339                    offset: start,
340                });
341            }
342        };
343
344        self.tokens.push(Token {
345            kind,
346            span: Span::new(start as u32, (self.pos - start) as u32),
347        });
348        Ok(())
349    }
350
351    fn lex_decimal(&mut self, start: usize) -> Result<(), LexError> {
352        while self.pos < self.source.len() && (self.peek().is_ascii_digit() || self.peek() == b'_')
353        {
354            self.advance();
355        }
356        let text: String = self.source[start..self.pos]
357            .iter()
358            .filter(|&&b| b != b'_')
359            .map(|&b| b as char)
360            .collect();
361        let value = text.parse::<i64>().map_err(|_| LexError {
362            msg: format!("invalid integer literal: {text}"),
363            offset: start,
364        })?;
365        self.tokens.push(Token {
366            kind: TokenKind::Integer(value),
367            span: Span::new(start as u32, (self.pos - start) as u32),
368        });
369        Ok(())
370    }
371
372    fn lex_hex(&mut self, start: usize) -> Result<(), LexError> {
373        if self.pos >= self.source.len() || !self.peek().is_ascii_hexdigit() {
374            return Err(LexError {
375                msg: "expected hex digit after 0x".into(),
376                offset: start,
377            });
378        }
379        while self.pos < self.source.len()
380            && (self.peek().is_ascii_hexdigit() || self.peek() == b'_')
381        {
382            self.advance();
383        }
384        // start points at '0', we skip "0x" prefix for parsing
385        let text: String = self.source[start + 2..self.pos]
386            .iter()
387            .filter(|&&b| b != b'_')
388            .map(|&b| b as char)
389            .collect();
390        let value = i64::from_str_radix(&text, 16).map_err(|_| LexError {
391            msg: format!("invalid hex literal: 0x{text}"),
392            offset: start,
393        })?;
394        self.tokens.push(Token {
395            kind: TokenKind::Integer(value),
396            span: Span::new(start as u32, (self.pos - start) as u32),
397        });
398        Ok(())
399    }
400
401    fn lex_binary(&mut self, start: usize) -> Result<(), LexError> {
402        if self.pos >= self.source.len() || (self.peek() != b'0' && self.peek() != b'1') {
403            return Err(LexError {
404                msg: "expected binary digit after 0b".into(),
405                offset: start,
406            });
407        }
408        while self.pos < self.source.len()
409            && (self.peek() == b'0' || self.peek() == b'1' || self.peek() == b'_')
410        {
411            self.advance();
412        }
413        let text: String = self.source[start + 2..self.pos]
414            .iter()
415            .filter(|&&b| b != b'_')
416            .map(|&b| b as char)
417            .collect();
418        let value = i64::from_str_radix(&text, 2).map_err(|_| LexError {
419            msg: format!("invalid binary literal: 0b{text}"),
420            offset: start,
421        })?;
422        self.tokens.push(Token {
423            kind: TokenKind::Integer(value),
424            span: Span::new(start as u32, (self.pos - start) as u32),
425        });
426        Ok(())
427    }
428
429    fn lex_string(&mut self, start: usize) -> Result<(), LexError> {
430        let mut value = String::new();
431        loop {
432            if self.pos >= self.source.len() {
433                return Err(LexError {
434                    msg: "unterminated string literal".into(),
435                    offset: start,
436                });
437            }
438            let ch = self.advance();
439            match ch {
440                b'"' => break,
441                b'\\' => {
442                    if self.pos >= self.source.len() {
443                        return Err(LexError {
444                            msg: "unterminated escape in string".into(),
445                            offset: start,
446                        });
447                    }
448                    let esc = self.advance();
449                    match esc {
450                        b'n' => value.push('\n'),
451                        b't' => value.push('\t'),
452                        b'\\' => value.push('\\'),
453                        b'"' => value.push('"'),
454                        _ => {
455                            return Err(LexError {
456                                msg: format!("unknown escape: \\{}", esc as char),
457                                offset: self.pos - 1,
458                            });
459                        }
460                    }
461                }
462                _ => value.push(ch as char),
463            }
464        }
465        self.tokens.push(Token {
466            kind: TokenKind::StringLit(value),
467            span: Span::new(start as u32, (self.pos - start) as u32),
468        });
469        Ok(())
470    }
471
472    fn lex_name(&mut self, start: usize) -> Result<(), LexError> {
473        while self.pos < self.source.len()
474            && (self.peek().is_ascii_alphanumeric() || self.peek() == b'_')
475        {
476            self.advance();
477        }
478        let text = std::str::from_utf8(&self.source[start..self.pos])
479            .expect("identifier bytes must be valid UTF-8");
480        let kind = match text {
481            "module" => TokenKind::Module,
482            "import" => TokenKind::Import,
483            "const" => TokenKind::Const,
484            "enum" => TokenKind::Enum,
485            "flags" => TokenKind::Flags,
486            "type" => TokenKind::Type,
487            "packet" => TokenKind::Packet,
488            "frame" => TokenKind::Frame,
489            "capsule" => TokenKind::Capsule,
490            "state" => TokenKind::State,
491            "machine" => TokenKind::Machine,
492            "transition" => TokenKind::Transition,
493            "initial" => TokenKind::Initial,
494            "terminal" => TokenKind::Terminal,
495            "on" => TokenKind::On,
496            "guard" => TokenKind::Guard,
497            "action" => TokenKind::Action,
498            "delegate" => TokenKind::Delegate,
499            "match" => TokenKind::Match,
500            "if" => TokenKind::If,
501            "let" => TokenKind::Let,
502            "require" => TokenKind::Require,
503            "static_assert" => TokenKind::StaticAssert,
504            "within" => TokenKind::Within,
505            "export" => TokenKind::Export,
506            "varint" => TokenKind::Varint,
507            "bytes" => TokenKind::Bytes,
508            "bits" => TokenKind::Bits,
509            "bit" => TokenKind::Bit,
510            "fill" => TokenKind::Fill,
511            "remaining" => TokenKind::Remaining,
512            "true" => TokenKind::True,
513            "false" => TokenKind::False,
514            "null" => TokenKind::Null,
515            "and" => TokenKind::And,
516            "or" => TokenKind::Or,
517            "not" => TokenKind::Not,
518            "in_state" => TokenKind::InState,
519            "all" => TokenKind::All,
520            _ => TokenKind::Name(text.to_string()),
521        };
522        self.tokens.push(Token {
523            kind,
524            span: Span::new(start as u32, (self.pos - start) as u32),
525        });
526        Ok(())
527    }
528}
529
530#[derive(Debug, Clone)]
531pub struct LexError {
532    pub msg: String,
533    pub offset: usize,
534}
535
536impl std::fmt::Display for LexError {
537    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
538        write!(f, "lex error at offset {}: {}", self.offset, self.msg)
539    }
540}
541
542impl std::error::Error for LexError {}
543
544#[cfg(test)]
545mod tests {
546    use super::*;
547
548    fn tok_kinds(src: &str) -> Vec<TokenKind> {
549        let tokens = Lexer::new(src).tokenize().unwrap();
550        tokens.into_iter().map(|t| t.kind).collect()
551    }
552
553    #[test]
554    fn keywords() {
555        let kinds = tok_kinds("packet frame capsule type match");
556        assert_eq!(
557            kinds,
558            vec![
559                TokenKind::Packet,
560                TokenKind::Frame,
561                TokenKind::Capsule,
562                TokenKind::Type,
563                TokenKind::Match,
564                TokenKind::Eof,
565            ]
566        );
567    }
568
569    #[test]
570    fn integers() {
571        let kinds = tok_kinds("42 0xFF 0b1010");
572        assert_eq!(
573            kinds,
574            vec![
575                TokenKind::Integer(42),
576                TokenKind::Integer(0xFF),
577                TokenKind::Integer(0b1010),
578                TokenKind::Eof,
579            ]
580        );
581    }
582
583    #[test]
584    fn operators() {
585        let kinds = tok_kinds("+ - * / & | ^ << >> == != <= >= ?? => -> <- ..=");
586        assert_eq!(
587            kinds,
588            vec![
589                TokenKind::Plus,
590                TokenKind::Minus,
591                TokenKind::Star,
592                TokenKind::Slash,
593                TokenKind::Amp,
594                TokenKind::Pipe,
595                TokenKind::Caret,
596                TokenKind::Shl,
597                TokenKind::Shr,
598                TokenKind::EqEq,
599                TokenKind::BangEq,
600                TokenKind::Le,
601                TokenKind::Ge,
602                TokenKind::QuestionQuestion,
603                TokenKind::FatArrow,
604                TokenKind::Arrow,
605                TokenKind::LArrow,
606                TokenKind::DotDotEq,
607                TokenKind::Eof,
608            ]
609        );
610    }
611
612    #[test]
613    fn string_literal() {
614        let kinds = tok_kinds(r#""hello world""#);
615        assert_eq!(
616            kinds,
617            vec![
618                TokenKind::StringLit("hello world".to_string()),
619                TokenKind::Eof
620            ]
621        );
622    }
623
624    #[test]
625    fn comments() {
626        let kinds = tok_kinds("packet # comment\nframe // also comment\ncapsule");
627        assert_eq!(
628            kinds,
629            vec![
630                TokenKind::Packet,
631                TokenKind::Frame,
632                TokenKind::Capsule,
633                TokenKind::Eof,
634            ]
635        );
636    }
637
638    #[test]
639    fn name_and_reserved() {
640        let kinds = tok_kinds("src dst fill remaining in_state all true false null");
641        assert_eq!(
642            kinds,
643            vec![
644                TokenKind::Name("src".into()),
645                TokenKind::Name("dst".into()),
646                TokenKind::Fill,
647                TokenKind::Remaining,
648                TokenKind::InState,
649                TokenKind::All,
650                TokenKind::True,
651                TokenKind::False,
652                TokenKind::Null,
653                TokenKind::Eof,
654            ]
655        );
656    }
657
658    #[test]
659    fn hex_underscore() {
660        let kinds = tok_kinds("0xFF_FF");
661        assert_eq!(kinds, vec![TokenKind::Integer(0xFFFF), TokenKind::Eof]);
662    }
663
664    #[test]
665    fn binary_underscore() {
666        let kinds = tok_kinds("0b1010_0101");
667        assert_eq!(kinds, vec![TokenKind::Integer(0b10100101), TokenKind::Eof]);
668    }
669
670    #[test]
671    fn decimal_underscore() {
672        let kinds = tok_kinds("1_000_000");
673        assert_eq!(kinds, vec![TokenKind::Integer(1000000), TokenKind::Eof]);
674    }
675
676    #[test]
677    fn empty_string() {
678        let kinds = tok_kinds(r#""""#);
679        assert_eq!(
680            kinds,
681            vec![TokenKind::StringLit("".to_string()), TokenKind::Eof]
682        );
683    }
684
685    #[test]
686    fn string_escapes() {
687        let kinds = tok_kinds(r#""\n\t\\\"" "#);
688        assert_eq!(
689            kinds,
690            vec![TokenKind::StringLit("\n\t\\\"".to_string()), TokenKind::Eof,]
691        );
692    }
693
694    #[test]
695    fn consecutive_operators() {
696        let kinds = tok_kinds(">>>=");
697        // >> > =
698        assert_eq!(kinds, vec![TokenKind::Shr, TokenKind::Ge, TokenKind::Eof]);
699    }
700}