flutmax_parser/
lexer.rs

1/// Hand-written lexer for flutmax source code.
2///
3/// Converts a source string into a stream of `Token`s that the recursive-descent
4/// parser consumes. Key design decisions:
5///
6/// - Identifiers: `[a-zA-Z_][a-zA-Z0-9_]*(-[a-zA-Z0-9_]+)*` (hyphens allowed, e.g. `drunk-walk`)
7/// - Dot is always emitted as a separate `Dot` token
8/// - Tilde `~` is always emitted as a separate `Tilde` token
9/// - The parser reassembles dotted names (`jit.gl.render`) and tilde names (`cycle~`)
10/// - `.attr(` is recognized as a single `DotAttrLParen` token
11/// - Operator chars (`?*+/%!<>=&|^-`) form `Operator` tokens when not part of numbers
12/// - Keywords are checked after identifier scanning
13use crate::tokens::{Token, TokenType};
14
15pub struct Lexer<'a> {
16    source: &'a [u8],
17    pos: usize,
18    line: usize,
19    col: usize,
20}
21
22impl<'a> Lexer<'a> {
23    pub fn new(source: &'a str) -> Self {
24        Self {
25            source: source.as_bytes(),
26            pos: 0,
27            line: 1,
28            col: 1,
29        }
30    }
31
32    /// Tokenize the entire source, returning a Vec of tokens (ending with Eof).
33    pub fn tokenize(source: &str) -> Result<Vec<Token>, LexError> {
34        let mut lexer = Lexer::new(source);
35        let mut tokens = Vec::new();
36        loop {
37            let tok = lexer.next_token()?;
38            let is_eof = tok.token_type == TokenType::Eof;
39            tokens.push(tok);
40            if is_eof {
41                break;
42            }
43        }
44        Ok(tokens)
45    }
46
47    /// Tokenize the entire source, including comment tokens (for semantic highlighting).
48    /// Comments are emitted as `TokenType::Comment` tokens instead of being skipped.
49    pub fn tokenize_with_comments(source: &str) -> Result<Vec<Token>, LexError> {
50        let mut lexer = Lexer::new(source);
51        let mut tokens = Vec::new();
52        loop {
53            let tok = lexer.next_token_with_comments()?;
54            let is_eof = tok.token_type == TokenType::Eof;
55            tokens.push(tok);
56            if is_eof {
57                break;
58            }
59        }
60        Ok(tokens)
61    }
62
63    fn peek(&self) -> Option<u8> {
64        self.source.get(self.pos).copied()
65    }
66
67    fn peek_at(&self, offset: usize) -> Option<u8> {
68        self.source.get(self.pos + offset).copied()
69    }
70
71    fn advance(&mut self) -> Option<u8> {
72        let ch = self.source.get(self.pos).copied()?;
73        self.pos += 1;
74        if ch == b'\n' {
75            self.line += 1;
76            self.col = 1;
77        } else {
78            self.col += 1;
79        }
80        Some(ch)
81    }
82
83    fn skip_whitespace_and_comments(&mut self) {
84        loop {
85            // Skip whitespace
86            while let Some(ch) = self.peek() {
87                if ch == b' ' || ch == b'\t' || ch == b'\n' || ch == b'\r' {
88                    self.advance();
89                } else {
90                    break;
91                }
92            }
93            // Skip line comments
94            if self.peek() == Some(b'/') && self.peek_at(1) == Some(b'/') {
95                // Consume until end of line
96                while let Some(ch) = self.peek() {
97                    if ch == b'\n' {
98                        break;
99                    }
100                    self.advance();
101                }
102                // Continue to skip more whitespace/comments
103                continue;
104            }
105            break;
106        }
107    }
108
109    /// Skip whitespace only (not comments). Returns `Some(Comment token)` if a
110    /// comment was found, or `None` if no comment follows whitespace.
111    fn skip_whitespace_and_maybe_comment(&mut self) -> Option<Token> {
112        // Skip whitespace
113        while let Some(ch) = self.peek() {
114            if ch == b' ' || ch == b'\t' || ch == b'\n' || ch == b'\r' {
115                self.advance();
116            } else {
117                break;
118            }
119        }
120        // Check for line comment — emit as token instead of skipping
121        if self.peek() == Some(b'/') && self.peek_at(1) == Some(b'/') {
122            let line = self.line;
123            let col = self.col;
124            let start = self.pos;
125            // Consume until end of line
126            while let Some(ch) = self.peek() {
127                if ch == b'\n' {
128                    break;
129                }
130                self.advance();
131            }
132            let text = std::str::from_utf8(&self.source[start..self.pos]).unwrap();
133            return Some(Token::new(TokenType::Comment, text, line, col));
134        }
135        None
136    }
137
138    /// Like `next_token` but emits `Comment` tokens instead of skipping them.
139    fn next_token_with_comments(&mut self) -> Result<Token, LexError> {
140        if let Some(comment_tok) = self.skip_whitespace_and_maybe_comment() {
141            return Ok(comment_tok);
142        }
143
144        let line = self.line;
145        let col = self.col;
146
147        let ch = match self.peek() {
148            Some(ch) => ch,
149            None => return Ok(Token::new(TokenType::Eof, "", line, col)),
150        };
151
152        // Delegate to the same matching logic as next_token
153        self.lex_token_char(ch, line, col)
154    }
155
156    fn next_token(&mut self) -> Result<Token, LexError> {
157        self.skip_whitespace_and_comments();
158
159        let line = self.line;
160        let col = self.col;
161
162        let ch = match self.peek() {
163            Some(ch) => ch,
164            None => return Ok(Token::new(TokenType::Eof, "", line, col)),
165        };
166
167        self.lex_token_char(ch, line, col)
168    }
169
170    /// Core token matching logic shared by `next_token` and `next_token_with_comments`.
171    fn lex_token_char(&mut self, ch: u8, line: usize, col: usize) -> Result<Token, LexError> {
172        match ch {
173            b'(' => {
174                self.advance();
175                Ok(Token::new(TokenType::LParen, "(", line, col))
176            }
177            b')' => {
178                self.advance();
179                Ok(Token::new(TokenType::RParen, ")", line, col))
180            }
181            b'[' => {
182                self.advance();
183                Ok(Token::new(TokenType::LBracket, "[", line, col))
184            }
185            b']' => {
186                self.advance();
187                Ok(Token::new(TokenType::RBracket, "]", line, col))
188            }
189            b',' => {
190                self.advance();
191                Ok(Token::new(TokenType::Comma, ",", line, col))
192            }
193            b';' => {
194                self.advance();
195                Ok(Token::new(TokenType::Semicolon, ";", line, col))
196            }
197            b':' => {
198                self.advance();
199                Ok(Token::new(TokenType::Colon, ":", line, col))
200            }
201            b'~' => {
202                self.advance();
203                Ok(Token::new(TokenType::Tilde, "~", line, col))
204            }
205            b'.' => {
206                // Check for `.attr(` special token
207                if self.matches_ahead(b".attr(") {
208                    for _ in 0..6 {
209                        self.advance();
210                    }
211                    Ok(Token::new(TokenType::DotAttrLParen, ".attr(", line, col))
212                } else {
213                    self.advance();
214                    Ok(Token::new(TokenType::Dot, ".", line, col))
215                }
216            }
217            b'=' => {
218                // Could be `=`, `==`, or longer operator
219                // If followed by `=` or another operator char, treat as operator
220                if self.peek_at(1) == Some(b'=') || self.is_operator_char_at(1) {
221                    self.lex_operator(line, col)
222                } else {
223                    self.advance();
224                    Ok(Token::new(TokenType::Eq, "=", line, col))
225                }
226            }
227            b'"' => self.lex_string(line, col),
228            b'-' => {
229                // Negative number or operator
230                // It's a negative number if followed by a digit
231                // BUT only if the previous significant token is not an identifier/number/rparen
232                // (to handle `sub(1, -2)` vs operator `-`)
233                // For simplicity: if `-` followed by digit, lex as number
234                if self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) {
235                    self.lex_number(line, col)
236                } else {
237                    self.lex_operator(line, col)
238                }
239            }
240            _ if ch.is_ascii_digit() => self.lex_number(line, col),
241            _ if is_ident_start(ch) => self.lex_identifier(line, col),
242            _ if is_operator_char(ch) => self.lex_operator(line, col),
243            _ => Err(LexError {
244                message: format!("Unexpected character '{}'", ch as char),
245                line,
246                column: col,
247            }),
248        }
249    }
250
251    fn matches_ahead(&self, pattern: &[u8]) -> bool {
252        if self.pos + pattern.len() > self.source.len() {
253            return false;
254        }
255        &self.source[self.pos..self.pos + pattern.len()] == pattern
256    }
257
258    fn is_operator_char_at(&self, offset: usize) -> bool {
259        self.peek_at(offset).is_some_and(is_operator_char)
260    }
261
262    /// Lex an identifier: `[a-zA-Z_][a-zA-Z0-9_]*(-[a-zA-Z0-9_]+)*`
263    /// Then check if it's a keyword.
264    fn lex_identifier(&mut self, line: usize, col: usize) -> Result<Token, LexError> {
265        let start = self.pos;
266        // First char: [a-zA-Z_]
267        self.advance();
268        // Continue: [a-zA-Z0-9_]
269        while let Some(ch) = self.peek() {
270            if ch.is_ascii_alphanumeric() || ch == b'_' {
271                self.advance();
272            } else {
273                break;
274            }
275        }
276        // Hyphenated segments: `-[a-zA-Z0-9_]+`
277        // A hyphen followed by alphanumeric (not a digit alone, to avoid `-7`)
278        while self.peek() == Some(b'-') {
279            // Look ahead: next char after `-` must be a letter or digit that forms
280            // part of the identifier, not a standalone negative number
281            if let Some(next) = self.peek_at(1) {
282                if next.is_ascii_alphanumeric() || next == b'_' {
283                    self.advance(); // consume `-`
284                                    // consume segment
285                    while let Some(ch) = self.peek() {
286                        if ch.is_ascii_alphanumeric() || ch == b'_' {
287                            self.advance();
288                        } else {
289                            break;
290                        }
291                    }
292                } else {
293                    break;
294                }
295            } else {
296                break;
297            }
298        }
299
300        let text = std::str::from_utf8(&self.source[start..self.pos]).unwrap();
301        let token_type = keyword_or_ident(text);
302        Ok(Token::new(token_type, text, line, col))
303    }
304
305    /// Lex a number: integer, float, scientific notation, trailing dot, negative.
306    fn lex_number(&mut self, line: usize, col: usize) -> Result<Token, LexError> {
307        let start = self.pos;
308
309        // Optional leading `-`
310        if self.peek() == Some(b'-') {
311            self.advance();
312        }
313
314        // Digits
315        self.consume_digits();
316
317        // Optional decimal part
318        if self.peek() == Some(b'.') {
319            // Peek ahead: if the next char after `.` is a digit, or nothing follows
320            // (trailing dot like `100.`), consume the dot.
321            // But NOT if it's `.attr(` or `.in[` etc.
322            let after_dot = self.peek_at(1);
323            let consume_dot = match after_dot {
324                Some(d) if d.is_ascii_digit() => true,
325                // Trailing dot: `100.` — only if not followed by identifier start
326                // (which would be member access like `100.something`)
327                Some(d) if is_ident_start(d) => false,
328                _ => true, // end of input, space, comma, paren, etc.
329            };
330            if consume_dot {
331                self.advance(); // consume `.`
332                self.consume_digits(); // may be empty for trailing dot
333            }
334        }
335
336        // Optional scientific notation
337        if let Some(ch) = self.peek() {
338            if ch == b'e' || ch == b'E' {
339                self.advance(); // consume `e`/`E`
340                                // Optional sign
341                if let Some(sign) = self.peek() {
342                    if sign == b'+' || sign == b'-' {
343                        self.advance();
344                    }
345                }
346                self.consume_digits();
347            }
348        }
349
350        let text = std::str::from_utf8(&self.source[start..self.pos]).unwrap();
351        Ok(Token::new(TokenType::NumberLit, text, line, col))
352    }
353
354    fn consume_digits(&mut self) {
355        while let Some(ch) = self.peek() {
356            if ch.is_ascii_digit() {
357                self.advance();
358            } else {
359                break;
360            }
361        }
362    }
363
364    /// Lex a string literal: `"..."` with escape sequences.
365    fn lex_string(&mut self, line: usize, col: usize) -> Result<Token, LexError> {
366        let start = self.pos;
367        self.advance(); // consume opening `"`
368
369        loop {
370            match self.peek() {
371                Some(b'"') => {
372                    self.advance(); // consume closing `"`
373                    break;
374                }
375                Some(b'\\') => {
376                    self.advance(); // consume `\`
377                    self.advance(); // consume escaped char
378                }
379                Some(_) => {
380                    self.advance();
381                }
382                None => {
383                    return Err(LexError {
384                        message: "Unterminated string literal".to_string(),
385                        line,
386                        column: col,
387                    });
388                }
389            }
390        }
391
392        let text = std::str::from_utf8(&self.source[start..self.pos]).unwrap();
393        Ok(Token::new(TokenType::StringLit, text, line, col))
394    }
395
396    /// Lex an operator: `[*+/%!<>=&|^?-]+`
397    /// Also consumes `=` characters when part of multi-char operators like `==`, `!=`, `<=`, `>=`.
398    fn lex_operator(&mut self, line: usize, col: usize) -> Result<Token, LexError> {
399        let start = self.pos;
400        while let Some(ch) = self.peek() {
401            if is_operator_char(ch) || ch == b'=' {
402                self.advance();
403            } else {
404                break;
405            }
406        }
407        let text = std::str::from_utf8(&self.source[start..self.pos]).unwrap();
408        Ok(Token::new(TokenType::Operator, text, line, col))
409    }
410}
411
412fn is_ident_start(ch: u8) -> bool {
413    ch.is_ascii_alphabetic() || ch == b'_'
414}
415
416fn is_operator_char(ch: u8) -> bool {
417    matches!(
418        ch,
419        b'*' | b'+' | b'/' | b'%' | b'!' | b'<' | b'>' | b'&' | b'|' | b'^' | b'?'
420    )
421}
422
423fn keyword_or_ident(text: &str) -> TokenType {
424    match text {
425        "wire" => TokenType::Wire,
426        "in" => TokenType::In,
427        "out" => TokenType::Out,
428        "state" => TokenType::State,
429        "msg" => TokenType::Msg,
430        "feedback" => TokenType::Feedback,
431        "signal" => TokenType::Signal,
432        "float" => TokenType::Float,
433        "int" => TokenType::Int,
434        "bang" => TokenType::Bang,
435        "list" => TokenType::List,
436        "symbol" => TokenType::Symbol,
437        _ => TokenType::Identifier,
438    }
439}
440
441#[derive(Debug)]
442pub struct LexError {
443    pub message: String,
444    pub line: usize,
445    pub column: usize,
446}
447
448impl std::fmt::Display for LexError {
449    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
450        write!(
451            f,
452            "Lex error at {}:{}: {}",
453            self.line, self.column, self.message
454        )
455    }
456}
457
458impl std::error::Error for LexError {}
459
460#[cfg(test)]
461mod tests {
462    use super::*;
463    use crate::tokens::TokenType::*;
464
465    fn types(source: &str) -> Vec<TokenType> {
466        Lexer::tokenize(source)
467            .unwrap()
468            .into_iter()
469            .map(|t| t.token_type)
470            .collect()
471    }
472
473    fn lexemes(source: &str) -> Vec<String> {
474        Lexer::tokenize(source)
475            .unwrap()
476            .into_iter()
477            .map(|t| t.lexeme)
478            .collect()
479    }
480
481    #[test]
482    fn test_simple_wire() {
483        let toks = types("wire osc = cycle~(440);");
484        assert_eq!(
485            toks,
486            vec![
487                Wire, Identifier, // osc
488                Eq, Identifier, // cycle
489                Tilde, LParen, NumberLit, // 440
490                RParen, Semicolon, Eof,
491            ]
492        );
493    }
494
495    #[test]
496    fn test_in_decl() {
497        let toks = types("in 0 (freq): signal;");
498        assert_eq!(
499            toks,
500            vec![In, NumberLit, LParen, Identifier, RParen, Colon, Signal, Semicolon, Eof]
501        );
502    }
503
504    #[test]
505    fn test_dotted_identifier() {
506        // `jit.gl.render(440)` → jit Dot gl Dot render LParen 440 RParen
507        let lex = lexemes("jit.gl.render(440)");
508        assert_eq!(
509            lex,
510            vec!["jit", ".", "gl", ".", "render", "(", "440", ")", ""]
511        );
512    }
513
514    #[test]
515    fn test_port_access() {
516        // `w_1.in[0]` → w_1 Dot in LBracket 0 RBracket
517        let toks = types("w_1.in[0]");
518        assert_eq!(
519            toks,
520            vec![Identifier, Dot, In, LBracket, NumberLit, RBracket, Eof]
521        );
522    }
523
524    #[test]
525    fn test_output_port_access() {
526        let toks = types("w_1.out[1]");
527        assert_eq!(
528            toks,
529            vec![Identifier, Dot, Out, LBracket, NumberLit, RBracket, Eof]
530        );
531    }
532
533    #[test]
534    fn test_numbers() {
535        // Integer
536        let lex = lexemes("42");
537        assert_eq!(lex, vec!["42", ""]);
538
539        // Float
540        let lex = lexemes("3.14");
541        assert_eq!(lex, vec!["3.14", ""]);
542
543        // Negative
544        let lex = lexemes("-7");
545        assert_eq!(lex, vec!["-7", ""]);
546
547        // Trailing dot
548        let lex = lexemes("100.");
549        assert_eq!(lex, vec!["100.", ""]);
550
551        // Scientific notation
552        let lex = lexemes("1e-6");
553        assert_eq!(lex, vec!["1e-6", ""]);
554
555        // Float + scientific
556        let lex = lexemes("3.14E+5");
557        assert_eq!(lex, vec!["3.14E+5", ""]);
558    }
559
560    #[test]
561    fn test_string() {
562        let toks = Lexer::tokenize(r#""hello world""#).unwrap();
563        assert_eq!(toks.len(), 2); // string + eof
564        assert_eq!(toks[0].token_type, StringLit);
565        assert_eq!(toks[0].lexeme, r#""hello world""#);
566    }
567
568    #[test]
569    fn test_string_with_escapes() {
570        let toks = Lexer::tokenize(r#""hello \"world\"""#).unwrap();
571        assert_eq!(toks[0].token_type, StringLit);
572        assert_eq!(toks[0].lexeme, r#""hello \"world\"""#);
573    }
574
575    #[test]
576    fn test_operator_names() {
577        let toks = types("?(a, b)");
578        assert_eq!(
579            toks,
580            vec![Operator, LParen, Identifier, Comma, Identifier, RParen, Eof]
581        );
582
583        let lex = lexemes("*(x, y)");
584        assert_eq!(lex[0], "*");
585    }
586
587    #[test]
588    fn test_comment_skipped() {
589        let toks = types("// comment\nwire x = 1;");
590        assert_eq!(toks, vec![Wire, Identifier, Eq, NumberLit, Semicolon, Eof]);
591    }
592
593    #[test]
594    fn test_hyphenated_identifier() {
595        let lex = lexemes("drunk-walk");
596        assert_eq!(lex, vec!["drunk-walk", ""]);
597    }
598
599    #[test]
600    fn test_dot_attr_lparen() {
601        let toks = types(".attr(minimum: 0)");
602        assert_eq!(
603            toks,
604            vec![DotAttrLParen, Identifier, Colon, NumberLit, RParen, Eof]
605        );
606    }
607
608    #[test]
609    fn test_negative_float() {
610        let lex = lexemes("-3.14");
611        assert_eq!(lex, vec!["-3.14", ""]);
612    }
613
614    #[test]
615    fn test_line_column_tracking() {
616        let toks = Lexer::tokenize("wire x\n  = 1;").unwrap();
617        // `wire` at (1,1)
618        assert_eq!((toks[0].line, toks[0].column), (1, 1));
619        // `x` at (1,6)
620        assert_eq!((toks[1].line, toks[1].column), (1, 6));
621        // `=` at (2,3)
622        assert_eq!((toks[2].line, toks[2].column), (2, 3));
623        // `1` at (2,5)
624        assert_eq!((toks[3].line, toks[3].column), (2, 5));
625    }
626
627    #[test]
628    fn test_empty_source() {
629        let toks = types("");
630        assert_eq!(toks, vec![Eof]);
631    }
632
633    #[test]
634    fn test_out_assignment_tokens() {
635        let toks = types("out[0] = osc;");
636        assert_eq!(
637            toks,
638            vec![Out, LBracket, NumberLit, RBracket, Eq, Identifier, Semicolon, Eof]
639        );
640    }
641
642    #[test]
643    fn test_operator_eq_disambiguation() {
644        // `==` should be a single operator token, not Eq Eq
645        let lex = lexemes("==(a, b)");
646        assert_eq!(lex[0], "==");
647        assert_eq!(
648            types("==(a, b)"),
649            vec![Operator, LParen, Identifier, Comma, Identifier, RParen, Eof]
650        );
651    }
652
653    #[test]
654    fn test_dotted_segment_with_digit() {
655        // `jit.3m` — dotted segment starting with digit
656        // The lexer emits separate tokens: jit Dot 3 ...
657        // But `3m` won't be a single identifier token — `3` is a number.
658        // The parser handles reassembly with digit-starting segments.
659        let lex = lexemes("jit.3m");
660        // `3m` is tricky: `3` as number, then `m` as identifier
661        // Actually the lexer sees `3` as digit → NumberLit, then `m` as Identifier
662        assert_eq!(lex, vec!["jit", ".", "3", "m", ""]);
663    }
664
665    #[test]
666    fn test_msg_tokens() {
667        let toks = types(r#"msg click = "bang";"#);
668        assert_eq!(toks, vec![Msg, Identifier, Eq, StringLit, Semicolon, Eof]);
669    }
670
671    #[test]
672    fn test_feedback_tokens() {
673        let toks = types("feedback fb: signal;");
674        assert_eq!(
675            toks,
676            vec![Feedback, Identifier, Colon, Signal, Semicolon, Eof]
677        );
678    }
679
680    #[test]
681    fn test_state_tokens() {
682        let toks = types("state counter: int = 0;");
683        assert_eq!(
684            toks,
685            vec![State, Identifier, Colon, Int, Eq, NumberLit, Semicolon, Eof]
686        );
687    }
688
689    #[test]
690    fn test_string_with_url() {
691        // String containing `//` should not be treated as comment
692        let toks = Lexer::tokenize(r#""http://example.com""#).unwrap();
693        assert_eq!(toks.len(), 2);
694        assert_eq!(toks[0].token_type, StringLit);
695        assert_eq!(toks[0].lexeme, r#""http://example.com""#);
696    }
697
698    #[test]
699    fn test_complex_expr() {
700        // `mul~(osc, 0.5)` — tilde identifier with float arg
701        let lex = lexemes("mul~(osc, 0.5)");
702        assert_eq!(lex, vec!["mul", "~", "(", "osc", ",", "0.5", ")", ""]);
703    }
704}
flutmax_parser/lexer.rs

flutmax_parser/
lexer.rs