Skip to main content

fiddler_script/
lexer.rs

1//! Lexer (tokenizer) for FiddlerScript.
2//!
3//! The lexer transforms source code into a stream of tokens that can be
4//! consumed by the parser.
5
6use crate::error::{LexError, Position};
7
8/// A token produced by the lexer.
9#[derive(Debug, Clone, PartialEq)]
10pub struct Token {
11    /// The type and value of the token
12    pub kind: TokenKind,
13    /// Position in source code
14    pub position: Position,
15}
16
17impl Token {
18    /// Create a new token.
19    pub fn new(kind: TokenKind, position: Position) -> Self {
20        Self { kind, position }
21    }
22}
23
24/// The different kinds of tokens in FiddlerScript.
25#[derive(Debug, Clone, PartialEq)]
26pub enum TokenKind {
27    // Literals
28    /// Integer literal
29    Integer(i64),
30    /// Float literal
31    Float(f64),
32    /// String literal
33    String(String),
34
35    // Identifiers and keywords
36    /// Identifier (variable or function name)
37    Identifier(String),
38    /// `let` keyword
39    Let,
40    /// `if` keyword
41    If,
42    /// `else` keyword
43    Else,
44    /// `for` keyword
45    For,
46    /// `fn` keyword
47    Fn,
48    /// `return` keyword
49    Return,
50    /// `true` keyword
51    True,
52    /// `false` keyword
53    False,
54    /// `null` keyword
55    Null,
56
57    // Operators
58    /// `+`
59    Plus,
60    /// `-`
61    Minus,
62    /// `*`
63    Star,
64    /// `/`
65    Slash,
66    /// `%`
67    Percent,
68    /// `=`
69    Assign,
70    /// `==`
71    Equal,
72    /// `!=`
73    NotEqual,
74    /// `<`
75    LessThan,
76    /// `<=`
77    LessEqual,
78    /// `>`
79    GreaterThan,
80    /// `>=`
81    GreaterEqual,
82    /// `!`
83    Bang,
84    /// `&&`
85    And,
86    /// `||`
87    Or,
88
89    // Delimiters
90    /// `(`
91    LeftParen,
92    /// `)`
93    RightParen,
94    /// `{`
95    LeftBrace,
96    /// `}`
97    RightBrace,
98    /// `[`
99    LeftBracket,
100    /// `]`
101    RightBracket,
102    /// `,`
103    Comma,
104    /// `;`
105    Semicolon,
106    /// `:`
107    Colon,
108    /// `.`
109    Dot,
110
111    // Special
112    /// End of file
113    Eof,
114}
115
116impl std::fmt::Display for TokenKind {
117    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
118        match self {
119            TokenKind::Integer(n) => write!(f, "{}", n),
120            TokenKind::Float(fl) => write!(f, "{}", fl),
121            TokenKind::String(s) => write!(f, "\"{}\"", s),
122            TokenKind::Identifier(s) => write!(f, "{}", s),
123            TokenKind::Let => write!(f, "let"),
124            TokenKind::If => write!(f, "if"),
125            TokenKind::Else => write!(f, "else"),
126            TokenKind::For => write!(f, "for"),
127            TokenKind::Fn => write!(f, "fn"),
128            TokenKind::Return => write!(f, "return"),
129            TokenKind::True => write!(f, "true"),
130            TokenKind::False => write!(f, "false"),
131            TokenKind::Null => write!(f, "null"),
132            TokenKind::Plus => write!(f, "+"),
133            TokenKind::Minus => write!(f, "-"),
134            TokenKind::Star => write!(f, "*"),
135            TokenKind::Slash => write!(f, "/"),
136            TokenKind::Percent => write!(f, "%"),
137            TokenKind::Assign => write!(f, "="),
138            TokenKind::Equal => write!(f, "=="),
139            TokenKind::NotEqual => write!(f, "!="),
140            TokenKind::LessThan => write!(f, "<"),
141            TokenKind::LessEqual => write!(f, "<="),
142            TokenKind::GreaterThan => write!(f, ">"),
143            TokenKind::GreaterEqual => write!(f, ">="),
144            TokenKind::Bang => write!(f, "!"),
145            TokenKind::And => write!(f, "&&"),
146            TokenKind::Or => write!(f, "||"),
147            TokenKind::LeftParen => write!(f, "("),
148            TokenKind::RightParen => write!(f, ")"),
149            TokenKind::LeftBrace => write!(f, "{{"),
150            TokenKind::RightBrace => write!(f, "}}"),
151            TokenKind::LeftBracket => write!(f, "["),
152            TokenKind::RightBracket => write!(f, "]"),
153            TokenKind::Comma => write!(f, ","),
154            TokenKind::Semicolon => write!(f, ";"),
155            TokenKind::Colon => write!(f, ":"),
156            TokenKind::Dot => write!(f, "."),
157            TokenKind::Eof => write!(f, "EOF"),
158        }
159    }
160}
161
162/// The lexer that tokenizes FiddlerScript source code.
163pub struct Lexer<'a> {
164    /// Source code being tokenized
165    source: &'a str,
166    /// Characters iterator with indices
167    chars: std::iter::Peekable<std::str::CharIndices<'a>>,
168    /// Current line number (1-indexed)
169    line: usize,
170    /// Current column number (1-indexed)
171    column: usize,
172    /// Current byte offset
173    offset: usize,
174    /// Whether we've reached EOF
175    at_eof: bool,
176}
177
178impl<'a> Lexer<'a> {
179    /// Create a new lexer for the given source code.
180    pub fn new(source: &'a str) -> Self {
181        Self {
182            source,
183            chars: source.char_indices().peekable(),
184            line: 1,
185            column: 1,
186            offset: 0,
187            at_eof: false,
188        }
189    }
190
191    /// Get the current position in the source.
192    fn position(&self) -> Position {
193        Position::new(self.line, self.column, self.offset)
194    }
195
196    /// Advance to the next character.
197    fn advance(&mut self) -> Option<char> {
198        if let Some((idx, ch)) = self.chars.next() {
199            self.offset = idx + ch.len_utf8();
200            if ch == '\n' {
201                self.line += 1;
202                self.column = 1;
203            } else {
204                self.column += 1;
205            }
206            Some(ch)
207        } else {
208            None
209        }
210    }
211
212    /// Peek at the next character without consuming it.
213    fn peek(&mut self) -> Option<char> {
214        self.chars.peek().map(|(_, ch)| *ch)
215    }
216
217    /// Peek at the character after the current one without consuming.
218    fn peek_next(&self) -> Option<char> {
219        let mut iter = self.chars.clone();
220        iter.next(); // Skip current
221        iter.peek().map(|(_, ch)| *ch)
222    }
223
224    /// Skip whitespace and comments.
225    fn skip_whitespace_and_comments(&mut self) {
226        loop {
227            // Skip whitespace
228            while let Some(ch) = self.peek() {
229                if ch.is_whitespace() {
230                    self.advance();
231                } else {
232                    break;
233                }
234            }
235
236            // Check for comment: // until end of line
237            if self.peek() == Some('/') && self.peek_next() == Some('/') {
238                // Skip until end of line
239                self.advance(); // consume first /
240                self.advance(); // consume second /
241                while let Some(ch) = self.peek() {
242                    if ch == '\n' {
243                        break;
244                    }
245                    self.advance();
246                }
247                continue; // Check for more whitespace/comments
248            }
249
250            break;
251        }
252    }
253
254    /// Tokenize an identifier or keyword.
255    fn scan_identifier(&mut self, first_char: char, start_pos: Position) -> Token {
256        let start_offset = self.offset - first_char.len_utf8();
257
258        while let Some(ch) = self.peek() {
259            if ch.is_alphanumeric() || ch == '_' {
260                self.advance();
261            } else {
262                break;
263            }
264        }
265
266        let text = &self.source[start_offset..self.offset];
267        let kind = match text {
268            "let" => TokenKind::Let,
269            "if" => TokenKind::If,
270            "else" => TokenKind::Else,
271            "for" => TokenKind::For,
272            "fn" => TokenKind::Fn,
273            "return" => TokenKind::Return,
274            "true" => TokenKind::True,
275            "false" => TokenKind::False,
276            "null" => TokenKind::Null,
277            _ => TokenKind::Identifier(text.to_string()),
278        };
279
280        Token::new(kind, start_pos)
281    }
282
283    /// Tokenize a number literal (integer or float).
284    fn scan_number(&mut self, first_char: char, start_pos: Position) -> Result<Token, LexError> {
285        let start_offset = self.offset - first_char.len_utf8();
286        let mut has_decimal = false;
287
288        // Scan digits and optional decimal part
289        while let Some(ch) = self.peek() {
290            if ch.is_ascii_digit() {
291                self.advance();
292            } else if ch == '.' && !has_decimal {
293                // Look ahead to ensure '.' is followed by a digit (not a method call)
294                if let Some(next_ch) = self.peek_next() {
295                    if next_ch.is_ascii_digit() {
296                        has_decimal = true;
297                        self.advance(); // consume '.'
298                    } else {
299                        // '.' not part of number (could be method call like 3.abs())
300                        break;
301                    }
302                } else {
303                    // '.' at end of input, not part of number
304                    break;
305                }
306            } else {
307                break;
308            }
309        }
310
311        let text = &self.source[start_offset..self.offset];
312
313        if has_decimal {
314            // Parse as float
315            match text.parse::<f64>() {
316                Ok(value) => Ok(Token::new(TokenKind::Float(value), start_pos)),
317                Err(_) => Err(LexError::InvalidNumber(start_pos)),
318            }
319        } else {
320            // Parse as integer
321            match text.parse::<i64>() {
322                Ok(value) => Ok(Token::new(TokenKind::Integer(value), start_pos)),
323                Err(_) => Err(LexError::InvalidNumber(start_pos)),
324            }
325        }
326    }
327
328    /// Tokenize a string literal.
329    fn scan_string(&mut self, start_pos: Position) -> Result<Token, LexError> {
330        let mut value = String::new();
331
332        loop {
333            match self.advance() {
334                Some('"') => break,
335                Some('\\') => {
336                    // Handle escape sequences
337                    match self.advance() {
338                        Some('n') => value.push('\n'),
339                        Some('t') => value.push('\t'),
340                        Some('r') => value.push('\r'),
341                        Some('\\') => value.push('\\'),
342                        Some('"') => value.push('"'),
343                        Some(ch) => return Err(LexError::InvalidEscape(ch, self.position())),
344                        None => return Err(LexError::UnterminatedString(start_pos)),
345                    }
346                }
347                Some(ch) => value.push(ch),
348                None => return Err(LexError::UnterminatedString(start_pos)),
349            }
350        }
351
352        Ok(Token::new(TokenKind::String(value), start_pos))
353    }
354
355    /// Get the next token from the source.
356    pub fn next_token(&mut self) -> Result<Token, LexError> {
357        self.skip_whitespace_and_comments();
358
359        let pos = self.position();
360
361        let Some(ch) = self.advance() else {
362            self.at_eof = true;
363            return Ok(Token::new(TokenKind::Eof, pos));
364        };
365
366        let kind = match ch {
367            // Single-character tokens
368            '+' => TokenKind::Plus,
369            '-' => TokenKind::Minus,
370            '*' => TokenKind::Star,
371            '/' => TokenKind::Slash,
372            '%' => TokenKind::Percent,
373            '(' => TokenKind::LeftParen,
374            ')' => TokenKind::RightParen,
375            '{' => TokenKind::LeftBrace,
376            '}' => TokenKind::RightBrace,
377            '[' => TokenKind::LeftBracket,
378            ']' => TokenKind::RightBracket,
379            ',' => TokenKind::Comma,
380            ';' => TokenKind::Semicolon,
381            ':' => TokenKind::Colon,
382            '.' => TokenKind::Dot,
383
384            // Potentially two-character tokens
385            '=' => {
386                if self.peek() == Some('=') {
387                    self.advance();
388                    TokenKind::Equal
389                } else {
390                    TokenKind::Assign
391                }
392            }
393            '!' => {
394                if self.peek() == Some('=') {
395                    self.advance();
396                    TokenKind::NotEqual
397                } else {
398                    TokenKind::Bang
399                }
400            }
401            '<' => {
402                if self.peek() == Some('=') {
403                    self.advance();
404                    TokenKind::LessEqual
405                } else {
406                    TokenKind::LessThan
407                }
408            }
409            '>' => {
410                if self.peek() == Some('=') {
411                    self.advance();
412                    TokenKind::GreaterEqual
413                } else {
414                    TokenKind::GreaterThan
415                }
416            }
417            '&' => {
418                if self.peek() == Some('&') {
419                    self.advance();
420                    TokenKind::And
421                } else {
422                    return Err(LexError::UnexpectedCharacter(ch, pos));
423                }
424            }
425            '|' => {
426                if self.peek() == Some('|') {
427                    self.advance();
428                    TokenKind::Or
429                } else {
430                    return Err(LexError::UnexpectedCharacter(ch, pos));
431                }
432            }
433
434            // String literal
435            '"' => return self.scan_string(pos),
436
437            // Number literal
438            ch if ch.is_ascii_digit() => return self.scan_number(ch, pos),
439
440            // Identifier or keyword
441            ch if ch.is_alphabetic() || ch == '_' => {
442                return Ok(self.scan_identifier(ch, pos));
443            }
444
445            // Unknown character
446            _ => return Err(LexError::UnexpectedCharacter(ch, pos)),
447        };
448
449        Ok(Token::new(kind, pos))
450    }
451
452    /// Tokenize the entire source and return all tokens.
453    pub fn tokenize(&mut self) -> Result<Vec<Token>, LexError> {
454        let mut tokens = Vec::new();
455        loop {
456            let token = self.next_token()?;
457            let is_eof = token.kind == TokenKind::Eof;
458            tokens.push(token);
459            if is_eof {
460                break;
461            }
462        }
463        Ok(tokens)
464    }
465}
466
467#[cfg(test)]
468mod tests {
469    use super::*;
470
471    #[test]
472    fn test_empty_source() {
473        let mut lexer = Lexer::new("");
474        let token = lexer.next_token().unwrap();
475        assert_eq!(token.kind, TokenKind::Eof);
476    }
477
478    #[test]
479    fn test_single_tokens() {
480        let mut lexer = Lexer::new("+ - * / % ( ) { } , ;");
481        let tokens = lexer.tokenize().unwrap();
482        assert!(matches!(tokens[0].kind, TokenKind::Plus));
483        assert!(matches!(tokens[1].kind, TokenKind::Minus));
484        assert!(matches!(tokens[2].kind, TokenKind::Star));
485        assert!(matches!(tokens[3].kind, TokenKind::Slash));
486        assert!(matches!(tokens[4].kind, TokenKind::Percent));
487        assert!(matches!(tokens[5].kind, TokenKind::LeftParen));
488        assert!(matches!(tokens[6].kind, TokenKind::RightParen));
489        assert!(matches!(tokens[7].kind, TokenKind::LeftBrace));
490        assert!(matches!(tokens[8].kind, TokenKind::RightBrace));
491        assert!(matches!(tokens[9].kind, TokenKind::Comma));
492        assert!(matches!(tokens[10].kind, TokenKind::Semicolon));
493    }
494
495    #[test]
496    fn test_comparison_operators() {
497        let mut lexer = Lexer::new("= == != < <= > >=");
498        let tokens = lexer.tokenize().unwrap();
499        assert!(matches!(tokens[0].kind, TokenKind::Assign));
500        assert!(matches!(tokens[1].kind, TokenKind::Equal));
501        assert!(matches!(tokens[2].kind, TokenKind::NotEqual));
502        assert!(matches!(tokens[3].kind, TokenKind::LessThan));
503        assert!(matches!(tokens[4].kind, TokenKind::LessEqual));
504        assert!(matches!(tokens[5].kind, TokenKind::GreaterThan));
505        assert!(matches!(tokens[6].kind, TokenKind::GreaterEqual));
506    }
507
508    #[test]
509    fn test_logical_operators() {
510        let mut lexer = Lexer::new("! && ||");
511        let tokens = lexer.tokenize().unwrap();
512        assert!(matches!(tokens[0].kind, TokenKind::Bang));
513        assert!(matches!(tokens[1].kind, TokenKind::And));
514        assert!(matches!(tokens[2].kind, TokenKind::Or));
515    }
516
517    #[test]
518    fn test_keywords() {
519        let mut lexer = Lexer::new("let if else for fn return true false null");
520        let tokens = lexer.tokenize().unwrap();
521        assert!(matches!(tokens[0].kind, TokenKind::Let));
522        assert!(matches!(tokens[1].kind, TokenKind::If));
523        assert!(matches!(tokens[2].kind, TokenKind::Else));
524        assert!(matches!(tokens[3].kind, TokenKind::For));
525        assert!(matches!(tokens[4].kind, TokenKind::Fn));
526        assert!(matches!(tokens[5].kind, TokenKind::Return));
527        assert!(matches!(tokens[6].kind, TokenKind::True));
528        assert!(matches!(tokens[7].kind, TokenKind::False));
529        assert!(matches!(tokens[8].kind, TokenKind::Null));
530    }
531
532    #[test]
533    fn test_identifier() {
534        let mut lexer = Lexer::new("foo bar_123 _test");
535        let tokens = lexer.tokenize().unwrap();
536        assert!(matches!(&tokens[0].kind, TokenKind::Identifier(s) if s == "foo"));
537        assert!(matches!(&tokens[1].kind, TokenKind::Identifier(s) if s == "bar_123"));
538        assert!(matches!(&tokens[2].kind, TokenKind::Identifier(s) if s == "_test"));
539    }
540
541    #[test]
542    fn test_integer() {
543        let mut lexer = Lexer::new("42 0 12345");
544        let tokens = lexer.tokenize().unwrap();
545        assert!(matches!(tokens[0].kind, TokenKind::Integer(42)));
546        assert!(matches!(tokens[1].kind, TokenKind::Integer(0)));
547        assert!(matches!(tokens[2].kind, TokenKind::Integer(12345)));
548    }
549
550    #[test]
551    fn test_string() {
552        let mut lexer = Lexer::new(r#""hello" "world" "with\nescapes""#);
553        let tokens = lexer.tokenize().unwrap();
554        assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "hello"));
555        assert!(matches!(&tokens[1].kind, TokenKind::String(s) if s == "world"));
556        assert!(matches!(&tokens[2].kind, TokenKind::String(s) if s == "with\nescapes"));
557    }
558
559    #[test]
560    fn test_comments() {
561        let mut lexer = Lexer::new("let x = 10; // this is a comment\nlet y = 20;");
562        let tokens = lexer.tokenize().unwrap();
563        // Should have: let, x, =, 10, ;, let, y, =, 20, ;, EOF
564        assert_eq!(tokens.len(), 11);
565        assert!(matches!(tokens[0].kind, TokenKind::Let));
566        assert!(matches!(tokens[5].kind, TokenKind::Let));
567    }
568
569    #[test]
570    fn test_position_tracking() {
571        let mut lexer = Lexer::new("let x\ny");
572        let tokens = lexer.tokenize().unwrap();
573        assert_eq!(tokens[0].position.line, 1);
574        assert_eq!(tokens[0].position.column, 1);
575        assert_eq!(tokens[1].position.line, 1);
576        assert_eq!(tokens[1].position.column, 5);
577        assert_eq!(tokens[2].position.line, 2);
578        assert_eq!(tokens[2].position.column, 1);
579    }
580
581    #[test]
582    fn test_unterminated_string() {
583        let mut lexer = Lexer::new(r#""hello"#);
584        let result = lexer.next_token();
585        assert!(matches!(result, Err(LexError::UnterminatedString(_))));
586    }
587
588    #[test]
589    fn test_unexpected_character() {
590        let mut lexer = Lexer::new("@");
591        let result = lexer.next_token();
592        assert!(matches!(result, Err(LexError::UnexpectedCharacter('@', _))));
593    }
594
595    #[test]
596    fn test_dot_token() {
597        let mut lexer = Lexer::new(".");
598        let tokens = lexer.tokenize().unwrap();
599        assert!(matches!(tokens[0].kind, TokenKind::Dot));
600    }
601
602    #[test]
603    fn test_method_call_tokens() {
604        let mut lexer = Lexer::new("foo.bar()");
605        let tokens = lexer.tokenize().unwrap();
606        assert!(matches!(&tokens[0].kind, TokenKind::Identifier(s) if s == "foo"));
607        assert!(matches!(tokens[1].kind, TokenKind::Dot));
608        assert!(matches!(&tokens[2].kind, TokenKind::Identifier(s) if s == "bar"));
609        assert!(matches!(tokens[3].kind, TokenKind::LeftParen));
610        assert!(matches!(tokens[4].kind, TokenKind::RightParen));
611    }
612}