Skip to main content

fiddler_script/
lexer.rs

1//! Lexer (tokenizer) for FiddlerScript.
2//!
3//! The lexer transforms source code into a stream of tokens that can be
4//! consumed by the parser.
5
6use crate::error::{LexError, Position};
7
8/// A token produced by the lexer.
9#[derive(Debug, Clone, PartialEq)]
10pub struct Token {
11    /// The type and value of the token
12    pub kind: TokenKind,
13    /// Position in source code
14    pub position: Position,
15}
16
17impl Token {
18    /// Create a new token.
19    pub fn new(kind: TokenKind, position: Position) -> Self {
20        Self { kind, position }
21    }
22}
23
24/// The different kinds of tokens in FiddlerScript.
25#[derive(Debug, Clone, PartialEq)]
26pub enum TokenKind {
27    // Literals
28    /// Integer literal
29    Integer(i64),
30    /// Float literal
31    Float(f64),
32    /// String literal
33    String(String),
34
35    // Identifiers and keywords
36    /// Identifier (variable or function name)
37    Identifier(String),
38    /// `let` keyword
39    Let,
40    /// `if` keyword
41    If,
42    /// `else` keyword
43    Else,
44    /// `for` keyword
45    For,
46    /// `fn` keyword
47    Fn,
48    /// `return` keyword
49    Return,
50    /// `true` keyword
51    True,
52    /// `false` keyword
53    False,
54
55    // Operators
56    /// `+`
57    Plus,
58    /// `-`
59    Minus,
60    /// `*`
61    Star,
62    /// `/`
63    Slash,
64    /// `%`
65    Percent,
66    /// `=`
67    Assign,
68    /// `==`
69    Equal,
70    /// `!=`
71    NotEqual,
72    /// `<`
73    LessThan,
74    /// `<=`
75    LessEqual,
76    /// `>`
77    GreaterThan,
78    /// `>=`
79    GreaterEqual,
80    /// `!`
81    Bang,
82    /// `&&`
83    And,
84    /// `||`
85    Or,
86
87    // Delimiters
88    /// `(`
89    LeftParen,
90    /// `)`
91    RightParen,
92    /// `{`
93    LeftBrace,
94    /// `}`
95    RightBrace,
96    /// `[`
97    LeftBracket,
98    /// `]`
99    RightBracket,
100    /// `,`
101    Comma,
102    /// `;`
103    Semicolon,
104    /// `:`
105    Colon,
106    /// `.`
107    Dot,
108
109    // Special
110    /// End of file
111    Eof,
112}
113
114impl std::fmt::Display for TokenKind {
115    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
116        match self {
117            TokenKind::Integer(n) => write!(f, "{}", n),
118            TokenKind::Float(fl) => write!(f, "{}", fl),
119            TokenKind::String(s) => write!(f, "\"{}\"", s),
120            TokenKind::Identifier(s) => write!(f, "{}", s),
121            TokenKind::Let => write!(f, "let"),
122            TokenKind::If => write!(f, "if"),
123            TokenKind::Else => write!(f, "else"),
124            TokenKind::For => write!(f, "for"),
125            TokenKind::Fn => write!(f, "fn"),
126            TokenKind::Return => write!(f, "return"),
127            TokenKind::True => write!(f, "true"),
128            TokenKind::False => write!(f, "false"),
129            TokenKind::Plus => write!(f, "+"),
130            TokenKind::Minus => write!(f, "-"),
131            TokenKind::Star => write!(f, "*"),
132            TokenKind::Slash => write!(f, "/"),
133            TokenKind::Percent => write!(f, "%"),
134            TokenKind::Assign => write!(f, "="),
135            TokenKind::Equal => write!(f, "=="),
136            TokenKind::NotEqual => write!(f, "!="),
137            TokenKind::LessThan => write!(f, "<"),
138            TokenKind::LessEqual => write!(f, "<="),
139            TokenKind::GreaterThan => write!(f, ">"),
140            TokenKind::GreaterEqual => write!(f, ">="),
141            TokenKind::Bang => write!(f, "!"),
142            TokenKind::And => write!(f, "&&"),
143            TokenKind::Or => write!(f, "||"),
144            TokenKind::LeftParen => write!(f, "("),
145            TokenKind::RightParen => write!(f, ")"),
146            TokenKind::LeftBrace => write!(f, "{{"),
147            TokenKind::RightBrace => write!(f, "}}"),
148            TokenKind::LeftBracket => write!(f, "["),
149            TokenKind::RightBracket => write!(f, "]"),
150            TokenKind::Comma => write!(f, ","),
151            TokenKind::Semicolon => write!(f, ";"),
152            TokenKind::Colon => write!(f, ":"),
153            TokenKind::Dot => write!(f, "."),
154            TokenKind::Eof => write!(f, "EOF"),
155        }
156    }
157}
158
159/// The lexer that tokenizes FiddlerScript source code.
160pub struct Lexer<'a> {
161    /// Source code being tokenized
162    source: &'a str,
163    /// Characters iterator with indices
164    chars: std::iter::Peekable<std::str::CharIndices<'a>>,
165    /// Current line number (1-indexed)
166    line: usize,
167    /// Current column number (1-indexed)
168    column: usize,
169    /// Current byte offset
170    offset: usize,
171    /// Whether we've reached EOF
172    at_eof: bool,
173}
174
175impl<'a> Lexer<'a> {
176    /// Create a new lexer for the given source code.
177    pub fn new(source: &'a str) -> Self {
178        Self {
179            source,
180            chars: source.char_indices().peekable(),
181            line: 1,
182            column: 1,
183            offset: 0,
184            at_eof: false,
185        }
186    }
187
188    /// Get the current position in the source.
189    fn position(&self) -> Position {
190        Position::new(self.line, self.column, self.offset)
191    }
192
193    /// Advance to the next character.
194    fn advance(&mut self) -> Option<char> {
195        if let Some((idx, ch)) = self.chars.next() {
196            self.offset = idx + ch.len_utf8();
197            if ch == '\n' {
198                self.line += 1;
199                self.column = 1;
200            } else {
201                self.column += 1;
202            }
203            Some(ch)
204        } else {
205            None
206        }
207    }
208
209    /// Peek at the next character without consuming it.
210    fn peek(&mut self) -> Option<char> {
211        self.chars.peek().map(|(_, ch)| *ch)
212    }
213
214    /// Peek at the character after the current one without consuming.
215    fn peek_next(&self) -> Option<char> {
216        let mut iter = self.chars.clone();
217        iter.next(); // Skip current
218        iter.peek().map(|(_, ch)| *ch)
219    }
220
221    /// Skip whitespace and comments.
222    fn skip_whitespace_and_comments(&mut self) {
223        loop {
224            // Skip whitespace
225            while let Some(ch) = self.peek() {
226                if ch.is_whitespace() {
227                    self.advance();
228                } else {
229                    break;
230                }
231            }
232
233            // Check for comment: // until end of line
234            if self.peek() == Some('/') && self.peek_next() == Some('/') {
235                // Skip until end of line
236                self.advance(); // consume first /
237                self.advance(); // consume second /
238                while let Some(ch) = self.peek() {
239                    if ch == '\n' {
240                        break;
241                    }
242                    self.advance();
243                }
244                continue; // Check for more whitespace/comments
245            }
246
247            break;
248        }
249    }
250
251    /// Tokenize an identifier or keyword.
252    fn scan_identifier(&mut self, first_char: char, start_pos: Position) -> Token {
253        let start_offset = self.offset - first_char.len_utf8();
254
255        while let Some(ch) = self.peek() {
256            if ch.is_alphanumeric() || ch == '_' {
257                self.advance();
258            } else {
259                break;
260            }
261        }
262
263        let text = &self.source[start_offset..self.offset];
264        let kind = match text {
265            "let" => TokenKind::Let,
266            "if" => TokenKind::If,
267            "else" => TokenKind::Else,
268            "for" => TokenKind::For,
269            "fn" => TokenKind::Fn,
270            "return" => TokenKind::Return,
271            "true" => TokenKind::True,
272            "false" => TokenKind::False,
273            _ => TokenKind::Identifier(text.to_string()),
274        };
275
276        Token::new(kind, start_pos)
277    }
278
279    /// Tokenize a number literal (integer or float).
280    fn scan_number(&mut self, first_char: char, start_pos: Position) -> Result<Token, LexError> {
281        let start_offset = self.offset - first_char.len_utf8();
282        let mut has_decimal = false;
283
284        // Scan digits and optional decimal part
285        while let Some(ch) = self.peek() {
286            if ch.is_ascii_digit() {
287                self.advance();
288            } else if ch == '.' && !has_decimal {
289                // Look ahead to ensure '.' is followed by a digit (not a method call)
290                if let Some(next_ch) = self.peek_next() {
291                    if next_ch.is_ascii_digit() {
292                        has_decimal = true;
293                        self.advance(); // consume '.'
294                    } else {
295                        // '.' not part of number (could be method call like 3.abs())
296                        break;
297                    }
298                } else {
299                    // '.' at end of input, not part of number
300                    break;
301                }
302            } else {
303                break;
304            }
305        }
306
307        let text = &self.source[start_offset..self.offset];
308
309        if has_decimal {
310            // Parse as float
311            match text.parse::<f64>() {
312                Ok(value) => Ok(Token::new(TokenKind::Float(value), start_pos)),
313                Err(_) => Err(LexError::InvalidNumber(start_pos)),
314            }
315        } else {
316            // Parse as integer
317            match text.parse::<i64>() {
318                Ok(value) => Ok(Token::new(TokenKind::Integer(value), start_pos)),
319                Err(_) => Err(LexError::InvalidNumber(start_pos)),
320            }
321        }
322    }
323
324    /// Tokenize a string literal.
325    fn scan_string(&mut self, start_pos: Position) -> Result<Token, LexError> {
326        let mut value = String::new();
327
328        loop {
329            match self.advance() {
330                Some('"') => break,
331                Some('\\') => {
332                    // Handle escape sequences
333                    match self.advance() {
334                        Some('n') => value.push('\n'),
335                        Some('t') => value.push('\t'),
336                        Some('r') => value.push('\r'),
337                        Some('\\') => value.push('\\'),
338                        Some('"') => value.push('"'),
339                        Some(ch) => return Err(LexError::InvalidEscape(ch, self.position())),
340                        None => return Err(LexError::UnterminatedString(start_pos)),
341                    }
342                }
343                Some(ch) => value.push(ch),
344                None => return Err(LexError::UnterminatedString(start_pos)),
345            }
346        }
347
348        Ok(Token::new(TokenKind::String(value), start_pos))
349    }
350
351    /// Get the next token from the source.
352    pub fn next_token(&mut self) -> Result<Token, LexError> {
353        self.skip_whitespace_and_comments();
354
355        let pos = self.position();
356
357        let Some(ch) = self.advance() else {
358            self.at_eof = true;
359            return Ok(Token::new(TokenKind::Eof, pos));
360        };
361
362        let kind = match ch {
363            // Single-character tokens
364            '+' => TokenKind::Plus,
365            '-' => TokenKind::Minus,
366            '*' => TokenKind::Star,
367            '/' => TokenKind::Slash,
368            '%' => TokenKind::Percent,
369            '(' => TokenKind::LeftParen,
370            ')' => TokenKind::RightParen,
371            '{' => TokenKind::LeftBrace,
372            '}' => TokenKind::RightBrace,
373            '[' => TokenKind::LeftBracket,
374            ']' => TokenKind::RightBracket,
375            ',' => TokenKind::Comma,
376            ';' => TokenKind::Semicolon,
377            ':' => TokenKind::Colon,
378            '.' => TokenKind::Dot,
379
380            // Potentially two-character tokens
381            '=' => {
382                if self.peek() == Some('=') {
383                    self.advance();
384                    TokenKind::Equal
385                } else {
386                    TokenKind::Assign
387                }
388            }
389            '!' => {
390                if self.peek() == Some('=') {
391                    self.advance();
392                    TokenKind::NotEqual
393                } else {
394                    TokenKind::Bang
395                }
396            }
397            '<' => {
398                if self.peek() == Some('=') {
399                    self.advance();
400                    TokenKind::LessEqual
401                } else {
402                    TokenKind::LessThan
403                }
404            }
405            '>' => {
406                if self.peek() == Some('=') {
407                    self.advance();
408                    TokenKind::GreaterEqual
409                } else {
410                    TokenKind::GreaterThan
411                }
412            }
413            '&' => {
414                if self.peek() == Some('&') {
415                    self.advance();
416                    TokenKind::And
417                } else {
418                    return Err(LexError::UnexpectedCharacter(ch, pos));
419                }
420            }
421            '|' => {
422                if self.peek() == Some('|') {
423                    self.advance();
424                    TokenKind::Or
425                } else {
426                    return Err(LexError::UnexpectedCharacter(ch, pos));
427                }
428            }
429
430            // String literal
431            '"' => return self.scan_string(pos),
432
433            // Number literal
434            ch if ch.is_ascii_digit() => return self.scan_number(ch, pos),
435
436            // Identifier or keyword
437            ch if ch.is_alphabetic() || ch == '_' => {
438                return Ok(self.scan_identifier(ch, pos));
439            }
440
441            // Unknown character
442            _ => return Err(LexError::UnexpectedCharacter(ch, pos)),
443        };
444
445        Ok(Token::new(kind, pos))
446    }
447
448    /// Tokenize the entire source and return all tokens.
449    pub fn tokenize(&mut self) -> Result<Vec<Token>, LexError> {
450        let mut tokens = Vec::new();
451        loop {
452            let token = self.next_token()?;
453            let is_eof = token.kind == TokenKind::Eof;
454            tokens.push(token);
455            if is_eof {
456                break;
457            }
458        }
459        Ok(tokens)
460    }
461}
462
463#[cfg(test)]
464mod tests {
465    use super::*;
466
467    #[test]
468    fn test_empty_source() {
469        let mut lexer = Lexer::new("");
470        let token = lexer.next_token().unwrap();
471        assert_eq!(token.kind, TokenKind::Eof);
472    }
473
474    #[test]
475    fn test_single_tokens() {
476        let mut lexer = Lexer::new("+ - * / % ( ) { } , ;");
477        let tokens = lexer.tokenize().unwrap();
478        assert!(matches!(tokens[0].kind, TokenKind::Plus));
479        assert!(matches!(tokens[1].kind, TokenKind::Minus));
480        assert!(matches!(tokens[2].kind, TokenKind::Star));
481        assert!(matches!(tokens[3].kind, TokenKind::Slash));
482        assert!(matches!(tokens[4].kind, TokenKind::Percent));
483        assert!(matches!(tokens[5].kind, TokenKind::LeftParen));
484        assert!(matches!(tokens[6].kind, TokenKind::RightParen));
485        assert!(matches!(tokens[7].kind, TokenKind::LeftBrace));
486        assert!(matches!(tokens[8].kind, TokenKind::RightBrace));
487        assert!(matches!(tokens[9].kind, TokenKind::Comma));
488        assert!(matches!(tokens[10].kind, TokenKind::Semicolon));
489    }
490
491    #[test]
492    fn test_comparison_operators() {
493        let mut lexer = Lexer::new("= == != < <= > >=");
494        let tokens = lexer.tokenize().unwrap();
495        assert!(matches!(tokens[0].kind, TokenKind::Assign));
496        assert!(matches!(tokens[1].kind, TokenKind::Equal));
497        assert!(matches!(tokens[2].kind, TokenKind::NotEqual));
498        assert!(matches!(tokens[3].kind, TokenKind::LessThan));
499        assert!(matches!(tokens[4].kind, TokenKind::LessEqual));
500        assert!(matches!(tokens[5].kind, TokenKind::GreaterThan));
501        assert!(matches!(tokens[6].kind, TokenKind::GreaterEqual));
502    }
503
504    #[test]
505    fn test_logical_operators() {
506        let mut lexer = Lexer::new("! && ||");
507        let tokens = lexer.tokenize().unwrap();
508        assert!(matches!(tokens[0].kind, TokenKind::Bang));
509        assert!(matches!(tokens[1].kind, TokenKind::And));
510        assert!(matches!(tokens[2].kind, TokenKind::Or));
511    }
512
513    #[test]
514    fn test_keywords() {
515        let mut lexer = Lexer::new("let if else for fn return true false");
516        let tokens = lexer.tokenize().unwrap();
517        assert!(matches!(tokens[0].kind, TokenKind::Let));
518        assert!(matches!(tokens[1].kind, TokenKind::If));
519        assert!(matches!(tokens[2].kind, TokenKind::Else));
520        assert!(matches!(tokens[3].kind, TokenKind::For));
521        assert!(matches!(tokens[4].kind, TokenKind::Fn));
522        assert!(matches!(tokens[5].kind, TokenKind::Return));
523        assert!(matches!(tokens[6].kind, TokenKind::True));
524        assert!(matches!(tokens[7].kind, TokenKind::False));
525    }
526
527    #[test]
528    fn test_identifier() {
529        let mut lexer = Lexer::new("foo bar_123 _test");
530        let tokens = lexer.tokenize().unwrap();
531        assert!(matches!(&tokens[0].kind, TokenKind::Identifier(s) if s == "foo"));
532        assert!(matches!(&tokens[1].kind, TokenKind::Identifier(s) if s == "bar_123"));
533        assert!(matches!(&tokens[2].kind, TokenKind::Identifier(s) if s == "_test"));
534    }
535
536    #[test]
537    fn test_integer() {
538        let mut lexer = Lexer::new("42 0 12345");
539        let tokens = lexer.tokenize().unwrap();
540        assert!(matches!(tokens[0].kind, TokenKind::Integer(42)));
541        assert!(matches!(tokens[1].kind, TokenKind::Integer(0)));
542        assert!(matches!(tokens[2].kind, TokenKind::Integer(12345)));
543    }
544
545    #[test]
546    fn test_string() {
547        let mut lexer = Lexer::new(r#""hello" "world" "with\nescapes""#);
548        let tokens = lexer.tokenize().unwrap();
549        assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "hello"));
550        assert!(matches!(&tokens[1].kind, TokenKind::String(s) if s == "world"));
551        assert!(matches!(&tokens[2].kind, TokenKind::String(s) if s == "with\nescapes"));
552    }
553
554    #[test]
555    fn test_comments() {
556        let mut lexer = Lexer::new("let x = 10; // this is a comment\nlet y = 20;");
557        let tokens = lexer.tokenize().unwrap();
558        // Should have: let, x, =, 10, ;, let, y, =, 20, ;, EOF
559        assert_eq!(tokens.len(), 11);
560        assert!(matches!(tokens[0].kind, TokenKind::Let));
561        assert!(matches!(tokens[5].kind, TokenKind::Let));
562    }
563
564    #[test]
565    fn test_position_tracking() {
566        let mut lexer = Lexer::new("let x\ny");
567        let tokens = lexer.tokenize().unwrap();
568        assert_eq!(tokens[0].position.line, 1);
569        assert_eq!(tokens[0].position.column, 1);
570        assert_eq!(tokens[1].position.line, 1);
571        assert_eq!(tokens[1].position.column, 5);
572        assert_eq!(tokens[2].position.line, 2);
573        assert_eq!(tokens[2].position.column, 1);
574    }
575
576    #[test]
577    fn test_unterminated_string() {
578        let mut lexer = Lexer::new(r#""hello"#);
579        let result = lexer.next_token();
580        assert!(matches!(result, Err(LexError::UnterminatedString(_))));
581    }
582
583    #[test]
584    fn test_unexpected_character() {
585        let mut lexer = Lexer::new("@");
586        let result = lexer.next_token();
587        assert!(matches!(result, Err(LexError::UnexpectedCharacter('@', _))));
588    }
589
590    #[test]
591    fn test_dot_token() {
592        let mut lexer = Lexer::new(".");
593        let tokens = lexer.tokenize().unwrap();
594        assert!(matches!(tokens[0].kind, TokenKind::Dot));
595    }
596
597    #[test]
598    fn test_method_call_tokens() {
599        let mut lexer = Lexer::new("foo.bar()");
600        let tokens = lexer.tokenize().unwrap();
601        assert!(matches!(&tokens[0].kind, TokenKind::Identifier(s) if s == "foo"));
602        assert!(matches!(tokens[1].kind, TokenKind::Dot));
603        assert!(matches!(&tokens[2].kind, TokenKind::Identifier(s) if s == "bar"));
604        assert!(matches!(tokens[3].kind, TokenKind::LeftParen));
605        assert!(matches!(tokens[4].kind, TokenKind::RightParen));
606    }
607}