Skip to main content

proof_engine/scripting/
lexer.rs

1//! Lexer — tokenizes script source into a token stream.
2
3#[derive(Debug, Clone, PartialEq)]
4pub enum Token {
5    // Literals
6    Nil,
7    True,
8    False,
9    Int(i64),
10    Float(f64),
11    Str(String),
12    Ident(String),
13
14    // Keywords
15    And, Or, Not,
16    If, Then, Else, ElseIf, End,
17    While, Do, For, In, Repeat, Until,
18    Function, Return, Local, Break, Continue,
19    Class, Self_, New, Import, Export,
20    Match, Case, Default,
21
22    // Operators
23    Plus, Minus, Star, Slash, Percent, Caret, Hash,
24    Amp, Pipe, Tilde, ShiftLeft, ShiftRight, SlashSlash,
25    EqEq, NotEq, Lt, LtEq, Gt, GtEq,
26    Eq, PlusEq, MinusEq, StarEq, SlashEq,
27    DotDot, DotDotDot, Arrow, FatArrow,
28    Bang, Question,
29
30    // Delimiters
31    LParen, RParen, LBrace, RBrace, LBracket, RBracket,
32    Comma, Semicolon, Colon, ColonColon, Dot,
33
34    // Meta
35    Eof,
36}
37
38#[derive(Debug, Clone)]
39pub struct Span {
40    pub line:   u32,
41    pub column: u32,
42}
43
44#[derive(Debug, Clone)]
45pub struct TokenWithSpan {
46    pub token: Token,
47    pub span:  Span,
48}
49
50/// Tokenizes source code into a flat Vec of tokens.
51pub struct Lexer {
52    source:  Vec<char>,
53    pos:     usize,
54    line:    u32,
55    column:  u32,
56}
57
58impl Lexer {
59    pub fn new(source: &str) -> Self {
60        Self { source: source.chars().collect(), pos: 0, line: 1, column: 1 }
61    }
62
63    fn peek(&self) -> Option<char> { self.source.get(self.pos).copied() }
64    fn peek2(&self) -> Option<char> { self.source.get(self.pos + 1).copied() }
65
66    fn advance(&mut self) -> Option<char> {
67        let c = self.source.get(self.pos).copied();
68        if let Some(ch) = c {
69            self.pos += 1;
70            if ch == '\n' { self.line += 1; self.column = 1; }
71            else { self.column += 1; }
72        }
73        c
74    }
75
76    fn span(&self) -> Span { Span { line: self.line, column: self.column } }
77
78    fn skip_whitespace_and_comments(&mut self) {
79        loop {
80            // Skip whitespace
81            while self.peek().map(|c| c.is_whitespace()).unwrap_or(false) {
82                self.advance();
83            }
84            // Skip line comments  (-- or //)
85            if self.peek() == Some('-') && self.peek2() == Some('-') {
86                self.advance(); self.advance();
87                while self.peek().map(|c| c != '\n').unwrap_or(false) { self.advance(); }
88                continue;
89            }
90            if self.peek() == Some('/') && self.peek2() == Some('/') {
91                self.advance(); self.advance();
92                while self.peek().map(|c| c != '\n').unwrap_or(false) { self.advance(); }
93                continue;
94            }
95            // Skip block comments (/* */ or --[[ ]])
96            if self.peek() == Some('/') && self.peek2() == Some('*') {
97                self.advance(); self.advance();
98                while self.pos + 1 < self.source.len() {
99                    if self.peek() == Some('*') && self.peek2() == Some('/') {
100                        self.advance(); self.advance(); break;
101                    }
102                    self.advance();
103                }
104                continue;
105            }
106            break;
107        }
108    }
109
110    fn read_string(&mut self, delim: char) -> String {
111        let mut s = String::new();
112        while let Some(c) = self.peek() {
113            if c == delim { self.advance(); break; }
114            if c == '\\' {
115                self.advance();
116                match self.advance() {
117                    Some('n')  => s.push('\n'),
118                    Some('t')  => s.push('\t'),
119                    Some('r')  => s.push('\r'),
120                    Some('\\') => s.push('\\'),
121                    Some('\'') => s.push('\''),
122                    Some('"')  => s.push('"'),
123                    Some('0')  => s.push('\0'),
124                    Some(x)    => { s.push('\\'); s.push(x); }
125                    None       => break,
126                }
127            } else {
128                s.push(c);
129                self.advance();
130            }
131        }
132        s
133    }
134
135    fn read_number(&mut self, first: char) -> Token {
136        let mut num = first.to_string();
137        let mut is_float = false;
138        // Hex literal: 0x or 0X
139        if first == '0' && (self.peek() == Some('x') || self.peek() == Some('X')) {
140            self.advance(); // consume 'x'
141            let mut hex = String::new();
142            while let Some(c) = self.peek() {
143                if c.is_ascii_hexdigit() { hex.push(c); self.advance(); }
144                else { break; }
145            }
146            return Token::Int(i64::from_str_radix(&hex, 16).unwrap_or(0));
147        }
148        while let Some(c) = self.peek() {
149            if c.is_ascii_digit() { num.push(c); self.advance(); }
150            else if c == '.' && !is_float && self.peek2().map(|n| n.is_ascii_digit()).unwrap_or(false) {
151                is_float = true; num.push(c); self.advance();
152            }
153            else if (c == 'e' || c == 'E') && !num.contains('e') && !num.contains('E') {
154                is_float = true; num.push(c); self.advance();
155                if self.peek() == Some('+') || self.peek() == Some('-') {
156                    if let Some(sign) = self.advance() { num.push(sign); }
157                }
158            }
159            else { break; }
160        }
161        if is_float {
162            Token::Float(num.parse().unwrap_or(0.0))
163        } else {
164            Token::Int(num.parse().unwrap_or(0))
165        }
166    }
167
168    fn read_ident(&mut self, first: char) -> Token {
169        let mut ident = first.to_string();
170        while let Some(c) = self.peek() {
171            if c.is_alphanumeric() || c == '_' { ident.push(c); self.advance(); }
172            else { break; }
173        }
174        match ident.as_str() {
175            "nil"      => Token::Nil,
176            "true"     => Token::True,
177            "false"    => Token::False,
178            "and"      => Token::And,
179            "or"       => Token::Or,
180            "not"      => Token::Not,
181            "if"       => Token::If,
182            "then"     => Token::Then,
183            "else"     => Token::Else,
184            "elseif"   => Token::ElseIf,
185            "end"      => Token::End,
186            "while"    => Token::While,
187            "do"       => Token::Do,
188            "for"      => Token::For,
189            "in"       => Token::In,
190            "repeat"   => Token::Repeat,
191            "until"    => Token::Until,
192            "function" => Token::Function,
193            "return"   => Token::Return,
194            "local"    => Token::Local,
195            "break"    => Token::Break,
196            "continue" => Token::Continue,
197            "class"    => Token::Class,
198            "self"     => Token::Self_,
199            "new"      => Token::New,
200            "import"   => Token::Import,
201            "export"   => Token::Export,
202            "match"    => Token::Match,
203            "case"     => Token::Case,
204            "default"  => Token::Default,
205            _          => Token::Ident(ident),
206        }
207    }
208
209    pub fn tokenize(&mut self) -> Vec<TokenWithSpan> {
210        let mut tokens = Vec::new();
211        loop {
212            self.skip_whitespace_and_comments();
213            let span = self.span();
214            let ch = match self.advance() {
215                Some(c) => c,
216                None    => { tokens.push(TokenWithSpan { token: Token::Eof, span }); break; }
217            };
218
219            let token = match ch {
220                '+' => { if self.peek() == Some('=') { self.advance(); Token::PlusEq  } else { Token::Plus  } }
221                '-' => { if self.peek() == Some('=') { self.advance(); Token::MinusEq } else if self.peek() == Some('>') { self.advance(); Token::Arrow } else { Token::Minus } }
222                '*' => { if self.peek() == Some('=') { self.advance(); Token::StarEq  } else { Token::Star  } }
223                '/' => { if self.peek() == Some('=') { self.advance(); Token::SlashEq } else if self.peek() == Some('/') { self.advance(); Token::SlashSlash } else { Token::Slash } }
224                '%' => Token::Percent,
225                '^' => Token::Caret,
226                '#' => Token::Hash,
227                '&' => Token::Amp,
228                '|' => Token::Pipe,
229                '~' => { if self.peek() == Some('=') { self.advance(); Token::NotEq } else { Token::Tilde } }
230                '<' => { if self.peek() == Some('=') { self.advance(); Token::LtEq } else if self.peek() == Some('<') { self.advance(); Token::ShiftLeft } else { Token::Lt } }
231                '>' => { if self.peek() == Some('=') { self.advance(); Token::GtEq } else if self.peek() == Some('>') { self.advance(); Token::ShiftRight } else { Token::Gt } }
232                '=' => { if self.peek() == Some('=') { self.advance(); Token::EqEq } else if self.peek() == Some('>') { self.advance(); Token::FatArrow } else { Token::Eq } }
233                '!' => { if self.peek() == Some('=') { self.advance(); Token::NotEq } else { Token::Bang } }
234                '.' => {
235                    if self.peek() == Some('.') {
236                        self.advance();
237                        if self.peek() == Some('.') { self.advance(); Token::DotDotDot }
238                        else { Token::DotDot }
239                    } else { Token::Dot }
240                }
241                ':' => { if self.peek() == Some(':') { self.advance(); Token::ColonColon } else { Token::Colon } }
242                '(' => Token::LParen,
243                ')' => Token::RParen,
244                '{' => Token::LBrace,
245                '}' => Token::RBrace,
246                '[' => Token::LBracket,
247                ']' => Token::RBracket,
248                ',' => Token::Comma,
249                ';' => Token::Semicolon,
250                '?' => Token::Question,
251                '\'' | '"' => Token::Str(self.read_string(ch)),
252                '`' => Token::Str(self.read_string('`')),
253                c if c.is_ascii_digit() => self.read_number(c),
254                c if c.is_alphabetic() || c == '_' => self.read_ident(c),
255                _ => continue,
256            };
257            tokens.push(TokenWithSpan { token, span });
258        }
259        tokens
260    }
261}
262
263#[cfg(test)]
264mod tests {
265    use super::*;
266
267    fn lex(src: &str) -> Vec<Token> {
268        let mut l = Lexer::new(src);
269        l.tokenize().into_iter().map(|t| t.token).collect()
270    }
271
272    #[test]
273    fn test_lex_simple_assign() {
274        let toks = lex("local x = 42");
275        assert!(toks.contains(&Token::Local));
276        assert!(toks.contains(&Token::Ident("x".to_string())));
277        assert!(toks.contains(&Token::Eq));
278        assert!(toks.contains(&Token::Int(42)));
279    }
280
281    #[test]
282    fn test_lex_string() {
283        let toks = lex(r#"local s = "hello world""#);
284        assert!(toks.contains(&Token::Str("hello world".to_string())));
285    }
286
287    #[test]
288    fn test_lex_float() {
289        let toks = lex("3.14");
290        assert!(toks.iter().any(|t| matches!(t, Token::Float(v) if (*v - 3.14).abs() < 1e-6)));
291    }
292
293    #[test]
294    fn test_lex_operators() {
295        let toks = lex("a == b ~= c <= d >= e");
296        assert!(toks.contains(&Token::EqEq));
297        assert!(toks.contains(&Token::NotEq));
298        assert!(toks.contains(&Token::LtEq));
299        assert!(toks.contains(&Token::GtEq));
300    }
301
302    #[test]
303    fn test_lex_keywords() {
304        let toks = lex("if x then return end");
305        assert!(toks.contains(&Token::If));
306        assert!(toks.contains(&Token::Then));
307        assert!(toks.contains(&Token::Return));
308        assert!(toks.contains(&Token::End));
309    }
310
311    #[test]
312    fn test_lex_comment_skip() {
313        let toks = lex("local x = 1 -- this is a comment\nlocal y = 2");
314        assert!(!toks.iter().any(|t| matches!(t, Token::Ident(s) if s == "this")));
315    }
316}