Skip to main content

proof_engine/scripting/
lexer.rs

1//! Lexer — tokenizes script source into a token stream.
2
3#[derive(Debug, Clone, PartialEq)]
4pub enum Token {
5    // Literals
6    Nil,
7    True,
8    False,
9    Int(i64),
10    Float(f64),
11    Str(String),
12    Ident(String),
13
14    // Keywords
15    And, Or, Not,
16    If, Then, Else, ElseIf, End,
17    While, Do, For, In, Repeat, Until,
18    Function, Return, Local, Break, Continue,
19    Class, Self_, New, Import, Export,
20    Match, Case, Default,
21
22    // Operators
23    Plus, Minus, Star, Slash, Percent, Caret, Hash,
24    Amp, Pipe, Tilde, ShiftLeft, ShiftRight, SlashSlash,
25    EqEq, NotEq, Lt, LtEq, Gt, GtEq,
26    Eq, PlusEq, MinusEq, StarEq, SlashEq,
27    DotDot, DotDotDot, Arrow,
28    Bang,
29
30    // Delimiters
31    LParen, RParen, LBrace, RBrace, LBracket, RBracket,
32    Comma, Semicolon, Colon, ColonColon, Dot,
33
34    // Meta
35    Eof,
36}
37
38#[derive(Debug, Clone)]
39pub struct Span {
40    pub line:   u32,
41    pub column: u32,
42}
43
44#[derive(Debug, Clone)]
45pub struct TokenWithSpan {
46    pub token: Token,
47    pub span:  Span,
48}
49
50/// Tokenizes source code into a flat Vec of tokens.
51pub struct Lexer {
52    source:  Vec<char>,
53    pos:     usize,
54    line:    u32,
55    column:  u32,
56}
57
58impl Lexer {
59    pub fn new(source: &str) -> Self {
60        Self { source: source.chars().collect(), pos: 0, line: 1, column: 1 }
61    }
62
63    fn peek(&self) -> Option<char> { self.source.get(self.pos).copied() }
64    fn peek2(&self) -> Option<char> { self.source.get(self.pos + 1).copied() }
65
66    fn advance(&mut self) -> Option<char> {
67        let c = self.source.get(self.pos).copied();
68        if let Some(ch) = c {
69            self.pos += 1;
70            if ch == '\n' { self.line += 1; self.column = 1; }
71            else { self.column += 1; }
72        }
73        c
74    }
75
76    fn span(&self) -> Span { Span { line: self.line, column: self.column } }
77
78    fn skip_whitespace_and_comments(&mut self) {
79        loop {
80            // Skip whitespace
81            while self.peek().map(|c| c.is_whitespace()).unwrap_or(false) {
82                self.advance();
83            }
84            // Skip line comments  (-- or //)
85            if self.peek() == Some('-') && self.peek2() == Some('-') {
86                self.advance(); self.advance();
87                while self.peek().map(|c| c != '\n').unwrap_or(false) { self.advance(); }
88                continue;
89            }
90            if self.peek() == Some('/') && self.peek2() == Some('/') {
91                self.advance(); self.advance();
92                while self.peek().map(|c| c != '\n').unwrap_or(false) { self.advance(); }
93                continue;
94            }
95            // Skip block comments (/* */ or --[[ ]])
96            if self.peek() == Some('/') && self.peek2() == Some('*') {
97                self.advance(); self.advance();
98                while self.pos + 1 < self.source.len() {
99                    if self.peek() == Some('*') && self.peek2() == Some('/') {
100                        self.advance(); self.advance(); break;
101                    }
102                    self.advance();
103                }
104                continue;
105            }
106            break;
107        }
108    }
109
110    fn read_string(&mut self, delim: char) -> String {
111        let mut s = String::new();
112        while let Some(c) = self.peek() {
113            if c == delim { self.advance(); break; }
114            if c == '\\' {
115                self.advance();
116                match self.advance() {
117                    Some('n')  => s.push('\n'),
118                    Some('t')  => s.push('\t'),
119                    Some('r')  => s.push('\r'),
120                    Some('\\') => s.push('\\'),
121                    Some('\'') => s.push('\''),
122                    Some('"')  => s.push('"'),
123                    Some('0')  => s.push('\0'),
124                    Some(x)    => { s.push('\\'); s.push(x); }
125                    None       => break,
126                }
127            } else {
128                s.push(c);
129                self.advance();
130            }
131        }
132        s
133    }
134
135    fn read_number(&mut self, first: char) -> Token {
136        let mut num = first.to_string();
137        let mut is_float = false;
138        while let Some(c) = self.peek() {
139            if c.is_ascii_digit() { num.push(c); self.advance(); }
140            else if c == '.' && !is_float && self.peek2().map(|n| n.is_ascii_digit()).unwrap_or(false) {
141                is_float = true; num.push(c); self.advance();
142            }
143            else if (c == 'e' || c == 'E') && !num.contains('e') && !num.contains('E') {
144                is_float = true; num.push(c); self.advance();
145                if self.peek() == Some('+') || self.peek() == Some('-') {
146                    if let Some(sign) = self.advance() { num.push(sign); }
147                }
148            }
149            else { break; }
150        }
151        if is_float {
152            Token::Float(num.parse().unwrap_or(0.0))
153        } else {
154            Token::Int(num.parse().unwrap_or(0))
155        }
156    }
157
158    fn read_ident(&mut self, first: char) -> Token {
159        let mut ident = first.to_string();
160        while let Some(c) = self.peek() {
161            if c.is_alphanumeric() || c == '_' { ident.push(c); self.advance(); }
162            else { break; }
163        }
164        match ident.as_str() {
165            "nil"      => Token::Nil,
166            "true"     => Token::True,
167            "false"    => Token::False,
168            "and"      => Token::And,
169            "or"       => Token::Or,
170            "not"      => Token::Not,
171            "if"       => Token::If,
172            "then"     => Token::Then,
173            "else"     => Token::Else,
174            "elseif"   => Token::ElseIf,
175            "end"      => Token::End,
176            "while"    => Token::While,
177            "do"       => Token::Do,
178            "for"      => Token::For,
179            "in"       => Token::In,
180            "repeat"   => Token::Repeat,
181            "until"    => Token::Until,
182            "function" => Token::Function,
183            "return"   => Token::Return,
184            "local"    => Token::Local,
185            "break"    => Token::Break,
186            "continue" => Token::Continue,
187            "class"    => Token::Class,
188            "self"     => Token::Self_,
189            "new"      => Token::New,
190            "import"   => Token::Import,
191            "export"   => Token::Export,
192            "match"    => Token::Match,
193            "case"     => Token::Case,
194            "default"  => Token::Default,
195            _          => Token::Ident(ident),
196        }
197    }
198
199    pub fn tokenize(&mut self) -> Vec<TokenWithSpan> {
200        let mut tokens = Vec::new();
201        loop {
202            self.skip_whitespace_and_comments();
203            let span = self.span();
204            let ch = match self.advance() {
205                Some(c) => c,
206                None    => { tokens.push(TokenWithSpan { token: Token::Eof, span }); break; }
207            };
208
209            let token = match ch {
210                '+' => { if self.peek() == Some('=') { self.advance(); Token::PlusEq  } else { Token::Plus  } }
211                '-' => { if self.peek() == Some('=') { self.advance(); Token::MinusEq } else if self.peek() == Some('>') { self.advance(); Token::Arrow } else { Token::Minus } }
212                '*' => { if self.peek() == Some('=') { self.advance(); Token::StarEq  } else { Token::Star  } }
213                '/' => { if self.peek() == Some('=') { self.advance(); Token::SlashEq } else if self.peek() == Some('/') { self.advance(); Token::SlashSlash } else { Token::Slash } }
214                '%' => Token::Percent,
215                '^' => Token::Caret,
216                '#' => Token::Hash,
217                '&' => Token::Amp,
218                '|' => Token::Pipe,
219                '~' => { if self.peek() == Some('=') { self.advance(); Token::NotEq } else { Token::Tilde } }
220                '<' => { if self.peek() == Some('=') { self.advance(); Token::LtEq } else if self.peek() == Some('<') { self.advance(); Token::ShiftLeft } else { Token::Lt } }
221                '>' => { if self.peek() == Some('=') { self.advance(); Token::GtEq } else if self.peek() == Some('>') { self.advance(); Token::ShiftRight } else { Token::Gt } }
222                '=' => { if self.peek() == Some('=') { self.advance(); Token::EqEq } else { Token::Eq } }
223                '!' => { if self.peek() == Some('=') { self.advance(); Token::NotEq } else { Token::Bang } }
224                '.' => {
225                    if self.peek() == Some('.') {
226                        self.advance();
227                        if self.peek() == Some('.') { self.advance(); Token::DotDotDot }
228                        else { Token::DotDot }
229                    } else { Token::Dot }
230                }
231                ':' => { if self.peek() == Some(':') { self.advance(); Token::ColonColon } else { Token::Colon } }
232                '(' => Token::LParen,
233                ')' => Token::RParen,
234                '{' => Token::LBrace,
235                '}' => Token::RBrace,
236                '[' => Token::LBracket,
237                ']' => Token::RBracket,
238                ',' => Token::Comma,
239                ';' => Token::Semicolon,
240                '\'' | '"' => Token::Str(self.read_string(ch)),
241                '`' => Token::Str(self.read_string('`')),
242                c if c.is_ascii_digit() => self.read_number(c),
243                c if c.is_alphabetic() || c == '_' => self.read_ident(c),
244                _ => continue,
245            };
246            tokens.push(TokenWithSpan { token, span });
247        }
248        tokens
249    }
250}
251
252#[cfg(test)]
253mod tests {
254    use super::*;
255
256    fn lex(src: &str) -> Vec<Token> {
257        let mut l = Lexer::new(src);
258        l.tokenize().into_iter().map(|t| t.token).collect()
259    }
260
261    #[test]
262    fn test_lex_simple_assign() {
263        let toks = lex("local x = 42");
264        assert!(toks.contains(&Token::Local));
265        assert!(toks.contains(&Token::Ident("x".to_string())));
266        assert!(toks.contains(&Token::Eq));
267        assert!(toks.contains(&Token::Int(42)));
268    }
269
270    #[test]
271    fn test_lex_string() {
272        let toks = lex(r#"local s = "hello world""#);
273        assert!(toks.contains(&Token::Str("hello world".to_string())));
274    }
275
276    #[test]
277    fn test_lex_float() {
278        let toks = lex("3.14");
279        assert!(toks.iter().any(|t| matches!(t, Token::Float(v) if (*v - 3.14).abs() < 1e-6)));
280    }
281
282    #[test]
283    fn test_lex_operators() {
284        let toks = lex("a == b ~= c <= d >= e");
285        assert!(toks.contains(&Token::EqEq));
286        assert!(toks.contains(&Token::NotEq));
287        assert!(toks.contains(&Token::LtEq));
288        assert!(toks.contains(&Token::GtEq));
289    }
290
291    #[test]
292    fn test_lex_keywords() {
293        let toks = lex("if x then return end");
294        assert!(toks.contains(&Token::If));
295        assert!(toks.contains(&Token::Then));
296        assert!(toks.contains(&Token::Return));
297        assert!(toks.contains(&Token::End));
298    }
299
300    #[test]
301    fn test_lex_comment_skip() {
302        let toks = lex("local x = 1 -- this is a comment\nlocal y = 2");
303        assert!(!toks.iter().any(|t| matches!(t, Token::Ident(s) if s == "this")));
304    }
305}