Skip to main content

aver/
lexer.rs

1use std::fmt;
2use thiserror::Error;
3
4#[derive(Debug, Clone, PartialEq)]
5pub enum TokenKind {
6    // Literals
7    Int(i64),
8    Float(f64),
9    Str(String),
10    InterpStr(Vec<(bool, String)>), // (is_expr, text)
11    Bool(bool),
12    // Identifiers
13    Ident(String),
14    // Keywords
15    Module,
16    Depends,
17    Exposes,
18    Intent,
19    Type,
20    Record,
21    Fn,
22    Effects,
23    Decision,
24    Verify,
25    Match,
26    // Operators
27    Arrow,    // ->
28    FatArrow, // =>
29    Eq,       // ==
30    Neq,      // !=
31    Lte,      // <=
32    Gte,      // >=
33    Assign,   // =
34    Bang,     // !
35    Question, // ?
36    Lt,       // <
37    Gt,       // >
38    Plus,     // +
39    Minus,    // -
40    Star,     // *
41    Slash,    // /
42    Dot,      // .
43    Colon,    // :
44    Comma,    // ,
45    LParen,   // (
46    RParen,   // )
47    LBracket, // [
48    RBracket, // ]
49    LBrace,   // {
50    RBrace,   // }
51    // Structure
52    Indent,
53    Dedent,
54    Newline,
55    Eof,
56}
57
58impl fmt::Display for TokenKind {
59    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
60        match self {
61            TokenKind::Int(n) => write!(f, "integer '{}'", n),
62            TokenKind::Float(n) => write!(f, "float '{}'", n),
63            TokenKind::Str(s) => write!(f, "string \"{}\"", s),
64            TokenKind::InterpStr(_) => write!(f, "interpolated string"),
65            TokenKind::Bool(b) => write!(f, "'{}'", b),
66            TokenKind::Ident(s) => write!(f, "'{}'", s),
67            TokenKind::Module => write!(f, "'module'"),
68            TokenKind::Depends => write!(f, "'depends'"),
69            TokenKind::Exposes => write!(f, "'exposes'"),
70            TokenKind::Intent => write!(f, "'intent'"),
71            TokenKind::Type => write!(f, "'type'"),
72            TokenKind::Record => write!(f, "'record'"),
73            TokenKind::Fn => write!(f, "'fn'"),
74            TokenKind::Effects => write!(f, "'effects'"),
75            TokenKind::Decision => write!(f, "'decision'"),
76            TokenKind::Verify => write!(f, "'verify'"),
77            TokenKind::Match => write!(f, "'match'"),
78            TokenKind::Arrow => write!(f, "'->'"),
79            TokenKind::FatArrow => write!(f, "'=>'"),
80            TokenKind::Eq => write!(f, "'=='"),
81            TokenKind::Neq => write!(f, "'!='"),
82            TokenKind::Lte => write!(f, "'<='"),
83            TokenKind::Gte => write!(f, "'>='"),
84            TokenKind::Assign => write!(f, "'='"),
85            TokenKind::Bang => write!(f, "'!'"),
86            TokenKind::Question => write!(f, "'?'"),
87            TokenKind::Lt => write!(f, "'<'"),
88            TokenKind::Gt => write!(f, "'>'"),
89            TokenKind::Plus => write!(f, "'+'"),
90            TokenKind::Minus => write!(f, "'-'"),
91            TokenKind::Star => write!(f, "'*'"),
92            TokenKind::Slash => write!(f, "'/'"),
93            TokenKind::Dot => write!(f, "'.'"),
94            TokenKind::Colon => write!(f, "':'"),
95            TokenKind::Comma => write!(f, "','"),
96            TokenKind::LParen => write!(f, "'('"),
97            TokenKind::RParen => write!(f, "')'"),
98            TokenKind::LBracket => write!(f, "'['"),
99            TokenKind::RBracket => write!(f, "']'"),
100            TokenKind::LBrace => write!(f, "'{{'"),
101            TokenKind::RBrace => write!(f, "'}}'"),
102            TokenKind::Indent => write!(f, "indentation"),
103            TokenKind::Dedent => write!(f, "end of block"),
104            TokenKind::Newline => write!(f, "end of line"),
105            TokenKind::Eof => write!(f, "end of file"),
106        }
107    }
108}
109
110#[derive(Debug, Clone)]
111pub struct Token {
112    pub kind: TokenKind,
113    pub line: usize,
114    pub col: usize,
115}
116
117#[derive(Debug, Error)]
118pub enum LexerError {
119    #[error("error[{line}:{col}]: {msg}")]
120    Error {
121        msg: String,
122        line: usize,
123        col: usize,
124    },
125}
126
127fn keyword(s: &str) -> Option<TokenKind> {
128    match s {
129        "module" => Some(TokenKind::Module),
130        "depends" => Some(TokenKind::Depends),
131        "exposes" => Some(TokenKind::Exposes),
132        "intent" => Some(TokenKind::Intent),
133        "type" => Some(TokenKind::Type),
134        "record" => Some(TokenKind::Record),
135        "fn" => Some(TokenKind::Fn),
136        "effects" => Some(TokenKind::Effects),
137        "decision" => Some(TokenKind::Decision),
138        "verify" => Some(TokenKind::Verify),
139        "match" => Some(TokenKind::Match),
140        "true" => Some(TokenKind::Bool(true)),
141        "false" => Some(TokenKind::Bool(false)),
142        _ => None,
143    }
144}
145
146pub struct Lexer {
147    chars: Vec<char>,
148    pos: usize,
149    line: usize,
150    col: usize,
151    indent_stack: Vec<usize>,
152    at_line_start: bool,
153}
154
155impl Lexer {
156    pub fn new(source: &str) -> Self {
157        Lexer {
158            chars: source.chars().collect(),
159            pos: 0,
160            line: 1,
161            col: 1,
162            indent_stack: vec![0],
163            at_line_start: true,
164        }
165    }
166
167    fn error(&self, msg: impl Into<String>) -> LexerError {
168        LexerError::Error {
169            msg: msg.into(),
170            line: self.line,
171            col: self.col,
172        }
173    }
174
175    fn peek(&self, offset: usize) -> Option<char> {
176        self.chars.get(self.pos + offset).copied()
177    }
178
179    fn current(&self) -> Option<char> {
180        self.chars.get(self.pos).copied()
181    }
182
183    fn advance(&mut self) -> Option<char> {
184        let ch = self.chars.get(self.pos).copied()?;
185        self.pos += 1;
186        if ch == '\n' {
187            self.line += 1;
188            self.col = 1;
189        } else {
190            self.col += 1;
191        }
192        Some(ch)
193    }
194
195    pub fn tokenize(&mut self) -> Result<Vec<Token>, LexerError> {
196        let mut tokens = Vec::new();
197
198        while self.pos < self.chars.len() {
199            self.scan_token(&mut tokens)?;
200        }
201
202        // Emit remaining DEDENTs
203        while self.indent_stack.len() > 1 {
204            self.indent_stack.pop();
205            tokens.push(Token {
206                kind: TokenKind::Dedent,
207                line: self.line,
208                col: self.col,
209            });
210        }
211
212        tokens.push(Token {
213            kind: TokenKind::Eof,
214            line: self.line,
215            col: self.col,
216        });
217
218        Ok(tokens)
219    }
220
221    fn scan_token(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexerError> {
222        if self.at_line_start {
223            self.handle_indentation(tokens)?;
224            if self.pos >= self.chars.len() {
225                return Ok(());
226            }
227        }
228
229        let ch = match self.current() {
230            Some(c) => c,
231            None => return Ok(()),
232        };
233
234        // Skip spaces (not at line start)
235        if ch == ' ' {
236            self.advance();
237            return Ok(());
238        }
239
240        // Newline
241        if ch == '\n' {
242            let line = self.line;
243            let col = self.col;
244            self.advance();
245
246            let last_is_structural = tokens
247                .last()
248                .map(|t| {
249                    matches!(
250                        t.kind,
251                        TokenKind::Newline | TokenKind::Indent | TokenKind::Dedent
252                    )
253                })
254                .unwrap_or(true);
255
256            if !tokens.is_empty() && !last_is_structural {
257                tokens.push(Token {
258                    kind: TokenKind::Newline,
259                    line,
260                    col,
261                });
262            }
263            self.at_line_start = true;
264            return Ok(());
265        }
266
267        // Carriage return
268        if ch == '\r' {
269            self.advance();
270            return Ok(());
271        }
272
273        // Comments
274        if ch == '/' && self.peek(1) == Some('/') {
275            self.skip_comment();
276            return Ok(());
277        }
278
279        // Strings
280        if ch == '"' {
281            let tok = self.scan_string()?;
282            tokens.push(tok);
283            return Ok(());
284        }
285
286        // Numbers
287        if ch.is_ascii_digit() {
288            let tok = self.scan_number()?;
289            tokens.push(tok);
290            return Ok(());
291        }
292
293        // Identifiers / keywords
294        if ch.is_alphabetic() || ch == '_' {
295            let tok = self.scan_identifier();
296            tokens.push(tok);
297            return Ok(());
298        }
299
300        // Operators
301        let tok = self.scan_operator()?;
302        tokens.push(tok);
303        Ok(())
304    }
305
306    fn handle_indentation(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexerError> {
307        self.at_line_start = false;
308        let mut indent = 0;
309
310        while self.pos < self.chars.len() && self.chars[self.pos] == ' ' {
311            indent += 1;
312            self.pos += 1;
313            self.col += 1;
314        }
315
316        // Empty line or comment-only line — don't emit indent/dedent
317        if self.pos < self.chars.len() {
318            let ch = self.chars[self.pos];
319            if ch == '\n' || ch == '\r' {
320                return Ok(());
321            }
322            if ch == '/' && self.pos + 1 < self.chars.len() && self.chars[self.pos + 1] == '/' {
323                return Ok(());
324            }
325        } else {
326            return Ok(());
327        }
328
329        let current = *self.indent_stack.last().unwrap();
330        let line = self.line;
331
332        if indent > current {
333            self.indent_stack.push(indent);
334            tokens.push(Token {
335                kind: TokenKind::Indent,
336                line,
337                col: 1,
338            });
339        } else if indent < current {
340            while self.indent_stack.len() > 1 && *self.indent_stack.last().unwrap() > indent {
341                self.indent_stack.pop();
342                tokens.push(Token {
343                    kind: TokenKind::Dedent,
344                    line,
345                    col: 1,
346                });
347            }
348            if *self.indent_stack.last().unwrap() != indent {
349                return Err(self.error(format!("Invalid indentation level: {}", indent)));
350            }
351        }
352
353        Ok(())
354    }
355
356    fn skip_comment(&mut self) {
357        while self.pos < self.chars.len() && self.chars[self.pos] != '\n' {
358            self.advance();
359        }
360    }
361
362    fn scan_string(&mut self) -> Result<Token, LexerError> {
363        let line = self.line;
364        let col = self.col;
365        self.advance(); // consume opening "
366
367        let mut parts: Vec<(bool, String)> = Vec::new(); // (is_expr, text)
368        let mut current = String::new();
369        let mut has_interp = false;
370
371        loop {
372            match self.current() {
373                None => return Err(self.error("Unterminated string literal")),
374                Some('"') => {
375                    self.advance();
376                    break;
377                }
378                Some('{') => {
379                    // {{ → literal {, otherwise start interpolation
380                    if self.chars.get(self.pos + 1).copied() == Some('{') {
381                        current.push('{');
382                        self.advance(); // first {
383                        self.advance(); // second {
384                    } else {
385                        has_interp = true;
386                        if !current.is_empty() {
387                            parts.push((false, current.clone()));
388                            current.clear();
389                        }
390                        self.advance(); // consume {
391                        let mut expr_text = String::new();
392                        let mut depth = 1usize;
393                        while self.pos < self.chars.len() && depth > 0 {
394                            match self.chars[self.pos] {
395                                '{' => {
396                                    depth += 1;
397                                    expr_text.push('{');
398                                    self.advance();
399                                }
400                                '}' => {
401                                    depth -= 1;
402                                    if depth > 0 {
403                                        expr_text.push('}');
404                                    }
405                                    self.advance();
406                                }
407                                c => {
408                                    expr_text.push(c);
409                                    self.advance();
410                                }
411                            }
412                        }
413                        parts.push((true, expr_text));
414                    }
415                }
416                Some('}') => {
417                    // }} → literal }, single } is just a literal character
418                    if self.chars.get(self.pos + 1).copied() == Some('}') {
419                        current.push('}');
420                        self.advance(); // first }
421                        self.advance(); // second }
422                    } else {
423                        current.push('}');
424                        self.advance();
425                    }
426                }
427                Some('\\') => {
428                    self.advance();
429                    match self.advance() {
430                        Some('n') => current.push('\n'),
431                        Some('t') => current.push('\t'),
432                        Some('r') => current.push('\r'),
433                        Some('"') => current.push('"'),
434                        Some('\\') => current.push('\\'),
435                        Some(c) => current.push(c),
436                        None => return Err(self.error("Unterminated string literal")),
437                    }
438                }
439                Some('\n') => return Err(self.error("Unterminated string literal")),
440                Some(c) => {
441                    current.push(c);
442                    self.advance();
443                }
444            }
445        }
446
447        if !current.is_empty() {
448            parts.push((false, current));
449        }
450
451        if has_interp {
452            Ok(Token {
453                kind: TokenKind::InterpStr(parts),
454                line,
455                col,
456            })
457        } else {
458            let plain = parts.into_iter().map(|(_, s)| s).collect::<String>();
459            Ok(Token {
460                kind: TokenKind::Str(plain),
461                line,
462                col,
463            })
464        }
465    }
466
467    fn scan_number(&mut self) -> Result<Token, LexerError> {
468        let line = self.line;
469        let col = self.col;
470        let mut num_str = String::new();
471        let mut is_float = false;
472
473        while let Some(c) = self.current() {
474            if c.is_ascii_digit() {
475                num_str.push(c);
476                self.advance();
477            } else {
478                break;
479            }
480        }
481
482        if self.current() == Some('.') && self.peek(1).map(|c| c.is_ascii_digit()).unwrap_or(false)
483        {
484            is_float = true;
485            num_str.push('.');
486            self.advance(); // consume '.'
487            while let Some(c) = self.current() {
488                if c.is_ascii_digit() {
489                    num_str.push(c);
490                    self.advance();
491                } else {
492                    break;
493                }
494            }
495        }
496
497        if is_float {
498            let f: f64 = num_str
499                .parse()
500                .map_err(|_| self.error("Invalid floating-point number"))?;
501            Ok(Token {
502                kind: TokenKind::Float(f),
503                line,
504                col,
505            })
506        } else {
507            let i: i64 = num_str
508                .parse()
509                .map_err(|_| self.error("Invalid integer literal"))?;
510            Ok(Token {
511                kind: TokenKind::Int(i),
512                line,
513                col,
514            })
515        }
516    }
517
518    fn scan_identifier(&mut self) -> Token {
519        let line = self.line;
520        let col = self.col;
521        let mut ident = String::new();
522
523        while let Some(c) = self.current() {
524            if c.is_alphanumeric() || c == '_' {
525                ident.push(c);
526                self.advance();
527            } else {
528                break;
529            }
530        }
531
532        let kind = keyword(&ident).unwrap_or(TokenKind::Ident(ident));
533        Token { kind, line, col }
534    }
535
536    fn scan_operator(&mut self) -> Result<Token, LexerError> {
537        let line = self.line;
538        let col = self.col;
539        let ch = self.advance().unwrap();
540
541        let kind = match ch {
542            '-' if self.current() == Some('>') => {
543                self.advance();
544                TokenKind::Arrow
545            }
546            '=' if self.current() == Some('>') => {
547                self.advance();
548                TokenKind::FatArrow
549            }
550            '=' if self.current() == Some('=') => {
551                self.advance();
552                TokenKind::Eq
553            }
554            '!' if self.current() == Some('=') => {
555                self.advance();
556                TokenKind::Neq
557            }
558            '<' if self.current() == Some('=') => {
559                self.advance();
560                TokenKind::Lte
561            }
562            '>' if self.current() == Some('=') => {
563                self.advance();
564                TokenKind::Gte
565            }
566            '=' => TokenKind::Assign,
567            '<' => TokenKind::Lt,
568            '>' => TokenKind::Gt,
569            '+' => TokenKind::Plus,
570            '-' => TokenKind::Minus,
571            '*' => TokenKind::Star,
572            '/' => TokenKind::Slash,
573            '!' => TokenKind::Bang,
574            '?' => TokenKind::Question,
575            '.' => TokenKind::Dot,
576            ':' => TokenKind::Colon,
577            ',' => TokenKind::Comma,
578            '(' => TokenKind::LParen,
579            ')' => TokenKind::RParen,
580            '[' => TokenKind::LBracket,
581            ']' => TokenKind::RBracket,
582            '{' => TokenKind::LBrace,
583            '}' => TokenKind::RBrace,
584            other => return Err(self.error(format!("Unknown character: {:?}", other))),
585        };
586
587        Ok(Token { kind, line, col })
588    }
589}