Skip to main content

aver/
lexer.rs

1use std::fmt;
2use thiserror::Error;
3
4#[derive(Debug, Clone, PartialEq)]
5pub enum TokenKind {
6    // Literals
7    Int(i64),
8    Float(f64),
9    Str(String),
10    InterpStr(Vec<(bool, String)>), // (is_expr, text)
11    Bool(bool),
12    // Identifiers
13    Ident(String),
14    // Keywords
15    Module,
16    Depends,
17    Exposes,
18    Intent,
19    Type,
20    Record,
21    Fn,
22    Effects,
23    Decision,
24    Verify,
25    Match,
26    // Operators
27    Arrow,    // ->
28    FatArrow, // =>
29    Eq,       // ==
30    Neq,      // !=
31    Lte,      // <=
32    Gte,      // >=
33    Assign,   // =
34    Bang,     // !
35    Question, // ?
36    Lt,       // <
37    Gt,       // >
38    Plus,     // +
39    Minus,    // -
40    Star,     // *
41    Slash,    // /
42    Dot,      // .
43    Colon,    // :
44    Comma,    // ,
45    LParen,   // (
46    RParen,   // )
47    LBracket, // [
48    RBracket, // ]
49    LBrace,   // {
50    RBrace,   // }
51    // Structure
52    Indent,
53    Dedent,
54    Newline,
55    Eof,
56}
57
58impl fmt::Display for TokenKind {
59    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
60        match self {
61            TokenKind::Int(n) => write!(f, "integer '{}'", n),
62            TokenKind::Float(n) => write!(f, "float '{}'", n),
63            TokenKind::Str(s) => write!(f, "string \"{}\"", s),
64            TokenKind::InterpStr(_) => write!(f, "interpolated string"),
65            TokenKind::Bool(b) => write!(f, "'{}'", b),
66            TokenKind::Ident(s) => write!(f, "'{}'", s),
67            TokenKind::Module => write!(f, "'module'"),
68            TokenKind::Depends => write!(f, "'depends'"),
69            TokenKind::Exposes => write!(f, "'exposes'"),
70            TokenKind::Intent => write!(f, "'intent'"),
71            TokenKind::Type => write!(f, "'type'"),
72            TokenKind::Record => write!(f, "'record'"),
73            TokenKind::Fn => write!(f, "'fn'"),
74            TokenKind::Effects => write!(f, "'effects'"),
75            TokenKind::Decision => write!(f, "'decision'"),
76            TokenKind::Verify => write!(f, "'verify'"),
77            TokenKind::Match => write!(f, "'match'"),
78            TokenKind::Arrow => write!(f, "'->'"),
79            TokenKind::FatArrow => write!(f, "'=>'"),
80            TokenKind::Eq => write!(f, "'=='"),
81            TokenKind::Neq => write!(f, "'!='"),
82            TokenKind::Lte => write!(f, "'<='"),
83            TokenKind::Gte => write!(f, "'>='"),
84            TokenKind::Assign => write!(f, "'='"),
85            TokenKind::Bang => write!(f, "'!'"),
86            TokenKind::Question => write!(f, "'?'"),
87            TokenKind::Lt => write!(f, "'<'"),
88            TokenKind::Gt => write!(f, "'>'"),
89            TokenKind::Plus => write!(f, "'+'"),
90            TokenKind::Minus => write!(f, "'-'"),
91            TokenKind::Star => write!(f, "'*'"),
92            TokenKind::Slash => write!(f, "'/'"),
93            TokenKind::Dot => write!(f, "'.'"),
94            TokenKind::Colon => write!(f, "':'"),
95            TokenKind::Comma => write!(f, "','"),
96            TokenKind::LParen => write!(f, "'('"),
97            TokenKind::RParen => write!(f, "')'"),
98            TokenKind::LBracket => write!(f, "'['"),
99            TokenKind::RBracket => write!(f, "']'"),
100            TokenKind::LBrace => write!(f, "'{{'"),
101            TokenKind::RBrace => write!(f, "'}}'"),
102            TokenKind::Indent => write!(f, "indentation"),
103            TokenKind::Dedent => write!(f, "end of block"),
104            TokenKind::Newline => write!(f, "end of line"),
105            TokenKind::Eof => write!(f, "end of file"),
106        }
107    }
108}
109
110#[derive(Debug, Clone)]
111pub struct Token {
112    pub kind: TokenKind,
113    pub line: usize,
114    pub col: usize,
115}
116
117#[derive(Debug, Error)]
118pub enum LexerError {
119    #[error("error[{line}:{col}]: {msg}")]
120    Error {
121        msg: String,
122        line: usize,
123        col: usize,
124    },
125}
126
127fn keyword(s: &str) -> Option<TokenKind> {
128    match s {
129        "module" => Some(TokenKind::Module),
130        "depends" => Some(TokenKind::Depends),
131        "exposes" => Some(TokenKind::Exposes),
132        "intent" => Some(TokenKind::Intent),
133        "type" => Some(TokenKind::Type),
134        "record" => Some(TokenKind::Record),
135        "fn" => Some(TokenKind::Fn),
136        "effects" => Some(TokenKind::Effects),
137        "decision" => Some(TokenKind::Decision),
138        "verify" => Some(TokenKind::Verify),
139        "match" => Some(TokenKind::Match),
140        "true" => Some(TokenKind::Bool(true)),
141        "false" => Some(TokenKind::Bool(false)),
142        _ => None,
143    }
144}
145
146pub struct Lexer {
147    chars: Vec<char>,
148    pos: usize,
149    line: usize,
150    col: usize,
151    indent_stack: Vec<usize>,
152    at_line_start: bool,
153}
154
155impl Lexer {
156    pub fn new(source: &str) -> Self {
157        Lexer {
158            chars: source.chars().collect(),
159            pos: 0,
160            line: 1,
161            col: 1,
162            indent_stack: vec![0],
163            at_line_start: true,
164        }
165    }
166
167    fn error(&self, msg: impl Into<String>) -> LexerError {
168        LexerError::Error {
169            msg: msg.into(),
170            line: self.line,
171            col: self.col,
172        }
173    }
174
175    fn peek(&self, offset: usize) -> Option<char> {
176        self.chars.get(self.pos + offset).copied()
177    }
178
179    fn current(&self) -> Option<char> {
180        self.chars.get(self.pos).copied()
181    }
182
183    fn advance(&mut self) -> Option<char> {
184        let ch = self.chars.get(self.pos).copied()?;
185        self.pos += 1;
186        if ch == '\n' {
187            self.line += 1;
188            self.col = 1;
189        } else {
190            self.col += 1;
191        }
192        Some(ch)
193    }
194
195    pub fn tokenize(&mut self) -> Result<Vec<Token>, LexerError> {
196        let mut tokens = Vec::new();
197
198        while self.pos < self.chars.len() {
199            self.scan_token(&mut tokens)?;
200        }
201
202        // Emit remaining DEDENTs
203        while self.indent_stack.len() > 1 {
204            self.indent_stack.pop();
205            tokens.push(Token {
206                kind: TokenKind::Dedent,
207                line: self.line,
208                col: self.col,
209            });
210        }
211
212        tokens.push(Token {
213            kind: TokenKind::Eof,
214            line: self.line,
215            col: self.col,
216        });
217
218        Ok(tokens)
219    }
220
221    fn scan_token(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexerError> {
222        if self.at_line_start {
223            self.handle_indentation(tokens)?;
224            if self.pos >= self.chars.len() {
225                return Ok(());
226            }
227        }
228
229        let ch = match self.current() {
230            Some(c) => c,
231            None => return Ok(()),
232        };
233
234        // Skip spaces (not at line start)
235        if ch == ' ' {
236            self.advance();
237            return Ok(());
238        }
239
240        // Newline
241        if ch == '\n' {
242            let line = self.line;
243            let col = self.col;
244            self.advance();
245
246            let last_is_structural = tokens
247                .last()
248                .map(|t| {
249                    matches!(
250                        t.kind,
251                        TokenKind::Newline | TokenKind::Indent | TokenKind::Dedent
252                    )
253                })
254                .unwrap_or(true);
255
256            if !tokens.is_empty() && !last_is_structural {
257                tokens.push(Token {
258                    kind: TokenKind::Newline,
259                    line,
260                    col,
261                });
262            }
263            self.at_line_start = true;
264            return Ok(());
265        }
266
267        // Carriage return
268        if ch == '\r' {
269            self.advance();
270            return Ok(());
271        }
272
273        // Comments
274        if ch == '/' && self.peek(1) == Some('/') {
275            self.skip_comment();
276            return Ok(());
277        }
278
279        // Strings
280        if ch == '"' {
281            let tok = self.scan_string()?;
282            tokens.push(tok);
283            return Ok(());
284        }
285
286        // Numbers
287        if ch.is_ascii_digit() {
288            let tok = self.scan_number()?;
289            tokens.push(tok);
290            return Ok(());
291        }
292
293        // Identifiers / keywords
294        if ch.is_alphabetic() || ch == '_' {
295            let tok = self.scan_identifier();
296            tokens.push(tok);
297            return Ok(());
298        }
299
300        // Operators
301        let tok = self.scan_operator()?;
302        tokens.push(tok);
303        Ok(())
304    }
305
306    fn handle_indentation(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexerError> {
307        self.at_line_start = false;
308        let mut indent = 0;
309
310        while self.pos < self.chars.len() && self.chars[self.pos] == ' ' {
311            indent += 1;
312            self.pos += 1;
313            self.col += 1;
314        }
315
316        // Empty line or comment-only line — don't emit indent/dedent
317        if self.pos < self.chars.len() {
318            let ch = self.chars[self.pos];
319            if ch == '\n' || ch == '\r' {
320                return Ok(());
321            }
322            if ch == '/' && self.pos + 1 < self.chars.len() && self.chars[self.pos + 1] == '/' {
323                return Ok(());
324            }
325        } else {
326            return Ok(());
327        }
328
329        let current = *self.indent_stack.last().unwrap();
330        let line = self.line;
331
332        if indent > current {
333            self.indent_stack.push(indent);
334            tokens.push(Token {
335                kind: TokenKind::Indent,
336                line,
337                col: 1,
338            });
339        } else if indent < current {
340            while self.indent_stack.len() > 1 && *self.indent_stack.last().unwrap() > indent {
341                self.indent_stack.pop();
342                tokens.push(Token {
343                    kind: TokenKind::Dedent,
344                    line,
345                    col: 1,
346                });
347            }
348            if *self.indent_stack.last().unwrap() != indent {
349                return Err(self.error(format!("Invalid indentation level: {}", indent)));
350            }
351        }
352
353        Ok(())
354    }
355
356    fn skip_comment(&mut self) {
357        while self.pos < self.chars.len() && self.chars[self.pos] != '\n' {
358            self.advance();
359        }
360    }
361
362    fn scan_string(&mut self) -> Result<Token, LexerError> {
363        let line = self.line;
364        let col = self.col;
365        self.advance(); // consume opening "
366
367        let mut parts: Vec<(bool, String)> = Vec::new(); // (is_expr, text)
368        let mut current = String::new();
369        let mut has_interp = false;
370
371        loop {
372            match self.current() {
373                None => return Err(self.error("Unterminated string literal")),
374                Some('"') => {
375                    self.advance();
376                    break;
377                }
378                Some('{') => {
379                    // {{ → literal {, otherwise start interpolation
380                    if self.chars.get(self.pos + 1).copied() == Some('{') {
381                        current.push('{');
382                        self.advance(); // first {
383                        self.advance(); // second {
384                    } else {
385                        has_interp = true;
386                        if !current.is_empty() {
387                            parts.push((false, current.clone()));
388                            current.clear();
389                        }
390                        self.advance(); // consume {
391                        let mut expr_text = String::new();
392                        let mut depth = 1usize;
393                        while self.pos < self.chars.len() && depth > 0 {
394                            match self.chars[self.pos] {
395                                '{' => {
396                                    depth += 1;
397                                    expr_text.push('{');
398                                    self.advance();
399                                }
400                                '}' => {
401                                    depth -= 1;
402                                    if depth > 0 {
403                                        expr_text.push('}');
404                                    }
405                                    self.advance();
406                                }
407                                c => {
408                                    expr_text.push(c);
409                                    self.advance();
410                                }
411                            }
412                        }
413                        parts.push((true, expr_text));
414                    }
415                }
416                Some('}') => {
417                    // }} → literal }, single } is just a literal character
418                    if self.chars.get(self.pos + 1).copied() == Some('}') {
419                        current.push('}');
420                        self.advance(); // first }
421                        self.advance(); // second }
422                    } else {
423                        current.push('}');
424                        self.advance();
425                    }
426                }
427                Some('\\') => {
428                    self.advance();
429                    match self.advance() {
430                        Some('b') => current.push('\u{0008}'),
431                        Some('f') => current.push('\u{000C}'),
432                        Some('n') => current.push('\n'),
433                        Some('t') => current.push('\t'),
434                        Some('r') => current.push('\r'),
435                        Some('"') => current.push('"'),
436                        Some('\\') => current.push('\\'),
437                        Some(c) => current.push(c),
438                        None => return Err(self.error("Unterminated string literal")),
439                    }
440                }
441                Some('\n') => return Err(self.error("Unterminated string literal")),
442                Some(c) => {
443                    current.push(c);
444                    self.advance();
445                }
446            }
447        }
448
449        if !current.is_empty() {
450            parts.push((false, current));
451        }
452
453        if has_interp {
454            Ok(Token {
455                kind: TokenKind::InterpStr(parts),
456                line,
457                col,
458            })
459        } else {
460            let plain = parts.into_iter().map(|(_, s)| s).collect::<String>();
461            Ok(Token {
462                kind: TokenKind::Str(plain),
463                line,
464                col,
465            })
466        }
467    }
468
469    fn scan_number(&mut self) -> Result<Token, LexerError> {
470        let line = self.line;
471        let col = self.col;
472        let mut num_str = String::new();
473        let mut is_float = false;
474
475        while let Some(c) = self.current() {
476            if c.is_ascii_digit() {
477                num_str.push(c);
478                self.advance();
479            } else {
480                break;
481            }
482        }
483
484        if self.current() == Some('.') && self.peek(1).map(|c| c.is_ascii_digit()).unwrap_or(false)
485        {
486            is_float = true;
487            num_str.push('.');
488            self.advance(); // consume '.'
489            while let Some(c) = self.current() {
490                if c.is_ascii_digit() {
491                    num_str.push(c);
492                    self.advance();
493                } else {
494                    break;
495                }
496            }
497        }
498
499        if is_float {
500            let f: f64 = num_str
501                .parse()
502                .map_err(|_| self.error("Invalid floating-point number"))?;
503            Ok(Token {
504                kind: TokenKind::Float(f),
505                line,
506                col,
507            })
508        } else {
509            let i: i64 = num_str
510                .parse()
511                .map_err(|_| self.error("Invalid integer literal"))?;
512            Ok(Token {
513                kind: TokenKind::Int(i),
514                line,
515                col,
516            })
517        }
518    }
519
520    fn scan_identifier(&mut self) -> Token {
521        let line = self.line;
522        let col = self.col;
523        let mut ident = String::new();
524
525        while let Some(c) = self.current() {
526            if c.is_alphanumeric() || c == '_' {
527                ident.push(c);
528                self.advance();
529            } else {
530                break;
531            }
532        }
533
534        let kind = keyword(&ident).unwrap_or(TokenKind::Ident(ident));
535        Token { kind, line, col }
536    }
537
538    fn scan_operator(&mut self) -> Result<Token, LexerError> {
539        let line = self.line;
540        let col = self.col;
541        let ch = self.advance().unwrap();
542
543        let kind = match ch {
544            '-' if self.current() == Some('>') => {
545                self.advance();
546                TokenKind::Arrow
547            }
548            '=' if self.current() == Some('>') => {
549                self.advance();
550                TokenKind::FatArrow
551            }
552            '=' if self.current() == Some('=') => {
553                self.advance();
554                TokenKind::Eq
555            }
556            '!' if self.current() == Some('=') => {
557                self.advance();
558                TokenKind::Neq
559            }
560            '<' if self.current() == Some('=') => {
561                self.advance();
562                TokenKind::Lte
563            }
564            '>' if self.current() == Some('=') => {
565                self.advance();
566                TokenKind::Gte
567            }
568            '=' => TokenKind::Assign,
569            '<' => TokenKind::Lt,
570            '>' => TokenKind::Gt,
571            '+' => TokenKind::Plus,
572            '-' => TokenKind::Minus,
573            '*' => TokenKind::Star,
574            '/' => TokenKind::Slash,
575            '!' => TokenKind::Bang,
576            '?' => TokenKind::Question,
577            '.' => TokenKind::Dot,
578            ':' => TokenKind::Colon,
579            ',' => TokenKind::Comma,
580            '(' => TokenKind::LParen,
581            ')' => TokenKind::RParen,
582            '[' => TokenKind::LBracket,
583            ']' => TokenKind::RBracket,
584            '{' => TokenKind::LBrace,
585            '}' => TokenKind::RBrace,
586            other => return Err(self.error(format!("Unknown character: {:?}", other))),
587        };
588
589        Ok(Token { kind, line, col })
590    }
591}