Skip to main content

aver/
lexer.rs

1use std::fmt;
2use thiserror::Error;
3
4#[derive(Debug, Clone, PartialEq)]
5pub enum TokenKind {
6    // Literals
7    Int(i64),
8    Float(f64),
9    Str(String),
10    InterpStr(Vec<(bool, String)>), // (is_expr, text)
11    Bool(bool),
12    // Identifiers
13    Ident(String),
14    // Keywords
15    Module,
16    Depends,
17    Exposes,
18    Intent,
19    Type,
20    Record,
21    Fn,
22    Effects,
23    Decision,
24    Verify,
25    Match,
26    // Operators
27    Arrow,    // ->
28    FatArrow, // =>
29    Eq,       // ==
30    Neq,      // !=
31    Lte,      // <=
32    Gte,      // >=
33    Assign,   // =
34    Bang,     // !
35    Question, // ?
36    Lt,       // <
37    Gt,       // >
38    Plus,     // +
39    Minus,    // -
40    Star,     // *
41    Slash,    // /
42    Dot,      // .
43    Colon,    // :
44    Comma,    // ,
45    LParen,   // (
46    RParen,   // )
47    LBracket, // [
48    RBracket, // ]
49    LBrace,   // {
50    RBrace,   // }
51    // Structure
52    Indent,
53    Dedent,
54    Newline,
55    Eof,
56}
57
58impl fmt::Display for TokenKind {
59    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
60        match self {
61            TokenKind::Int(n) => write!(f, "integer '{}'", n),
62            TokenKind::Float(n) => write!(f, "float '{}'", n),
63            TokenKind::Str(s) => write!(f, "string \"{}\"", s),
64            TokenKind::InterpStr(_) => write!(f, "interpolated string"),
65            TokenKind::Bool(b) => write!(f, "'{}'", b),
66            TokenKind::Ident(s) => write!(f, "'{}'", s),
67            TokenKind::Module => write!(f, "'module'"),
68            TokenKind::Depends => write!(f, "'depends'"),
69            TokenKind::Exposes => write!(f, "'exposes'"),
70            TokenKind::Intent => write!(f, "'intent'"),
71            TokenKind::Type => write!(f, "'type'"),
72            TokenKind::Record => write!(f, "'record'"),
73            TokenKind::Fn => write!(f, "'fn'"),
74            TokenKind::Effects => write!(f, "'effects'"),
75            TokenKind::Decision => write!(f, "'decision'"),
76            TokenKind::Verify => write!(f, "'verify'"),
77            TokenKind::Match => write!(f, "'match'"),
78            TokenKind::Arrow => write!(f, "'->'"),
79            TokenKind::FatArrow => write!(f, "'=>'"),
80            TokenKind::Eq => write!(f, "'=='"),
81            TokenKind::Neq => write!(f, "'!='"),
82            TokenKind::Lte => write!(f, "'<='"),
83            TokenKind::Gte => write!(f, "'>='"),
84            TokenKind::Assign => write!(f, "'='"),
85            TokenKind::Bang => write!(f, "'!'"),
86            TokenKind::Question => write!(f, "'?'"),
87            TokenKind::Lt => write!(f, "'<'"),
88            TokenKind::Gt => write!(f, "'>'"),
89            TokenKind::Plus => write!(f, "'+'"),
90            TokenKind::Minus => write!(f, "'-'"),
91            TokenKind::Star => write!(f, "'*'"),
92            TokenKind::Slash => write!(f, "'/'"),
93            TokenKind::Dot => write!(f, "'.'"),
94            TokenKind::Colon => write!(f, "':'"),
95            TokenKind::Comma => write!(f, "','"),
96            TokenKind::LParen => write!(f, "'('"),
97            TokenKind::RParen => write!(f, "')'"),
98            TokenKind::LBracket => write!(f, "'['"),
99            TokenKind::RBracket => write!(f, "']'"),
100            TokenKind::LBrace => write!(f, "'{{'"),
101            TokenKind::RBrace => write!(f, "'}}'"),
102            TokenKind::Indent => write!(f, "indentation"),
103            TokenKind::Dedent => write!(f, "end of block"),
104            TokenKind::Newline => write!(f, "end of line"),
105            TokenKind::Eof => write!(f, "end of file"),
106        }
107    }
108}
109
110#[derive(Debug, Clone)]
111pub struct Token {
112    pub kind: TokenKind,
113    pub line: usize,
114    pub col: usize,
115}
116
117#[derive(Debug, Error)]
118pub enum LexerError {
119    #[error("error[{line}:{col}]: {msg}")]
120    Error {
121        msg: String,
122        line: usize,
123        col: usize,
124    },
125}
126
127fn keyword(s: &str) -> Option<TokenKind> {
128    match s {
129        "module" => Some(TokenKind::Module),
130        "depends" => Some(TokenKind::Depends),
131        "exposes" => Some(TokenKind::Exposes),
132        "intent" => Some(TokenKind::Intent),
133        "type" => Some(TokenKind::Type),
134        "record" => Some(TokenKind::Record),
135        "fn" => Some(TokenKind::Fn),
136        "effects" => Some(TokenKind::Effects),
137        "decision" => Some(TokenKind::Decision),
138        "verify" => Some(TokenKind::Verify),
139        "match" => Some(TokenKind::Match),
140        "true" => Some(TokenKind::Bool(true)),
141        "false" => Some(TokenKind::Bool(false)),
142        _ => None,
143    }
144}
145
146pub struct Lexer {
147    chars: Vec<char>,
148    pos: usize,
149    line: usize,
150    col: usize,
151    indent_stack: Vec<usize>,
152    at_line_start: bool,
153}
154
155impl Lexer {
156    pub fn new(source: &str) -> Self {
157        Lexer {
158            chars: source.chars().collect(),
159            pos: 0,
160            line: 1,
161            col: 1,
162            indent_stack: vec![0],
163            at_line_start: true,
164        }
165    }
166
167    fn error(&self, msg: impl Into<String>) -> LexerError {
168        LexerError::Error {
169            msg: msg.into(),
170            line: self.line,
171            col: self.col,
172        }
173    }
174
175    fn peek(&self, offset: usize) -> Option<char> {
176        self.chars.get(self.pos + offset).copied()
177    }
178
179    fn current(&self) -> Option<char> {
180        self.chars.get(self.pos).copied()
181    }
182
183    fn advance(&mut self) -> Option<char> {
184        let ch = self.chars.get(self.pos).copied()?;
185        self.pos += 1;
186        if ch == '\n' {
187            self.line += 1;
188            self.col = 1;
189        } else {
190            self.col += 1;
191        }
192        Some(ch)
193    }
194
195    pub fn tokenize(&mut self) -> Result<Vec<Token>, LexerError> {
196        let mut tokens = Vec::new();
197
198        while self.pos < self.chars.len() {
199            self.scan_token(&mut tokens)?;
200        }
201
202        // Emit remaining DEDENTs
203        while self.indent_stack.len() > 1 {
204            self.indent_stack.pop();
205            tokens.push(Token {
206                kind: TokenKind::Dedent,
207                line: self.line,
208                col: self.col,
209            });
210        }
211
212        tokens.push(Token {
213            kind: TokenKind::Eof,
214            line: self.line,
215            col: self.col,
216        });
217
218        Ok(tokens)
219    }
220
221    fn scan_token(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexerError> {
222        if self.at_line_start {
223            self.handle_indentation(tokens)?;
224            if self.pos >= self.chars.len() {
225                return Ok(());
226            }
227        }
228
229        let ch = match self.current() {
230            Some(c) => c,
231            None => return Ok(()),
232        };
233
234        // Skip spaces (not at line start)
235        if ch == ' ' {
236            self.advance();
237            return Ok(());
238        }
239
240        // Newline
241        if ch == '\n' {
242            let line = self.line;
243            let col = self.col;
244            self.advance();
245
246            let last_is_structural = tokens
247                .last()
248                .map(|t| {
249                    matches!(
250                        t.kind,
251                        TokenKind::Newline | TokenKind::Indent | TokenKind::Dedent
252                    )
253                })
254                .unwrap_or(true);
255
256            if !tokens.is_empty() && !last_is_structural {
257                tokens.push(Token {
258                    kind: TokenKind::Newline,
259                    line,
260                    col,
261                });
262            }
263            self.at_line_start = true;
264            return Ok(());
265        }
266
267        // Carriage return
268        if ch == '\r' {
269            self.advance();
270            return Ok(());
271        }
272
273        // Comments
274        if ch == '/' && self.peek(1) == Some('/') {
275            self.skip_comment();
276            return Ok(());
277        }
278
279        // Strings
280        if ch == '"' {
281            let tok = self.scan_string()?;
282            tokens.push(tok);
283            return Ok(());
284        }
285
286        // Numbers
287        if ch.is_ascii_digit() {
288            let tok = self.scan_number()?;
289            tokens.push(tok);
290            return Ok(());
291        }
292
293        // Identifiers / keywords
294        if ch.is_alphabetic() || ch == '_' {
295            let tok = self.scan_identifier();
296            tokens.push(tok);
297            return Ok(());
298        }
299
300        // Operators
301        let tok = self.scan_operator()?;
302        tokens.push(tok);
303        Ok(())
304    }
305
306    fn handle_indentation(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexerError> {
307        self.at_line_start = false;
308        let mut indent = 0;
309
310        while self.pos < self.chars.len() && self.chars[self.pos] == ' ' {
311            indent += 1;
312            self.pos += 1;
313            self.col += 1;
314        }
315
316        // Empty line or comment-only line — don't emit indent/dedent
317        if self.pos < self.chars.len() {
318            let ch = self.chars[self.pos];
319            if ch == '\n' || ch == '\r' {
320                return Ok(());
321            }
322            if ch == '/' && self.pos + 1 < self.chars.len() && self.chars[self.pos + 1] == '/' {
323                return Ok(());
324            }
325        } else {
326            return Ok(());
327        }
328
329        let current = *self.indent_stack.last().unwrap();
330        let line = self.line;
331
332        if indent > current {
333            self.indent_stack.push(indent);
334            tokens.push(Token {
335                kind: TokenKind::Indent,
336                line,
337                col: 1,
338            });
339        } else if indent < current {
340            while self.indent_stack.len() > 1 && *self.indent_stack.last().unwrap() > indent {
341                self.indent_stack.pop();
342                tokens.push(Token {
343                    kind: TokenKind::Dedent,
344                    line,
345                    col: 1,
346                });
347            }
348            if *self.indent_stack.last().unwrap() != indent {
349                return Err(self.error(format!(
350                    "Invalid indentation level: {indent}. Aver uses \
351                     significant indentation with one consistent step per \
352                     block — every line in the same block dedents back to \
353                     a previously-opened indent level. Common cause: a \
354                     wrapped `fn` signature or a multi-line argument list \
355                     (Aver doesn't support either — keep each declaration \
356                     on a single line, or split the body into a named \
357                     helper function)."
358                )));
359            }
360        }
361
362        Ok(())
363    }
364
365    fn skip_comment(&mut self) {
366        while self.pos < self.chars.len() && self.chars[self.pos] != '\n' {
367            self.advance();
368        }
369    }
370
371    fn scan_string(&mut self) -> Result<Token, LexerError> {
372        let line = self.line;
373        let col = self.col;
374        self.advance(); // consume opening "
375
376        let mut parts: Vec<(bool, String)> = Vec::new(); // (is_expr, text)
377        let mut current = String::new();
378        let mut has_interp = false;
379
380        loop {
381            match self.current() {
382                None => return Err(self.error("Unterminated string literal")),
383                Some('"') => {
384                    self.advance();
385                    break;
386                }
387                Some('{') => {
388                    // {{ → literal {, otherwise start interpolation
389                    if self.chars.get(self.pos + 1).copied() == Some('{') {
390                        current.push('{');
391                        self.advance(); // first {
392                        self.advance(); // second {
393                    } else {
394                        has_interp = true;
395                        if !current.is_empty() {
396                            parts.push((false, current.clone()));
397                            current.clear();
398                        }
399                        self.advance(); // consume {
400                        let mut expr_text = String::new();
401                        let mut depth = 1usize;
402                        while self.pos < self.chars.len() && depth > 0 {
403                            match self.chars[self.pos] {
404                                '{' => {
405                                    depth += 1;
406                                    expr_text.push('{');
407                                    self.advance();
408                                }
409                                '}' => {
410                                    depth -= 1;
411                                    if depth > 0 {
412                                        expr_text.push('}');
413                                    }
414                                    self.advance();
415                                }
416                                c => {
417                                    expr_text.push(c);
418                                    self.advance();
419                                }
420                            }
421                        }
422                        parts.push((true, expr_text));
423                    }
424                }
425                Some('}') => {
426                    // }} → literal }, single } is just a literal character
427                    if self.chars.get(self.pos + 1).copied() == Some('}') {
428                        current.push('}');
429                        self.advance(); // first }
430                        self.advance(); // second }
431                    } else {
432                        current.push('}');
433                        self.advance();
434                    }
435                }
436                Some('\\') => {
437                    self.advance();
438                    match self.advance() {
439                        Some('b') => current.push('\u{0008}'),
440                        Some('f') => current.push('\u{000C}'),
441                        Some('n') => current.push('\n'),
442                        Some('t') => current.push('\t'),
443                        Some('r') => current.push('\r'),
444                        Some('"') => current.push('"'),
445                        Some('\\') => current.push('\\'),
446                        Some(c) => current.push(c),
447                        None => return Err(self.error("Unterminated string literal")),
448                    }
449                }
450                Some('\n') => return Err(self.error("Unterminated string literal")),
451                Some(c) => {
452                    current.push(c);
453                    self.advance();
454                }
455            }
456        }
457
458        if !current.is_empty() {
459            parts.push((false, current));
460        }
461
462        if has_interp {
463            Ok(Token {
464                kind: TokenKind::InterpStr(parts),
465                line,
466                col,
467            })
468        } else {
469            let plain = parts.into_iter().map(|(_, s)| s).collect::<String>();
470            Ok(Token {
471                kind: TokenKind::Str(plain),
472                line,
473                col,
474            })
475        }
476    }
477
478    fn scan_number(&mut self) -> Result<Token, LexerError> {
479        let line = self.line;
480        let col = self.col;
481        let mut num_str = String::new();
482        let mut is_float = false;
483
484        while let Some(c) = self.current() {
485            if c.is_ascii_digit() {
486                num_str.push(c);
487                self.advance();
488            } else {
489                break;
490            }
491        }
492
493        if self.current() == Some('.') && self.peek(1).map(|c| c.is_ascii_digit()).unwrap_or(false)
494        {
495            is_float = true;
496            num_str.push('.');
497            self.advance(); // consume '.'
498            while let Some(c) = self.current() {
499                if c.is_ascii_digit() {
500                    num_str.push(c);
501                    self.advance();
502                } else {
503                    break;
504                }
505            }
506        }
507
508        if is_float {
509            let f: f64 = num_str
510                .parse()
511                .map_err(|_| self.error("Invalid floating-point number"))?;
512            Ok(Token {
513                kind: TokenKind::Float(f),
514                line,
515                col,
516            })
517        } else {
518            let i: i64 = num_str
519                .parse()
520                .map_err(|_| self.error("Invalid integer literal"))?;
521            Ok(Token {
522                kind: TokenKind::Int(i),
523                line,
524                col,
525            })
526        }
527    }
528
529    fn scan_identifier(&mut self) -> Token {
530        let line = self.line;
531        let col = self.col;
532        let mut ident = String::new();
533
534        while let Some(c) = self.current() {
535            if c.is_alphanumeric() || c == '_' {
536                ident.push(c);
537                self.advance();
538            } else {
539                break;
540            }
541        }
542
543        let kind = keyword(&ident).unwrap_or(TokenKind::Ident(ident));
544        Token { kind, line, col }
545    }
546
547    fn scan_operator(&mut self) -> Result<Token, LexerError> {
548        let line = self.line;
549        let col = self.col;
550        let ch = self.advance().unwrap();
551
552        let kind = match ch {
553            '-' if self.current() == Some('>') => {
554                self.advance();
555                TokenKind::Arrow
556            }
557            '=' if self.current() == Some('>') => {
558                self.advance();
559                TokenKind::FatArrow
560            }
561            '=' if self.current() == Some('=') => {
562                self.advance();
563                TokenKind::Eq
564            }
565            '!' if self.current() == Some('=') => {
566                self.advance();
567                TokenKind::Neq
568            }
569            '<' if self.current() == Some('=') => {
570                self.advance();
571                TokenKind::Lte
572            }
573            '>' if self.current() == Some('=') => {
574                self.advance();
575                TokenKind::Gte
576            }
577            '=' => TokenKind::Assign,
578            '<' => TokenKind::Lt,
579            '>' => TokenKind::Gt,
580            '+' => TokenKind::Plus,
581            '-' => TokenKind::Minus,
582            '*' => TokenKind::Star,
583            '/' => TokenKind::Slash,
584            '!' => TokenKind::Bang,
585            '?' => TokenKind::Question,
586            '.' => TokenKind::Dot,
587            ':' => TokenKind::Colon,
588            ',' => TokenKind::Comma,
589            '(' => TokenKind::LParen,
590            ')' => TokenKind::RParen,
591            '[' => TokenKind::LBracket,
592            ']' => TokenKind::RBracket,
593            '{' => TokenKind::LBrace,
594            '}' => TokenKind::RBrace,
595            other => return Err(self.error(format!("Unknown character: {:?}", other))),
596        };
597
598        Ok(Token { kind, line, col })
599    }
600}