Skip to main content

aver/
lexer.rs

1use std::fmt;
2use thiserror::Error;
3
4#[derive(Debug, Clone, PartialEq)]
5pub enum TokenKind {
6    // Literals
7    Int(i64),
8    Float(f64),
9    Str(String),
10    InterpStr(Vec<(bool, String)>), // (is_expr, text)
11    Bool(bool),
12    // Identifiers
13    Ident(String),
14    // Keywords
15    Module,
16    Depends,
17    Exposes,
18    Intent,
19    Type,
20    Record,
21    Fn,
22    Effect,
23    Effects,
24    Service,
25    Needs,
26    Decision,
27    Verify,
28    Case,
29    Match,
30    Where,
31    Input,
32    Expect,
33    // Operators
34    Arrow,    // ->
35    Pipe,     // |>
36    FatArrow, // =>
37    Eq,       // ==
38    Neq,      // !=
39    Lte,      // <=
40    Gte,      // >=
41    Assign,   // =
42    Bang,     // !
43    Question, // ?
44    Lt,       // <
45    Gt,       // >
46    Plus,     // +
47    Minus,    // -
48    Star,     // *
49    Slash,    // /
50    Dot,      // .
51    Colon,    // :
52    Comma,    // ,
53    LParen,   // (
54    RParen,   // )
55    LBracket, // [
56    RBracket, // ]
57    LBrace,   // {
58    RBrace,   // }
59    // Structure
60    Indent,
61    Dedent,
62    Newline,
63    Eof,
64}
65
66impl fmt::Display for TokenKind {
67    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
68        match self {
69            TokenKind::Int(n) => write!(f, "integer '{}'", n),
70            TokenKind::Float(n) => write!(f, "float '{}'", n),
71            TokenKind::Str(s) => write!(f, "string \"{}\"", s),
72            TokenKind::InterpStr(_) => write!(f, "interpolated string"),
73            TokenKind::Bool(b) => write!(f, "'{}'", b),
74            TokenKind::Ident(s) => write!(f, "'{}'", s),
75            TokenKind::Module => write!(f, "'module'"),
76            TokenKind::Depends => write!(f, "'depends'"),
77            TokenKind::Exposes => write!(f, "'exposes'"),
78            TokenKind::Intent => write!(f, "'intent'"),
79            TokenKind::Type => write!(f, "'type'"),
80            TokenKind::Record => write!(f, "'record'"),
81            TokenKind::Fn => write!(f, "'fn'"),
82            TokenKind::Effect => write!(f, "'effect'"),
83            TokenKind::Effects => write!(f, "'effects'"),
84            TokenKind::Service => write!(f, "'service'"),
85            TokenKind::Needs => write!(f, "'needs'"),
86            TokenKind::Decision => write!(f, "'decision'"),
87            TokenKind::Verify => write!(f, "'verify'"),
88            TokenKind::Case => write!(f, "'case'"),
89            TokenKind::Match => write!(f, "'match'"),
90            TokenKind::Where => write!(f, "'where'"),
91            TokenKind::Input => write!(f, "'input'"),
92            TokenKind::Expect => write!(f, "'expect'"),
93            TokenKind::Arrow => write!(f, "'->'"),
94            TokenKind::Pipe => write!(f, "'|>'"),
95            TokenKind::FatArrow => write!(f, "'=>'"),
96            TokenKind::Eq => write!(f, "'=='"),
97            TokenKind::Neq => write!(f, "'!='"),
98            TokenKind::Lte => write!(f, "'<='"),
99            TokenKind::Gte => write!(f, "'>='"),
100            TokenKind::Assign => write!(f, "'='"),
101            TokenKind::Bang => write!(f, "'!'"),
102            TokenKind::Question => write!(f, "'?'"),
103            TokenKind::Lt => write!(f, "'<'"),
104            TokenKind::Gt => write!(f, "'>'"),
105            TokenKind::Plus => write!(f, "'+'"),
106            TokenKind::Minus => write!(f, "'-'"),
107            TokenKind::Star => write!(f, "'*'"),
108            TokenKind::Slash => write!(f, "'/'"),
109            TokenKind::Dot => write!(f, "'.'"),
110            TokenKind::Colon => write!(f, "':'"),
111            TokenKind::Comma => write!(f, "','"),
112            TokenKind::LParen => write!(f, "'('"),
113            TokenKind::RParen => write!(f, "')'"),
114            TokenKind::LBracket => write!(f, "'['"),
115            TokenKind::RBracket => write!(f, "']'"),
116            TokenKind::LBrace => write!(f, "'{{'"),
117            TokenKind::RBrace => write!(f, "'}}'"),
118            TokenKind::Indent => write!(f, "indentation"),
119            TokenKind::Dedent => write!(f, "end of block"),
120            TokenKind::Newline => write!(f, "end of line"),
121            TokenKind::Eof => write!(f, "end of file"),
122        }
123    }
124}
125
126#[derive(Debug, Clone)]
127pub struct Token {
128    pub kind: TokenKind,
129    pub line: usize,
130    pub col: usize,
131}
132
133#[derive(Debug, Error)]
134pub enum LexerError {
135    #[error("error[{line}:{col}]: {msg}")]
136    Error {
137        msg: String,
138        line: usize,
139        col: usize,
140    },
141}
142
143fn keyword(s: &str) -> Option<TokenKind> {
144    match s {
145        "module" => Some(TokenKind::Module),
146        "depends" => Some(TokenKind::Depends),
147        "exposes" => Some(TokenKind::Exposes),
148        "intent" => Some(TokenKind::Intent),
149        "type" => Some(TokenKind::Type),
150        "record" => Some(TokenKind::Record),
151        "fn" => Some(TokenKind::Fn),
152        "effect" => Some(TokenKind::Effect),
153        "effects" => Some(TokenKind::Effects),
154        "service" => Some(TokenKind::Service),
155        "needs" => Some(TokenKind::Needs),
156        "decision" => Some(TokenKind::Decision),
157        "verify" => Some(TokenKind::Verify),
158        "case" => Some(TokenKind::Case),
159        "match" => Some(TokenKind::Match),
160        "where" => Some(TokenKind::Where),
161        "input" => Some(TokenKind::Input),
162        "expect" => Some(TokenKind::Expect),
163        "true" => Some(TokenKind::Bool(true)),
164        "false" => Some(TokenKind::Bool(false)),
165        _ => None,
166    }
167}
168
169pub struct Lexer {
170    chars: Vec<char>,
171    pos: usize,
172    line: usize,
173    col: usize,
174    indent_stack: Vec<usize>,
175    at_line_start: bool,
176}
177
178impl Lexer {
179    pub fn new(source: &str) -> Self {
180        Lexer {
181            chars: source.chars().collect(),
182            pos: 0,
183            line: 1,
184            col: 1,
185            indent_stack: vec![0],
186            at_line_start: true,
187        }
188    }
189
190    fn error(&self, msg: impl Into<String>) -> LexerError {
191        LexerError::Error {
192            msg: msg.into(),
193            line: self.line,
194            col: self.col,
195        }
196    }
197
198    fn peek(&self, offset: usize) -> Option<char> {
199        self.chars.get(self.pos + offset).copied()
200    }
201
202    fn current(&self) -> Option<char> {
203        self.chars.get(self.pos).copied()
204    }
205
206    fn advance(&mut self) -> Option<char> {
207        let ch = self.chars.get(self.pos).copied()?;
208        self.pos += 1;
209        if ch == '\n' {
210            self.line += 1;
211            self.col = 1;
212        } else {
213            self.col += 1;
214        }
215        Some(ch)
216    }
217
218    pub fn tokenize(&mut self) -> Result<Vec<Token>, LexerError> {
219        let mut tokens = Vec::new();
220
221        while self.pos < self.chars.len() {
222            self.scan_token(&mut tokens)?;
223        }
224
225        // Emit remaining DEDENTs
226        while self.indent_stack.len() > 1 {
227            self.indent_stack.pop();
228            tokens.push(Token {
229                kind: TokenKind::Dedent,
230                line: self.line,
231                col: self.col,
232            });
233        }
234
235        tokens.push(Token {
236            kind: TokenKind::Eof,
237            line: self.line,
238            col: self.col,
239        });
240
241        Ok(tokens)
242    }
243
244    fn scan_token(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexerError> {
245        if self.at_line_start {
246            self.handle_indentation(tokens)?;
247            if self.pos >= self.chars.len() {
248                return Ok(());
249            }
250        }
251
252        let ch = match self.current() {
253            Some(c) => c,
254            None => return Ok(()),
255        };
256
257        // Skip spaces (not at line start)
258        if ch == ' ' {
259            self.advance();
260            return Ok(());
261        }
262
263        // Newline
264        if ch == '\n' {
265            let line = self.line;
266            let col = self.col;
267            self.advance();
268
269            let last_is_structural = tokens
270                .last()
271                .map(|t| {
272                    matches!(
273                        t.kind,
274                        TokenKind::Newline | TokenKind::Indent | TokenKind::Dedent
275                    )
276                })
277                .unwrap_or(true);
278
279            if !tokens.is_empty() && !last_is_structural {
280                tokens.push(Token {
281                    kind: TokenKind::Newline,
282                    line,
283                    col,
284                });
285            }
286            self.at_line_start = true;
287            return Ok(());
288        }
289
290        // Carriage return
291        if ch == '\r' {
292            self.advance();
293            return Ok(());
294        }
295
296        // Comments
297        if ch == '/' && self.peek(1) == Some('/') {
298            self.skip_comment();
299            return Ok(());
300        }
301
302        // Strings
303        if ch == '"' {
304            let tok = self.scan_string()?;
305            tokens.push(tok);
306            return Ok(());
307        }
308
309        // Numbers
310        if ch.is_ascii_digit() {
311            let tok = self.scan_number()?;
312            tokens.push(tok);
313            return Ok(());
314        }
315
316        // Identifiers / keywords
317        if ch.is_alphabetic() || ch == '_' {
318            let tok = self.scan_identifier();
319            tokens.push(tok);
320            return Ok(());
321        }
322
323        // Operators
324        let tok = self.scan_operator()?;
325        tokens.push(tok);
326        Ok(())
327    }
328
329    fn handle_indentation(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexerError> {
330        self.at_line_start = false;
331        let mut indent = 0;
332
333        while self.pos < self.chars.len() && self.chars[self.pos] == ' ' {
334            indent += 1;
335            self.pos += 1;
336            self.col += 1;
337        }
338
339        // Empty line or comment-only line — don't emit indent/dedent
340        if self.pos < self.chars.len() {
341            let ch = self.chars[self.pos];
342            if ch == '\n' || ch == '\r' {
343                return Ok(());
344            }
345            if ch == '/' && self.pos + 1 < self.chars.len() && self.chars[self.pos + 1] == '/' {
346                return Ok(());
347            }
348        } else {
349            return Ok(());
350        }
351
352        let current = *self.indent_stack.last().unwrap();
353        let line = self.line;
354
355        if indent > current {
356            self.indent_stack.push(indent);
357            tokens.push(Token {
358                kind: TokenKind::Indent,
359                line,
360                col: 1,
361            });
362        } else if indent < current {
363            while self.indent_stack.len() > 1 && *self.indent_stack.last().unwrap() > indent {
364                self.indent_stack.pop();
365                tokens.push(Token {
366                    kind: TokenKind::Dedent,
367                    line,
368                    col: 1,
369                });
370            }
371            if *self.indent_stack.last().unwrap() != indent {
372                return Err(self.error(format!("Invalid indentation level: {}", indent)));
373            }
374        }
375
376        Ok(())
377    }
378
379    fn skip_comment(&mut self) {
380        while self.pos < self.chars.len() && self.chars[self.pos] != '\n' {
381            self.advance();
382        }
383    }
384
385    fn scan_string(&mut self) -> Result<Token, LexerError> {
386        let line = self.line;
387        let col = self.col;
388        self.advance(); // consume opening "
389
390        let mut parts: Vec<(bool, String)> = Vec::new(); // (is_expr, text)
391        let mut current = String::new();
392        let mut has_interp = false;
393
394        loop {
395            match self.current() {
396                None => return Err(self.error("Unterminated string literal")),
397                Some('"') => {
398                    self.advance();
399                    break;
400                }
401                Some('{') => {
402                    // {{ → literal {, otherwise start interpolation
403                    if self.chars.get(self.pos + 1).copied() == Some('{') {
404                        current.push('{');
405                        self.advance(); // first {
406                        self.advance(); // second {
407                    } else {
408                        has_interp = true;
409                        if !current.is_empty() {
410                            parts.push((false, current.clone()));
411                            current.clear();
412                        }
413                        self.advance(); // consume {
414                        let mut expr_text = String::new();
415                        let mut depth = 1usize;
416                        while self.pos < self.chars.len() && depth > 0 {
417                            match self.chars[self.pos] {
418                                '{' => {
419                                    depth += 1;
420                                    expr_text.push('{');
421                                    self.advance();
422                                }
423                                '}' => {
424                                    depth -= 1;
425                                    if depth > 0 {
426                                        expr_text.push('}');
427                                    }
428                                    self.advance();
429                                }
430                                c => {
431                                    expr_text.push(c);
432                                    self.advance();
433                                }
434                            }
435                        }
436                        parts.push((true, expr_text));
437                    }
438                }
439                Some('}') => {
440                    // }} → literal }, single } is just a literal character
441                    if self.chars.get(self.pos + 1).copied() == Some('}') {
442                        current.push('}');
443                        self.advance(); // first }
444                        self.advance(); // second }
445                    } else {
446                        current.push('}');
447                        self.advance();
448                    }
449                }
450                Some('\\') => {
451                    self.advance();
452                    match self.advance() {
453                        Some('n') => current.push('\n'),
454                        Some('t') => current.push('\t'),
455                        Some('r') => current.push('\r'),
456                        Some('"') => current.push('"'),
457                        Some('\\') => current.push('\\'),
458                        Some(c) => current.push(c),
459                        None => return Err(self.error("Unterminated string literal")),
460                    }
461                }
462                Some('\n') => return Err(self.error("Unterminated string literal")),
463                Some(c) => {
464                    current.push(c);
465                    self.advance();
466                }
467            }
468        }
469
470        if !current.is_empty() {
471            parts.push((false, current));
472        }
473
474        if has_interp {
475            Ok(Token {
476                kind: TokenKind::InterpStr(parts),
477                line,
478                col,
479            })
480        } else {
481            let plain = parts.into_iter().map(|(_, s)| s).collect::<String>();
482            Ok(Token {
483                kind: TokenKind::Str(plain),
484                line,
485                col,
486            })
487        }
488    }
489
490    fn scan_number(&mut self) -> Result<Token, LexerError> {
491        let line = self.line;
492        let col = self.col;
493        let mut num_str = String::new();
494        let mut is_float = false;
495
496        while let Some(c) = self.current() {
497            if c.is_ascii_digit() {
498                num_str.push(c);
499                self.advance();
500            } else {
501                break;
502            }
503        }
504
505        if self.current() == Some('.') && self.peek(1).map(|c| c.is_ascii_digit()).unwrap_or(false)
506        {
507            is_float = true;
508            num_str.push('.');
509            self.advance(); // consume '.'
510            while let Some(c) = self.current() {
511                if c.is_ascii_digit() {
512                    num_str.push(c);
513                    self.advance();
514                } else {
515                    break;
516                }
517            }
518        }
519
520        if is_float {
521            let f: f64 = num_str
522                .parse()
523                .map_err(|_| self.error("Invalid floating-point number"))?;
524            Ok(Token {
525                kind: TokenKind::Float(f),
526                line,
527                col,
528            })
529        } else {
530            let i: i64 = num_str
531                .parse()
532                .map_err(|_| self.error("Invalid integer literal"))?;
533            Ok(Token {
534                kind: TokenKind::Int(i),
535                line,
536                col,
537            })
538        }
539    }
540
541    fn scan_identifier(&mut self) -> Token {
542        let line = self.line;
543        let col = self.col;
544        let mut ident = String::new();
545
546        while let Some(c) = self.current() {
547            if c.is_alphanumeric() || c == '_' {
548                ident.push(c);
549                self.advance();
550            } else {
551                break;
552            }
553        }
554
555        let kind = keyword(&ident).unwrap_or_else(|| TokenKind::Ident(ident));
556        Token { kind, line, col }
557    }
558
559    fn scan_operator(&mut self) -> Result<Token, LexerError> {
560        let line = self.line;
561        let col = self.col;
562        let ch = self.advance().unwrap();
563
564        let kind = match ch {
565            '-' if self.current() == Some('>') => {
566                self.advance();
567                TokenKind::Arrow
568            }
569            '|' if self.current() == Some('>') => {
570                self.advance();
571                TokenKind::Pipe
572            }
573            '=' if self.current() == Some('>') => {
574                self.advance();
575                TokenKind::FatArrow
576            }
577            '=' if self.current() == Some('=') => {
578                self.advance();
579                TokenKind::Eq
580            }
581            '!' if self.current() == Some('=') => {
582                self.advance();
583                TokenKind::Neq
584            }
585            '<' if self.current() == Some('=') => {
586                self.advance();
587                TokenKind::Lte
588            }
589            '>' if self.current() == Some('=') => {
590                self.advance();
591                TokenKind::Gte
592            }
593            '=' => TokenKind::Assign,
594            '<' => TokenKind::Lt,
595            '>' => TokenKind::Gt,
596            '+' => TokenKind::Plus,
597            '-' => TokenKind::Minus,
598            '*' => TokenKind::Star,
599            '/' => TokenKind::Slash,
600            '!' => TokenKind::Bang,
601            '?' => TokenKind::Question,
602            '.' => TokenKind::Dot,
603            ':' => TokenKind::Colon,
604            ',' => TokenKind::Comma,
605            '(' => TokenKind::LParen,
606            ')' => TokenKind::RParen,
607            '[' => TokenKind::LBracket,
608            ']' => TokenKind::RBracket,
609            '{' => TokenKind::LBrace,
610            '}' => TokenKind::RBrace,
611            other => return Err(self.error(format!("Unknown character: {:?}", other))),
612        };
613
614        Ok(Token { kind, line, col })
615    }
616}