Skip to main content

aver/
lexer.rs

1use std::fmt;
2use thiserror::Error;
3
4#[derive(Debug, Clone, PartialEq)]
5pub enum TokenKind {
6    // Literals
7    Int(i64),
8    Float(f64),
9    Str(String),
10    InterpStr(Vec<(bool, String)>), // (is_expr, text)
11    Bool(bool),
12    // Identifiers
13    Ident(String),
14    // Keywords
15    Module,
16    Depends,
17    Exposes,
18    Intent,
19    Type,
20    Record,
21    Reason,
22    Fn,
23    Effect,
24    Effects,
25    Service,
26    Needs,
27    Decision,
28    Verify,
29    Case,
30    Match,
31    Where,
32    Input,
33    Expect,
34    Date,
35    Author,
36    Chosen,
37    Rejected,
38    Impacts,
39    // Operators
40    Arrow,    // ->
41    Pipe,     // |>
42    FatArrow, // =>
43    Eq,       // ==
44    Neq,      // !=
45    Lte,      // <=
46    Gte,      // >=
47    Assign,   // =
48    Bang,     // !
49    Question, // ?
50    Lt,       // <
51    Gt,       // >
52    Plus,     // +
53    Minus,    // -
54    Star,     // *
55    Slash,    // /
56    Dot,      // .
57    Colon,    // :
58    Comma,    // ,
59    LParen,   // (
60    RParen,   // )
61    LBracket, // [
62    RBracket, // ]
63    LBrace,   // {
64    RBrace,   // }
65    // Structure
66    Indent,
67    Dedent,
68    Newline,
69    Eof,
70}
71
72impl fmt::Display for TokenKind {
73    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
74        match self {
75            TokenKind::Int(n) => write!(f, "integer '{}'", n),
76            TokenKind::Float(n) => write!(f, "float '{}'", n),
77            TokenKind::Str(s) => write!(f, "string \"{}\"", s),
78            TokenKind::InterpStr(_) => write!(f, "interpolated string"),
79            TokenKind::Bool(b) => write!(f, "'{}'", b),
80            TokenKind::Ident(s) => write!(f, "'{}'", s),
81            TokenKind::Module => write!(f, "'module'"),
82            TokenKind::Depends => write!(f, "'depends'"),
83            TokenKind::Exposes => write!(f, "'exposes'"),
84            TokenKind::Intent => write!(f, "'intent'"),
85            TokenKind::Type => write!(f, "'type'"),
86            TokenKind::Record => write!(f, "'record'"),
87            TokenKind::Reason => write!(f, "'reason'"),
88            TokenKind::Fn => write!(f, "'fn'"),
89            TokenKind::Effect => write!(f, "'effect'"),
90            TokenKind::Effects => write!(f, "'effects'"),
91            TokenKind::Service => write!(f, "'service'"),
92            TokenKind::Needs => write!(f, "'needs'"),
93            TokenKind::Decision => write!(f, "'decision'"),
94            TokenKind::Verify => write!(f, "'verify'"),
95            TokenKind::Case => write!(f, "'case'"),
96            TokenKind::Match => write!(f, "'match'"),
97            TokenKind::Where => write!(f, "'where'"),
98            TokenKind::Input => write!(f, "'input'"),
99            TokenKind::Expect => write!(f, "'expect'"),
100            TokenKind::Date => write!(f, "'date'"),
101            TokenKind::Author => write!(f, "'author'"),
102            TokenKind::Chosen => write!(f, "'chosen'"),
103            TokenKind::Rejected => write!(f, "'rejected'"),
104            TokenKind::Impacts => write!(f, "'impacts'"),
105            TokenKind::Arrow => write!(f, "'->'"),
106            TokenKind::Pipe => write!(f, "'|>'"),
107            TokenKind::FatArrow => write!(f, "'=>'"),
108            TokenKind::Eq => write!(f, "'=='"),
109            TokenKind::Neq => write!(f, "'!='"),
110            TokenKind::Lte => write!(f, "'<='"),
111            TokenKind::Gte => write!(f, "'>='"),
112            TokenKind::Assign => write!(f, "'='"),
113            TokenKind::Bang => write!(f, "'!'"),
114            TokenKind::Question => write!(f, "'?'"),
115            TokenKind::Lt => write!(f, "'<'"),
116            TokenKind::Gt => write!(f, "'>'"),
117            TokenKind::Plus => write!(f, "'+'"),
118            TokenKind::Minus => write!(f, "'-'"),
119            TokenKind::Star => write!(f, "'*'"),
120            TokenKind::Slash => write!(f, "'/'"),
121            TokenKind::Dot => write!(f, "'.'"),
122            TokenKind::Colon => write!(f, "':'"),
123            TokenKind::Comma => write!(f, "','"),
124            TokenKind::LParen => write!(f, "'('"),
125            TokenKind::RParen => write!(f, "')'"),
126            TokenKind::LBracket => write!(f, "'['"),
127            TokenKind::RBracket => write!(f, "']'"),
128            TokenKind::LBrace => write!(f, "'{{'"),
129            TokenKind::RBrace => write!(f, "'}}'"),
130            TokenKind::Indent => write!(f, "indentation"),
131            TokenKind::Dedent => write!(f, "end of block"),
132            TokenKind::Newline => write!(f, "end of line"),
133            TokenKind::Eof => write!(f, "end of file"),
134        }
135    }
136}
137
138#[derive(Debug, Clone)]
139pub struct Token {
140    pub kind: TokenKind,
141    pub line: usize,
142    pub col: usize,
143}
144
145#[derive(Debug, Error)]
146pub enum LexerError {
147    #[error("error[{line}:{col}]: {msg}")]
148    Error {
149        msg: String,
150        line: usize,
151        col: usize,
152    },
153}
154
155fn keyword(s: &str) -> Option<TokenKind> {
156    match s {
157        "module" => Some(TokenKind::Module),
158        "depends" => Some(TokenKind::Depends),
159        "exposes" => Some(TokenKind::Exposes),
160        "intent" => Some(TokenKind::Intent),
161        "type" => Some(TokenKind::Type),
162        "record" => Some(TokenKind::Record),
163        "reason" => Some(TokenKind::Reason),
164        "fn" => Some(TokenKind::Fn),
165        "effect" => Some(TokenKind::Effect),
166        "effects" => Some(TokenKind::Effects),
167        "service" => Some(TokenKind::Service),
168        "needs" => Some(TokenKind::Needs),
169        "decision" => Some(TokenKind::Decision),
170        "verify" => Some(TokenKind::Verify),
171        "case" => Some(TokenKind::Case),
172        "match" => Some(TokenKind::Match),
173        "where" => Some(TokenKind::Where),
174        "input" => Some(TokenKind::Input),
175        "expect" => Some(TokenKind::Expect),
176        "date" => Some(TokenKind::Date),
177        "author" => Some(TokenKind::Author),
178        "chosen" => Some(TokenKind::Chosen),
179        "rejected" => Some(TokenKind::Rejected),
180        "impacts" => Some(TokenKind::Impacts),
181        "true" => Some(TokenKind::Bool(true)),
182        "false" => Some(TokenKind::Bool(false)),
183        _ => None,
184    }
185}
186
187pub struct Lexer {
188    chars: Vec<char>,
189    pos: usize,
190    line: usize,
191    col: usize,
192    indent_stack: Vec<usize>,
193    at_line_start: bool,
194}
195
196impl Lexer {
197    pub fn new(source: &str) -> Self {
198        Lexer {
199            chars: source.chars().collect(),
200            pos: 0,
201            line: 1,
202            col: 1,
203            indent_stack: vec![0],
204            at_line_start: true,
205        }
206    }
207
208    fn error(&self, msg: impl Into<String>) -> LexerError {
209        LexerError::Error {
210            msg: msg.into(),
211            line: self.line,
212            col: self.col,
213        }
214    }
215
216    fn peek(&self, offset: usize) -> Option<char> {
217        self.chars.get(self.pos + offset).copied()
218    }
219
220    fn current(&self) -> Option<char> {
221        self.chars.get(self.pos).copied()
222    }
223
224    fn advance(&mut self) -> Option<char> {
225        let ch = self.chars.get(self.pos).copied()?;
226        self.pos += 1;
227        if ch == '\n' {
228            self.line += 1;
229            self.col = 1;
230        } else {
231            self.col += 1;
232        }
233        Some(ch)
234    }
235
236    pub fn tokenize(&mut self) -> Result<Vec<Token>, LexerError> {
237        let mut tokens = Vec::new();
238
239        while self.pos < self.chars.len() {
240            self.scan_token(&mut tokens)?;
241        }
242
243        // Emit remaining DEDENTs
244        while self.indent_stack.len() > 1 {
245            self.indent_stack.pop();
246            tokens.push(Token {
247                kind: TokenKind::Dedent,
248                line: self.line,
249                col: self.col,
250            });
251        }
252
253        tokens.push(Token {
254            kind: TokenKind::Eof,
255            line: self.line,
256            col: self.col,
257        });
258
259        Ok(tokens)
260    }
261
262    fn scan_token(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexerError> {
263        if self.at_line_start {
264            self.handle_indentation(tokens)?;
265            if self.pos >= self.chars.len() {
266                return Ok(());
267            }
268        }
269
270        let ch = match self.current() {
271            Some(c) => c,
272            None => return Ok(()),
273        };
274
275        // Skip spaces (not at line start)
276        if ch == ' ' {
277            self.advance();
278            return Ok(());
279        }
280
281        // Newline
282        if ch == '\n' {
283            let line = self.line;
284            let col = self.col;
285            self.advance();
286
287            let last_is_structural = tokens
288                .last()
289                .map(|t| {
290                    matches!(
291                        t.kind,
292                        TokenKind::Newline | TokenKind::Indent | TokenKind::Dedent
293                    )
294                })
295                .unwrap_or(true);
296
297            if !tokens.is_empty() && !last_is_structural {
298                tokens.push(Token {
299                    kind: TokenKind::Newline,
300                    line,
301                    col,
302                });
303            }
304            self.at_line_start = true;
305            return Ok(());
306        }
307
308        // Carriage return
309        if ch == '\r' {
310            self.advance();
311            return Ok(());
312        }
313
314        // Comments
315        if ch == '/' && self.peek(1) == Some('/') {
316            self.skip_comment();
317            return Ok(());
318        }
319
320        // Strings
321        if ch == '"' {
322            let tok = self.scan_string()?;
323            tokens.push(tok);
324            return Ok(());
325        }
326
327        // Numbers
328        if ch.is_ascii_digit() {
329            let tok = self.scan_number()?;
330            tokens.push(tok);
331            return Ok(());
332        }
333
334        // Identifiers / keywords
335        if ch.is_alphabetic() || ch == '_' {
336            let tok = self.scan_identifier();
337            tokens.push(tok);
338            return Ok(());
339        }
340
341        // Operators
342        let tok = self.scan_operator()?;
343        tokens.push(tok);
344        Ok(())
345    }
346
347    fn handle_indentation(&mut self, tokens: &mut Vec<Token>) -> Result<(), LexerError> {
348        self.at_line_start = false;
349        let mut indent = 0;
350
351        while self.pos < self.chars.len() && self.chars[self.pos] == ' ' {
352            indent += 1;
353            self.pos += 1;
354            self.col += 1;
355        }
356
357        // Empty line or comment-only line — don't emit indent/dedent
358        if self.pos < self.chars.len() {
359            let ch = self.chars[self.pos];
360            if ch == '\n' || ch == '\r' {
361                return Ok(());
362            }
363            if ch == '/' && self.pos + 1 < self.chars.len() && self.chars[self.pos + 1] == '/' {
364                return Ok(());
365            }
366        } else {
367            return Ok(());
368        }
369
370        let current = *self.indent_stack.last().unwrap();
371        let line = self.line;
372
373        if indent > current {
374            self.indent_stack.push(indent);
375            tokens.push(Token {
376                kind: TokenKind::Indent,
377                line,
378                col: 1,
379            });
380        } else if indent < current {
381            while self.indent_stack.len() > 1 && *self.indent_stack.last().unwrap() > indent {
382                self.indent_stack.pop();
383                tokens.push(Token {
384                    kind: TokenKind::Dedent,
385                    line,
386                    col: 1,
387                });
388            }
389            if *self.indent_stack.last().unwrap() != indent {
390                return Err(self.error(format!("Invalid indentation level: {}", indent)));
391            }
392        }
393
394        Ok(())
395    }
396
397    fn skip_comment(&mut self) {
398        while self.pos < self.chars.len() && self.chars[self.pos] != '\n' {
399            self.advance();
400        }
401    }
402
403    fn scan_string(&mut self) -> Result<Token, LexerError> {
404        let line = self.line;
405        let col = self.col;
406        self.advance(); // consume opening "
407
408        let mut parts: Vec<(bool, String)> = Vec::new(); // (is_expr, text)
409        let mut current = String::new();
410        let mut has_interp = false;
411
412        loop {
413            match self.current() {
414                None => return Err(self.error("Unterminated string literal")),
415                Some('"') => {
416                    self.advance();
417                    break;
418                }
419                Some('{') => {
420                    // {{ → literal {, otherwise start interpolation
421                    if self.chars.get(self.pos + 1).copied() == Some('{') {
422                        current.push('{');
423                        self.advance(); // first {
424                        self.advance(); // second {
425                    } else {
426                        has_interp = true;
427                        if !current.is_empty() {
428                            parts.push((false, current.clone()));
429                            current.clear();
430                        }
431                        self.advance(); // consume {
432                        let mut expr_text = String::new();
433                        let mut depth = 1usize;
434                        while self.pos < self.chars.len() && depth > 0 {
435                            match self.chars[self.pos] {
436                                '{' => {
437                                    depth += 1;
438                                    expr_text.push('{');
439                                    self.advance();
440                                }
441                                '}' => {
442                                    depth -= 1;
443                                    if depth > 0 {
444                                        expr_text.push('}');
445                                    }
446                                    self.advance();
447                                }
448                                c => {
449                                    expr_text.push(c);
450                                    self.advance();
451                                }
452                            }
453                        }
454                        parts.push((true, expr_text));
455                    }
456                }
457                Some('}') => {
458                    // }} → literal }, single } is just a literal character
459                    if self.chars.get(self.pos + 1).copied() == Some('}') {
460                        current.push('}');
461                        self.advance(); // first }
462                        self.advance(); // second }
463                    } else {
464                        current.push('}');
465                        self.advance();
466                    }
467                }
468                Some('\\') => {
469                    self.advance();
470                    match self.advance() {
471                        Some('n') => current.push('\n'),
472                        Some('t') => current.push('\t'),
473                        Some('r') => current.push('\r'),
474                        Some('"') => current.push('"'),
475                        Some('\\') => current.push('\\'),
476                        Some(c) => current.push(c),
477                        None => return Err(self.error("Unterminated string literal")),
478                    }
479                }
480                Some('\n') => return Err(self.error("Unterminated string literal")),
481                Some(c) => {
482                    current.push(c);
483                    self.advance();
484                }
485            }
486        }
487
488        if !current.is_empty() {
489            parts.push((false, current));
490        }
491
492        if has_interp {
493            Ok(Token {
494                kind: TokenKind::InterpStr(parts),
495                line,
496                col,
497            })
498        } else {
499            let plain = parts.into_iter().map(|(_, s)| s).collect::<String>();
500            Ok(Token {
501                kind: TokenKind::Str(plain),
502                line,
503                col,
504            })
505        }
506    }
507
508    fn scan_number(&mut self) -> Result<Token, LexerError> {
509        let line = self.line;
510        let col = self.col;
511        let mut num_str = String::new();
512        let mut is_float = false;
513
514        while let Some(c) = self.current() {
515            if c.is_ascii_digit() {
516                num_str.push(c);
517                self.advance();
518            } else {
519                break;
520            }
521        }
522
523        if self.current() == Some('.') && self.peek(1).map(|c| c.is_ascii_digit()).unwrap_or(false)
524        {
525            is_float = true;
526            num_str.push('.');
527            self.advance(); // consume '.'
528            while let Some(c) = self.current() {
529                if c.is_ascii_digit() {
530                    num_str.push(c);
531                    self.advance();
532                } else {
533                    break;
534                }
535            }
536        }
537
538        if is_float {
539            let f: f64 = num_str
540                .parse()
541                .map_err(|_| self.error("Invalid floating-point number"))?;
542            Ok(Token {
543                kind: TokenKind::Float(f),
544                line,
545                col,
546            })
547        } else {
548            let i: i64 = num_str
549                .parse()
550                .map_err(|_| self.error("Invalid integer literal"))?;
551            Ok(Token {
552                kind: TokenKind::Int(i),
553                line,
554                col,
555            })
556        }
557    }
558
559    fn scan_identifier(&mut self) -> Token {
560        let line = self.line;
561        let col = self.col;
562        let mut ident = String::new();
563
564        while let Some(c) = self.current() {
565            if c.is_alphanumeric() || c == '_' {
566                ident.push(c);
567                self.advance();
568            } else {
569                break;
570            }
571        }
572
573        let kind = keyword(&ident).unwrap_or_else(|| TokenKind::Ident(ident));
574        Token { kind, line, col }
575    }
576
577    fn scan_operator(&mut self) -> Result<Token, LexerError> {
578        let line = self.line;
579        let col = self.col;
580        let ch = self.advance().unwrap();
581
582        let kind = match ch {
583            '-' if self.current() == Some('>') => {
584                self.advance();
585                TokenKind::Arrow
586            }
587            '|' if self.current() == Some('>') => {
588                self.advance();
589                TokenKind::Pipe
590            }
591            '=' if self.current() == Some('>') => {
592                self.advance();
593                TokenKind::FatArrow
594            }
595            '=' if self.current() == Some('=') => {
596                self.advance();
597                TokenKind::Eq
598            }
599            '!' if self.current() == Some('=') => {
600                self.advance();
601                TokenKind::Neq
602            }
603            '<' if self.current() == Some('=') => {
604                self.advance();
605                TokenKind::Lte
606            }
607            '>' if self.current() == Some('=') => {
608                self.advance();
609                TokenKind::Gte
610            }
611            '=' => TokenKind::Assign,
612            '<' => TokenKind::Lt,
613            '>' => TokenKind::Gt,
614            '+' => TokenKind::Plus,
615            '-' => TokenKind::Minus,
616            '*' => TokenKind::Star,
617            '/' => TokenKind::Slash,
618            '!' => TokenKind::Bang,
619            '?' => TokenKind::Question,
620            '.' => TokenKind::Dot,
621            ':' => TokenKind::Colon,
622            ',' => TokenKind::Comma,
623            '(' => TokenKind::LParen,
624            ')' => TokenKind::RParen,
625            '[' => TokenKind::LBracket,
626            ']' => TokenKind::RBracket,
627            '{' => TokenKind::LBrace,
628            '}' => TokenKind::RBrace,
629            other => return Err(self.error(format!("Unknown character: {:?}", other))),
630        };
631
632        Ok(Token { kind, line, col })
633    }
634}