Skip to main content

allium_parser/
lexer.rs

1use crate::Span;
2
3// ---------------------------------------------------------------------------
4// Token types
5// ---------------------------------------------------------------------------
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum TokenKind {
9    // Identifiers and literals
10    Ident,
11    Number,
12    Duration,
13    String,
14    True,
15    False,
16    Null,
17
18    // Block-level keywords
19    Rule,
20    Entity,
21    External,
22    Value,
23    Enum,
24    Given,
25    Config,
26    Surface,
27    Actor,
28    Default,
29    Variant,
30    Deferred,
31    Open,
32    Question,
33    Use,
34    As,
35
36    // Clause / expression keywords
37    When,
38    Requires,
39    Ensures,
40    Let,
41    For,
42    In,
43    If,
44    Else,
45    Where,
46    With,
47    Not,
48    And,
49    Or,
50    Exists,
51
52    // Trigger keywords
53    TransitionsTo,
54    Becomes,
55
56    // Contract and invariant keywords
57    Implies,
58    Contract,
59    Invariant,
60
61    // Sigil
62    At,              // @
63
64    // Context-sensitive identifiers treated as keywords
65    Now,
66    This,
67    Within,
68
69    // Operators
70    Eq,              // =
71    BangEq,          // !=
72    Lt,              // <
73    LtEq,            // <=
74    Gt,              // >
75    GtEq,            // >=
76    Plus,            // +
77    Minus,           // -
78    Star,            // *
79    Slash,           // /
80    Pipe,            // |
81    FatArrow,        // =>
82    ThinArrow,       // ->
83    QuestionQuestion, // ??
84    QuestionDot,     // ?.
85    Dot,             // .
86
87    // Delimiters
88    LBrace,
89    RBrace,
90    LParen,
91    RParen,
92    LBracket,        // [
93    RBracket,        // ]
94    Colon,
95    Comma,
96    QuestionMark,    // standalone ?
97
98    // End of file
99    Eof,
100
101    // Lexer error (unrecognised character or unterminated string)
102    Error,
103}
104
105impl std::fmt::Display for TokenKind {
106    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
107        match self {
108            TokenKind::Ident => write!(f, "identifier"),
109            TokenKind::Number => write!(f, "number"),
110            TokenKind::Duration => write!(f, "duration"),
111            TokenKind::String => write!(f, "string"),
112            TokenKind::True => write!(f, "'true'"),
113            TokenKind::False => write!(f, "'false'"),
114            TokenKind::Null => write!(f, "'null'"),
115            TokenKind::Rule => write!(f, "'rule'"),
116            TokenKind::Entity => write!(f, "'entity'"),
117            TokenKind::External => write!(f, "'external'"),
118            TokenKind::Value => write!(f, "'value'"),
119            TokenKind::Enum => write!(f, "'enum'"),
120            TokenKind::Given => write!(f, "'given'"),
121            TokenKind::Config => write!(f, "'config'"),
122            TokenKind::Surface => write!(f, "'surface'"),
123            TokenKind::Actor => write!(f, "'actor'"),
124            TokenKind::Default => write!(f, "'default'"),
125            TokenKind::Variant => write!(f, "'variant'"),
126            TokenKind::Deferred => write!(f, "'deferred'"),
127            TokenKind::Open => write!(f, "'open'"),
128            TokenKind::Question => write!(f, "'question'"),
129            TokenKind::Use => write!(f, "'use'"),
130            TokenKind::As => write!(f, "'as'"),
131            TokenKind::When => write!(f, "'when'"),
132            TokenKind::Requires => write!(f, "'requires'"),
133            TokenKind::Ensures => write!(f, "'ensures'"),
134            TokenKind::Let => write!(f, "'let'"),
135            TokenKind::For => write!(f, "'for'"),
136            TokenKind::In => write!(f, "'in'"),
137            TokenKind::If => write!(f, "'if'"),
138            TokenKind::Else => write!(f, "'else'"),
139            TokenKind::Where => write!(f, "'where'"),
140            TokenKind::With => write!(f, "'with'"),
141            TokenKind::Not => write!(f, "'not'"),
142            TokenKind::And => write!(f, "'and'"),
143            TokenKind::Or => write!(f, "'or'"),
144            TokenKind::Exists => write!(f, "'exists'"),
145            TokenKind::TransitionsTo => write!(f, "'transitions_to'"),
146            TokenKind::Becomes => write!(f, "'becomes'"),
147            TokenKind::Implies => write!(f, "'implies'"),
148            TokenKind::Contract => write!(f, "'contract'"),
149            TokenKind::Invariant => write!(f, "'invariant'"),
150            TokenKind::At => write!(f, "'@'"),
151            TokenKind::Now => write!(f, "'now'"),
152            TokenKind::This => write!(f, "'this'"),
153            TokenKind::Within => write!(f, "'within'"),
154            TokenKind::Eq => write!(f, "'='"),
155            TokenKind::BangEq => write!(f, "'!='"),
156            TokenKind::Lt => write!(f, "'<'"),
157            TokenKind::LtEq => write!(f, "'<='"),
158            TokenKind::Gt => write!(f, "'>'"),
159            TokenKind::GtEq => write!(f, "'>='"),
160            TokenKind::Plus => write!(f, "'+'"),
161            TokenKind::Minus => write!(f, "'-'"),
162            TokenKind::Star => write!(f, "'*'"),
163            TokenKind::Slash => write!(f, "'/'"),
164            TokenKind::Pipe => write!(f, "'|'"),
165            TokenKind::FatArrow => write!(f, "'=>'"),
166            TokenKind::ThinArrow => write!(f, "'->'"),
167            TokenKind::QuestionQuestion => write!(f, "'??'"),
168            TokenKind::QuestionDot => write!(f, "'?.'"),
169            TokenKind::Dot => write!(f, "'.'"),
170            TokenKind::LBrace => write!(f, "'{{'"),
171            TokenKind::RBrace => write!(f, "'}}'"),
172            TokenKind::LParen => write!(f, "'('"),
173            TokenKind::RParen => write!(f, "')'"),
174            TokenKind::LBracket => write!(f, "'['"),
175            TokenKind::RBracket => write!(f, "']'"),
176            TokenKind::Colon => write!(f, "':'"),
177            TokenKind::Comma => write!(f, "','"),
178            TokenKind::QuestionMark => write!(f, "'?'"),
179            TokenKind::Eof => write!(f, "end of file"),
180            TokenKind::Error => write!(f, "unrecognised token"),
181        }
182    }
183}
184
185impl TokenKind {
186    /// True for any keyword or identifier — tokens that look like words.
187    pub fn is_word(self) -> bool {
188        matches!(
189            self,
190            TokenKind::Ident
191                | TokenKind::True
192                | TokenKind::False
193                | TokenKind::Null
194                | TokenKind::Rule
195                | TokenKind::Entity
196                | TokenKind::External
197                | TokenKind::Value
198                | TokenKind::Enum
199                | TokenKind::Given
200                | TokenKind::Config
201                | TokenKind::Surface
202                | TokenKind::Actor
203                | TokenKind::Default
204                | TokenKind::Variant
205                | TokenKind::Deferred
206                | TokenKind::Open
207                | TokenKind::Question
208                | TokenKind::Use
209                | TokenKind::As
210                | TokenKind::When
211                | TokenKind::Requires
212                | TokenKind::Ensures
213                | TokenKind::Let
214                | TokenKind::For
215                | TokenKind::In
216                | TokenKind::If
217                | TokenKind::Else
218                | TokenKind::Where
219                | TokenKind::With
220                | TokenKind::Not
221                | TokenKind::And
222                | TokenKind::Or
223                | TokenKind::Exists
224                | TokenKind::TransitionsTo
225                | TokenKind::Becomes
226                | TokenKind::Implies
227                | TokenKind::Contract
228                | TokenKind::Invariant
229                | TokenKind::Now
230                | TokenKind::This
231                | TokenKind::Within
232        )
233    }
234}
235
236#[derive(Debug, Clone, Copy)]
237pub struct Token {
238    pub kind: TokenKind,
239    pub span: Span,
240}
241
242// ---------------------------------------------------------------------------
243// Source map — precomputed line start offsets for O(1) line/col lookup
244// ---------------------------------------------------------------------------
245
246pub struct SourceMap {
247    line_starts: Vec<usize>,
248}
249
250impl SourceMap {
251    pub fn new(source: &str) -> Self {
252        let mut starts = vec![0];
253        for (i, b) in source.bytes().enumerate() {
254            if b == b'\n' {
255                starts.push(i + 1);
256            }
257        }
258        Self { line_starts: starts }
259    }
260
261    pub fn line_col(&self, offset: usize) -> (u32, u32) {
262        let line = self
263            .line_starts
264            .partition_point(|&s| s <= offset)
265            .saturating_sub(1);
266        let col = offset - self.line_starts[line];
267        (line as u32, col as u32)
268    }
269
270    /// Return the text of a single source line (0-indexed), without trailing newline.
271    pub fn line_text<'a>(&self, source: &'a str, line: u32) -> &'a str {
272        let idx = line as usize;
273        let start = self.line_starts[idx];
274        let end = if idx + 1 < self.line_starts.len() {
275            self.line_starts[idx + 1]
276        } else {
277            source.len()
278        };
279        source[start..end].trim_end_matches('\n').trim_end_matches('\r')
280    }
281}
282
283// ---------------------------------------------------------------------------
284// Lexer
285// ---------------------------------------------------------------------------
286
287/// Tokenise `source` into a flat list of tokens, ending with `Eof`.
288/// Whitespace and comments are skipped. The special comment `-- allium: N`
289/// at the very start of the file is preserved as an `Ident` token followed
290/// by normal tokens (version detection happens in the parser by inspecting
291/// the raw source before tokenising).
292pub fn lex(source: &str) -> Vec<Token> {
293    let mut lexer = Lexer::new(source);
294    let mut tokens = Vec::new();
295    loop {
296        let tok = lexer.next_token();
297        let done = tok.kind == TokenKind::Eof;
298        tokens.push(tok);
299        if done {
300            break;
301        }
302    }
303    tokens
304}
305
306struct Lexer<'s> {
307    src: &'s [u8],
308    pos: usize,
309}
310
311impl<'s> Lexer<'s> {
312    fn new(source: &'s str) -> Self {
313        Self {
314            src: source.as_bytes(),
315            pos: 0,
316        }
317    }
318
319    fn next_token(&mut self) -> Token {
320        self.skip_whitespace_and_comments();
321
322        if self.pos >= self.src.len() {
323            return Token {
324                kind: TokenKind::Eof,
325                span: Span::new(self.pos, self.pos),
326            };
327        }
328
329        let start = self.pos;
330        let b = self.src[self.pos];
331
332        if b == b'"' {
333            return self.lex_string(start);
334        }
335        if b.is_ascii_digit() {
336            return self.lex_number(start);
337        }
338        if is_ident_start(b) {
339            return self.lex_ident(start);
340        }
341
342        self.lex_operator(start)
343    }
344
345    // -- whitespace / comments ------------------------------------------
346
347    fn skip_whitespace_and_comments(&mut self) {
348        loop {
349            while self.pos < self.src.len()
350                && matches!(self.src[self.pos], b' ' | b'\t' | b'\n' | b'\r')
351            {
352                self.pos += 1;
353            }
354            if self.pos + 1 < self.src.len()
355                && self.src[self.pos] == b'-'
356                && self.src[self.pos + 1] == b'-'
357            {
358                while self.pos < self.src.len() && self.src[self.pos] != b'\n' {
359                    self.pos += 1;
360                }
361                continue;
362            }
363            break;
364        }
365    }
366
367    // -- string literals ------------------------------------------------
368
369    fn lex_string(&mut self, start: usize) -> Token {
370        self.pos += 1; // opening "
371        while self.pos < self.src.len() {
372            match self.src[self.pos] {
373                b'"' => {
374                    self.pos += 1;
375                    return Token {
376                        kind: TokenKind::String,
377                        span: Span::new(start, self.pos),
378                    };
379                }
380                b'\\' => {
381                    self.pos += 1;
382                    if self.pos < self.src.len() {
383                        self.pos += 1;
384                    }
385                }
386                b'\n' => {
387                    return Token {
388                        kind: TokenKind::Error,
389                        span: Span::new(start, self.pos),
390                    };
391                }
392                _ => self.pos += 1,
393            }
394        }
395        Token {
396            kind: TokenKind::Error,
397            span: Span::new(start, self.pos),
398        }
399    }
400
401    // -- numbers and durations ------------------------------------------
402
403    fn lex_number(&mut self, start: usize) -> Token {
404        self.consume_digits();
405
406        if self.pos < self.src.len() && self.src[self.pos] == b'.' {
407            let after_dot = self.pos + 1;
408            if after_dot < self.src.len() && self.src[after_dot].is_ascii_digit() {
409                // Decimal part
410                self.pos += 1;
411                self.consume_digits();
412                // Check for .unit (e.g. 3.14.hours — unusual but valid)
413                if self.check_duration_suffix() {
414                    return Token {
415                        kind: TokenKind::Duration,
416                        span: Span::new(start, self.pos),
417                    };
418                }
419                return Token {
420                    kind: TokenKind::Number,
421                    span: Span::new(start, self.pos),
422                };
423            }
424            if self.peek_duration_unit(after_dot).is_some() {
425                let unit_len = self.peek_duration_unit(after_dot).unwrap();
426                self.pos = after_dot + unit_len;
427                return Token {
428                    kind: TokenKind::Duration,
429                    span: Span::new(start, self.pos),
430                };
431            }
432        }
433
434        Token {
435            kind: TokenKind::Number,
436            span: Span::new(start, self.pos),
437        }
438    }
439
440    fn consume_digits(&mut self) {
441        while self.pos < self.src.len()
442            && (self.src[self.pos].is_ascii_digit() || self.src[self.pos] == b'_')
443        {
444            self.pos += 1;
445        }
446    }
447
448    /// After consuming a decimal number, check for `.unit` suffix.
449    fn check_duration_suffix(&mut self) -> bool {
450        if self.pos < self.src.len() && self.src[self.pos] == b'.' {
451            if let Some(unit_len) = self.peek_duration_unit(self.pos + 1) {
452                self.pos += 1 + unit_len;
453                return true;
454            }
455        }
456        false
457    }
458
459    fn peek_duration_unit(&self, from: usize) -> Option<usize> {
460        const UNITS: &[&str] = &[
461            "seconds", "second", "minutes", "minute", "hours", "hour", "days", "day", "weeks",
462            "week", "months", "month", "years", "year",
463        ];
464        for unit in UNITS {
465            let end = from + unit.len();
466            if end <= self.src.len()
467                && &self.src[from..end] == unit.as_bytes()
468                && (end >= self.src.len() || !is_ident_continue(self.src[end]))
469            {
470                return Some(unit.len());
471            }
472        }
473        None
474    }
475
476    // -- identifiers and keywords ---------------------------------------
477
478    fn lex_ident(&mut self, start: usize) -> Token {
479        while self.pos < self.src.len() && is_ident_continue(self.src[self.pos]) {
480            self.pos += 1;
481        }
482        let text = std::str::from_utf8(&self.src[start..self.pos]).unwrap();
483        Token {
484            kind: classify_keyword(text),
485            span: Span::new(start, self.pos),
486        }
487    }
488
489    // -- operators and punctuation --------------------------------------
490
491    fn lex_operator(&mut self, start: usize) -> Token {
492        let b = self.src[self.pos];
493        let next = if self.pos + 1 < self.src.len() {
494            self.src[self.pos + 1]
495        } else {
496            0
497        };
498
499        let (kind, len) = match (b, next) {
500            (b'=', b'>') => (TokenKind::FatArrow, 2),
501            (b'=', _) => (TokenKind::Eq, 1),
502            (b'!', b'=') => (TokenKind::BangEq, 2),
503            (b'<', b'=') => (TokenKind::LtEq, 2),
504            (b'<', _) => (TokenKind::Lt, 1),
505            (b'>', b'=') => (TokenKind::GtEq, 2),
506            (b'>', _) => (TokenKind::Gt, 1),
507            (b'+', _) => (TokenKind::Plus, 1),
508            (b'-', b'>') => (TokenKind::ThinArrow, 2),
509            (b'-', _) => (TokenKind::Minus, 1),
510            (b'*', _) => (TokenKind::Star, 1),
511            (b'/', _) => (TokenKind::Slash, 1),
512            (b'|', _) => (TokenKind::Pipe, 1),
513            (b'?', b'?') => (TokenKind::QuestionQuestion, 2),
514            (b'?', b'.') => (TokenKind::QuestionDot, 2),
515            (b'?', _) => (TokenKind::QuestionMark, 1),
516            (b'.', _) => (TokenKind::Dot, 1),
517            (b'{', _) => (TokenKind::LBrace, 1),
518            (b'}', _) => (TokenKind::RBrace, 1),
519            (b'(', _) => (TokenKind::LParen, 1),
520            (b')', _) => (TokenKind::RParen, 1),
521            (b'[', _) => (TokenKind::LBracket, 1),
522            (b']', _) => (TokenKind::RBracket, 1),
523            (b':', _) => (TokenKind::Colon, 1),
524            (b',', _) => (TokenKind::Comma, 1),
525            (b'@', _) => (TokenKind::At, 1),
526            _ => (TokenKind::Error, 1),
527        };
528
529        self.pos += len;
530        Token {
531            kind,
532            span: Span::new(start, self.pos),
533        }
534    }
535}
536
537// ---------------------------------------------------------------------------
538// Helpers
539// ---------------------------------------------------------------------------
540
541fn is_ident_start(b: u8) -> bool {
542    b.is_ascii_alphabetic() || b == b'_'
543}
544
545fn is_ident_continue(b: u8) -> bool {
546    b.is_ascii_alphanumeric() || b == b'_'
547}
548
549fn classify_keyword(text: &str) -> TokenKind {
550    match text {
551        "rule" => TokenKind::Rule,
552        "entity" => TokenKind::Entity,
553        "external" => TokenKind::External,
554        "value" => TokenKind::Value,
555        "enum" => TokenKind::Enum,
556        "given" => TokenKind::Given,
557        "config" => TokenKind::Config,
558        "surface" => TokenKind::Surface,
559        "actor" => TokenKind::Actor,
560        "default" => TokenKind::Default,
561        "variant" => TokenKind::Variant,
562        "deferred" => TokenKind::Deferred,
563        "open" => TokenKind::Open,
564        "question" => TokenKind::Question,
565        "use" => TokenKind::Use,
566        "as" => TokenKind::As,
567        "when" => TokenKind::When,
568        "requires" => TokenKind::Requires,
569        "ensures" => TokenKind::Ensures,
570        "let" => TokenKind::Let,
571        "for" => TokenKind::For,
572        "in" => TokenKind::In,
573        "if" => TokenKind::If,
574        "else" => TokenKind::Else,
575        "where" => TokenKind::Where,
576        "with" => TokenKind::With,
577        "not" => TokenKind::Not,
578        "and" => TokenKind::And,
579        "or" => TokenKind::Or,
580        "exists" => TokenKind::Exists,
581        "implies" => TokenKind::Implies,
582        "contract" => TokenKind::Contract,
583        "invariant" => TokenKind::Invariant,
584        "transitions_to" => TokenKind::TransitionsTo,
585        "becomes" => TokenKind::Becomes,
586        "true" => TokenKind::True,
587        "false" => TokenKind::False,
588        "null" => TokenKind::Null,
589        "now" => TokenKind::Now,
590        "this" => TokenKind::This,
591        "within" => TokenKind::Within,
592        _ => TokenKind::Ident,
593    }
594}
595
596// ---------------------------------------------------------------------------
597// Tests
598// ---------------------------------------------------------------------------
599
600#[cfg(test)]
601mod tests {
602    use super::*;
603
604    fn kinds(src: &str) -> Vec<TokenKind> {
605        lex(src).into_iter().map(|t| t.kind).collect()
606    }
607
608    fn text_of(src: &str) -> Vec<&str> {
609        lex(src)
610            .into_iter()
611            .map(|t| &src[t.span.start..t.span.end])
612            .collect()
613    }
614
615    #[test]
616    fn keywords() {
617        assert_eq!(
618            kinds("rule entity enum"),
619            vec![TokenKind::Rule, TokenKind::Entity, TokenKind::Enum, TokenKind::Eof]
620        );
621    }
622
623    #[test]
624    fn identifiers() {
625        assert_eq!(
626            kinds("my_var User"),
627            vec![TokenKind::Ident, TokenKind::Ident, TokenKind::Eof]
628        );
629    }
630
631    #[test]
632    fn numbers() {
633        assert_eq!(kinds("42"), vec![TokenKind::Number, TokenKind::Eof]);
634        assert_eq!(kinds("3.14"), vec![TokenKind::Number, TokenKind::Eof]);
635        assert_eq!(kinds("100_000"), vec![TokenKind::Number, TokenKind::Eof]);
636    }
637
638    #[test]
639    fn durations() {
640        assert_eq!(kinds("24.hours"), vec![TokenKind::Duration, TokenKind::Eof]);
641        assert_eq!(kinds("7.days"), vec![TokenKind::Duration, TokenKind::Eof]);
642        assert_eq!(kinds("1.second"), vec![TokenKind::Duration, TokenKind::Eof]);
643        assert_eq!(kinds("3.5.minutes"), vec![TokenKind::Duration, TokenKind::Eof]);
644    }
645
646    #[test]
647    fn duration_vs_member_access() {
648        // 42.count is number + dot + ident, not a duration
649        assert_eq!(
650            kinds("42.count"),
651            vec![TokenKind::Number, TokenKind::Dot, TokenKind::Ident, TokenKind::Eof]
652        );
653    }
654
655    #[test]
656    fn strings() {
657        assert_eq!(kinds(r#""hello""#), vec![TokenKind::String, TokenKind::Eof]);
658        assert_eq!(
659            kinds(r#""hello {name}""#),
660            vec![TokenKind::String, TokenKind::Eof]
661        );
662    }
663
664    #[test]
665    fn operators() {
666        assert_eq!(
667            kinds("=> -> ?? ?. != <= >="),
668            vec![
669                TokenKind::FatArrow,
670                TokenKind::ThinArrow,
671                TokenKind::QuestionQuestion,
672                TokenKind::QuestionDot,
673                TokenKind::BangEq,
674                TokenKind::LtEq,
675                TokenKind::GtEq,
676                TokenKind::Eof,
677            ]
678        );
679    }
680
681    #[test]
682    fn comments_skipped() {
683        assert_eq!(
684            kinds("rule -- this is a comment\nentity"),
685            vec![TokenKind::Rule, TokenKind::Entity, TokenKind::Eof]
686        );
687    }
688
689    #[test]
690    fn delimiters() {
691        assert_eq!(
692            kinds("{ } ( ) : ,"),
693            vec![
694                TokenKind::LBrace, TokenKind::RBrace,
695                TokenKind::LParen, TokenKind::RParen,
696                TokenKind::Colon, TokenKind::Comma,
697                TokenKind::Eof,
698            ]
699        );
700    }
701
702    #[test]
703    fn full_line() {
704        let src = "status: pending | active | completed";
705        assert_eq!(
706            text_of(src),
707            vec!["status", ":", "pending", "|", "active", "|", "completed", ""]
708        );
709    }
710
711    #[test]
712    fn source_map_line_col() {
713        let src = "abc\ndef\nghi";
714        let map = SourceMap::new(src);
715        assert_eq!(map.line_col(0), (0, 0)); // 'a'
716        assert_eq!(map.line_col(3), (0, 3)); // '\n'
717        assert_eq!(map.line_col(4), (1, 0)); // 'd'
718        assert_eq!(map.line_col(8), (2, 0)); // 'g'
719    }
720}