Skip to main content

allium_parser/
lexer.rs

1use crate::Span;
2
3// ---------------------------------------------------------------------------
4// Token types
5// ---------------------------------------------------------------------------
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum TokenKind {
9    // Identifiers and literals
10    Ident,
11    Number,
12    Duration,
13    String,
14    True,
15    False,
16    Null,
17
18    // Block-level keywords
19    Rule,
20    Entity,
21    External,
22    Value,
23    Enum,
24    Given,
25    Config,
26    Surface,
27    Actor,
28    Default,
29    Variant,
30    Deferred,
31    Open,
32    Question,
33    Use,
34    As,
35    Module,
36
37    // Clause / expression keywords
38    When,
39    Requires,
40    Ensures,
41    Let,
42    For,
43    In,
44    If,
45    Else,
46    Where,
47    With,
48    Not,
49    And,
50    Or,
51    Exists,
52
53    // Trigger / predicate keywords
54    TransitionsTo,
55    Becomes,
56    Includes,
57    Excludes,
58
59    // Context-sensitive identifiers treated as keywords
60    Now,
61    This,
62    Within,
63
64    // Operators
65    Eq,              // =
66    EqEq,            // ==
67    BangEq,          // !=
68    Lt,              // <
69    LtEq,            // <=
70    Gt,              // >
71    GtEq,            // >=
72    Plus,            // +
73    Minus,           // -
74    Star,            // *
75    Slash,           // /
76    Pipe,            // |
77    FatArrow,        // =>
78    ThinArrow,       // ->
79    QuestionQuestion, // ??
80    QuestionDot,     // ?.
81    Dot,             // .
82    DotDot,          // ..
83
84    // Delimiters
85    LBrace,
86    RBrace,
87    LParen,
88    RParen,
89    LBracket,        // [
90    RBracket,        // ]
91    Colon,
92    Comma,
93    QuestionMark,    // standalone ?
94
95    // End of file
96    Eof,
97
98    // Lexer error (unrecognised character or unterminated string)
99    Error,
100}
101
102impl std::fmt::Display for TokenKind {
103    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
104        match self {
105            TokenKind::Ident => write!(f, "identifier"),
106            TokenKind::Number => write!(f, "number"),
107            TokenKind::Duration => write!(f, "duration"),
108            TokenKind::String => write!(f, "string"),
109            TokenKind::True => write!(f, "'true'"),
110            TokenKind::False => write!(f, "'false'"),
111            TokenKind::Null => write!(f, "'null'"),
112            TokenKind::Rule => write!(f, "'rule'"),
113            TokenKind::Entity => write!(f, "'entity'"),
114            TokenKind::External => write!(f, "'external'"),
115            TokenKind::Value => write!(f, "'value'"),
116            TokenKind::Enum => write!(f, "'enum'"),
117            TokenKind::Given => write!(f, "'given'"),
118            TokenKind::Config => write!(f, "'config'"),
119            TokenKind::Surface => write!(f, "'surface'"),
120            TokenKind::Actor => write!(f, "'actor'"),
121            TokenKind::Default => write!(f, "'default'"),
122            TokenKind::Variant => write!(f, "'variant'"),
123            TokenKind::Deferred => write!(f, "'deferred'"),
124            TokenKind::Open => write!(f, "'open'"),
125            TokenKind::Question => write!(f, "'question'"),
126            TokenKind::Use => write!(f, "'use'"),
127            TokenKind::As => write!(f, "'as'"),
128            TokenKind::Module => write!(f, "'module'"),
129            TokenKind::When => write!(f, "'when'"),
130            TokenKind::Requires => write!(f, "'requires'"),
131            TokenKind::Ensures => write!(f, "'ensures'"),
132            TokenKind::Let => write!(f, "'let'"),
133            TokenKind::For => write!(f, "'for'"),
134            TokenKind::In => write!(f, "'in'"),
135            TokenKind::If => write!(f, "'if'"),
136            TokenKind::Else => write!(f, "'else'"),
137            TokenKind::Where => write!(f, "'where'"),
138            TokenKind::With => write!(f, "'with'"),
139            TokenKind::Not => write!(f, "'not'"),
140            TokenKind::And => write!(f, "'and'"),
141            TokenKind::Or => write!(f, "'or'"),
142            TokenKind::Exists => write!(f, "'exists'"),
143            TokenKind::TransitionsTo => write!(f, "'transitions_to'"),
144            TokenKind::Becomes => write!(f, "'becomes'"),
145            TokenKind::Includes => write!(f, "'includes'"),
146            TokenKind::Excludes => write!(f, "'excludes'"),
147            TokenKind::Now => write!(f, "'now'"),
148            TokenKind::This => write!(f, "'this'"),
149            TokenKind::Within => write!(f, "'within'"),
150            TokenKind::Eq => write!(f, "'='"),
151            TokenKind::EqEq => write!(f, "'=='"),
152            TokenKind::BangEq => write!(f, "'!='"),
153            TokenKind::Lt => write!(f, "'<'"),
154            TokenKind::LtEq => write!(f, "'<='"),
155            TokenKind::Gt => write!(f, "'>'"),
156            TokenKind::GtEq => write!(f, "'>='"),
157            TokenKind::Plus => write!(f, "'+'"),
158            TokenKind::Minus => write!(f, "'-'"),
159            TokenKind::Star => write!(f, "'*'"),
160            TokenKind::Slash => write!(f, "'/'"),
161            TokenKind::Pipe => write!(f, "'|'"),
162            TokenKind::FatArrow => write!(f, "'=>'"),
163            TokenKind::ThinArrow => write!(f, "'->'"),
164            TokenKind::QuestionQuestion => write!(f, "'??'"),
165            TokenKind::QuestionDot => write!(f, "'?.'"),
166            TokenKind::Dot => write!(f, "'.'"),
167            TokenKind::DotDot => write!(f, "'..'"),
168            TokenKind::LBrace => write!(f, "'{{'"),
169            TokenKind::RBrace => write!(f, "'}}'"),
170            TokenKind::LParen => write!(f, "'('"),
171            TokenKind::RParen => write!(f, "')'"),
172            TokenKind::LBracket => write!(f, "'['"),
173            TokenKind::RBracket => write!(f, "']'"),
174            TokenKind::Colon => write!(f, "':'"),
175            TokenKind::Comma => write!(f, "','"),
176            TokenKind::QuestionMark => write!(f, "'?'"),
177            TokenKind::Eof => write!(f, "end of file"),
178            TokenKind::Error => write!(f, "unrecognised token"),
179        }
180    }
181}
182
183impl TokenKind {
184    /// True for any keyword or identifier — tokens that look like words.
185    pub fn is_word(self) -> bool {
186        matches!(
187            self,
188            TokenKind::Ident
189                | TokenKind::True
190                | TokenKind::False
191                | TokenKind::Null
192                | TokenKind::Rule
193                | TokenKind::Entity
194                | TokenKind::External
195                | TokenKind::Value
196                | TokenKind::Enum
197                | TokenKind::Given
198                | TokenKind::Config
199                | TokenKind::Surface
200                | TokenKind::Actor
201                | TokenKind::Default
202                | TokenKind::Variant
203                | TokenKind::Deferred
204                | TokenKind::Open
205                | TokenKind::Question
206                | TokenKind::Use
207                | TokenKind::As
208                | TokenKind::Module
209                | TokenKind::When
210                | TokenKind::Requires
211                | TokenKind::Ensures
212                | TokenKind::Let
213                | TokenKind::For
214                | TokenKind::In
215                | TokenKind::If
216                | TokenKind::Else
217                | TokenKind::Where
218                | TokenKind::With
219                | TokenKind::Not
220                | TokenKind::And
221                | TokenKind::Or
222                | TokenKind::Exists
223                | TokenKind::TransitionsTo
224                | TokenKind::Becomes
225                | TokenKind::Includes
226                | TokenKind::Excludes
227                | TokenKind::Now
228                | TokenKind::This
229                | TokenKind::Within
230        )
231    }
232}
233
234#[derive(Debug, Clone, Copy)]
235pub struct Token {
236    pub kind: TokenKind,
237    pub span: Span,
238}
239
240// ---------------------------------------------------------------------------
241// Source map — precomputed line start offsets for O(1) line/col lookup
242// ---------------------------------------------------------------------------
243
244pub struct SourceMap {
245    line_starts: Vec<usize>,
246}
247
248impl SourceMap {
249    pub fn new(source: &str) -> Self {
250        let mut starts = vec![0];
251        for (i, b) in source.bytes().enumerate() {
252            if b == b'\n' {
253                starts.push(i + 1);
254            }
255        }
256        Self { line_starts: starts }
257    }
258
259    pub fn line_col(&self, offset: usize) -> (u32, u32) {
260        let line = self
261            .line_starts
262            .partition_point(|&s| s <= offset)
263            .saturating_sub(1);
264        let col = offset - self.line_starts[line];
265        (line as u32, col as u32)
266    }
267
268    /// Return the text of a single source line (0-indexed), without trailing newline.
269    pub fn line_text<'a>(&self, source: &'a str, line: u32) -> &'a str {
270        let idx = line as usize;
271        let start = self.line_starts[idx];
272        let end = if idx + 1 < self.line_starts.len() {
273            self.line_starts[idx + 1]
274        } else {
275            source.len()
276        };
277        source[start..end].trim_end_matches('\n').trim_end_matches('\r')
278    }
279}
280
281// ---------------------------------------------------------------------------
282// Lexer
283// ---------------------------------------------------------------------------
284
285/// Tokenise `source` into a flat list of tokens, ending with `Eof`.
286/// Whitespace and comments are skipped. The special comment `-- allium: N`
287/// at the very start of the file is preserved as an `Ident` token followed
288/// by normal tokens (version detection happens in the parser by inspecting
289/// the raw source before tokenising).
290pub fn lex(source: &str) -> Vec<Token> {
291    let mut lexer = Lexer::new(source);
292    let mut tokens = Vec::new();
293    loop {
294        let tok = lexer.next_token();
295        let done = tok.kind == TokenKind::Eof;
296        tokens.push(tok);
297        if done {
298            break;
299        }
300    }
301    tokens
302}
303
304struct Lexer<'s> {
305    src: &'s [u8],
306    pos: usize,
307}
308
309impl<'s> Lexer<'s> {
310    fn new(source: &'s str) -> Self {
311        Self {
312            src: source.as_bytes(),
313            pos: 0,
314        }
315    }
316
317    fn next_token(&mut self) -> Token {
318        self.skip_whitespace_and_comments();
319
320        if self.pos >= self.src.len() {
321            return Token {
322                kind: TokenKind::Eof,
323                span: Span::new(self.pos, self.pos),
324            };
325        }
326
327        let start = self.pos;
328        let b = self.src[self.pos];
329
330        if b == b'"' {
331            return self.lex_string(start);
332        }
333        if b.is_ascii_digit() {
334            return self.lex_number(start);
335        }
336        if is_ident_start(b) {
337            return self.lex_ident(start);
338        }
339
340        self.lex_operator(start)
341    }
342
343    // -- whitespace / comments ------------------------------------------
344
345    fn skip_whitespace_and_comments(&mut self) {
346        loop {
347            while self.pos < self.src.len()
348                && matches!(self.src[self.pos], b' ' | b'\t' | b'\n' | b'\r')
349            {
350                self.pos += 1;
351            }
352            if self.pos + 1 < self.src.len()
353                && self.src[self.pos] == b'-'
354                && self.src[self.pos + 1] == b'-'
355            {
356                while self.pos < self.src.len() && self.src[self.pos] != b'\n' {
357                    self.pos += 1;
358                }
359                continue;
360            }
361            break;
362        }
363    }
364
365    // -- string literals ------------------------------------------------
366
367    fn lex_string(&mut self, start: usize) -> Token {
368        self.pos += 1; // opening "
369        while self.pos < self.src.len() {
370            match self.src[self.pos] {
371                b'"' => {
372                    self.pos += 1;
373                    return Token {
374                        kind: TokenKind::String,
375                        span: Span::new(start, self.pos),
376                    };
377                }
378                b'\\' => {
379                    self.pos += 1;
380                    if self.pos < self.src.len() {
381                        self.pos += 1;
382                    }
383                }
384                b'\n' => {
385                    return Token {
386                        kind: TokenKind::Error,
387                        span: Span::new(start, self.pos),
388                    };
389                }
390                _ => self.pos += 1,
391            }
392        }
393        Token {
394            kind: TokenKind::Error,
395            span: Span::new(start, self.pos),
396        }
397    }
398
399    // -- numbers and durations ------------------------------------------
400
401    fn lex_number(&mut self, start: usize) -> Token {
402        self.consume_digits();
403
404        if self.pos < self.src.len() && self.src[self.pos] == b'.' {
405            let after_dot = self.pos + 1;
406            if after_dot < self.src.len() && self.src[after_dot].is_ascii_digit() {
407                // Decimal part
408                self.pos += 1;
409                self.consume_digits();
410                // Check for .unit (e.g. 3.14.hours — unusual but valid)
411                if self.check_duration_suffix() {
412                    return Token {
413                        kind: TokenKind::Duration,
414                        span: Span::new(start, self.pos),
415                    };
416                }
417                return Token {
418                    kind: TokenKind::Number,
419                    span: Span::new(start, self.pos),
420                };
421            }
422            if self.peek_duration_unit(after_dot).is_some() {
423                let unit_len = self.peek_duration_unit(after_dot).unwrap();
424                self.pos = after_dot + unit_len;
425                return Token {
426                    kind: TokenKind::Duration,
427                    span: Span::new(start, self.pos),
428                };
429            }
430        }
431
432        Token {
433            kind: TokenKind::Number,
434            span: Span::new(start, self.pos),
435        }
436    }
437
438    fn consume_digits(&mut self) {
439        while self.pos < self.src.len()
440            && (self.src[self.pos].is_ascii_digit() || self.src[self.pos] == b'_')
441        {
442            self.pos += 1;
443        }
444    }
445
446    /// After consuming a decimal number, check for `.unit` suffix.
447    fn check_duration_suffix(&mut self) -> bool {
448        if self.pos < self.src.len() && self.src[self.pos] == b'.' {
449            if let Some(unit_len) = self.peek_duration_unit(self.pos + 1) {
450                self.pos += 1 + unit_len;
451                return true;
452            }
453        }
454        false
455    }
456
457    fn peek_duration_unit(&self, from: usize) -> Option<usize> {
458        const UNITS: &[&str] = &[
459            "seconds", "second", "minutes", "minute", "hours", "hour", "days", "day", "weeks",
460            "week", "months", "month", "years", "year",
461        ];
462        for unit in UNITS {
463            let end = from + unit.len();
464            if end <= self.src.len()
465                && &self.src[from..end] == unit.as_bytes()
466                && (end >= self.src.len() || !is_ident_continue(self.src[end]))
467            {
468                return Some(unit.len());
469            }
470        }
471        None
472    }
473
474    // -- identifiers and keywords ---------------------------------------
475
476    fn lex_ident(&mut self, start: usize) -> Token {
477        while self.pos < self.src.len() && is_ident_continue(self.src[self.pos]) {
478            self.pos += 1;
479        }
480        let text = std::str::from_utf8(&self.src[start..self.pos]).unwrap();
481        Token {
482            kind: classify_keyword(text),
483            span: Span::new(start, self.pos),
484        }
485    }
486
487    // -- operators and punctuation --------------------------------------
488
489    fn lex_operator(&mut self, start: usize) -> Token {
490        let b = self.src[self.pos];
491        let next = if self.pos + 1 < self.src.len() {
492            self.src[self.pos + 1]
493        } else {
494            0
495        };
496
497        let (kind, len) = match (b, next) {
498            (b'=', b'>') => (TokenKind::FatArrow, 2),
499            (b'=', b'=') => (TokenKind::EqEq, 2),
500            (b'=', _) => (TokenKind::Eq, 1),
501            (b'!', b'=') => (TokenKind::BangEq, 2),
502            (b'<', b'=') => (TokenKind::LtEq, 2),
503            (b'<', _) => (TokenKind::Lt, 1),
504            (b'>', b'=') => (TokenKind::GtEq, 2),
505            (b'>', _) => (TokenKind::Gt, 1),
506            (b'+', _) => (TokenKind::Plus, 1),
507            (b'-', b'>') => (TokenKind::ThinArrow, 2),
508            (b'-', _) => (TokenKind::Minus, 1),
509            (b'*', _) => (TokenKind::Star, 1),
510            (b'/', _) => (TokenKind::Slash, 1),
511            (b'|', _) => (TokenKind::Pipe, 1),
512            (b'?', b'?') => (TokenKind::QuestionQuestion, 2),
513            (b'?', b'.') => (TokenKind::QuestionDot, 2),
514            (b'?', _) => (TokenKind::QuestionMark, 1),
515            (b'.', b'.') => (TokenKind::DotDot, 2),
516            (b'.', _) => (TokenKind::Dot, 1),
517            (b'{', _) => (TokenKind::LBrace, 1),
518            (b'}', _) => (TokenKind::RBrace, 1),
519            (b'(', _) => (TokenKind::LParen, 1),
520            (b')', _) => (TokenKind::RParen, 1),
521            (b'[', _) => (TokenKind::LBracket, 1),
522            (b']', _) => (TokenKind::RBracket, 1),
523            (b':', _) => (TokenKind::Colon, 1),
524            (b',', _) => (TokenKind::Comma, 1),
525            _ => (TokenKind::Error, 1),
526        };
527
528        self.pos += len;
529        Token {
530            kind,
531            span: Span::new(start, self.pos),
532        }
533    }
534}
535
536// ---------------------------------------------------------------------------
537// Helpers
538// ---------------------------------------------------------------------------
539
540fn is_ident_start(b: u8) -> bool {
541    b.is_ascii_alphabetic() || b == b'_'
542}
543
544fn is_ident_continue(b: u8) -> bool {
545    b.is_ascii_alphanumeric() || b == b'_'
546}
547
548fn classify_keyword(text: &str) -> TokenKind {
549    match text {
550        "rule" => TokenKind::Rule,
551        "entity" => TokenKind::Entity,
552        "external" => TokenKind::External,
553        "value" => TokenKind::Value,
554        "enum" => TokenKind::Enum,
555        "given" => TokenKind::Given,
556        "config" => TokenKind::Config,
557        "surface" => TokenKind::Surface,
558        "actor" => TokenKind::Actor,
559        "default" => TokenKind::Default,
560        "variant" => TokenKind::Variant,
561        "deferred" => TokenKind::Deferred,
562        "open" => TokenKind::Open,
563        "question" => TokenKind::Question,
564        "use" => TokenKind::Use,
565        "as" => TokenKind::As,
566        "module" => TokenKind::Module,
567        "when" => TokenKind::When,
568        "requires" => TokenKind::Requires,
569        "ensures" => TokenKind::Ensures,
570        "let" => TokenKind::Let,
571        "for" => TokenKind::For,
572        "in" => TokenKind::In,
573        "if" => TokenKind::If,
574        "else" => TokenKind::Else,
575        "where" => TokenKind::Where,
576        "with" => TokenKind::With,
577        "not" => TokenKind::Not,
578        "and" => TokenKind::And,
579        "or" => TokenKind::Or,
580        "exists" => TokenKind::Exists,
581        "transitions_to" => TokenKind::TransitionsTo,
582        "becomes" => TokenKind::Becomes,
583        "includes" => TokenKind::Includes,
584        "excludes" => TokenKind::Excludes,
585        "true" => TokenKind::True,
586        "false" => TokenKind::False,
587        "null" => TokenKind::Null,
588        "now" => TokenKind::Now,
589        "this" => TokenKind::This,
590        "within" => TokenKind::Within,
591        _ => TokenKind::Ident,
592    }
593}
594
595// ---------------------------------------------------------------------------
596// Tests
597// ---------------------------------------------------------------------------
598
599#[cfg(test)]
600mod tests {
601    use super::*;
602
603    fn kinds(src: &str) -> Vec<TokenKind> {
604        lex(src).into_iter().map(|t| t.kind).collect()
605    }
606
607    fn text_of(src: &str) -> Vec<&str> {
608        lex(src)
609            .into_iter()
610            .map(|t| &src[t.span.start..t.span.end])
611            .collect()
612    }
613
614    #[test]
615    fn keywords() {
616        assert_eq!(
617            kinds("rule entity enum"),
618            vec![TokenKind::Rule, TokenKind::Entity, TokenKind::Enum, TokenKind::Eof]
619        );
620    }
621
622    #[test]
623    fn identifiers() {
624        assert_eq!(
625            kinds("my_var User"),
626            vec![TokenKind::Ident, TokenKind::Ident, TokenKind::Eof]
627        );
628    }
629
630    #[test]
631    fn numbers() {
632        assert_eq!(kinds("42"), vec![TokenKind::Number, TokenKind::Eof]);
633        assert_eq!(kinds("3.14"), vec![TokenKind::Number, TokenKind::Eof]);
634        assert_eq!(kinds("100_000"), vec![TokenKind::Number, TokenKind::Eof]);
635    }
636
637    #[test]
638    fn durations() {
639        assert_eq!(kinds("24.hours"), vec![TokenKind::Duration, TokenKind::Eof]);
640        assert_eq!(kinds("7.days"), vec![TokenKind::Duration, TokenKind::Eof]);
641        assert_eq!(kinds("1.second"), vec![TokenKind::Duration, TokenKind::Eof]);
642        assert_eq!(kinds("3.5.minutes"), vec![TokenKind::Duration, TokenKind::Eof]);
643    }
644
645    #[test]
646    fn duration_vs_member_access() {
647        // 42.count is number + dot + ident, not a duration
648        assert_eq!(
649            kinds("42.count"),
650            vec![TokenKind::Number, TokenKind::Dot, TokenKind::Ident, TokenKind::Eof]
651        );
652    }
653
654    #[test]
655    fn strings() {
656        assert_eq!(kinds(r#""hello""#), vec![TokenKind::String, TokenKind::Eof]);
657        assert_eq!(
658            kinds(r#""hello {name}""#),
659            vec![TokenKind::String, TokenKind::Eof]
660        );
661    }
662
663    #[test]
664    fn operators() {
665        assert_eq!(
666            kinds("=> -> ?? ?. != <= >="),
667            vec![
668                TokenKind::FatArrow,
669                TokenKind::ThinArrow,
670                TokenKind::QuestionQuestion,
671                TokenKind::QuestionDot,
672                TokenKind::BangEq,
673                TokenKind::LtEq,
674                TokenKind::GtEq,
675                TokenKind::Eof,
676            ]
677        );
678    }
679
680    #[test]
681    fn comments_skipped() {
682        assert_eq!(
683            kinds("rule -- this is a comment\nentity"),
684            vec![TokenKind::Rule, TokenKind::Entity, TokenKind::Eof]
685        );
686    }
687
688    #[test]
689    fn delimiters() {
690        assert_eq!(
691            kinds("{ } ( ) : ,"),
692            vec![
693                TokenKind::LBrace, TokenKind::RBrace,
694                TokenKind::LParen, TokenKind::RParen,
695                TokenKind::Colon, TokenKind::Comma,
696                TokenKind::Eof,
697            ]
698        );
699    }
700
701    #[test]
702    fn full_line() {
703        let src = "status: pending | active | completed";
704        assert_eq!(
705            text_of(src),
706            vec!["status", ":", "pending", "|", "active", "|", "completed", ""]
707        );
708    }
709
710    #[test]
711    fn source_map_line_col() {
712        let src = "abc\ndef\nghi";
713        let map = SourceMap::new(src);
714        assert_eq!(map.line_col(0), (0, 0)); // 'a'
715        assert_eq!(map.line_col(3), (0, 3)); // '\n'
716        assert_eq!(map.line_col(4), (1, 0)); // 'd'
717        assert_eq!(map.line_col(8), (2, 0)); // 'g'
718    }
719}