Skip to main content

allium_parser/
lexer.rs

1use crate::Span;
2
3// ---------------------------------------------------------------------------
4// Token types
5// ---------------------------------------------------------------------------
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum TokenKind {
9    // Identifiers and literals
10    Ident,
11    Number,
12    Duration,
13    String,
14    True,
15    False,
16    Null,
17
18    // Block-level keywords
19    Rule,
20    Entity,
21    External,
22    Value,
23    Enum,
24    Given,
25    Config,
26    Surface,
27    Actor,
28    Default,
29    Variant,
30    Deferred,
31    Open,
32    Question,
33    Use,
34    As,
35
36    // Clause / expression keywords
37    When,
38    Requires,
39    Ensures,
40    Let,
41    For,
42    In,
43    If,
44    Else,
45    Where,
46    With,
47    Not,
48    And,
49    Or,
50    Exists,
51
52    // Trigger keywords
53    TransitionsTo,
54    Becomes,
55
56    // Context-sensitive identifiers treated as keywords
57    Now,
58    This,
59    Within,
60
61    // Operators
62    Eq,              // =
63    BangEq,          // !=
64    Lt,              // <
65    LtEq,            // <=
66    Gt,              // >
67    GtEq,            // >=
68    Plus,            // +
69    Minus,           // -
70    Star,            // *
71    Slash,           // /
72    Pipe,            // |
73    FatArrow,        // =>
74    ThinArrow,       // ->
75    QuestionQuestion, // ??
76    QuestionDot,     // ?.
77    Dot,             // .
78
79    // Delimiters
80    LBrace,
81    RBrace,
82    LParen,
83    RParen,
84    LBracket,        // [
85    RBracket,        // ]
86    Colon,
87    Comma,
88    QuestionMark,    // standalone ?
89
90    // End of file
91    Eof,
92
93    // Lexer error (unrecognised character or unterminated string)
94    Error,
95}
96
97impl std::fmt::Display for TokenKind {
98    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
99        match self {
100            TokenKind::Ident => write!(f, "identifier"),
101            TokenKind::Number => write!(f, "number"),
102            TokenKind::Duration => write!(f, "duration"),
103            TokenKind::String => write!(f, "string"),
104            TokenKind::True => write!(f, "'true'"),
105            TokenKind::False => write!(f, "'false'"),
106            TokenKind::Null => write!(f, "'null'"),
107            TokenKind::Rule => write!(f, "'rule'"),
108            TokenKind::Entity => write!(f, "'entity'"),
109            TokenKind::External => write!(f, "'external'"),
110            TokenKind::Value => write!(f, "'value'"),
111            TokenKind::Enum => write!(f, "'enum'"),
112            TokenKind::Given => write!(f, "'given'"),
113            TokenKind::Config => write!(f, "'config'"),
114            TokenKind::Surface => write!(f, "'surface'"),
115            TokenKind::Actor => write!(f, "'actor'"),
116            TokenKind::Default => write!(f, "'default'"),
117            TokenKind::Variant => write!(f, "'variant'"),
118            TokenKind::Deferred => write!(f, "'deferred'"),
119            TokenKind::Open => write!(f, "'open'"),
120            TokenKind::Question => write!(f, "'question'"),
121            TokenKind::Use => write!(f, "'use'"),
122            TokenKind::As => write!(f, "'as'"),
123            TokenKind::When => write!(f, "'when'"),
124            TokenKind::Requires => write!(f, "'requires'"),
125            TokenKind::Ensures => write!(f, "'ensures'"),
126            TokenKind::Let => write!(f, "'let'"),
127            TokenKind::For => write!(f, "'for'"),
128            TokenKind::In => write!(f, "'in'"),
129            TokenKind::If => write!(f, "'if'"),
130            TokenKind::Else => write!(f, "'else'"),
131            TokenKind::Where => write!(f, "'where'"),
132            TokenKind::With => write!(f, "'with'"),
133            TokenKind::Not => write!(f, "'not'"),
134            TokenKind::And => write!(f, "'and'"),
135            TokenKind::Or => write!(f, "'or'"),
136            TokenKind::Exists => write!(f, "'exists'"),
137            TokenKind::TransitionsTo => write!(f, "'transitions_to'"),
138            TokenKind::Becomes => write!(f, "'becomes'"),
139            TokenKind::Now => write!(f, "'now'"),
140            TokenKind::This => write!(f, "'this'"),
141            TokenKind::Within => write!(f, "'within'"),
142            TokenKind::Eq => write!(f, "'='"),
143            TokenKind::BangEq => write!(f, "'!='"),
144            TokenKind::Lt => write!(f, "'<'"),
145            TokenKind::LtEq => write!(f, "'<='"),
146            TokenKind::Gt => write!(f, "'>'"),
147            TokenKind::GtEq => write!(f, "'>='"),
148            TokenKind::Plus => write!(f, "'+'"),
149            TokenKind::Minus => write!(f, "'-'"),
150            TokenKind::Star => write!(f, "'*'"),
151            TokenKind::Slash => write!(f, "'/'"),
152            TokenKind::Pipe => write!(f, "'|'"),
153            TokenKind::FatArrow => write!(f, "'=>'"),
154            TokenKind::ThinArrow => write!(f, "'->'"),
155            TokenKind::QuestionQuestion => write!(f, "'??'"),
156            TokenKind::QuestionDot => write!(f, "'?.'"),
157            TokenKind::Dot => write!(f, "'.'"),
158            TokenKind::LBrace => write!(f, "'{{'"),
159            TokenKind::RBrace => write!(f, "'}}'"),
160            TokenKind::LParen => write!(f, "'('"),
161            TokenKind::RParen => write!(f, "')'"),
162            TokenKind::LBracket => write!(f, "'['"),
163            TokenKind::RBracket => write!(f, "']'"),
164            TokenKind::Colon => write!(f, "':'"),
165            TokenKind::Comma => write!(f, "','"),
166            TokenKind::QuestionMark => write!(f, "'?'"),
167            TokenKind::Eof => write!(f, "end of file"),
168            TokenKind::Error => write!(f, "unrecognised token"),
169        }
170    }
171}
172
173impl TokenKind {
174    /// True for any keyword or identifier — tokens that look like words.
175    pub fn is_word(self) -> bool {
176        matches!(
177            self,
178            TokenKind::Ident
179                | TokenKind::True
180                | TokenKind::False
181                | TokenKind::Null
182                | TokenKind::Rule
183                | TokenKind::Entity
184                | TokenKind::External
185                | TokenKind::Value
186                | TokenKind::Enum
187                | TokenKind::Given
188                | TokenKind::Config
189                | TokenKind::Surface
190                | TokenKind::Actor
191                | TokenKind::Default
192                | TokenKind::Variant
193                | TokenKind::Deferred
194                | TokenKind::Open
195                | TokenKind::Question
196                | TokenKind::Use
197                | TokenKind::As
198                | TokenKind::When
199                | TokenKind::Requires
200                | TokenKind::Ensures
201                | TokenKind::Let
202                | TokenKind::For
203                | TokenKind::In
204                | TokenKind::If
205                | TokenKind::Else
206                | TokenKind::Where
207                | TokenKind::With
208                | TokenKind::Not
209                | TokenKind::And
210                | TokenKind::Or
211                | TokenKind::Exists
212                | TokenKind::TransitionsTo
213                | TokenKind::Becomes
214                | TokenKind::Now
215                | TokenKind::This
216                | TokenKind::Within
217        )
218    }
219}
220
221#[derive(Debug, Clone, Copy)]
222pub struct Token {
223    pub kind: TokenKind,
224    pub span: Span,
225}
226
227// ---------------------------------------------------------------------------
228// Source map — precomputed line start offsets for O(1) line/col lookup
229// ---------------------------------------------------------------------------
230
231pub struct SourceMap {
232    line_starts: Vec<usize>,
233}
234
235impl SourceMap {
236    pub fn new(source: &str) -> Self {
237        let mut starts = vec![0];
238        for (i, b) in source.bytes().enumerate() {
239            if b == b'\n' {
240                starts.push(i + 1);
241            }
242        }
243        Self { line_starts: starts }
244    }
245
246    pub fn line_col(&self, offset: usize) -> (u32, u32) {
247        let line = self
248            .line_starts
249            .partition_point(|&s| s <= offset)
250            .saturating_sub(1);
251        let col = offset - self.line_starts[line];
252        (line as u32, col as u32)
253    }
254
255    /// Return the text of a single source line (0-indexed), without trailing newline.
256    pub fn line_text<'a>(&self, source: &'a str, line: u32) -> &'a str {
257        let idx = line as usize;
258        let start = self.line_starts[idx];
259        let end = if idx + 1 < self.line_starts.len() {
260            self.line_starts[idx + 1]
261        } else {
262            source.len()
263        };
264        source[start..end].trim_end_matches('\n').trim_end_matches('\r')
265    }
266}
267
268// ---------------------------------------------------------------------------
269// Lexer
270// ---------------------------------------------------------------------------
271
272/// Tokenise `source` into a flat list of tokens, ending with `Eof`.
273/// Whitespace and comments are skipped. The special comment `-- allium: N`
274/// at the very start of the file is preserved as an `Ident` token followed
275/// by normal tokens (version detection happens in the parser by inspecting
276/// the raw source before tokenising).
277pub fn lex(source: &str) -> Vec<Token> {
278    let mut lexer = Lexer::new(source);
279    let mut tokens = Vec::new();
280    loop {
281        let tok = lexer.next_token();
282        let done = tok.kind == TokenKind::Eof;
283        tokens.push(tok);
284        if done {
285            break;
286        }
287    }
288    tokens
289}
290
291struct Lexer<'s> {
292    src: &'s [u8],
293    pos: usize,
294}
295
296impl<'s> Lexer<'s> {
297    fn new(source: &'s str) -> Self {
298        Self {
299            src: source.as_bytes(),
300            pos: 0,
301        }
302    }
303
304    fn next_token(&mut self) -> Token {
305        self.skip_whitespace_and_comments();
306
307        if self.pos >= self.src.len() {
308            return Token {
309                kind: TokenKind::Eof,
310                span: Span::new(self.pos, self.pos),
311            };
312        }
313
314        let start = self.pos;
315        let b = self.src[self.pos];
316
317        if b == b'"' {
318            return self.lex_string(start);
319        }
320        if b.is_ascii_digit() {
321            return self.lex_number(start);
322        }
323        if is_ident_start(b) {
324            return self.lex_ident(start);
325        }
326
327        self.lex_operator(start)
328    }
329
330    // -- whitespace / comments ------------------------------------------
331
332    fn skip_whitespace_and_comments(&mut self) {
333        loop {
334            while self.pos < self.src.len()
335                && matches!(self.src[self.pos], b' ' | b'\t' | b'\n' | b'\r')
336            {
337                self.pos += 1;
338            }
339            if self.pos + 1 < self.src.len()
340                && self.src[self.pos] == b'-'
341                && self.src[self.pos + 1] == b'-'
342            {
343                while self.pos < self.src.len() && self.src[self.pos] != b'\n' {
344                    self.pos += 1;
345                }
346                continue;
347            }
348            break;
349        }
350    }
351
352    // -- string literals ------------------------------------------------
353
354    fn lex_string(&mut self, start: usize) -> Token {
355        self.pos += 1; // opening "
356        while self.pos < self.src.len() {
357            match self.src[self.pos] {
358                b'"' => {
359                    self.pos += 1;
360                    return Token {
361                        kind: TokenKind::String,
362                        span: Span::new(start, self.pos),
363                    };
364                }
365                b'\\' => {
366                    self.pos += 1;
367                    if self.pos < self.src.len() {
368                        self.pos += 1;
369                    }
370                }
371                b'\n' => {
372                    return Token {
373                        kind: TokenKind::Error,
374                        span: Span::new(start, self.pos),
375                    };
376                }
377                _ => self.pos += 1,
378            }
379        }
380        Token {
381            kind: TokenKind::Error,
382            span: Span::new(start, self.pos),
383        }
384    }
385
386    // -- numbers and durations ------------------------------------------
387
388    fn lex_number(&mut self, start: usize) -> Token {
389        self.consume_digits();
390
391        if self.pos < self.src.len() && self.src[self.pos] == b'.' {
392            let after_dot = self.pos + 1;
393            if after_dot < self.src.len() && self.src[after_dot].is_ascii_digit() {
394                // Decimal part
395                self.pos += 1;
396                self.consume_digits();
397                // Check for .unit (e.g. 3.14.hours — unusual but valid)
398                if self.check_duration_suffix() {
399                    return Token {
400                        kind: TokenKind::Duration,
401                        span: Span::new(start, self.pos),
402                    };
403                }
404                return Token {
405                    kind: TokenKind::Number,
406                    span: Span::new(start, self.pos),
407                };
408            }
409            if self.peek_duration_unit(after_dot).is_some() {
410                let unit_len = self.peek_duration_unit(after_dot).unwrap();
411                self.pos = after_dot + unit_len;
412                return Token {
413                    kind: TokenKind::Duration,
414                    span: Span::new(start, self.pos),
415                };
416            }
417        }
418
419        Token {
420            kind: TokenKind::Number,
421            span: Span::new(start, self.pos),
422        }
423    }
424
425    fn consume_digits(&mut self) {
426        while self.pos < self.src.len()
427            && (self.src[self.pos].is_ascii_digit() || self.src[self.pos] == b'_')
428        {
429            self.pos += 1;
430        }
431    }
432
433    /// After consuming a decimal number, check for `.unit` suffix.
434    fn check_duration_suffix(&mut self) -> bool {
435        if self.pos < self.src.len() && self.src[self.pos] == b'.' {
436            if let Some(unit_len) = self.peek_duration_unit(self.pos + 1) {
437                self.pos += 1 + unit_len;
438                return true;
439            }
440        }
441        false
442    }
443
444    fn peek_duration_unit(&self, from: usize) -> Option<usize> {
445        const UNITS: &[&str] = &[
446            "seconds", "second", "minutes", "minute", "hours", "hour", "days", "day", "weeks",
447            "week", "months", "month", "years", "year",
448        ];
449        for unit in UNITS {
450            let end = from + unit.len();
451            if end <= self.src.len()
452                && &self.src[from..end] == unit.as_bytes()
453                && (end >= self.src.len() || !is_ident_continue(self.src[end]))
454            {
455                return Some(unit.len());
456            }
457        }
458        None
459    }
460
461    // -- identifiers and keywords ---------------------------------------
462
463    fn lex_ident(&mut self, start: usize) -> Token {
464        while self.pos < self.src.len() && is_ident_continue(self.src[self.pos]) {
465            self.pos += 1;
466        }
467        let text = std::str::from_utf8(&self.src[start..self.pos]).unwrap();
468        Token {
469            kind: classify_keyword(text),
470            span: Span::new(start, self.pos),
471        }
472    }
473
474    // -- operators and punctuation --------------------------------------
475
476    fn lex_operator(&mut self, start: usize) -> Token {
477        let b = self.src[self.pos];
478        let next = if self.pos + 1 < self.src.len() {
479            self.src[self.pos + 1]
480        } else {
481            0
482        };
483
484        let (kind, len) = match (b, next) {
485            (b'=', b'>') => (TokenKind::FatArrow, 2),
486            (b'=', _) => (TokenKind::Eq, 1),
487            (b'!', b'=') => (TokenKind::BangEq, 2),
488            (b'<', b'=') => (TokenKind::LtEq, 2),
489            (b'<', _) => (TokenKind::Lt, 1),
490            (b'>', b'=') => (TokenKind::GtEq, 2),
491            (b'>', _) => (TokenKind::Gt, 1),
492            (b'+', _) => (TokenKind::Plus, 1),
493            (b'-', b'>') => (TokenKind::ThinArrow, 2),
494            (b'-', _) => (TokenKind::Minus, 1),
495            (b'*', _) => (TokenKind::Star, 1),
496            (b'/', _) => (TokenKind::Slash, 1),
497            (b'|', _) => (TokenKind::Pipe, 1),
498            (b'?', b'?') => (TokenKind::QuestionQuestion, 2),
499            (b'?', b'.') => (TokenKind::QuestionDot, 2),
500            (b'?', _) => (TokenKind::QuestionMark, 1),
501            (b'.', _) => (TokenKind::Dot, 1),
502            (b'{', _) => (TokenKind::LBrace, 1),
503            (b'}', _) => (TokenKind::RBrace, 1),
504            (b'(', _) => (TokenKind::LParen, 1),
505            (b')', _) => (TokenKind::RParen, 1),
506            (b'[', _) => (TokenKind::LBracket, 1),
507            (b']', _) => (TokenKind::RBracket, 1),
508            (b':', _) => (TokenKind::Colon, 1),
509            (b',', _) => (TokenKind::Comma, 1),
510            _ => (TokenKind::Error, 1),
511        };
512
513        self.pos += len;
514        Token {
515            kind,
516            span: Span::new(start, self.pos),
517        }
518    }
519}
520
521// ---------------------------------------------------------------------------
522// Helpers
523// ---------------------------------------------------------------------------
524
525fn is_ident_start(b: u8) -> bool {
526    b.is_ascii_alphabetic() || b == b'_'
527}
528
529fn is_ident_continue(b: u8) -> bool {
530    b.is_ascii_alphanumeric() || b == b'_'
531}
532
533fn classify_keyword(text: &str) -> TokenKind {
534    match text {
535        "rule" => TokenKind::Rule,
536        "entity" => TokenKind::Entity,
537        "external" => TokenKind::External,
538        "value" => TokenKind::Value,
539        "enum" => TokenKind::Enum,
540        "given" => TokenKind::Given,
541        "config" => TokenKind::Config,
542        "surface" => TokenKind::Surface,
543        "actor" => TokenKind::Actor,
544        "default" => TokenKind::Default,
545        "variant" => TokenKind::Variant,
546        "deferred" => TokenKind::Deferred,
547        "open" => TokenKind::Open,
548        "question" => TokenKind::Question,
549        "use" => TokenKind::Use,
550        "as" => TokenKind::As,
551        "when" => TokenKind::When,
552        "requires" => TokenKind::Requires,
553        "ensures" => TokenKind::Ensures,
554        "let" => TokenKind::Let,
555        "for" => TokenKind::For,
556        "in" => TokenKind::In,
557        "if" => TokenKind::If,
558        "else" => TokenKind::Else,
559        "where" => TokenKind::Where,
560        "with" => TokenKind::With,
561        "not" => TokenKind::Not,
562        "and" => TokenKind::And,
563        "or" => TokenKind::Or,
564        "exists" => TokenKind::Exists,
565        "transitions_to" => TokenKind::TransitionsTo,
566        "becomes" => TokenKind::Becomes,
567        "true" => TokenKind::True,
568        "false" => TokenKind::False,
569        "null" => TokenKind::Null,
570        "now" => TokenKind::Now,
571        "this" => TokenKind::This,
572        "within" => TokenKind::Within,
573        _ => TokenKind::Ident,
574    }
575}
576
577// ---------------------------------------------------------------------------
578// Tests
579// ---------------------------------------------------------------------------
580
581#[cfg(test)]
582mod tests {
583    use super::*;
584
585    fn kinds(src: &str) -> Vec<TokenKind> {
586        lex(src).into_iter().map(|t| t.kind).collect()
587    }
588
589    fn text_of(src: &str) -> Vec<&str> {
590        lex(src)
591            .into_iter()
592            .map(|t| &src[t.span.start..t.span.end])
593            .collect()
594    }
595
596    #[test]
597    fn keywords() {
598        assert_eq!(
599            kinds("rule entity enum"),
600            vec![TokenKind::Rule, TokenKind::Entity, TokenKind::Enum, TokenKind::Eof]
601        );
602    }
603
604    #[test]
605    fn identifiers() {
606        assert_eq!(
607            kinds("my_var User"),
608            vec![TokenKind::Ident, TokenKind::Ident, TokenKind::Eof]
609        );
610    }
611
612    #[test]
613    fn numbers() {
614        assert_eq!(kinds("42"), vec![TokenKind::Number, TokenKind::Eof]);
615        assert_eq!(kinds("3.14"), vec![TokenKind::Number, TokenKind::Eof]);
616        assert_eq!(kinds("100_000"), vec![TokenKind::Number, TokenKind::Eof]);
617    }
618
619    #[test]
620    fn durations() {
621        assert_eq!(kinds("24.hours"), vec![TokenKind::Duration, TokenKind::Eof]);
622        assert_eq!(kinds("7.days"), vec![TokenKind::Duration, TokenKind::Eof]);
623        assert_eq!(kinds("1.second"), vec![TokenKind::Duration, TokenKind::Eof]);
624        assert_eq!(kinds("3.5.minutes"), vec![TokenKind::Duration, TokenKind::Eof]);
625    }
626
627    #[test]
628    fn duration_vs_member_access() {
629        // 42.count is number + dot + ident, not a duration
630        assert_eq!(
631            kinds("42.count"),
632            vec![TokenKind::Number, TokenKind::Dot, TokenKind::Ident, TokenKind::Eof]
633        );
634    }
635
636    #[test]
637    fn strings() {
638        assert_eq!(kinds(r#""hello""#), vec![TokenKind::String, TokenKind::Eof]);
639        assert_eq!(
640            kinds(r#""hello {name}""#),
641            vec![TokenKind::String, TokenKind::Eof]
642        );
643    }
644
645    #[test]
646    fn operators() {
647        assert_eq!(
648            kinds("=> -> ?? ?. != <= >="),
649            vec![
650                TokenKind::FatArrow,
651                TokenKind::ThinArrow,
652                TokenKind::QuestionQuestion,
653                TokenKind::QuestionDot,
654                TokenKind::BangEq,
655                TokenKind::LtEq,
656                TokenKind::GtEq,
657                TokenKind::Eof,
658            ]
659        );
660    }
661
662    #[test]
663    fn comments_skipped() {
664        assert_eq!(
665            kinds("rule -- this is a comment\nentity"),
666            vec![TokenKind::Rule, TokenKind::Entity, TokenKind::Eof]
667        );
668    }
669
670    #[test]
671    fn delimiters() {
672        assert_eq!(
673            kinds("{ } ( ) : ,"),
674            vec![
675                TokenKind::LBrace, TokenKind::RBrace,
676                TokenKind::LParen, TokenKind::RParen,
677                TokenKind::Colon, TokenKind::Comma,
678                TokenKind::Eof,
679            ]
680        );
681    }
682
683    #[test]
684    fn full_line() {
685        let src = "status: pending | active | completed";
686        assert_eq!(
687            text_of(src),
688            vec!["status", ":", "pending", "|", "active", "|", "completed", ""]
689        );
690    }
691
692    #[test]
693    fn source_map_line_col() {
694        let src = "abc\ndef\nghi";
695        let map = SourceMap::new(src);
696        assert_eq!(map.line_col(0), (0, 0)); // 'a'
697        assert_eq!(map.line_col(3), (0, 3)); // '\n'
698        assert_eq!(map.line_col(4), (1, 0)); // 'd'
699        assert_eq!(map.line_col(8), (2, 0)); // 'g'
700    }
701}