1use crate::Span;
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum TokenKind {
9 Ident,
11 Number,
12 Duration,
13 String,
14 BacktickLiteral,
15 True,
16 False,
17 Null,
18
19 Rule,
21 Entity,
22 External,
23 Value,
24 Enum,
25 Given,
26 Config,
27 Surface,
28 Actor,
29 Default,
30 Variant,
31 Deferred,
32 Open,
33 Question,
34 Use,
35 As,
36
37 When,
39 Requires,
40 Ensures,
41 Let,
42 For,
43 In,
44 If,
45 Else,
46 Where,
47 With,
48 Not,
49 And,
50 Or,
51 Exists,
52
53 TransitionsTo,
55 Becomes,
56
57 Implies,
59 Contract,
60 Invariant,
61
62 Transitions,
64 Produces,
65 Consumes,
66 Terminal,
67
68 At, Now,
73 This,
74 Within,
75
76 Eq, BangEq, Lt, LtEq, Gt, GtEq, Plus, Minus, Star, Slash, Pipe, FatArrow, ThinArrow, QuestionQuestion, QuestionDot, Dot, LBrace,
96 RBrace,
97 LParen,
98 RParen,
99 LBracket, RBracket, Colon,
102 Comma,
103 QuestionMark, Eof,
107
108 Error,
110}
111
112impl std::fmt::Display for TokenKind {
113 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
114 match self {
115 TokenKind::Ident => write!(f, "identifier"),
116 TokenKind::Number => write!(f, "number"),
117 TokenKind::Duration => write!(f, "duration"),
118 TokenKind::String => write!(f, "string"),
119 TokenKind::BacktickLiteral => write!(f, "backtick literal"),
120 TokenKind::True => write!(f, "'true'"),
121 TokenKind::False => write!(f, "'false'"),
122 TokenKind::Null => write!(f, "'null'"),
123 TokenKind::Rule => write!(f, "'rule'"),
124 TokenKind::Entity => write!(f, "'entity'"),
125 TokenKind::External => write!(f, "'external'"),
126 TokenKind::Value => write!(f, "'value'"),
127 TokenKind::Enum => write!(f, "'enum'"),
128 TokenKind::Given => write!(f, "'given'"),
129 TokenKind::Config => write!(f, "'config'"),
130 TokenKind::Surface => write!(f, "'surface'"),
131 TokenKind::Actor => write!(f, "'actor'"),
132 TokenKind::Default => write!(f, "'default'"),
133 TokenKind::Variant => write!(f, "'variant'"),
134 TokenKind::Deferred => write!(f, "'deferred'"),
135 TokenKind::Open => write!(f, "'open'"),
136 TokenKind::Question => write!(f, "'question'"),
137 TokenKind::Use => write!(f, "'use'"),
138 TokenKind::As => write!(f, "'as'"),
139 TokenKind::When => write!(f, "'when'"),
140 TokenKind::Requires => write!(f, "'requires'"),
141 TokenKind::Ensures => write!(f, "'ensures'"),
142 TokenKind::Let => write!(f, "'let'"),
143 TokenKind::For => write!(f, "'for'"),
144 TokenKind::In => write!(f, "'in'"),
145 TokenKind::If => write!(f, "'if'"),
146 TokenKind::Else => write!(f, "'else'"),
147 TokenKind::Where => write!(f, "'where'"),
148 TokenKind::With => write!(f, "'with'"),
149 TokenKind::Not => write!(f, "'not'"),
150 TokenKind::And => write!(f, "'and'"),
151 TokenKind::Or => write!(f, "'or'"),
152 TokenKind::Exists => write!(f, "'exists'"),
153 TokenKind::TransitionsTo => write!(f, "'transitions_to'"),
154 TokenKind::Becomes => write!(f, "'becomes'"),
155 TokenKind::Implies => write!(f, "'implies'"),
156 TokenKind::Contract => write!(f, "'contract'"),
157 TokenKind::Invariant => write!(f, "'invariant'"),
158 TokenKind::Transitions => write!(f, "'transitions'"),
159 TokenKind::Produces => write!(f, "'produces'"),
160 TokenKind::Consumes => write!(f, "'consumes'"),
161 TokenKind::Terminal => write!(f, "'terminal'"),
162 TokenKind::At => write!(f, "'@'"),
163 TokenKind::Now => write!(f, "'now'"),
164 TokenKind::This => write!(f, "'this'"),
165 TokenKind::Within => write!(f, "'within'"),
166 TokenKind::Eq => write!(f, "'='"),
167 TokenKind::BangEq => write!(f, "'!='"),
168 TokenKind::Lt => write!(f, "'<'"),
169 TokenKind::LtEq => write!(f, "'<='"),
170 TokenKind::Gt => write!(f, "'>'"),
171 TokenKind::GtEq => write!(f, "'>='"),
172 TokenKind::Plus => write!(f, "'+'"),
173 TokenKind::Minus => write!(f, "'-'"),
174 TokenKind::Star => write!(f, "'*'"),
175 TokenKind::Slash => write!(f, "'/'"),
176 TokenKind::Pipe => write!(f, "'|'"),
177 TokenKind::FatArrow => write!(f, "'=>'"),
178 TokenKind::ThinArrow => write!(f, "'->'"),
179 TokenKind::QuestionQuestion => write!(f, "'??'"),
180 TokenKind::QuestionDot => write!(f, "'?.'"),
181 TokenKind::Dot => write!(f, "'.'"),
182 TokenKind::LBrace => write!(f, "'{{'"),
183 TokenKind::RBrace => write!(f, "'}}'"),
184 TokenKind::LParen => write!(f, "'('"),
185 TokenKind::RParen => write!(f, "')'"),
186 TokenKind::LBracket => write!(f, "'['"),
187 TokenKind::RBracket => write!(f, "']'"),
188 TokenKind::Colon => write!(f, "':'"),
189 TokenKind::Comma => write!(f, "','"),
190 TokenKind::QuestionMark => write!(f, "'?'"),
191 TokenKind::Eof => write!(f, "end of file"),
192 TokenKind::Error => write!(f, "unrecognised token"),
193 }
194 }
195}
196
197impl TokenKind {
198 pub fn is_word(self) -> bool {
200 matches!(
201 self,
202 TokenKind::Ident
203 | TokenKind::True
204 | TokenKind::False
205 | TokenKind::Null
206 | TokenKind::Rule
207 | TokenKind::Entity
208 | TokenKind::External
209 | TokenKind::Value
210 | TokenKind::Enum
211 | TokenKind::Given
212 | TokenKind::Config
213 | TokenKind::Surface
214 | TokenKind::Actor
215 | TokenKind::Default
216 | TokenKind::Variant
217 | TokenKind::Deferred
218 | TokenKind::Open
219 | TokenKind::Question
220 | TokenKind::Use
221 | TokenKind::As
222 | TokenKind::When
223 | TokenKind::Requires
224 | TokenKind::Ensures
225 | TokenKind::Let
226 | TokenKind::For
227 | TokenKind::In
228 | TokenKind::If
229 | TokenKind::Else
230 | TokenKind::Where
231 | TokenKind::With
232 | TokenKind::Not
233 | TokenKind::And
234 | TokenKind::Or
235 | TokenKind::Exists
236 | TokenKind::TransitionsTo
237 | TokenKind::Becomes
238 | TokenKind::Implies
239 | TokenKind::Contract
240 | TokenKind::Invariant
241 | TokenKind::Transitions
242 | TokenKind::Produces
243 | TokenKind::Consumes
244 | TokenKind::Terminal
245 | TokenKind::Now
246 | TokenKind::This
247 | TokenKind::Within
248 )
249 }
250}
251
252#[derive(Debug, Clone, Copy)]
253pub struct Token {
254 pub kind: TokenKind,
255 pub span: Span,
256}
257
258pub struct SourceMap {
263 line_starts: Vec<usize>,
264}
265
266impl SourceMap {
267 pub fn new(source: &str) -> Self {
268 let mut starts = vec![0];
269 for (i, b) in source.bytes().enumerate() {
270 if b == b'\n' {
271 starts.push(i + 1);
272 }
273 }
274 Self { line_starts: starts }
275 }
276
277 pub fn line_col(&self, offset: usize) -> (u32, u32) {
278 let line = self
279 .line_starts
280 .partition_point(|&s| s <= offset)
281 .saturating_sub(1);
282 let col = offset - self.line_starts[line];
283 (line as u32, col as u32)
284 }
285
286 pub fn line_text<'a>(&self, source: &'a str, line: u32) -> &'a str {
288 let idx = line as usize;
289 let start = self.line_starts[idx];
290 let end = if idx + 1 < self.line_starts.len() {
291 self.line_starts[idx + 1]
292 } else {
293 source.len()
294 };
295 source[start..end].trim_end_matches('\n').trim_end_matches('\r')
296 }
297}
298
299pub fn lex(source: &str) -> Vec<Token> {
309 let mut lexer = Lexer::new(source);
310 let mut tokens = Vec::new();
311 loop {
312 let tok = lexer.next_token();
313 let done = tok.kind == TokenKind::Eof;
314 tokens.push(tok);
315 if done {
316 break;
317 }
318 }
319 tokens
320}
321
322struct Lexer<'s> {
323 src: &'s [u8],
324 pos: usize,
325}
326
327impl<'s> Lexer<'s> {
328 fn new(source: &'s str) -> Self {
329 Self {
330 src: source.as_bytes(),
331 pos: 0,
332 }
333 }
334
335 fn next_token(&mut self) -> Token {
336 self.skip_whitespace_and_comments();
337
338 if self.pos >= self.src.len() {
339 return Token {
340 kind: TokenKind::Eof,
341 span: Span::new(self.pos, self.pos),
342 };
343 }
344
345 let start = self.pos;
346 let b = self.src[self.pos];
347
348 if b == b'"' {
349 return self.lex_string(start);
350 }
351 if b == b'`' {
352 return self.lex_backtick(start);
353 }
354 if b.is_ascii_digit() {
355 return self.lex_number(start);
356 }
357 if is_ident_start(b) {
358 return self.lex_ident(start);
359 }
360
361 self.lex_operator(start)
362 }
363
364 fn skip_whitespace_and_comments(&mut self) {
367 loop {
368 while self.pos < self.src.len()
369 && matches!(self.src[self.pos], b' ' | b'\t' | b'\n' | b'\r')
370 {
371 self.pos += 1;
372 }
373 if self.pos + 1 < self.src.len()
374 && self.src[self.pos] == b'-'
375 && self.src[self.pos + 1] == b'-'
376 {
377 while self.pos < self.src.len() && self.src[self.pos] != b'\n' {
378 self.pos += 1;
379 }
380 continue;
381 }
382 break;
383 }
384 }
385
386 fn lex_string(&mut self, start: usize) -> Token {
389 self.pos += 1; while self.pos < self.src.len() {
391 match self.src[self.pos] {
392 b'"' => {
393 self.pos += 1;
394 return Token {
395 kind: TokenKind::String,
396 span: Span::new(start, self.pos),
397 };
398 }
399 b'\\' => {
400 self.pos += 1;
401 if self.pos < self.src.len() {
402 self.pos += 1;
403 }
404 }
405 b'\n' => {
406 return Token {
407 kind: TokenKind::Error,
408 span: Span::new(start, self.pos),
409 };
410 }
411 _ => self.pos += 1,
412 }
413 }
414 Token {
415 kind: TokenKind::Error,
416 span: Span::new(start, self.pos),
417 }
418 }
419
420 fn lex_backtick(&mut self, start: usize) -> Token {
423 self.pos += 1; while self.pos < self.src.len() {
425 match self.src[self.pos] {
426 b'`' => {
427 self.pos += 1;
428 return Token {
429 kind: TokenKind::BacktickLiteral,
430 span: Span::new(start, self.pos),
431 };
432 }
433 b'\n' | b'\r' => {
434 return Token {
435 kind: TokenKind::Error,
436 span: Span::new(start, self.pos),
437 };
438 }
439 _ => self.pos += 1,
440 }
441 }
442 Token {
443 kind: TokenKind::Error,
444 span: Span::new(start, self.pos),
445 }
446 }
447
448 fn lex_number(&mut self, start: usize) -> Token {
451 self.consume_digits();
452
453 if self.pos < self.src.len() && self.src[self.pos] == b'.' {
454 let after_dot = self.pos + 1;
455 if after_dot < self.src.len() && self.src[after_dot].is_ascii_digit() {
456 self.pos += 1;
458 self.consume_digits();
459 if self.check_duration_suffix() {
461 return Token {
462 kind: TokenKind::Duration,
463 span: Span::new(start, self.pos),
464 };
465 }
466 return Token {
467 kind: TokenKind::Number,
468 span: Span::new(start, self.pos),
469 };
470 }
471 if self.peek_duration_unit(after_dot).is_some() {
472 let unit_len = self.peek_duration_unit(after_dot).unwrap();
473 self.pos = after_dot + unit_len;
474 return Token {
475 kind: TokenKind::Duration,
476 span: Span::new(start, self.pos),
477 };
478 }
479 }
480
481 Token {
482 kind: TokenKind::Number,
483 span: Span::new(start, self.pos),
484 }
485 }
486
487 fn consume_digits(&mut self) {
488 while self.pos < self.src.len()
489 && (self.src[self.pos].is_ascii_digit() || self.src[self.pos] == b'_')
490 {
491 self.pos += 1;
492 }
493 }
494
495 fn check_duration_suffix(&mut self) -> bool {
497 if self.pos < self.src.len() && self.src[self.pos] == b'.' {
498 if let Some(unit_len) = self.peek_duration_unit(self.pos + 1) {
499 self.pos += 1 + unit_len;
500 return true;
501 }
502 }
503 false
504 }
505
506 fn peek_duration_unit(&self, from: usize) -> Option<usize> {
507 const UNITS: &[&str] = &[
508 "seconds", "second", "minutes", "minute", "hours", "hour", "days", "day", "weeks",
509 "week", "months", "month", "years", "year",
510 ];
511 for unit in UNITS {
512 let end = from + unit.len();
513 if end <= self.src.len()
514 && &self.src[from..end] == unit.as_bytes()
515 && (end >= self.src.len() || !is_ident_continue(self.src[end]))
516 {
517 return Some(unit.len());
518 }
519 }
520 None
521 }
522
523 fn lex_ident(&mut self, start: usize) -> Token {
526 while self.pos < self.src.len() && is_ident_continue(self.src[self.pos]) {
527 self.pos += 1;
528 }
529 let text = std::str::from_utf8(&self.src[start..self.pos]).unwrap();
530 Token {
531 kind: classify_keyword(text),
532 span: Span::new(start, self.pos),
533 }
534 }
535
536 fn lex_operator(&mut self, start: usize) -> Token {
539 let b = self.src[self.pos];
540 let next = if self.pos + 1 < self.src.len() {
541 self.src[self.pos + 1]
542 } else {
543 0
544 };
545
546 let (kind, len) = match (b, next) {
547 (b'=', b'>') => (TokenKind::FatArrow, 2),
548 (b'=', _) => (TokenKind::Eq, 1),
549 (b'!', b'=') => (TokenKind::BangEq, 2),
550 (b'<', b'=') => (TokenKind::LtEq, 2),
551 (b'<', _) => (TokenKind::Lt, 1),
552 (b'>', b'=') => (TokenKind::GtEq, 2),
553 (b'>', _) => (TokenKind::Gt, 1),
554 (b'+', _) => (TokenKind::Plus, 1),
555 (b'-', b'>') => (TokenKind::ThinArrow, 2),
556 (b'-', _) => (TokenKind::Minus, 1),
557 (b'*', _) => (TokenKind::Star, 1),
558 (b'/', _) => (TokenKind::Slash, 1),
559 (b'|', _) => (TokenKind::Pipe, 1),
560 (b'?', b'?') => (TokenKind::QuestionQuestion, 2),
561 (b'?', b'.') => (TokenKind::QuestionDot, 2),
562 (b'?', _) => (TokenKind::QuestionMark, 1),
563 (b'.', _) => (TokenKind::Dot, 1),
564 (b'{', _) => (TokenKind::LBrace, 1),
565 (b'}', _) => (TokenKind::RBrace, 1),
566 (b'(', _) => (TokenKind::LParen, 1),
567 (b')', _) => (TokenKind::RParen, 1),
568 (b'[', _) => (TokenKind::LBracket, 1),
569 (b']', _) => (TokenKind::RBracket, 1),
570 (b':', _) => (TokenKind::Colon, 1),
571 (b',', _) => (TokenKind::Comma, 1),
572 (b'@', _) => (TokenKind::At, 1),
573 _ => (TokenKind::Error, 1),
574 };
575
576 self.pos += len;
577 Token {
578 kind,
579 span: Span::new(start, self.pos),
580 }
581 }
582}
583
584fn is_ident_start(b: u8) -> bool {
589 b.is_ascii_alphabetic() || b == b'_'
590}
591
592fn is_ident_continue(b: u8) -> bool {
593 b.is_ascii_alphanumeric() || b == b'_'
594}
595
596fn classify_keyword(text: &str) -> TokenKind {
597 match text {
598 "rule" => TokenKind::Rule,
599 "entity" => TokenKind::Entity,
600 "external" => TokenKind::External,
601 "value" => TokenKind::Value,
602 "enum" => TokenKind::Enum,
603 "given" => TokenKind::Given,
604 "config" => TokenKind::Config,
605 "surface" => TokenKind::Surface,
606 "actor" => TokenKind::Actor,
607 "default" => TokenKind::Default,
608 "variant" => TokenKind::Variant,
609 "deferred" => TokenKind::Deferred,
610 "open" => TokenKind::Open,
611 "question" => TokenKind::Question,
612 "use" => TokenKind::Use,
613 "as" => TokenKind::As,
614 "when" => TokenKind::When,
615 "requires" => TokenKind::Requires,
616 "ensures" => TokenKind::Ensures,
617 "let" => TokenKind::Let,
618 "for" => TokenKind::For,
619 "in" => TokenKind::In,
620 "if" => TokenKind::If,
621 "else" => TokenKind::Else,
622 "where" => TokenKind::Where,
623 "with" => TokenKind::With,
624 "not" => TokenKind::Not,
625 "and" => TokenKind::And,
626 "or" => TokenKind::Or,
627 "exists" => TokenKind::Exists,
628 "implies" => TokenKind::Implies,
629 "contract" => TokenKind::Contract,
630 "invariant" => TokenKind::Invariant,
631 "transitions_to" => TokenKind::TransitionsTo,
632 "becomes" => TokenKind::Becomes,
633 "transitions" => TokenKind::Transitions,
634 "produces" => TokenKind::Produces,
635 "consumes" => TokenKind::Consumes,
636 "terminal" => TokenKind::Terminal,
637 "true" => TokenKind::True,
638 "false" => TokenKind::False,
639 "null" => TokenKind::Null,
640 "now" => TokenKind::Now,
641 "this" => TokenKind::This,
642 "within" => TokenKind::Within,
643 _ => TokenKind::Ident,
644 }
645}
646
647#[cfg(test)]
652mod tests {
653 use super::*;
654
655 fn kinds(src: &str) -> Vec<TokenKind> {
656 lex(src).into_iter().map(|t| t.kind).collect()
657 }
658
659 fn text_of(src: &str) -> Vec<&str> {
660 lex(src)
661 .into_iter()
662 .map(|t| &src[t.span.start..t.span.end])
663 .collect()
664 }
665
666 #[test]
667 fn keywords() {
668 assert_eq!(
669 kinds("rule entity enum"),
670 vec![TokenKind::Rule, TokenKind::Entity, TokenKind::Enum, TokenKind::Eof]
671 );
672 }
673
674 #[test]
675 fn identifiers() {
676 assert_eq!(
677 kinds("my_var User"),
678 vec![TokenKind::Ident, TokenKind::Ident, TokenKind::Eof]
679 );
680 }
681
682 #[test]
683 fn numbers() {
684 assert_eq!(kinds("42"), vec![TokenKind::Number, TokenKind::Eof]);
685 assert_eq!(kinds("3.14"), vec![TokenKind::Number, TokenKind::Eof]);
686 assert_eq!(kinds("100_000"), vec![TokenKind::Number, TokenKind::Eof]);
687 }
688
689 #[test]
690 fn durations() {
691 assert_eq!(kinds("24.hours"), vec![TokenKind::Duration, TokenKind::Eof]);
692 assert_eq!(kinds("7.days"), vec![TokenKind::Duration, TokenKind::Eof]);
693 assert_eq!(kinds("1.second"), vec![TokenKind::Duration, TokenKind::Eof]);
694 assert_eq!(kinds("3.5.minutes"), vec![TokenKind::Duration, TokenKind::Eof]);
695 }
696
697 #[test]
698 fn duration_vs_member_access() {
699 assert_eq!(
701 kinds("42.count"),
702 vec![TokenKind::Number, TokenKind::Dot, TokenKind::Ident, TokenKind::Eof]
703 );
704 }
705
706 #[test]
707 fn strings() {
708 assert_eq!(kinds(r#""hello""#), vec![TokenKind::String, TokenKind::Eof]);
709 assert_eq!(
710 kinds(r#""hello {name}""#),
711 vec![TokenKind::String, TokenKind::Eof]
712 );
713 }
714
715 #[test]
716 fn operators() {
717 assert_eq!(
718 kinds("=> -> ?? ?. != <= >="),
719 vec![
720 TokenKind::FatArrow,
721 TokenKind::ThinArrow,
722 TokenKind::QuestionQuestion,
723 TokenKind::QuestionDot,
724 TokenKind::BangEq,
725 TokenKind::LtEq,
726 TokenKind::GtEq,
727 TokenKind::Eof,
728 ]
729 );
730 }
731
732 #[test]
733 fn comments_skipped() {
734 assert_eq!(
735 kinds("rule -- this is a comment\nentity"),
736 vec![TokenKind::Rule, TokenKind::Entity, TokenKind::Eof]
737 );
738 }
739
740 #[test]
741 fn delimiters() {
742 assert_eq!(
743 kinds("{ } ( ) : ,"),
744 vec![
745 TokenKind::LBrace, TokenKind::RBrace,
746 TokenKind::LParen, TokenKind::RParen,
747 TokenKind::Colon, TokenKind::Comma,
748 TokenKind::Eof,
749 ]
750 );
751 }
752
753 #[test]
754 fn full_line() {
755 let src = "status: pending | active | completed";
756 assert_eq!(
757 text_of(src),
758 vec!["status", ":", "pending", "|", "active", "|", "completed", ""]
759 );
760 }
761
762 #[test]
763 fn v3_keywords() {
764 assert_eq!(
765 kinds("transitions produces consumes terminal"),
766 vec![
767 TokenKind::Transitions,
768 TokenKind::Produces,
769 TokenKind::Consumes,
770 TokenKind::Terminal,
771 TokenKind::Eof,
772 ]
773 );
774 }
775
776 #[test]
777 fn backtick_literal() {
778 assert_eq!(kinds("`de-CH-1996`"), vec![TokenKind::BacktickLiteral, TokenKind::Eof]);
779 assert_eq!(kinds("`no-cache`"), vec![TokenKind::BacktickLiteral, TokenKind::Eof]);
780 }
781
782 #[test]
783 fn backtick_literal_text() {
784 let src = "`de-CH-1996`";
785 assert_eq!(text_of(src), vec!["`de-CH-1996`", ""]);
786 }
787
788 #[test]
789 fn backtick_in_enum_context() {
790 assert_eq!(
791 kinds("en | `de-CH-1996` | fr"),
792 vec![
793 TokenKind::Ident,
794 TokenKind::Pipe,
795 TokenKind::BacktickLiteral,
796 TokenKind::Pipe,
797 TokenKind::Ident,
798 TokenKind::Eof,
799 ]
800 );
801 }
802
803 #[test]
804 fn backtick_unterminated() {
805 assert_eq!(kinds("`unterminated"), vec![TokenKind::Error, TokenKind::Eof]);
806 }
807
808 #[test]
809 fn backtick_newline_terminates() {
810 assert_eq!(
811 kinds("`bad\nstuff`"),
812 vec![TokenKind::Error, TokenKind::Ident, TokenKind::Error, TokenKind::Eof]
813 );
814 }
815
816 #[test]
817 fn source_map_line_col() {
818 let src = "abc\ndef\nghi";
819 let map = SourceMap::new(src);
820 assert_eq!(map.line_col(0), (0, 0)); assert_eq!(map.line_col(3), (0, 3)); assert_eq!(map.line_col(4), (1, 0)); assert_eq!(map.line_col(8), (2, 0)); }
825}