1use crate::Span;
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum TokenKind {
9 Ident,
11 Number,
12 Duration,
13 String,
14 True,
15 False,
16 Null,
17
18 Rule,
20 Entity,
21 External,
22 Value,
23 Enum,
24 Given,
25 Config,
26 Surface,
27 Actor,
28 Default,
29 Variant,
30 Deferred,
31 Open,
32 Question,
33 Use,
34 As,
35
36 When,
38 Requires,
39 Ensures,
40 Let,
41 For,
42 In,
43 If,
44 Else,
45 Where,
46 With,
47 Not,
48 And,
49 Or,
50 Exists,
51
52 TransitionsTo,
54 Becomes,
55
56 Implies,
58 Contract,
59 Invariant,
60
61 At, Now,
66 This,
67 Within,
68
69 Eq, BangEq, Lt, LtEq, Gt, GtEq, Plus, Minus, Star, Slash, Pipe, FatArrow, ThinArrow, QuestionQuestion, QuestionDot, Dot, LBrace,
89 RBrace,
90 LParen,
91 RParen,
92 LBracket, RBracket, Colon,
95 Comma,
96 QuestionMark, Eof,
100
101 Error,
103}
104
105impl std::fmt::Display for TokenKind {
106 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
107 match self {
108 TokenKind::Ident => write!(f, "identifier"),
109 TokenKind::Number => write!(f, "number"),
110 TokenKind::Duration => write!(f, "duration"),
111 TokenKind::String => write!(f, "string"),
112 TokenKind::True => write!(f, "'true'"),
113 TokenKind::False => write!(f, "'false'"),
114 TokenKind::Null => write!(f, "'null'"),
115 TokenKind::Rule => write!(f, "'rule'"),
116 TokenKind::Entity => write!(f, "'entity'"),
117 TokenKind::External => write!(f, "'external'"),
118 TokenKind::Value => write!(f, "'value'"),
119 TokenKind::Enum => write!(f, "'enum'"),
120 TokenKind::Given => write!(f, "'given'"),
121 TokenKind::Config => write!(f, "'config'"),
122 TokenKind::Surface => write!(f, "'surface'"),
123 TokenKind::Actor => write!(f, "'actor'"),
124 TokenKind::Default => write!(f, "'default'"),
125 TokenKind::Variant => write!(f, "'variant'"),
126 TokenKind::Deferred => write!(f, "'deferred'"),
127 TokenKind::Open => write!(f, "'open'"),
128 TokenKind::Question => write!(f, "'question'"),
129 TokenKind::Use => write!(f, "'use'"),
130 TokenKind::As => write!(f, "'as'"),
131 TokenKind::When => write!(f, "'when'"),
132 TokenKind::Requires => write!(f, "'requires'"),
133 TokenKind::Ensures => write!(f, "'ensures'"),
134 TokenKind::Let => write!(f, "'let'"),
135 TokenKind::For => write!(f, "'for'"),
136 TokenKind::In => write!(f, "'in'"),
137 TokenKind::If => write!(f, "'if'"),
138 TokenKind::Else => write!(f, "'else'"),
139 TokenKind::Where => write!(f, "'where'"),
140 TokenKind::With => write!(f, "'with'"),
141 TokenKind::Not => write!(f, "'not'"),
142 TokenKind::And => write!(f, "'and'"),
143 TokenKind::Or => write!(f, "'or'"),
144 TokenKind::Exists => write!(f, "'exists'"),
145 TokenKind::TransitionsTo => write!(f, "'transitions_to'"),
146 TokenKind::Becomes => write!(f, "'becomes'"),
147 TokenKind::Implies => write!(f, "'implies'"),
148 TokenKind::Contract => write!(f, "'contract'"),
149 TokenKind::Invariant => write!(f, "'invariant'"),
150 TokenKind::At => write!(f, "'@'"),
151 TokenKind::Now => write!(f, "'now'"),
152 TokenKind::This => write!(f, "'this'"),
153 TokenKind::Within => write!(f, "'within'"),
154 TokenKind::Eq => write!(f, "'='"),
155 TokenKind::BangEq => write!(f, "'!='"),
156 TokenKind::Lt => write!(f, "'<'"),
157 TokenKind::LtEq => write!(f, "'<='"),
158 TokenKind::Gt => write!(f, "'>'"),
159 TokenKind::GtEq => write!(f, "'>='"),
160 TokenKind::Plus => write!(f, "'+'"),
161 TokenKind::Minus => write!(f, "'-'"),
162 TokenKind::Star => write!(f, "'*'"),
163 TokenKind::Slash => write!(f, "'/'"),
164 TokenKind::Pipe => write!(f, "'|'"),
165 TokenKind::FatArrow => write!(f, "'=>'"),
166 TokenKind::ThinArrow => write!(f, "'->'"),
167 TokenKind::QuestionQuestion => write!(f, "'??'"),
168 TokenKind::QuestionDot => write!(f, "'?.'"),
169 TokenKind::Dot => write!(f, "'.'"),
170 TokenKind::LBrace => write!(f, "'{{'"),
171 TokenKind::RBrace => write!(f, "'}}'"),
172 TokenKind::LParen => write!(f, "'('"),
173 TokenKind::RParen => write!(f, "')'"),
174 TokenKind::LBracket => write!(f, "'['"),
175 TokenKind::RBracket => write!(f, "']'"),
176 TokenKind::Colon => write!(f, "':'"),
177 TokenKind::Comma => write!(f, "','"),
178 TokenKind::QuestionMark => write!(f, "'?'"),
179 TokenKind::Eof => write!(f, "end of file"),
180 TokenKind::Error => write!(f, "unrecognised token"),
181 }
182 }
183}
184
185impl TokenKind {
186 pub fn is_word(self) -> bool {
188 matches!(
189 self,
190 TokenKind::Ident
191 | TokenKind::True
192 | TokenKind::False
193 | TokenKind::Null
194 | TokenKind::Rule
195 | TokenKind::Entity
196 | TokenKind::External
197 | TokenKind::Value
198 | TokenKind::Enum
199 | TokenKind::Given
200 | TokenKind::Config
201 | TokenKind::Surface
202 | TokenKind::Actor
203 | TokenKind::Default
204 | TokenKind::Variant
205 | TokenKind::Deferred
206 | TokenKind::Open
207 | TokenKind::Question
208 | TokenKind::Use
209 | TokenKind::As
210 | TokenKind::When
211 | TokenKind::Requires
212 | TokenKind::Ensures
213 | TokenKind::Let
214 | TokenKind::For
215 | TokenKind::In
216 | TokenKind::If
217 | TokenKind::Else
218 | TokenKind::Where
219 | TokenKind::With
220 | TokenKind::Not
221 | TokenKind::And
222 | TokenKind::Or
223 | TokenKind::Exists
224 | TokenKind::TransitionsTo
225 | TokenKind::Becomes
226 | TokenKind::Implies
227 | TokenKind::Contract
228 | TokenKind::Invariant
229 | TokenKind::Now
230 | TokenKind::This
231 | TokenKind::Within
232 )
233 }
234}
235
236#[derive(Debug, Clone, Copy)]
237pub struct Token {
238 pub kind: TokenKind,
239 pub span: Span,
240}
241
242pub struct SourceMap {
247 line_starts: Vec<usize>,
248}
249
250impl SourceMap {
251 pub fn new(source: &str) -> Self {
252 let mut starts = vec![0];
253 for (i, b) in source.bytes().enumerate() {
254 if b == b'\n' {
255 starts.push(i + 1);
256 }
257 }
258 Self { line_starts: starts }
259 }
260
261 pub fn line_col(&self, offset: usize) -> (u32, u32) {
262 let line = self
263 .line_starts
264 .partition_point(|&s| s <= offset)
265 .saturating_sub(1);
266 let col = offset - self.line_starts[line];
267 (line as u32, col as u32)
268 }
269
270 pub fn line_text<'a>(&self, source: &'a str, line: u32) -> &'a str {
272 let idx = line as usize;
273 let start = self.line_starts[idx];
274 let end = if idx + 1 < self.line_starts.len() {
275 self.line_starts[idx + 1]
276 } else {
277 source.len()
278 };
279 source[start..end].trim_end_matches('\n').trim_end_matches('\r')
280 }
281}
282
283pub fn lex(source: &str) -> Vec<Token> {
293 let mut lexer = Lexer::new(source);
294 let mut tokens = Vec::new();
295 loop {
296 let tok = lexer.next_token();
297 let done = tok.kind == TokenKind::Eof;
298 tokens.push(tok);
299 if done {
300 break;
301 }
302 }
303 tokens
304}
305
306struct Lexer<'s> {
307 src: &'s [u8],
308 pos: usize,
309}
310
311impl<'s> Lexer<'s> {
312 fn new(source: &'s str) -> Self {
313 Self {
314 src: source.as_bytes(),
315 pos: 0,
316 }
317 }
318
319 fn next_token(&mut self) -> Token {
320 self.skip_whitespace_and_comments();
321
322 if self.pos >= self.src.len() {
323 return Token {
324 kind: TokenKind::Eof,
325 span: Span::new(self.pos, self.pos),
326 };
327 }
328
329 let start = self.pos;
330 let b = self.src[self.pos];
331
332 if b == b'"' {
333 return self.lex_string(start);
334 }
335 if b.is_ascii_digit() {
336 return self.lex_number(start);
337 }
338 if is_ident_start(b) {
339 return self.lex_ident(start);
340 }
341
342 self.lex_operator(start)
343 }
344
345 fn skip_whitespace_and_comments(&mut self) {
348 loop {
349 while self.pos < self.src.len()
350 && matches!(self.src[self.pos], b' ' | b'\t' | b'\n' | b'\r')
351 {
352 self.pos += 1;
353 }
354 if self.pos + 1 < self.src.len()
355 && self.src[self.pos] == b'-'
356 && self.src[self.pos + 1] == b'-'
357 {
358 while self.pos < self.src.len() && self.src[self.pos] != b'\n' {
359 self.pos += 1;
360 }
361 continue;
362 }
363 break;
364 }
365 }
366
367 fn lex_string(&mut self, start: usize) -> Token {
370 self.pos += 1; while self.pos < self.src.len() {
372 match self.src[self.pos] {
373 b'"' => {
374 self.pos += 1;
375 return Token {
376 kind: TokenKind::String,
377 span: Span::new(start, self.pos),
378 };
379 }
380 b'\\' => {
381 self.pos += 1;
382 if self.pos < self.src.len() {
383 self.pos += 1;
384 }
385 }
386 b'\n' => {
387 return Token {
388 kind: TokenKind::Error,
389 span: Span::new(start, self.pos),
390 };
391 }
392 _ => self.pos += 1,
393 }
394 }
395 Token {
396 kind: TokenKind::Error,
397 span: Span::new(start, self.pos),
398 }
399 }
400
401 fn lex_number(&mut self, start: usize) -> Token {
404 self.consume_digits();
405
406 if self.pos < self.src.len() && self.src[self.pos] == b'.' {
407 let after_dot = self.pos + 1;
408 if after_dot < self.src.len() && self.src[after_dot].is_ascii_digit() {
409 self.pos += 1;
411 self.consume_digits();
412 if self.check_duration_suffix() {
414 return Token {
415 kind: TokenKind::Duration,
416 span: Span::new(start, self.pos),
417 };
418 }
419 return Token {
420 kind: TokenKind::Number,
421 span: Span::new(start, self.pos),
422 };
423 }
424 if self.peek_duration_unit(after_dot).is_some() {
425 let unit_len = self.peek_duration_unit(after_dot).unwrap();
426 self.pos = after_dot + unit_len;
427 return Token {
428 kind: TokenKind::Duration,
429 span: Span::new(start, self.pos),
430 };
431 }
432 }
433
434 Token {
435 kind: TokenKind::Number,
436 span: Span::new(start, self.pos),
437 }
438 }
439
440 fn consume_digits(&mut self) {
441 while self.pos < self.src.len()
442 && (self.src[self.pos].is_ascii_digit() || self.src[self.pos] == b'_')
443 {
444 self.pos += 1;
445 }
446 }
447
448 fn check_duration_suffix(&mut self) -> bool {
450 if self.pos < self.src.len() && self.src[self.pos] == b'.' {
451 if let Some(unit_len) = self.peek_duration_unit(self.pos + 1) {
452 self.pos += 1 + unit_len;
453 return true;
454 }
455 }
456 false
457 }
458
459 fn peek_duration_unit(&self, from: usize) -> Option<usize> {
460 const UNITS: &[&str] = &[
461 "seconds", "second", "minutes", "minute", "hours", "hour", "days", "day", "weeks",
462 "week", "months", "month", "years", "year",
463 ];
464 for unit in UNITS {
465 let end = from + unit.len();
466 if end <= self.src.len()
467 && &self.src[from..end] == unit.as_bytes()
468 && (end >= self.src.len() || !is_ident_continue(self.src[end]))
469 {
470 return Some(unit.len());
471 }
472 }
473 None
474 }
475
476 fn lex_ident(&mut self, start: usize) -> Token {
479 while self.pos < self.src.len() && is_ident_continue(self.src[self.pos]) {
480 self.pos += 1;
481 }
482 let text = std::str::from_utf8(&self.src[start..self.pos]).unwrap();
483 Token {
484 kind: classify_keyword(text),
485 span: Span::new(start, self.pos),
486 }
487 }
488
489 fn lex_operator(&mut self, start: usize) -> Token {
492 let b = self.src[self.pos];
493 let next = if self.pos + 1 < self.src.len() {
494 self.src[self.pos + 1]
495 } else {
496 0
497 };
498
499 let (kind, len) = match (b, next) {
500 (b'=', b'>') => (TokenKind::FatArrow, 2),
501 (b'=', _) => (TokenKind::Eq, 1),
502 (b'!', b'=') => (TokenKind::BangEq, 2),
503 (b'<', b'=') => (TokenKind::LtEq, 2),
504 (b'<', _) => (TokenKind::Lt, 1),
505 (b'>', b'=') => (TokenKind::GtEq, 2),
506 (b'>', _) => (TokenKind::Gt, 1),
507 (b'+', _) => (TokenKind::Plus, 1),
508 (b'-', b'>') => (TokenKind::ThinArrow, 2),
509 (b'-', _) => (TokenKind::Minus, 1),
510 (b'*', _) => (TokenKind::Star, 1),
511 (b'/', _) => (TokenKind::Slash, 1),
512 (b'|', _) => (TokenKind::Pipe, 1),
513 (b'?', b'?') => (TokenKind::QuestionQuestion, 2),
514 (b'?', b'.') => (TokenKind::QuestionDot, 2),
515 (b'?', _) => (TokenKind::QuestionMark, 1),
516 (b'.', _) => (TokenKind::Dot, 1),
517 (b'{', _) => (TokenKind::LBrace, 1),
518 (b'}', _) => (TokenKind::RBrace, 1),
519 (b'(', _) => (TokenKind::LParen, 1),
520 (b')', _) => (TokenKind::RParen, 1),
521 (b'[', _) => (TokenKind::LBracket, 1),
522 (b']', _) => (TokenKind::RBracket, 1),
523 (b':', _) => (TokenKind::Colon, 1),
524 (b',', _) => (TokenKind::Comma, 1),
525 (b'@', _) => (TokenKind::At, 1),
526 _ => (TokenKind::Error, 1),
527 };
528
529 self.pos += len;
530 Token {
531 kind,
532 span: Span::new(start, self.pos),
533 }
534 }
535}
536
537fn is_ident_start(b: u8) -> bool {
542 b.is_ascii_alphabetic() || b == b'_'
543}
544
545fn is_ident_continue(b: u8) -> bool {
546 b.is_ascii_alphanumeric() || b == b'_'
547}
548
549fn classify_keyword(text: &str) -> TokenKind {
550 match text {
551 "rule" => TokenKind::Rule,
552 "entity" => TokenKind::Entity,
553 "external" => TokenKind::External,
554 "value" => TokenKind::Value,
555 "enum" => TokenKind::Enum,
556 "given" => TokenKind::Given,
557 "config" => TokenKind::Config,
558 "surface" => TokenKind::Surface,
559 "actor" => TokenKind::Actor,
560 "default" => TokenKind::Default,
561 "variant" => TokenKind::Variant,
562 "deferred" => TokenKind::Deferred,
563 "open" => TokenKind::Open,
564 "question" => TokenKind::Question,
565 "use" => TokenKind::Use,
566 "as" => TokenKind::As,
567 "when" => TokenKind::When,
568 "requires" => TokenKind::Requires,
569 "ensures" => TokenKind::Ensures,
570 "let" => TokenKind::Let,
571 "for" => TokenKind::For,
572 "in" => TokenKind::In,
573 "if" => TokenKind::If,
574 "else" => TokenKind::Else,
575 "where" => TokenKind::Where,
576 "with" => TokenKind::With,
577 "not" => TokenKind::Not,
578 "and" => TokenKind::And,
579 "or" => TokenKind::Or,
580 "exists" => TokenKind::Exists,
581 "implies" => TokenKind::Implies,
582 "contract" => TokenKind::Contract,
583 "invariant" => TokenKind::Invariant,
584 "transitions_to" => TokenKind::TransitionsTo,
585 "becomes" => TokenKind::Becomes,
586 "true" => TokenKind::True,
587 "false" => TokenKind::False,
588 "null" => TokenKind::Null,
589 "now" => TokenKind::Now,
590 "this" => TokenKind::This,
591 "within" => TokenKind::Within,
592 _ => TokenKind::Ident,
593 }
594}
595
596#[cfg(test)]
601mod tests {
602 use super::*;
603
604 fn kinds(src: &str) -> Vec<TokenKind> {
605 lex(src).into_iter().map(|t| t.kind).collect()
606 }
607
608 fn text_of(src: &str) -> Vec<&str> {
609 lex(src)
610 .into_iter()
611 .map(|t| &src[t.span.start..t.span.end])
612 .collect()
613 }
614
615 #[test]
616 fn keywords() {
617 assert_eq!(
618 kinds("rule entity enum"),
619 vec![TokenKind::Rule, TokenKind::Entity, TokenKind::Enum, TokenKind::Eof]
620 );
621 }
622
623 #[test]
624 fn identifiers() {
625 assert_eq!(
626 kinds("my_var User"),
627 vec![TokenKind::Ident, TokenKind::Ident, TokenKind::Eof]
628 );
629 }
630
631 #[test]
632 fn numbers() {
633 assert_eq!(kinds("42"), vec![TokenKind::Number, TokenKind::Eof]);
634 assert_eq!(kinds("3.14"), vec![TokenKind::Number, TokenKind::Eof]);
635 assert_eq!(kinds("100_000"), vec![TokenKind::Number, TokenKind::Eof]);
636 }
637
638 #[test]
639 fn durations() {
640 assert_eq!(kinds("24.hours"), vec![TokenKind::Duration, TokenKind::Eof]);
641 assert_eq!(kinds("7.days"), vec![TokenKind::Duration, TokenKind::Eof]);
642 assert_eq!(kinds("1.second"), vec![TokenKind::Duration, TokenKind::Eof]);
643 assert_eq!(kinds("3.5.minutes"), vec![TokenKind::Duration, TokenKind::Eof]);
644 }
645
646 #[test]
647 fn duration_vs_member_access() {
648 assert_eq!(
650 kinds("42.count"),
651 vec![TokenKind::Number, TokenKind::Dot, TokenKind::Ident, TokenKind::Eof]
652 );
653 }
654
655 #[test]
656 fn strings() {
657 assert_eq!(kinds(r#""hello""#), vec![TokenKind::String, TokenKind::Eof]);
658 assert_eq!(
659 kinds(r#""hello {name}""#),
660 vec![TokenKind::String, TokenKind::Eof]
661 );
662 }
663
664 #[test]
665 fn operators() {
666 assert_eq!(
667 kinds("=> -> ?? ?. != <= >="),
668 vec![
669 TokenKind::FatArrow,
670 TokenKind::ThinArrow,
671 TokenKind::QuestionQuestion,
672 TokenKind::QuestionDot,
673 TokenKind::BangEq,
674 TokenKind::LtEq,
675 TokenKind::GtEq,
676 TokenKind::Eof,
677 ]
678 );
679 }
680
681 #[test]
682 fn comments_skipped() {
683 assert_eq!(
684 kinds("rule -- this is a comment\nentity"),
685 vec![TokenKind::Rule, TokenKind::Entity, TokenKind::Eof]
686 );
687 }
688
689 #[test]
690 fn delimiters() {
691 assert_eq!(
692 kinds("{ } ( ) : ,"),
693 vec![
694 TokenKind::LBrace, TokenKind::RBrace,
695 TokenKind::LParen, TokenKind::RParen,
696 TokenKind::Colon, TokenKind::Comma,
697 TokenKind::Eof,
698 ]
699 );
700 }
701
702 #[test]
703 fn full_line() {
704 let src = "status: pending | active | completed";
705 assert_eq!(
706 text_of(src),
707 vec!["status", ":", "pending", "|", "active", "|", "completed", ""]
708 );
709 }
710
711 #[test]
712 fn source_map_line_col() {
713 let src = "abc\ndef\nghi";
714 let map = SourceMap::new(src);
715 assert_eq!(map.line_col(0), (0, 0)); assert_eq!(map.line_col(3), (0, 3)); assert_eq!(map.line_col(4), (1, 0)); assert_eq!(map.line_col(8), (2, 0)); }
720}