1use crate::Span;
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum TokenKind {
9 Ident,
11 Number,
12 Duration,
13 String,
14 True,
15 False,
16 Null,
17
18 Rule,
20 Entity,
21 External,
22 Value,
23 Enum,
24 Given,
25 Config,
26 Surface,
27 Actor,
28 Default,
29 Variant,
30 Deferred,
31 Open,
32 Question,
33 Use,
34 As,
35 Module,
36
37 When,
39 Requires,
40 Ensures,
41 Let,
42 For,
43 In,
44 If,
45 Else,
46 Where,
47 With,
48 Not,
49 And,
50 Or,
51 Exists,
52
53 TransitionsTo,
55 Becomes,
56 Includes,
57 Excludes,
58
59 Now,
61 This,
62 Within,
63
64 Eq, EqEq, BangEq, Lt, LtEq, Gt, GtEq, Plus, Minus, Star, Slash, Pipe, FatArrow, ThinArrow, QuestionQuestion, QuestionDot, Dot, DotDot, LBrace,
86 RBrace,
87 LParen,
88 RParen,
89 LBracket, RBracket, Colon,
92 Comma,
93 QuestionMark, Eof,
97
98 Error,
100}
101
102impl std::fmt::Display for TokenKind {
103 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
104 match self {
105 TokenKind::Ident => write!(f, "identifier"),
106 TokenKind::Number => write!(f, "number"),
107 TokenKind::Duration => write!(f, "duration"),
108 TokenKind::String => write!(f, "string"),
109 TokenKind::True => write!(f, "'true'"),
110 TokenKind::False => write!(f, "'false'"),
111 TokenKind::Null => write!(f, "'null'"),
112 TokenKind::Rule => write!(f, "'rule'"),
113 TokenKind::Entity => write!(f, "'entity'"),
114 TokenKind::External => write!(f, "'external'"),
115 TokenKind::Value => write!(f, "'value'"),
116 TokenKind::Enum => write!(f, "'enum'"),
117 TokenKind::Given => write!(f, "'given'"),
118 TokenKind::Config => write!(f, "'config'"),
119 TokenKind::Surface => write!(f, "'surface'"),
120 TokenKind::Actor => write!(f, "'actor'"),
121 TokenKind::Default => write!(f, "'default'"),
122 TokenKind::Variant => write!(f, "'variant'"),
123 TokenKind::Deferred => write!(f, "'deferred'"),
124 TokenKind::Open => write!(f, "'open'"),
125 TokenKind::Question => write!(f, "'question'"),
126 TokenKind::Use => write!(f, "'use'"),
127 TokenKind::As => write!(f, "'as'"),
128 TokenKind::Module => write!(f, "'module'"),
129 TokenKind::When => write!(f, "'when'"),
130 TokenKind::Requires => write!(f, "'requires'"),
131 TokenKind::Ensures => write!(f, "'ensures'"),
132 TokenKind::Let => write!(f, "'let'"),
133 TokenKind::For => write!(f, "'for'"),
134 TokenKind::In => write!(f, "'in'"),
135 TokenKind::If => write!(f, "'if'"),
136 TokenKind::Else => write!(f, "'else'"),
137 TokenKind::Where => write!(f, "'where'"),
138 TokenKind::With => write!(f, "'with'"),
139 TokenKind::Not => write!(f, "'not'"),
140 TokenKind::And => write!(f, "'and'"),
141 TokenKind::Or => write!(f, "'or'"),
142 TokenKind::Exists => write!(f, "'exists'"),
143 TokenKind::TransitionsTo => write!(f, "'transitions_to'"),
144 TokenKind::Becomes => write!(f, "'becomes'"),
145 TokenKind::Includes => write!(f, "'includes'"),
146 TokenKind::Excludes => write!(f, "'excludes'"),
147 TokenKind::Now => write!(f, "'now'"),
148 TokenKind::This => write!(f, "'this'"),
149 TokenKind::Within => write!(f, "'within'"),
150 TokenKind::Eq => write!(f, "'='"),
151 TokenKind::EqEq => write!(f, "'=='"),
152 TokenKind::BangEq => write!(f, "'!='"),
153 TokenKind::Lt => write!(f, "'<'"),
154 TokenKind::LtEq => write!(f, "'<='"),
155 TokenKind::Gt => write!(f, "'>'"),
156 TokenKind::GtEq => write!(f, "'>='"),
157 TokenKind::Plus => write!(f, "'+'"),
158 TokenKind::Minus => write!(f, "'-'"),
159 TokenKind::Star => write!(f, "'*'"),
160 TokenKind::Slash => write!(f, "'/'"),
161 TokenKind::Pipe => write!(f, "'|'"),
162 TokenKind::FatArrow => write!(f, "'=>'"),
163 TokenKind::ThinArrow => write!(f, "'->'"),
164 TokenKind::QuestionQuestion => write!(f, "'??'"),
165 TokenKind::QuestionDot => write!(f, "'?.'"),
166 TokenKind::Dot => write!(f, "'.'"),
167 TokenKind::DotDot => write!(f, "'..'"),
168 TokenKind::LBrace => write!(f, "'{{'"),
169 TokenKind::RBrace => write!(f, "'}}'"),
170 TokenKind::LParen => write!(f, "'('"),
171 TokenKind::RParen => write!(f, "')'"),
172 TokenKind::LBracket => write!(f, "'['"),
173 TokenKind::RBracket => write!(f, "']'"),
174 TokenKind::Colon => write!(f, "':'"),
175 TokenKind::Comma => write!(f, "','"),
176 TokenKind::QuestionMark => write!(f, "'?'"),
177 TokenKind::Eof => write!(f, "end of file"),
178 TokenKind::Error => write!(f, "unrecognised token"),
179 }
180 }
181}
182
183impl TokenKind {
184 pub fn is_word(self) -> bool {
186 matches!(
187 self,
188 TokenKind::Ident
189 | TokenKind::True
190 | TokenKind::False
191 | TokenKind::Null
192 | TokenKind::Rule
193 | TokenKind::Entity
194 | TokenKind::External
195 | TokenKind::Value
196 | TokenKind::Enum
197 | TokenKind::Given
198 | TokenKind::Config
199 | TokenKind::Surface
200 | TokenKind::Actor
201 | TokenKind::Default
202 | TokenKind::Variant
203 | TokenKind::Deferred
204 | TokenKind::Open
205 | TokenKind::Question
206 | TokenKind::Use
207 | TokenKind::As
208 | TokenKind::Module
209 | TokenKind::When
210 | TokenKind::Requires
211 | TokenKind::Ensures
212 | TokenKind::Let
213 | TokenKind::For
214 | TokenKind::In
215 | TokenKind::If
216 | TokenKind::Else
217 | TokenKind::Where
218 | TokenKind::With
219 | TokenKind::Not
220 | TokenKind::And
221 | TokenKind::Or
222 | TokenKind::Exists
223 | TokenKind::TransitionsTo
224 | TokenKind::Becomes
225 | TokenKind::Includes
226 | TokenKind::Excludes
227 | TokenKind::Now
228 | TokenKind::This
229 | TokenKind::Within
230 )
231 }
232}
233
234#[derive(Debug, Clone, Copy)]
235pub struct Token {
236 pub kind: TokenKind,
237 pub span: Span,
238}
239
240pub struct SourceMap {
245 line_starts: Vec<usize>,
246}
247
248impl SourceMap {
249 pub fn new(source: &str) -> Self {
250 let mut starts = vec![0];
251 for (i, b) in source.bytes().enumerate() {
252 if b == b'\n' {
253 starts.push(i + 1);
254 }
255 }
256 Self { line_starts: starts }
257 }
258
259 pub fn line_col(&self, offset: usize) -> (u32, u32) {
260 let line = self
261 .line_starts
262 .partition_point(|&s| s <= offset)
263 .saturating_sub(1);
264 let col = offset - self.line_starts[line];
265 (line as u32, col as u32)
266 }
267
268 pub fn line_text<'a>(&self, source: &'a str, line: u32) -> &'a str {
270 let idx = line as usize;
271 let start = self.line_starts[idx];
272 let end = if idx + 1 < self.line_starts.len() {
273 self.line_starts[idx + 1]
274 } else {
275 source.len()
276 };
277 source[start..end].trim_end_matches('\n').trim_end_matches('\r')
278 }
279}
280
281pub fn lex(source: &str) -> Vec<Token> {
291 let mut lexer = Lexer::new(source);
292 let mut tokens = Vec::new();
293 loop {
294 let tok = lexer.next_token();
295 let done = tok.kind == TokenKind::Eof;
296 tokens.push(tok);
297 if done {
298 break;
299 }
300 }
301 tokens
302}
303
304struct Lexer<'s> {
305 src: &'s [u8],
306 pos: usize,
307}
308
309impl<'s> Lexer<'s> {
310 fn new(source: &'s str) -> Self {
311 Self {
312 src: source.as_bytes(),
313 pos: 0,
314 }
315 }
316
317 fn next_token(&mut self) -> Token {
318 self.skip_whitespace_and_comments();
319
320 if self.pos >= self.src.len() {
321 return Token {
322 kind: TokenKind::Eof,
323 span: Span::new(self.pos, self.pos),
324 };
325 }
326
327 let start = self.pos;
328 let b = self.src[self.pos];
329
330 if b == b'"' {
331 return self.lex_string(start);
332 }
333 if b.is_ascii_digit() {
334 return self.lex_number(start);
335 }
336 if is_ident_start(b) {
337 return self.lex_ident(start);
338 }
339
340 self.lex_operator(start)
341 }
342
343 fn skip_whitespace_and_comments(&mut self) {
346 loop {
347 while self.pos < self.src.len()
348 && matches!(self.src[self.pos], b' ' | b'\t' | b'\n' | b'\r')
349 {
350 self.pos += 1;
351 }
352 if self.pos + 1 < self.src.len()
353 && self.src[self.pos] == b'-'
354 && self.src[self.pos + 1] == b'-'
355 {
356 while self.pos < self.src.len() && self.src[self.pos] != b'\n' {
357 self.pos += 1;
358 }
359 continue;
360 }
361 break;
362 }
363 }
364
365 fn lex_string(&mut self, start: usize) -> Token {
368 self.pos += 1; while self.pos < self.src.len() {
370 match self.src[self.pos] {
371 b'"' => {
372 self.pos += 1;
373 return Token {
374 kind: TokenKind::String,
375 span: Span::new(start, self.pos),
376 };
377 }
378 b'\\' => {
379 self.pos += 1;
380 if self.pos < self.src.len() {
381 self.pos += 1;
382 }
383 }
384 b'\n' => {
385 return Token {
386 kind: TokenKind::Error,
387 span: Span::new(start, self.pos),
388 };
389 }
390 _ => self.pos += 1,
391 }
392 }
393 Token {
394 kind: TokenKind::Error,
395 span: Span::new(start, self.pos),
396 }
397 }
398
399 fn lex_number(&mut self, start: usize) -> Token {
402 self.consume_digits();
403
404 if self.pos < self.src.len() && self.src[self.pos] == b'.' {
405 let after_dot = self.pos + 1;
406 if after_dot < self.src.len() && self.src[after_dot].is_ascii_digit() {
407 self.pos += 1;
409 self.consume_digits();
410 if self.check_duration_suffix() {
412 return Token {
413 kind: TokenKind::Duration,
414 span: Span::new(start, self.pos),
415 };
416 }
417 return Token {
418 kind: TokenKind::Number,
419 span: Span::new(start, self.pos),
420 };
421 }
422 if self.peek_duration_unit(after_dot).is_some() {
423 let unit_len = self.peek_duration_unit(after_dot).unwrap();
424 self.pos = after_dot + unit_len;
425 return Token {
426 kind: TokenKind::Duration,
427 span: Span::new(start, self.pos),
428 };
429 }
430 }
431
432 Token {
433 kind: TokenKind::Number,
434 span: Span::new(start, self.pos),
435 }
436 }
437
438 fn consume_digits(&mut self) {
439 while self.pos < self.src.len()
440 && (self.src[self.pos].is_ascii_digit() || self.src[self.pos] == b'_')
441 {
442 self.pos += 1;
443 }
444 }
445
446 fn check_duration_suffix(&mut self) -> bool {
448 if self.pos < self.src.len() && self.src[self.pos] == b'.' {
449 if let Some(unit_len) = self.peek_duration_unit(self.pos + 1) {
450 self.pos += 1 + unit_len;
451 return true;
452 }
453 }
454 false
455 }
456
457 fn peek_duration_unit(&self, from: usize) -> Option<usize> {
458 const UNITS: &[&str] = &[
459 "seconds", "second", "minutes", "minute", "hours", "hour", "days", "day", "weeks",
460 "week", "months", "month", "years", "year",
461 ];
462 for unit in UNITS {
463 let end = from + unit.len();
464 if end <= self.src.len()
465 && &self.src[from..end] == unit.as_bytes()
466 && (end >= self.src.len() || !is_ident_continue(self.src[end]))
467 {
468 return Some(unit.len());
469 }
470 }
471 None
472 }
473
474 fn lex_ident(&mut self, start: usize) -> Token {
477 while self.pos < self.src.len() && is_ident_continue(self.src[self.pos]) {
478 self.pos += 1;
479 }
480 let text = std::str::from_utf8(&self.src[start..self.pos]).unwrap();
481 Token {
482 kind: classify_keyword(text),
483 span: Span::new(start, self.pos),
484 }
485 }
486
487 fn lex_operator(&mut self, start: usize) -> Token {
490 let b = self.src[self.pos];
491 let next = if self.pos + 1 < self.src.len() {
492 self.src[self.pos + 1]
493 } else {
494 0
495 };
496
497 let (kind, len) = match (b, next) {
498 (b'=', b'>') => (TokenKind::FatArrow, 2),
499 (b'=', b'=') => (TokenKind::EqEq, 2),
500 (b'=', _) => (TokenKind::Eq, 1),
501 (b'!', b'=') => (TokenKind::BangEq, 2),
502 (b'<', b'=') => (TokenKind::LtEq, 2),
503 (b'<', _) => (TokenKind::Lt, 1),
504 (b'>', b'=') => (TokenKind::GtEq, 2),
505 (b'>', _) => (TokenKind::Gt, 1),
506 (b'+', _) => (TokenKind::Plus, 1),
507 (b'-', b'>') => (TokenKind::ThinArrow, 2),
508 (b'-', _) => (TokenKind::Minus, 1),
509 (b'*', _) => (TokenKind::Star, 1),
510 (b'/', _) => (TokenKind::Slash, 1),
511 (b'|', _) => (TokenKind::Pipe, 1),
512 (b'?', b'?') => (TokenKind::QuestionQuestion, 2),
513 (b'?', b'.') => (TokenKind::QuestionDot, 2),
514 (b'?', _) => (TokenKind::QuestionMark, 1),
515 (b'.', b'.') => (TokenKind::DotDot, 2),
516 (b'.', _) => (TokenKind::Dot, 1),
517 (b'{', _) => (TokenKind::LBrace, 1),
518 (b'}', _) => (TokenKind::RBrace, 1),
519 (b'(', _) => (TokenKind::LParen, 1),
520 (b')', _) => (TokenKind::RParen, 1),
521 (b'[', _) => (TokenKind::LBracket, 1),
522 (b']', _) => (TokenKind::RBracket, 1),
523 (b':', _) => (TokenKind::Colon, 1),
524 (b',', _) => (TokenKind::Comma, 1),
525 _ => (TokenKind::Error, 1),
526 };
527
528 self.pos += len;
529 Token {
530 kind,
531 span: Span::new(start, self.pos),
532 }
533 }
534}
535
536fn is_ident_start(b: u8) -> bool {
541 b.is_ascii_alphabetic() || b == b'_'
542}
543
544fn is_ident_continue(b: u8) -> bool {
545 b.is_ascii_alphanumeric() || b == b'_'
546}
547
548fn classify_keyword(text: &str) -> TokenKind {
549 match text {
550 "rule" => TokenKind::Rule,
551 "entity" => TokenKind::Entity,
552 "external" => TokenKind::External,
553 "value" => TokenKind::Value,
554 "enum" => TokenKind::Enum,
555 "given" => TokenKind::Given,
556 "config" => TokenKind::Config,
557 "surface" => TokenKind::Surface,
558 "actor" => TokenKind::Actor,
559 "default" => TokenKind::Default,
560 "variant" => TokenKind::Variant,
561 "deferred" => TokenKind::Deferred,
562 "open" => TokenKind::Open,
563 "question" => TokenKind::Question,
564 "use" => TokenKind::Use,
565 "as" => TokenKind::As,
566 "module" => TokenKind::Module,
567 "when" => TokenKind::When,
568 "requires" => TokenKind::Requires,
569 "ensures" => TokenKind::Ensures,
570 "let" => TokenKind::Let,
571 "for" => TokenKind::For,
572 "in" => TokenKind::In,
573 "if" => TokenKind::If,
574 "else" => TokenKind::Else,
575 "where" => TokenKind::Where,
576 "with" => TokenKind::With,
577 "not" => TokenKind::Not,
578 "and" => TokenKind::And,
579 "or" => TokenKind::Or,
580 "exists" => TokenKind::Exists,
581 "transitions_to" => TokenKind::TransitionsTo,
582 "becomes" => TokenKind::Becomes,
583 "includes" => TokenKind::Includes,
584 "excludes" => TokenKind::Excludes,
585 "true" => TokenKind::True,
586 "false" => TokenKind::False,
587 "null" => TokenKind::Null,
588 "now" => TokenKind::Now,
589 "this" => TokenKind::This,
590 "within" => TokenKind::Within,
591 _ => TokenKind::Ident,
592 }
593}
594
595#[cfg(test)]
600mod tests {
601 use super::*;
602
603 fn kinds(src: &str) -> Vec<TokenKind> {
604 lex(src).into_iter().map(|t| t.kind).collect()
605 }
606
607 fn text_of(src: &str) -> Vec<&str> {
608 lex(src)
609 .into_iter()
610 .map(|t| &src[t.span.start..t.span.end])
611 .collect()
612 }
613
614 #[test]
615 fn keywords() {
616 assert_eq!(
617 kinds("rule entity enum"),
618 vec![TokenKind::Rule, TokenKind::Entity, TokenKind::Enum, TokenKind::Eof]
619 );
620 }
621
622 #[test]
623 fn identifiers() {
624 assert_eq!(
625 kinds("my_var User"),
626 vec![TokenKind::Ident, TokenKind::Ident, TokenKind::Eof]
627 );
628 }
629
630 #[test]
631 fn numbers() {
632 assert_eq!(kinds("42"), vec![TokenKind::Number, TokenKind::Eof]);
633 assert_eq!(kinds("3.14"), vec![TokenKind::Number, TokenKind::Eof]);
634 assert_eq!(kinds("100_000"), vec![TokenKind::Number, TokenKind::Eof]);
635 }
636
637 #[test]
638 fn durations() {
639 assert_eq!(kinds("24.hours"), vec![TokenKind::Duration, TokenKind::Eof]);
640 assert_eq!(kinds("7.days"), vec![TokenKind::Duration, TokenKind::Eof]);
641 assert_eq!(kinds("1.second"), vec![TokenKind::Duration, TokenKind::Eof]);
642 assert_eq!(kinds("3.5.minutes"), vec![TokenKind::Duration, TokenKind::Eof]);
643 }
644
645 #[test]
646 fn duration_vs_member_access() {
647 assert_eq!(
649 kinds("42.count"),
650 vec![TokenKind::Number, TokenKind::Dot, TokenKind::Ident, TokenKind::Eof]
651 );
652 }
653
654 #[test]
655 fn strings() {
656 assert_eq!(kinds(r#""hello""#), vec![TokenKind::String, TokenKind::Eof]);
657 assert_eq!(
658 kinds(r#""hello {name}""#),
659 vec![TokenKind::String, TokenKind::Eof]
660 );
661 }
662
663 #[test]
664 fn operators() {
665 assert_eq!(
666 kinds("=> -> ?? ?. != <= >="),
667 vec![
668 TokenKind::FatArrow,
669 TokenKind::ThinArrow,
670 TokenKind::QuestionQuestion,
671 TokenKind::QuestionDot,
672 TokenKind::BangEq,
673 TokenKind::LtEq,
674 TokenKind::GtEq,
675 TokenKind::Eof,
676 ]
677 );
678 }
679
680 #[test]
681 fn comments_skipped() {
682 assert_eq!(
683 kinds("rule -- this is a comment\nentity"),
684 vec![TokenKind::Rule, TokenKind::Entity, TokenKind::Eof]
685 );
686 }
687
688 #[test]
689 fn delimiters() {
690 assert_eq!(
691 kinds("{ } ( ) : ,"),
692 vec![
693 TokenKind::LBrace, TokenKind::RBrace,
694 TokenKind::LParen, TokenKind::RParen,
695 TokenKind::Colon, TokenKind::Comma,
696 TokenKind::Eof,
697 ]
698 );
699 }
700
701 #[test]
702 fn full_line() {
703 let src = "status: pending | active | completed";
704 assert_eq!(
705 text_of(src),
706 vec!["status", ":", "pending", "|", "active", "|", "completed", ""]
707 );
708 }
709
710 #[test]
711 fn source_map_line_col() {
712 let src = "abc\ndef\nghi";
713 let map = SourceMap::new(src);
714 assert_eq!(map.line_col(0), (0, 0)); assert_eq!(map.line_col(3), (0, 3)); assert_eq!(map.line_col(4), (1, 0)); assert_eq!(map.line_col(8), (2, 0)); }
719}