1use crate::Span;
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum TokenKind {
9 Ident,
11 Number,
12 Duration,
13 String,
14 True,
15 False,
16 Null,
17
18 Rule,
20 Entity,
21 External,
22 Value,
23 Enum,
24 Given,
25 Config,
26 Surface,
27 Actor,
28 Default,
29 Variant,
30 Deferred,
31 Open,
32 Question,
33 Use,
34 As,
35
36 When,
38 Requires,
39 Ensures,
40 Let,
41 For,
42 In,
43 If,
44 Else,
45 Where,
46 With,
47 Not,
48 And,
49 Or,
50 Exists,
51
52 TransitionsTo,
54 Becomes,
55
56 Now,
58 This,
59 Within,
60
61 Eq, BangEq, Lt, LtEq, Gt, GtEq, Plus, Minus, Star, Slash, Pipe, FatArrow, ThinArrow, QuestionQuestion, QuestionDot, Dot, LBrace,
81 RBrace,
82 LParen,
83 RParen,
84 LBracket, RBracket, Colon,
87 Comma,
88 QuestionMark, Eof,
92
93 Error,
95}
96
97impl std::fmt::Display for TokenKind {
98 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
99 match self {
100 TokenKind::Ident => write!(f, "identifier"),
101 TokenKind::Number => write!(f, "number"),
102 TokenKind::Duration => write!(f, "duration"),
103 TokenKind::String => write!(f, "string"),
104 TokenKind::True => write!(f, "'true'"),
105 TokenKind::False => write!(f, "'false'"),
106 TokenKind::Null => write!(f, "'null'"),
107 TokenKind::Rule => write!(f, "'rule'"),
108 TokenKind::Entity => write!(f, "'entity'"),
109 TokenKind::External => write!(f, "'external'"),
110 TokenKind::Value => write!(f, "'value'"),
111 TokenKind::Enum => write!(f, "'enum'"),
112 TokenKind::Given => write!(f, "'given'"),
113 TokenKind::Config => write!(f, "'config'"),
114 TokenKind::Surface => write!(f, "'surface'"),
115 TokenKind::Actor => write!(f, "'actor'"),
116 TokenKind::Default => write!(f, "'default'"),
117 TokenKind::Variant => write!(f, "'variant'"),
118 TokenKind::Deferred => write!(f, "'deferred'"),
119 TokenKind::Open => write!(f, "'open'"),
120 TokenKind::Question => write!(f, "'question'"),
121 TokenKind::Use => write!(f, "'use'"),
122 TokenKind::As => write!(f, "'as'"),
123 TokenKind::When => write!(f, "'when'"),
124 TokenKind::Requires => write!(f, "'requires'"),
125 TokenKind::Ensures => write!(f, "'ensures'"),
126 TokenKind::Let => write!(f, "'let'"),
127 TokenKind::For => write!(f, "'for'"),
128 TokenKind::In => write!(f, "'in'"),
129 TokenKind::If => write!(f, "'if'"),
130 TokenKind::Else => write!(f, "'else'"),
131 TokenKind::Where => write!(f, "'where'"),
132 TokenKind::With => write!(f, "'with'"),
133 TokenKind::Not => write!(f, "'not'"),
134 TokenKind::And => write!(f, "'and'"),
135 TokenKind::Or => write!(f, "'or'"),
136 TokenKind::Exists => write!(f, "'exists'"),
137 TokenKind::TransitionsTo => write!(f, "'transitions_to'"),
138 TokenKind::Becomes => write!(f, "'becomes'"),
139 TokenKind::Now => write!(f, "'now'"),
140 TokenKind::This => write!(f, "'this'"),
141 TokenKind::Within => write!(f, "'within'"),
142 TokenKind::Eq => write!(f, "'='"),
143 TokenKind::BangEq => write!(f, "'!='"),
144 TokenKind::Lt => write!(f, "'<'"),
145 TokenKind::LtEq => write!(f, "'<='"),
146 TokenKind::Gt => write!(f, "'>'"),
147 TokenKind::GtEq => write!(f, "'>='"),
148 TokenKind::Plus => write!(f, "'+'"),
149 TokenKind::Minus => write!(f, "'-'"),
150 TokenKind::Star => write!(f, "'*'"),
151 TokenKind::Slash => write!(f, "'/'"),
152 TokenKind::Pipe => write!(f, "'|'"),
153 TokenKind::FatArrow => write!(f, "'=>'"),
154 TokenKind::ThinArrow => write!(f, "'->'"),
155 TokenKind::QuestionQuestion => write!(f, "'??'"),
156 TokenKind::QuestionDot => write!(f, "'?.'"),
157 TokenKind::Dot => write!(f, "'.'"),
158 TokenKind::LBrace => write!(f, "'{{'"),
159 TokenKind::RBrace => write!(f, "'}}'"),
160 TokenKind::LParen => write!(f, "'('"),
161 TokenKind::RParen => write!(f, "')'"),
162 TokenKind::LBracket => write!(f, "'['"),
163 TokenKind::RBracket => write!(f, "']'"),
164 TokenKind::Colon => write!(f, "':'"),
165 TokenKind::Comma => write!(f, "','"),
166 TokenKind::QuestionMark => write!(f, "'?'"),
167 TokenKind::Eof => write!(f, "end of file"),
168 TokenKind::Error => write!(f, "unrecognised token"),
169 }
170 }
171}
172
173impl TokenKind {
174 pub fn is_word(self) -> bool {
176 matches!(
177 self,
178 TokenKind::Ident
179 | TokenKind::True
180 | TokenKind::False
181 | TokenKind::Null
182 | TokenKind::Rule
183 | TokenKind::Entity
184 | TokenKind::External
185 | TokenKind::Value
186 | TokenKind::Enum
187 | TokenKind::Given
188 | TokenKind::Config
189 | TokenKind::Surface
190 | TokenKind::Actor
191 | TokenKind::Default
192 | TokenKind::Variant
193 | TokenKind::Deferred
194 | TokenKind::Open
195 | TokenKind::Question
196 | TokenKind::Use
197 | TokenKind::As
198 | TokenKind::When
199 | TokenKind::Requires
200 | TokenKind::Ensures
201 | TokenKind::Let
202 | TokenKind::For
203 | TokenKind::In
204 | TokenKind::If
205 | TokenKind::Else
206 | TokenKind::Where
207 | TokenKind::With
208 | TokenKind::Not
209 | TokenKind::And
210 | TokenKind::Or
211 | TokenKind::Exists
212 | TokenKind::TransitionsTo
213 | TokenKind::Becomes
214 | TokenKind::Now
215 | TokenKind::This
216 | TokenKind::Within
217 )
218 }
219}
220
221#[derive(Debug, Clone, Copy)]
222pub struct Token {
223 pub kind: TokenKind,
224 pub span: Span,
225}
226
227pub struct SourceMap {
232 line_starts: Vec<usize>,
233}
234
235impl SourceMap {
236 pub fn new(source: &str) -> Self {
237 let mut starts = vec![0];
238 for (i, b) in source.bytes().enumerate() {
239 if b == b'\n' {
240 starts.push(i + 1);
241 }
242 }
243 Self { line_starts: starts }
244 }
245
246 pub fn line_col(&self, offset: usize) -> (u32, u32) {
247 let line = self
248 .line_starts
249 .partition_point(|&s| s <= offset)
250 .saturating_sub(1);
251 let col = offset - self.line_starts[line];
252 (line as u32, col as u32)
253 }
254
255 pub fn line_text<'a>(&self, source: &'a str, line: u32) -> &'a str {
257 let idx = line as usize;
258 let start = self.line_starts[idx];
259 let end = if idx + 1 < self.line_starts.len() {
260 self.line_starts[idx + 1]
261 } else {
262 source.len()
263 };
264 source[start..end].trim_end_matches('\n').trim_end_matches('\r')
265 }
266}
267
268pub fn lex(source: &str) -> Vec<Token> {
278 let mut lexer = Lexer::new(source);
279 let mut tokens = Vec::new();
280 loop {
281 let tok = lexer.next_token();
282 let done = tok.kind == TokenKind::Eof;
283 tokens.push(tok);
284 if done {
285 break;
286 }
287 }
288 tokens
289}
290
291struct Lexer<'s> {
292 src: &'s [u8],
293 pos: usize,
294}
295
296impl<'s> Lexer<'s> {
297 fn new(source: &'s str) -> Self {
298 Self {
299 src: source.as_bytes(),
300 pos: 0,
301 }
302 }
303
304 fn next_token(&mut self) -> Token {
305 self.skip_whitespace_and_comments();
306
307 if self.pos >= self.src.len() {
308 return Token {
309 kind: TokenKind::Eof,
310 span: Span::new(self.pos, self.pos),
311 };
312 }
313
314 let start = self.pos;
315 let b = self.src[self.pos];
316
317 if b == b'"' {
318 return self.lex_string(start);
319 }
320 if b.is_ascii_digit() {
321 return self.lex_number(start);
322 }
323 if is_ident_start(b) {
324 return self.lex_ident(start);
325 }
326
327 self.lex_operator(start)
328 }
329
330 fn skip_whitespace_and_comments(&mut self) {
333 loop {
334 while self.pos < self.src.len()
335 && matches!(self.src[self.pos], b' ' | b'\t' | b'\n' | b'\r')
336 {
337 self.pos += 1;
338 }
339 if self.pos + 1 < self.src.len()
340 && self.src[self.pos] == b'-'
341 && self.src[self.pos + 1] == b'-'
342 {
343 while self.pos < self.src.len() && self.src[self.pos] != b'\n' {
344 self.pos += 1;
345 }
346 continue;
347 }
348 break;
349 }
350 }
351
352 fn lex_string(&mut self, start: usize) -> Token {
355 self.pos += 1; while self.pos < self.src.len() {
357 match self.src[self.pos] {
358 b'"' => {
359 self.pos += 1;
360 return Token {
361 kind: TokenKind::String,
362 span: Span::new(start, self.pos),
363 };
364 }
365 b'\\' => {
366 self.pos += 1;
367 if self.pos < self.src.len() {
368 self.pos += 1;
369 }
370 }
371 b'\n' => {
372 return Token {
373 kind: TokenKind::Error,
374 span: Span::new(start, self.pos),
375 };
376 }
377 _ => self.pos += 1,
378 }
379 }
380 Token {
381 kind: TokenKind::Error,
382 span: Span::new(start, self.pos),
383 }
384 }
385
386 fn lex_number(&mut self, start: usize) -> Token {
389 self.consume_digits();
390
391 if self.pos < self.src.len() && self.src[self.pos] == b'.' {
392 let after_dot = self.pos + 1;
393 if after_dot < self.src.len() && self.src[after_dot].is_ascii_digit() {
394 self.pos += 1;
396 self.consume_digits();
397 if self.check_duration_suffix() {
399 return Token {
400 kind: TokenKind::Duration,
401 span: Span::new(start, self.pos),
402 };
403 }
404 return Token {
405 kind: TokenKind::Number,
406 span: Span::new(start, self.pos),
407 };
408 }
409 if self.peek_duration_unit(after_dot).is_some() {
410 let unit_len = self.peek_duration_unit(after_dot).unwrap();
411 self.pos = after_dot + unit_len;
412 return Token {
413 kind: TokenKind::Duration,
414 span: Span::new(start, self.pos),
415 };
416 }
417 }
418
419 Token {
420 kind: TokenKind::Number,
421 span: Span::new(start, self.pos),
422 }
423 }
424
425 fn consume_digits(&mut self) {
426 while self.pos < self.src.len()
427 && (self.src[self.pos].is_ascii_digit() || self.src[self.pos] == b'_')
428 {
429 self.pos += 1;
430 }
431 }
432
433 fn check_duration_suffix(&mut self) -> bool {
435 if self.pos < self.src.len() && self.src[self.pos] == b'.' {
436 if let Some(unit_len) = self.peek_duration_unit(self.pos + 1) {
437 self.pos += 1 + unit_len;
438 return true;
439 }
440 }
441 false
442 }
443
444 fn peek_duration_unit(&self, from: usize) -> Option<usize> {
445 const UNITS: &[&str] = &[
446 "seconds", "second", "minutes", "minute", "hours", "hour", "days", "day", "weeks",
447 "week", "months", "month", "years", "year",
448 ];
449 for unit in UNITS {
450 let end = from + unit.len();
451 if end <= self.src.len()
452 && &self.src[from..end] == unit.as_bytes()
453 && (end >= self.src.len() || !is_ident_continue(self.src[end]))
454 {
455 return Some(unit.len());
456 }
457 }
458 None
459 }
460
461 fn lex_ident(&mut self, start: usize) -> Token {
464 while self.pos < self.src.len() && is_ident_continue(self.src[self.pos]) {
465 self.pos += 1;
466 }
467 let text = std::str::from_utf8(&self.src[start..self.pos]).unwrap();
468 Token {
469 kind: classify_keyword(text),
470 span: Span::new(start, self.pos),
471 }
472 }
473
474 fn lex_operator(&mut self, start: usize) -> Token {
477 let b = self.src[self.pos];
478 let next = if self.pos + 1 < self.src.len() {
479 self.src[self.pos + 1]
480 } else {
481 0
482 };
483
484 let (kind, len) = match (b, next) {
485 (b'=', b'>') => (TokenKind::FatArrow, 2),
486 (b'=', _) => (TokenKind::Eq, 1),
487 (b'!', b'=') => (TokenKind::BangEq, 2),
488 (b'<', b'=') => (TokenKind::LtEq, 2),
489 (b'<', _) => (TokenKind::Lt, 1),
490 (b'>', b'=') => (TokenKind::GtEq, 2),
491 (b'>', _) => (TokenKind::Gt, 1),
492 (b'+', _) => (TokenKind::Plus, 1),
493 (b'-', b'>') => (TokenKind::ThinArrow, 2),
494 (b'-', _) => (TokenKind::Minus, 1),
495 (b'*', _) => (TokenKind::Star, 1),
496 (b'/', _) => (TokenKind::Slash, 1),
497 (b'|', _) => (TokenKind::Pipe, 1),
498 (b'?', b'?') => (TokenKind::QuestionQuestion, 2),
499 (b'?', b'.') => (TokenKind::QuestionDot, 2),
500 (b'?', _) => (TokenKind::QuestionMark, 1),
501 (b'.', _) => (TokenKind::Dot, 1),
502 (b'{', _) => (TokenKind::LBrace, 1),
503 (b'}', _) => (TokenKind::RBrace, 1),
504 (b'(', _) => (TokenKind::LParen, 1),
505 (b')', _) => (TokenKind::RParen, 1),
506 (b'[', _) => (TokenKind::LBracket, 1),
507 (b']', _) => (TokenKind::RBracket, 1),
508 (b':', _) => (TokenKind::Colon, 1),
509 (b',', _) => (TokenKind::Comma, 1),
510 _ => (TokenKind::Error, 1),
511 };
512
513 self.pos += len;
514 Token {
515 kind,
516 span: Span::new(start, self.pos),
517 }
518 }
519}
520
521fn is_ident_start(b: u8) -> bool {
526 b.is_ascii_alphabetic() || b == b'_'
527}
528
529fn is_ident_continue(b: u8) -> bool {
530 b.is_ascii_alphanumeric() || b == b'_'
531}
532
533fn classify_keyword(text: &str) -> TokenKind {
534 match text {
535 "rule" => TokenKind::Rule,
536 "entity" => TokenKind::Entity,
537 "external" => TokenKind::External,
538 "value" => TokenKind::Value,
539 "enum" => TokenKind::Enum,
540 "given" => TokenKind::Given,
541 "config" => TokenKind::Config,
542 "surface" => TokenKind::Surface,
543 "actor" => TokenKind::Actor,
544 "default" => TokenKind::Default,
545 "variant" => TokenKind::Variant,
546 "deferred" => TokenKind::Deferred,
547 "open" => TokenKind::Open,
548 "question" => TokenKind::Question,
549 "use" => TokenKind::Use,
550 "as" => TokenKind::As,
551 "when" => TokenKind::When,
552 "requires" => TokenKind::Requires,
553 "ensures" => TokenKind::Ensures,
554 "let" => TokenKind::Let,
555 "for" => TokenKind::For,
556 "in" => TokenKind::In,
557 "if" => TokenKind::If,
558 "else" => TokenKind::Else,
559 "where" => TokenKind::Where,
560 "with" => TokenKind::With,
561 "not" => TokenKind::Not,
562 "and" => TokenKind::And,
563 "or" => TokenKind::Or,
564 "exists" => TokenKind::Exists,
565 "transitions_to" => TokenKind::TransitionsTo,
566 "becomes" => TokenKind::Becomes,
567 "true" => TokenKind::True,
568 "false" => TokenKind::False,
569 "null" => TokenKind::Null,
570 "now" => TokenKind::Now,
571 "this" => TokenKind::This,
572 "within" => TokenKind::Within,
573 _ => TokenKind::Ident,
574 }
575}
576
577#[cfg(test)]
582mod tests {
583 use super::*;
584
585 fn kinds(src: &str) -> Vec<TokenKind> {
586 lex(src).into_iter().map(|t| t.kind).collect()
587 }
588
589 fn text_of(src: &str) -> Vec<&str> {
590 lex(src)
591 .into_iter()
592 .map(|t| &src[t.span.start..t.span.end])
593 .collect()
594 }
595
596 #[test]
597 fn keywords() {
598 assert_eq!(
599 kinds("rule entity enum"),
600 vec![TokenKind::Rule, TokenKind::Entity, TokenKind::Enum, TokenKind::Eof]
601 );
602 }
603
604 #[test]
605 fn identifiers() {
606 assert_eq!(
607 kinds("my_var User"),
608 vec![TokenKind::Ident, TokenKind::Ident, TokenKind::Eof]
609 );
610 }
611
612 #[test]
613 fn numbers() {
614 assert_eq!(kinds("42"), vec![TokenKind::Number, TokenKind::Eof]);
615 assert_eq!(kinds("3.14"), vec![TokenKind::Number, TokenKind::Eof]);
616 assert_eq!(kinds("100_000"), vec![TokenKind::Number, TokenKind::Eof]);
617 }
618
619 #[test]
620 fn durations() {
621 assert_eq!(kinds("24.hours"), vec![TokenKind::Duration, TokenKind::Eof]);
622 assert_eq!(kinds("7.days"), vec![TokenKind::Duration, TokenKind::Eof]);
623 assert_eq!(kinds("1.second"), vec![TokenKind::Duration, TokenKind::Eof]);
624 assert_eq!(kinds("3.5.minutes"), vec![TokenKind::Duration, TokenKind::Eof]);
625 }
626
627 #[test]
628 fn duration_vs_member_access() {
629 assert_eq!(
631 kinds("42.count"),
632 vec![TokenKind::Number, TokenKind::Dot, TokenKind::Ident, TokenKind::Eof]
633 );
634 }
635
636 #[test]
637 fn strings() {
638 assert_eq!(kinds(r#""hello""#), vec![TokenKind::String, TokenKind::Eof]);
639 assert_eq!(
640 kinds(r#""hello {name}""#),
641 vec![TokenKind::String, TokenKind::Eof]
642 );
643 }
644
645 #[test]
646 fn operators() {
647 assert_eq!(
648 kinds("=> -> ?? ?. != <= >="),
649 vec![
650 TokenKind::FatArrow,
651 TokenKind::ThinArrow,
652 TokenKind::QuestionQuestion,
653 TokenKind::QuestionDot,
654 TokenKind::BangEq,
655 TokenKind::LtEq,
656 TokenKind::GtEq,
657 TokenKind::Eof,
658 ]
659 );
660 }
661
662 #[test]
663 fn comments_skipped() {
664 assert_eq!(
665 kinds("rule -- this is a comment\nentity"),
666 vec![TokenKind::Rule, TokenKind::Entity, TokenKind::Eof]
667 );
668 }
669
670 #[test]
671 fn delimiters() {
672 assert_eq!(
673 kinds("{ } ( ) : ,"),
674 vec![
675 TokenKind::LBrace, TokenKind::RBrace,
676 TokenKind::LParen, TokenKind::RParen,
677 TokenKind::Colon, TokenKind::Comma,
678 TokenKind::Eof,
679 ]
680 );
681 }
682
683 #[test]
684 fn full_line() {
685 let src = "status: pending | active | completed";
686 assert_eq!(
687 text_of(src),
688 vec!["status", ":", "pending", "|", "active", "|", "completed", ""]
689 );
690 }
691
692 #[test]
693 fn source_map_line_col() {
694 let src = "abc\ndef\nghi";
695 let map = SourceMap::new(src);
696 assert_eq!(map.line_col(0), (0, 0)); assert_eq!(map.line_col(3), (0, 3)); assert_eq!(map.line_col(4), (1, 0)); assert_eq!(map.line_col(8), (2, 0)); }
701}