1use crate::span::Span;
8
9#[derive(Debug, Clone, PartialEq)]
10pub struct Token {
11 pub kind: TokenKind,
12 pub span: Span,
13}
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum TokenKind {
17 Name(String),
19 Integer(i64),
20 StringLit(String),
21
22 Module,
24 Import,
25 Const,
26 Enum,
27 Flags,
28 Type,
29 Packet,
30 Frame,
31 Capsule,
32 State,
33 Machine,
34 Transition,
35 Initial,
36 Terminal,
37 On,
38 Guard,
39 Action,
40 Delegate,
41 Match,
42 If,
43 Let,
44 Require,
45 StaticAssert,
46 Within,
47 Export,
48 Varint,
49 Bytes,
50 Bits,
51 Bit,
52 Fill,
53 Remaining,
54 True,
55 False,
56 Null,
57 And,
58 Or,
59 Not,
60 InState,
61 All,
62
63 LBrace,
65 RBrace,
66 LParen,
67 RParen,
68 LBracket,
69 RBracket,
70 Colon,
71 ColonColon,
72 Semicolon,
73 Comma,
74 Dot,
75 DotDot,
76 DotDotEq,
77 Arrow, FatArrow, Assign, PlusAssign, At, LArrow, Plus,
86 Minus,
87 Star,
88 Slash,
89 Percent,
90
91 Amp,
93 Pipe,
94 Caret,
95 Shl,
96 Shr,
97 Bang,
98
99 EqEq,
101 BangEq,
102 Lt,
103 Le,
104 Gt,
105 Ge,
106
107 QuestionQuestion, Tilde, TildeGt, Eof,
114}
115
116pub struct Lexer<'src> {
117 source: &'src [u8],
118 pos: usize,
119 tokens: Vec<Token>,
120}
121
122impl<'src> Lexer<'src> {
123 pub fn new(source: &'src str) -> Self {
124 Self {
125 source: source.as_bytes(),
126 pos: 0,
127 tokens: Vec::new(),
128 }
129 }
130
131 pub fn tokenize(mut self) -> Result<Vec<Token>, LexError> {
132 while self.pos < self.source.len() {
133 self.skip_whitespace_and_comments();
134 if self.pos >= self.source.len() {
135 break;
136 }
137 self.next_token()?;
138 }
139 self.tokens.push(Token {
140 kind: TokenKind::Eof,
141 span: Span::new(self.pos as u32, 0),
142 });
143 Ok(self.tokens)
144 }
145
146 fn peek(&self) -> u8 {
147 if self.pos < self.source.len() {
148 self.source[self.pos]
149 } else {
150 0
151 }
152 }
153
154 fn peek_at(&self, offset: usize) -> u8 {
155 let idx = self.pos + offset;
156 if idx < self.source.len() {
157 self.source[idx]
158 } else {
159 0
160 }
161 }
162
163 fn advance(&mut self) -> u8 {
164 let ch = self.source[self.pos];
165 self.pos += 1;
166 ch
167 }
168
169 fn skip_whitespace_and_comments(&mut self) {
170 while self.pos < self.source.len() {
171 let ch = self.peek();
172 if ch == b' ' || ch == b'\t' || ch == b'\r' || ch == b'\n' {
173 self.pos += 1;
174 } else if ch == b'#' || (ch == b'/' && self.peek_at(1) == b'/') {
175 while self.pos < self.source.len() && self.source[self.pos] != b'\n' {
177 self.pos += 1;
178 }
179 } else {
180 break;
181 }
182 }
183 }
184
185 fn next_token(&mut self) -> Result<(), LexError> {
186 let start = self.pos;
187 let ch = self.advance();
188
189 let kind = match ch {
190 b'{' => TokenKind::LBrace,
191 b'}' => TokenKind::RBrace,
192 b'(' => TokenKind::LParen,
193 b')' => TokenKind::RParen,
194 b'[' => TokenKind::LBracket,
195 b']' => TokenKind::RBracket,
196 b';' => TokenKind::Semicolon,
197 b',' => TokenKind::Comma,
198 b'*' => TokenKind::Star,
199 b'%' => TokenKind::Percent,
200 b'^' => TokenKind::Caret,
201 b'@' => TokenKind::At,
202
203 b':' => {
204 if self.peek() == b':' {
205 self.advance();
206 TokenKind::ColonColon
207 } else {
208 TokenKind::Colon
209 }
210 }
211
212 b'.' => {
213 if self.peek() == b'.' {
214 self.advance();
215 if self.peek() == b'=' {
216 self.advance();
217 TokenKind::DotDotEq
218 } else {
219 TokenKind::DotDot
220 }
221 } else {
222 TokenKind::Dot
223 }
224 }
225
226 b'-' => {
227 if self.peek() == b'>' {
228 self.advance();
229 TokenKind::Arrow
230 } else {
231 TokenKind::Minus
232 }
233 }
234
235 b'=' => {
236 if self.peek() == b'>' {
237 self.advance();
238 TokenKind::FatArrow
239 } else if self.peek() == b'=' {
240 self.advance();
241 TokenKind::EqEq
242 } else {
243 TokenKind::Assign
244 }
245 }
246
247 b'+' => {
248 if self.peek() == b'=' {
249 self.advance();
250 TokenKind::PlusAssign
251 } else {
252 TokenKind::Plus
253 }
254 }
255
256 b'!' => {
257 if self.peek() == b'=' {
258 self.advance();
259 TokenKind::BangEq
260 } else {
261 TokenKind::Bang
262 }
263 }
264
265 b'<' => {
266 if self.peek() == b'=' {
267 self.advance();
268 TokenKind::Le
269 } else if self.peek() == b'<' {
270 self.advance();
271 TokenKind::Shl
272 } else if self.peek() == b'-' {
273 self.advance();
274 TokenKind::LArrow
275 } else {
276 TokenKind::Lt
277 }
278 }
279
280 b'>' => {
281 if self.peek() == b'=' {
282 self.advance();
283 TokenKind::Ge
284 } else if self.peek() == b'>' {
285 self.advance();
286 TokenKind::Shr
287 } else {
288 TokenKind::Gt
289 }
290 }
291
292 b'&' => TokenKind::Amp,
293 b'|' => TokenKind::Pipe,
294 b'/' => TokenKind::Slash,
295 b'~' => {
296 if self.peek() == b'>' {
297 self.advance();
298 TokenKind::TildeGt
299 } else {
300 TokenKind::Tilde
301 }
302 }
303
304 b'?' => {
305 if self.peek() == b'?' {
306 self.advance();
307 TokenKind::QuestionQuestion
308 } else {
309 return Err(LexError {
310 msg: "unexpected '?'".into(),
311 offset: start,
312 });
313 }
314 }
315
316 b'"' => return self.lex_string(start),
317
318 b'0' if self.peek() == b'x' || self.peek() == b'X' => {
319 self.advance(); return self.lex_hex(start);
321 }
322
323 b'0' if self.peek() == b'b' || self.peek() == b'B' => {
324 self.advance(); return self.lex_binary(start);
326 }
327
328 ch if ch.is_ascii_digit() => {
329 return self.lex_decimal(start);
330 }
331
332 ch if ch.is_ascii_alphabetic() || ch == b'_' => {
333 return self.lex_name(start);
334 }
335
336 _ => {
337 return Err(LexError {
338 msg: format!("unexpected character: {:?}", ch as char),
339 offset: start,
340 });
341 }
342 };
343
344 self.tokens.push(Token {
345 kind,
346 span: Span::new(start as u32, (self.pos - start) as u32),
347 });
348 Ok(())
349 }
350
351 fn lex_decimal(&mut self, start: usize) -> Result<(), LexError> {
352 while self.pos < self.source.len() && (self.peek().is_ascii_digit() || self.peek() == b'_')
353 {
354 self.advance();
355 }
356 let text: String = self.source[start..self.pos]
357 .iter()
358 .filter(|&&b| b != b'_')
359 .map(|&b| b as char)
360 .collect();
361 let value = text.parse::<i64>().map_err(|_| LexError {
362 msg: format!("invalid integer literal: {text}"),
363 offset: start,
364 })?;
365 self.tokens.push(Token {
366 kind: TokenKind::Integer(value),
367 span: Span::new(start as u32, (self.pos - start) as u32),
368 });
369 Ok(())
370 }
371
372 fn lex_hex(&mut self, start: usize) -> Result<(), LexError> {
373 if self.pos >= self.source.len() || !self.peek().is_ascii_hexdigit() {
374 return Err(LexError {
375 msg: "expected hex digit after 0x".into(),
376 offset: start,
377 });
378 }
379 while self.pos < self.source.len()
380 && (self.peek().is_ascii_hexdigit() || self.peek() == b'_')
381 {
382 self.advance();
383 }
384 let text: String = self.source[start + 2..self.pos]
386 .iter()
387 .filter(|&&b| b != b'_')
388 .map(|&b| b as char)
389 .collect();
390 let value = i64::from_str_radix(&text, 16).map_err(|_| LexError {
391 msg: format!("invalid hex literal: 0x{text}"),
392 offset: start,
393 })?;
394 self.tokens.push(Token {
395 kind: TokenKind::Integer(value),
396 span: Span::new(start as u32, (self.pos - start) as u32),
397 });
398 Ok(())
399 }
400
401 fn lex_binary(&mut self, start: usize) -> Result<(), LexError> {
402 if self.pos >= self.source.len() || (self.peek() != b'0' && self.peek() != b'1') {
403 return Err(LexError {
404 msg: "expected binary digit after 0b".into(),
405 offset: start,
406 });
407 }
408 while self.pos < self.source.len()
409 && (self.peek() == b'0' || self.peek() == b'1' || self.peek() == b'_')
410 {
411 self.advance();
412 }
413 let text: String = self.source[start + 2..self.pos]
414 .iter()
415 .filter(|&&b| b != b'_')
416 .map(|&b| b as char)
417 .collect();
418 let value = i64::from_str_radix(&text, 2).map_err(|_| LexError {
419 msg: format!("invalid binary literal: 0b{text}"),
420 offset: start,
421 })?;
422 self.tokens.push(Token {
423 kind: TokenKind::Integer(value),
424 span: Span::new(start as u32, (self.pos - start) as u32),
425 });
426 Ok(())
427 }
428
429 fn lex_string(&mut self, start: usize) -> Result<(), LexError> {
430 let mut value = String::new();
431 loop {
432 if self.pos >= self.source.len() {
433 return Err(LexError {
434 msg: "unterminated string literal".into(),
435 offset: start,
436 });
437 }
438 let ch = self.advance();
439 match ch {
440 b'"' => break,
441 b'\\' => {
442 if self.pos >= self.source.len() {
443 return Err(LexError {
444 msg: "unterminated escape in string".into(),
445 offset: start,
446 });
447 }
448 let esc = self.advance();
449 match esc {
450 b'n' => value.push('\n'),
451 b't' => value.push('\t'),
452 b'\\' => value.push('\\'),
453 b'"' => value.push('"'),
454 _ => {
455 return Err(LexError {
456 msg: format!("unknown escape: \\{}", esc as char),
457 offset: self.pos - 1,
458 });
459 }
460 }
461 }
462 _ => value.push(ch as char),
463 }
464 }
465 self.tokens.push(Token {
466 kind: TokenKind::StringLit(value),
467 span: Span::new(start as u32, (self.pos - start) as u32),
468 });
469 Ok(())
470 }
471
472 fn lex_name(&mut self, start: usize) -> Result<(), LexError> {
473 while self.pos < self.source.len()
474 && (self.peek().is_ascii_alphanumeric() || self.peek() == b'_')
475 {
476 self.advance();
477 }
478 let text = std::str::from_utf8(&self.source[start..self.pos])
479 .expect("identifier bytes must be valid UTF-8");
480 let kind = match text {
481 "module" => TokenKind::Module,
482 "import" => TokenKind::Import,
483 "const" => TokenKind::Const,
484 "enum" => TokenKind::Enum,
485 "flags" => TokenKind::Flags,
486 "type" => TokenKind::Type,
487 "packet" => TokenKind::Packet,
488 "frame" => TokenKind::Frame,
489 "capsule" => TokenKind::Capsule,
490 "state" => TokenKind::State,
491 "machine" => TokenKind::Machine,
492 "transition" => TokenKind::Transition,
493 "initial" => TokenKind::Initial,
494 "terminal" => TokenKind::Terminal,
495 "on" => TokenKind::On,
496 "guard" => TokenKind::Guard,
497 "action" => TokenKind::Action,
498 "delegate" => TokenKind::Delegate,
499 "match" => TokenKind::Match,
500 "if" => TokenKind::If,
501 "let" => TokenKind::Let,
502 "require" => TokenKind::Require,
503 "static_assert" => TokenKind::StaticAssert,
504 "within" => TokenKind::Within,
505 "export" => TokenKind::Export,
506 "varint" => TokenKind::Varint,
507 "bytes" => TokenKind::Bytes,
508 "bits" => TokenKind::Bits,
509 "bit" => TokenKind::Bit,
510 "fill" => TokenKind::Fill,
511 "remaining" => TokenKind::Remaining,
512 "true" => TokenKind::True,
513 "false" => TokenKind::False,
514 "null" => TokenKind::Null,
515 "and" => TokenKind::And,
516 "or" => TokenKind::Or,
517 "not" => TokenKind::Not,
518 "in_state" => TokenKind::InState,
519 "all" => TokenKind::All,
520 _ => TokenKind::Name(text.to_string()),
521 };
522 self.tokens.push(Token {
523 kind,
524 span: Span::new(start as u32, (self.pos - start) as u32),
525 });
526 Ok(())
527 }
528}
529
530#[derive(Debug, Clone)]
531pub struct LexError {
532 pub msg: String,
533 pub offset: usize,
534}
535
536impl std::fmt::Display for LexError {
537 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
538 write!(f, "lex error at offset {}: {}", self.offset, self.msg)
539 }
540}
541
542impl std::error::Error for LexError {}
543
544#[cfg(test)]
545mod tests {
546 use super::*;
547
548 fn tok_kinds(src: &str) -> Vec<TokenKind> {
549 let tokens = Lexer::new(src).tokenize().unwrap();
550 tokens.into_iter().map(|t| t.kind).collect()
551 }
552
553 #[test]
554 fn keywords() {
555 let kinds = tok_kinds("packet frame capsule type match");
556 assert_eq!(
557 kinds,
558 vec![
559 TokenKind::Packet,
560 TokenKind::Frame,
561 TokenKind::Capsule,
562 TokenKind::Type,
563 TokenKind::Match,
564 TokenKind::Eof,
565 ]
566 );
567 }
568
569 #[test]
570 fn integers() {
571 let kinds = tok_kinds("42 0xFF 0b1010");
572 assert_eq!(
573 kinds,
574 vec![
575 TokenKind::Integer(42),
576 TokenKind::Integer(0xFF),
577 TokenKind::Integer(0b1010),
578 TokenKind::Eof,
579 ]
580 );
581 }
582
583 #[test]
584 fn operators() {
585 let kinds = tok_kinds("+ - * / & | ^ << >> == != <= >= ?? => -> <- ..=");
586 assert_eq!(
587 kinds,
588 vec![
589 TokenKind::Plus,
590 TokenKind::Minus,
591 TokenKind::Star,
592 TokenKind::Slash,
593 TokenKind::Amp,
594 TokenKind::Pipe,
595 TokenKind::Caret,
596 TokenKind::Shl,
597 TokenKind::Shr,
598 TokenKind::EqEq,
599 TokenKind::BangEq,
600 TokenKind::Le,
601 TokenKind::Ge,
602 TokenKind::QuestionQuestion,
603 TokenKind::FatArrow,
604 TokenKind::Arrow,
605 TokenKind::LArrow,
606 TokenKind::DotDotEq,
607 TokenKind::Eof,
608 ]
609 );
610 }
611
612 #[test]
613 fn string_literal() {
614 let kinds = tok_kinds(r#""hello world""#);
615 assert_eq!(
616 kinds,
617 vec![
618 TokenKind::StringLit("hello world".to_string()),
619 TokenKind::Eof
620 ]
621 );
622 }
623
624 #[test]
625 fn comments() {
626 let kinds = tok_kinds("packet # comment\nframe // also comment\ncapsule");
627 assert_eq!(
628 kinds,
629 vec![
630 TokenKind::Packet,
631 TokenKind::Frame,
632 TokenKind::Capsule,
633 TokenKind::Eof,
634 ]
635 );
636 }
637
638 #[test]
639 fn name_and_reserved() {
640 let kinds = tok_kinds("src dst fill remaining in_state all true false null");
641 assert_eq!(
642 kinds,
643 vec![
644 TokenKind::Name("src".into()),
645 TokenKind::Name("dst".into()),
646 TokenKind::Fill,
647 TokenKind::Remaining,
648 TokenKind::InState,
649 TokenKind::All,
650 TokenKind::True,
651 TokenKind::False,
652 TokenKind::Null,
653 TokenKind::Eof,
654 ]
655 );
656 }
657
658 #[test]
659 fn hex_underscore() {
660 let kinds = tok_kinds("0xFF_FF");
661 assert_eq!(kinds, vec![TokenKind::Integer(0xFFFF), TokenKind::Eof]);
662 }
663
664 #[test]
665 fn binary_underscore() {
666 let kinds = tok_kinds("0b1010_0101");
667 assert_eq!(kinds, vec![TokenKind::Integer(0b10100101), TokenKind::Eof]);
668 }
669
670 #[test]
671 fn decimal_underscore() {
672 let kinds = tok_kinds("1_000_000");
673 assert_eq!(kinds, vec![TokenKind::Integer(1000000), TokenKind::Eof]);
674 }
675
676 #[test]
677 fn empty_string() {
678 let kinds = tok_kinds(r#""""#);
679 assert_eq!(
680 kinds,
681 vec![TokenKind::StringLit("".to_string()), TokenKind::Eof]
682 );
683 }
684
685 #[test]
686 fn string_escapes() {
687 let kinds = tok_kinds(r#""\n\t\\\"" "#);
688 assert_eq!(
689 kinds,
690 vec![TokenKind::StringLit("\n\t\\\"".to_string()), TokenKind::Eof,]
691 );
692 }
693
694 #[test]
695 fn consecutive_operators() {
696 let kinds = tok_kinds(">>>=");
697 assert_eq!(kinds, vec![TokenKind::Shr, TokenKind::Ge, TokenKind::Eof]);
699 }
700}