1use crate::types::Span;
4use logos::Logos;
5
6pub type SpannedToken = (Token, Span);
8
9#[derive(Debug, Clone, PartialEq)]
11pub struct LexError {
12 pub message: String,
13 pub span: Span,
14}
15
16#[derive(Logos, Debug, Clone, PartialEq)]
18#[logos(skip r"[ \t\n\r\f]+")]
19#[logos(skip r"//[^\n]*")]
20pub enum Token {
21 #[regex(r"0[xX][0-9a-fA-F]+[uU]", lex_hex_uint)]
26 #[regex(r"[0-9]+[uU]", lex_decimal_uint, priority = 4)]
28 UInt(u64),
29
30 #[regex(r"0[xX][0-9a-fA-F]+", lex_hex_int, priority = 3)]
32 #[regex(r"[0-9]+", lex_decimal_int, priority = 1)]
34 Int(i64),
35
36 #[token("9223372036854775808", |_| "9223372036854775808".to_string(), priority = 2)]
40 IntOverflow(String),
41
42 #[regex(r"[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?", lex_float, priority = 5)]
44 #[regex(r"[0-9]+[eE][+-]?[0-9]+", lex_float, priority = 2)]
46 #[regex(r"\.[0-9]+([eE][+-]?[0-9]+)?", lex_float, priority = 6)]
48 Float(f64),
49
50 #[regex(r#"""""#, lex_triple_double_string)]
53 #[regex(r"'''", lex_triple_single_string)]
54 #[regex(r#"[rR]""""#, lex_raw_triple_double_string)]
56 #[regex(r"[rR]'''", lex_raw_triple_single_string)]
57 #[regex(r#"[rR]""#, lex_raw_double_string)]
59 #[regex(r"[rR]'", lex_raw_single_string)]
60 #[regex(r#"""#, lex_double_string)]
62 #[regex(r"'", lex_single_string)]
63 String(String),
64
65 #[regex(r#"[bB][rR]""""#, lex_raw_bytes_triple_double)]
68 #[regex(r"[bB][rR]'''", lex_raw_bytes_triple_single)]
69 #[regex(r#"[rR][bB]""""#, lex_raw_bytes_triple_double)]
70 #[regex(r"[rR][bB]'''", lex_raw_bytes_triple_single)]
71 #[regex(r#"[bB]""""#, lex_bytes_triple_double)]
73 #[regex(r"[bB]'''", lex_bytes_triple_single)]
74 #[regex(r#"[bB][rR]""#, lex_raw_bytes_double)]
76 #[regex(r"[bB][rR]'", lex_raw_bytes_single)]
77 #[regex(r#"[rR][bB]""#, lex_raw_bytes_double)]
78 #[regex(r"[rR][bB]'", lex_raw_bytes_single)]
79 #[regex(r#"[bB]""#, lex_bytes_double)]
81 #[regex(r"[bB]'", lex_bytes_single)]
82 Bytes(Vec<u8>),
83
84 #[token("true")]
86 True,
87 #[token("false")]
88 False,
89 #[token("null")]
90 Null,
91 #[token("in")]
92 In,
93
94 #[token("as", |_| "as".to_string())]
96 #[token("break", |_| "break".to_string())]
97 #[token("const", |_| "const".to_string())]
98 #[token("continue", |_| "continue".to_string())]
99 #[token("else", |_| "else".to_string())]
100 #[token("for", |_| "for".to_string())]
101 #[token("function", |_| "function".to_string())]
102 #[token("if", |_| "if".to_string())]
103 #[token("import", |_| "import".to_string())]
104 #[token("let", |_| "let".to_string())]
105 #[token("loop", |_| "loop".to_string())]
106 #[token("package", |_| "package".to_string())]
107 #[token("namespace", |_| "namespace".to_string())]
108 #[token("return", |_| "return".to_string())]
109 #[token("var", |_| "var".to_string())]
110 #[token("void", |_| "void".to_string())]
111 #[token("while", |_| "while".to_string())]
112 Reserved(String),
113
114 #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string(), priority = 0)]
116 #[regex(r"`[^`]+`", lex_backtick_ident)]
117 Ident(String),
118
119 #[token("==")]
121 EqEq,
122 #[token("!=")]
123 Ne,
124 #[token("<=")]
125 Le,
126 #[token(">=")]
127 Ge,
128 #[token("&&")]
129 And,
130 #[token("||")]
131 Or,
132
133 #[token("+")]
134 Plus,
135 #[token("-")]
136 Minus,
137 #[token("*")]
138 Star,
139 #[token("/")]
140 Slash,
141 #[token("%")]
142 Percent,
143 #[token("<")]
144 Lt,
145 #[token(">")]
146 Gt,
147 #[token("!")]
148 Not,
149 #[token("?")]
150 Question,
151 #[token(":")]
152 Colon,
153
154 #[token("(")]
156 LParen,
157 #[token(")")]
158 RParen,
159 #[token("[")]
160 LBracket,
161 #[token("]")]
162 RBracket,
163 #[token("{")]
164 LBrace,
165 #[token("}")]
166 RBrace,
167 #[token(".")]
168 Dot,
169 #[token(",")]
170 Comma,
171}
172
173impl std::fmt::Display for Token {
174 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
175 match self {
176 Token::Int(n) => write!(f, "{}", n),
177 Token::IntOverflow(s) => write!(f, "{}", s),
178 Token::UInt(n) => write!(f, "{}u", n),
179 Token::Float(n) => write!(f, "{}", n),
180 Token::String(s) => write!(f, "\"{}\"", s),
181 Token::Bytes(b) => write!(f, "b\"{}\"", String::from_utf8_lossy(b)),
182 Token::True => write!(f, "true"),
183 Token::False => write!(f, "false"),
184 Token::Null => write!(f, "null"),
185 Token::In => write!(f, "in"),
186 Token::Reserved(s) => write!(f, "{}", s),
187 Token::Ident(s) => write!(f, "{}", s),
188 Token::Plus => write!(f, "+"),
189 Token::Minus => write!(f, "-"),
190 Token::Star => write!(f, "*"),
191 Token::Slash => write!(f, "/"),
192 Token::Percent => write!(f, "%"),
193 Token::EqEq => write!(f, "=="),
194 Token::Ne => write!(f, "!="),
195 Token::Lt => write!(f, "<"),
196 Token::Le => write!(f, "<="),
197 Token::Gt => write!(f, ">"),
198 Token::Ge => write!(f, ">="),
199 Token::And => write!(f, "&&"),
200 Token::Or => write!(f, "||"),
201 Token::Not => write!(f, "!"),
202 Token::Question => write!(f, "?"),
203 Token::Colon => write!(f, ":"),
204 Token::LParen => write!(f, "("),
205 Token::RParen => write!(f, ")"),
206 Token::LBracket => write!(f, "["),
207 Token::RBracket => write!(f, "]"),
208 Token::LBrace => write!(f, "{{"),
209 Token::RBrace => write!(f, "}}"),
210 Token::Dot => write!(f, "."),
211 Token::Comma => write!(f, ","),
212 }
213 }
214}
215
216fn lex_decimal_int(lex: &mut logos::Lexer<Token>) -> Option<i64> {
219 lex.slice().parse().ok()
220}
221
222fn lex_decimal_uint(lex: &mut logos::Lexer<Token>) -> Option<u64> {
223 let s = lex.slice();
224 s[..s.len() - 1].parse().ok() }
226
227fn lex_hex_int(lex: &mut logos::Lexer<Token>) -> Option<i64> {
228 let s = lex.slice();
229 i64::from_str_radix(&s[2..], 16).ok() }
231
232fn lex_hex_uint(lex: &mut logos::Lexer<Token>) -> Option<u64> {
233 let s = lex.slice();
234 u64::from_str_radix(&s[2..s.len() - 1], 16).ok() }
236
237fn lex_float(lex: &mut logos::Lexer<Token>) -> Option<f64> {
238 lex.slice().parse().ok()
239}
240
241fn lex_backtick_ident(lex: &mut logos::Lexer<Token>) -> String {
242 let s = lex.slice();
243 s[1..s.len() - 1].to_string() }
245
246fn lex_double_string(lex: &mut logos::Lexer<Token>) -> Option<String> {
249 lex_quoted_string(lex, '"')
250}
251
252fn lex_single_string(lex: &mut logos::Lexer<Token>) -> Option<String> {
253 lex_quoted_string(lex, '\'')
254}
255
256fn lex_quoted_string(lex: &mut logos::Lexer<Token>, quote: char) -> Option<String> {
257 let remainder = lex.remainder();
258 let mut chars = remainder.chars().peekable();
259 let mut result = std::string::String::new();
260 let mut consumed = 0;
261
262 while let Some(c) = chars.next() {
263 consumed += c.len_utf8();
264 if c == quote {
265 lex.bump(consumed);
266 return Some(result);
267 } else if c == '\\' {
268 let escape_char = chars.next()?;
269 consumed += escape_char.len_utf8();
270 match escape_char {
271 '\\' => result.push('\\'),
272 '/' => result.push('/'),
273 '"' => result.push('"'),
274 '\'' => result.push('\''),
275 '`' => result.push('`'),
276 'a' => result.push('\x07'),
277 'b' => result.push('\x08'),
278 'f' => result.push('\x0C'),
279 'n' => result.push('\n'),
280 'r' => result.push('\r'),
281 't' => result.push('\t'),
282 'v' => result.push('\x0B'),
283 '?' => result.push('?'),
284 'x' | 'X' => {
285 let h1 = chars.next()?;
287 let h2 = chars.next()?;
288 consumed += h1.len_utf8() + h2.len_utf8();
289 let hex = format!("{}{}", h1, h2);
290 let val = u8::from_str_radix(&hex, 16).ok()?;
291 result.push(val as char);
292 }
293 'u' => {
294 let hex: String = chars.by_ref().take(4).collect();
296 consumed += hex.len();
297 if hex.len() != 4 {
298 return None;
299 }
300 let val = u32::from_str_radix(&hex, 16).ok()?;
301 result.push(char::from_u32(val)?);
302 }
303 'U' => {
304 let hex: String = chars.by_ref().take(8).collect();
306 consumed += hex.len();
307 if hex.len() != 8 {
308 return None;
309 }
310 let val = u32::from_str_radix(&hex, 16).ok()?;
311 result.push(char::from_u32(val)?);
312 }
313 c @ '0'..='3' => {
314 let d2 = chars.next()?;
316 let d3 = chars.next()?;
317 consumed += d2.len_utf8() + d3.len_utf8();
318 if !matches!(d2, '0'..='7') || !matches!(d3, '0'..='7') {
319 return None;
320 }
321 let octal = format!("{}{}{}", c, d2, d3);
322 let val = u8::from_str_radix(&octal, 8).ok()?;
323 result.push(val as char);
324 }
325 _ => return None, }
327 } else if c == '\n' {
328 return None;
330 } else {
331 result.push(c);
332 }
333 }
334
335 None }
337
338fn lex_raw_double_string(lex: &mut logos::Lexer<Token>) -> Option<String> {
339 lex_raw_string(lex, '"')
340}
341
342fn lex_raw_single_string(lex: &mut logos::Lexer<Token>) -> Option<String> {
343 lex_raw_string(lex, '\'')
344}
345
346fn lex_raw_string(lex: &mut logos::Lexer<Token>, quote: char) -> Option<String> {
347 let remainder = lex.remainder();
348 let mut result = std::string::String::new();
349 let mut consumed = 0;
350
351 for c in remainder.chars() {
352 consumed += c.len_utf8();
353 if c == quote {
354 lex.bump(consumed);
355 return Some(result);
356 }
357 result.push(c);
358 }
359
360 None }
362
363fn lex_triple_double_string(lex: &mut logos::Lexer<Token>) -> Option<String> {
364 lex_triple_string(lex, "\"\"\"")
365}
366
367fn lex_triple_single_string(lex: &mut logos::Lexer<Token>) -> Option<String> {
368 lex_triple_string(lex, "'''")
369}
370
371fn lex_triple_string(lex: &mut logos::Lexer<Token>, end_quote: &str) -> Option<String> {
372 let remainder = lex.remainder();
373
374 if let Some(end_pos) = remainder.find(end_quote) {
375 let content = &remainder[..end_pos];
376 lex.bump(end_pos + end_quote.len());
377 Some(content.to_string())
378 } else {
379 None }
381}
382
383fn lex_raw_triple_double_string(lex: &mut logos::Lexer<Token>) -> Option<String> {
384 lex_triple_string(lex, "\"\"\"")
385}
386
387fn lex_raw_triple_single_string(lex: &mut logos::Lexer<Token>) -> Option<String> {
388 lex_triple_string(lex, "'''")
389}
390
391fn lex_bytes_double(lex: &mut logos::Lexer<Token>) -> Option<Vec<u8>> {
394 lex_quoted_string(lex, '"').map(|s| s.into_bytes())
395}
396
397fn lex_bytes_single(lex: &mut logos::Lexer<Token>) -> Option<Vec<u8>> {
398 lex_quoted_string(lex, '\'').map(|s| s.into_bytes())
399}
400
401fn lex_bytes_triple_double(lex: &mut logos::Lexer<Token>) -> Option<Vec<u8>> {
402 lex_triple_string(lex, "\"\"\"").map(|s| s.into_bytes())
403}
404
405fn lex_bytes_triple_single(lex: &mut logos::Lexer<Token>) -> Option<Vec<u8>> {
406 lex_triple_string(lex, "'''").map(|s| s.into_bytes())
407}
408
409fn lex_raw_bytes_double(lex: &mut logos::Lexer<Token>) -> Option<Vec<u8>> {
410 lex_raw_string(lex, '"').map(|s| s.into_bytes())
411}
412
413fn lex_raw_bytes_single(lex: &mut logos::Lexer<Token>) -> Option<Vec<u8>> {
414 lex_raw_string(lex, '\'').map(|s| s.into_bytes())
415}
416
417fn lex_raw_bytes_triple_double(lex: &mut logos::Lexer<Token>) -> Option<Vec<u8>> {
418 lex_triple_string(lex, "\"\"\"").map(|s| s.into_bytes())
419}
420
421fn lex_raw_bytes_triple_single(lex: &mut logos::Lexer<Token>) -> Option<Vec<u8>> {
422 lex_triple_string(lex, "'''").map(|s| s.into_bytes())
423}
424
425pub fn lex(input: &str) -> Result<Vec<SpannedToken>, LexError> {
429 let mut tokens = Vec::new();
430 let mut lexer = Token::lexer(input);
431
432 while let Some(result) = lexer.next() {
433 let span = lexer.span();
434 match result {
435 Ok(token) => tokens.push((token, span)),
436 Err(_) => {
437 return Err(LexError {
438 message: format!("unexpected character '{}'", &input[span.clone()]),
439 span,
440 })
441 }
442 }
443 }
444
445 Ok(tokens)
446}
447
448#[cfg(test)]
449mod tests {
450 use super::*;
451
452 fn lex_tokens(input: &str) -> Vec<Token> {
453 lex(input)
454 .unwrap()
455 .into_iter()
456 .map(|(tok, _)| tok)
457 .collect()
458 }
459
460 #[test]
461 fn lex_integers() {
462 assert_eq!(lex_tokens("123"), vec![Token::Int(123)]);
463 assert_eq!(lex_tokens("0"), vec![Token::Int(0)]);
464 assert_eq!(lex_tokens("0x1F"), vec![Token::Int(31)]);
465 assert_eq!(lex_tokens("0XAB"), vec![Token::Int(171)]);
466 }
467
468 #[test]
469 fn lex_unsigned_integers() {
470 assert_eq!(lex_tokens("123u"), vec![Token::UInt(123)]);
471 assert_eq!(lex_tokens("123U"), vec![Token::UInt(123)]);
472 assert_eq!(lex_tokens("0x1Fu"), vec![Token::UInt(31)]);
473 }
474
475 #[test]
476 fn lex_floats() {
477 assert_eq!(lex_tokens("1.5"), vec![Token::Float(1.5)]);
478 assert_eq!(lex_tokens("1e10"), vec![Token::Float(1e10)]);
479 assert_eq!(lex_tokens("1.5e-3"), vec![Token::Float(1.5e-3)]);
480 }
481
482 #[test]
483 fn lex_strings() {
484 assert_eq!(
485 lex_tokens(r#""hello""#),
486 vec![Token::String("hello".to_string())]
487 );
488 assert_eq!(
489 lex_tokens("'world'"),
490 vec![Token::String("world".to_string())]
491 );
492 assert_eq!(
493 lex_tokens(r#""hello\nworld""#),
494 vec![Token::String("hello\nworld".to_string())]
495 );
496 }
497
498 #[test]
499 fn lex_raw_strings() {
500 assert_eq!(
501 lex_tokens(r#"r"hello\n""#),
502 vec![Token::String(r"hello\n".to_string())]
503 );
504 assert_eq!(
505 lex_tokens(r"r'hello\n'"),
506 vec![Token::String(r"hello\n".to_string())]
507 );
508 }
509
510 #[test]
511 fn lex_triple_strings() {
512 assert_eq!(
513 lex_tokens(
514 r#""""multi
515line""""#
516 ),
517 vec![Token::String("multi\nline".to_string())]
518 );
519 }
520
521 #[test]
522 fn lex_bytes() {
523 assert_eq!(
524 lex_tokens(r#"b"hello""#),
525 vec![Token::Bytes(b"hello".to_vec())]
526 );
527 assert_eq!(
528 lex_tokens("b'world'"),
529 vec![Token::Bytes(b"world".to_vec())]
530 );
531 }
532
533 #[test]
534 fn lex_keywords() {
535 assert_eq!(lex_tokens("true"), vec![Token::True]);
536 assert_eq!(lex_tokens("false"), vec![Token::False]);
537 assert_eq!(lex_tokens("null"), vec![Token::Null]);
538 assert_eq!(lex_tokens("in"), vec![Token::In]);
539 }
540
541 #[test]
542 fn lex_identifiers() {
543 assert_eq!(
544 lex_tokens("foo"),
545 vec![Token::Ident("foo".to_string())]
546 );
547 assert_eq!(
548 lex_tokens("_bar"),
549 vec![Token::Ident("_bar".to_string())]
550 );
551 assert_eq!(
552 lex_tokens("baz123"),
553 vec![Token::Ident("baz123".to_string())]
554 );
555 }
556
557 #[test]
558 fn lex_operators() {
559 assert_eq!(
560 lex_tokens("+ - * / %"),
561 vec![
562 Token::Plus,
563 Token::Minus,
564 Token::Star,
565 Token::Slash,
566 Token::Percent
567 ]
568 );
569 assert_eq!(
570 lex_tokens("== != < <= > >="),
571 vec![
572 Token::EqEq,
573 Token::Ne,
574 Token::Lt,
575 Token::Le,
576 Token::Gt,
577 Token::Ge
578 ]
579 );
580 assert_eq!(
581 lex_tokens("&& || !"),
582 vec![Token::And, Token::Or, Token::Not]
583 );
584 assert_eq!(lex_tokens("? :"), vec![Token::Question, Token::Colon]);
585 }
586
587 #[test]
588 fn lex_delimiters() {
589 assert_eq!(
590 lex_tokens("( ) [ ] { } . ,"),
591 vec![
592 Token::LParen,
593 Token::RParen,
594 Token::LBracket,
595 Token::RBracket,
596 Token::LBrace,
597 Token::RBrace,
598 Token::Dot,
599 Token::Comma
600 ]
601 );
602 }
603
604 #[test]
605 fn lex_expression() {
606 assert_eq!(
607 lex_tokens("a + b * 2"),
608 vec![
609 Token::Ident("a".to_string()),
610 Token::Plus,
611 Token::Ident("b".to_string()),
612 Token::Star,
613 Token::Int(2)
614 ]
615 );
616 }
617
618 #[test]
619 fn lex_with_comments() {
620 assert_eq!(
621 lex_tokens("a // comment\n+ b"),
622 vec![
623 Token::Ident("a".to_string()),
624 Token::Plus,
625 Token::Ident("b".to_string())
626 ]
627 );
628 }
629
630 #[test]
631 fn lex_unicode_escapes() {
632 assert_eq!(
634 lex_tokens(r#""\u0041""#),
635 vec![Token::String("A".to_string())]
636 );
637 assert_eq!(
638 lex_tokens(r#""\u03B1""#), vec![Token::String("α".to_string())]
640 );
641 assert_eq!(
643 lex_tokens(r#""\U00000041""#),
644 vec![Token::String("A".to_string())]
645 );
646 assert_eq!(
647 lex_tokens(r#""\U0001F600""#), vec![Token::String("😀".to_string())]
649 );
650 }
651
652 #[test]
653 fn lex_octal_escapes() {
654 assert_eq!(
656 lex_tokens(r#""\101""#), vec![Token::String("A".to_string())]
658 );
659 assert_eq!(
660 lex_tokens(r#""\000""#), vec![Token::String("\0".to_string())]
662 );
663 assert_eq!(
664 lex_tokens(r#""\377""#), vec![Token::String("\u{FF}".to_string())]
666 );
667 }
668
669 #[test]
670 fn lex_reserved_words() {
671 assert_eq!(
673 lex_tokens("if"),
674 vec![Token::Reserved("if".to_string())]
675 );
676 assert_eq!(
677 lex_tokens("else"),
678 vec![Token::Reserved("else".to_string())]
679 );
680 assert_eq!(
681 lex_tokens("for"),
682 vec![Token::Reserved("for".to_string())]
683 );
684 assert_eq!(
685 lex_tokens("while"),
686 vec![Token::Reserved("while".to_string())]
687 );
688 assert_eq!(
689 lex_tokens("return"),
690 vec![Token::Reserved("return".to_string())]
691 );
692 assert_eq!(
693 lex_tokens("let"),
694 vec![Token::Reserved("let".to_string())]
695 );
696 assert_eq!(
697 lex_tokens("const"),
698 vec![Token::Reserved("const".to_string())]
699 );
700 assert_eq!(
701 lex_tokens("var"),
702 vec![Token::Reserved("var".to_string())]
703 );
704 assert_eq!(
705 lex_tokens("function"),
706 vec![Token::Reserved("function".to_string())]
707 );
708 assert_eq!(
709 lex_tokens("namespace"),
710 vec![Token::Reserved("namespace".to_string())]
711 );
712 }
713
714 #[test]
715 fn lex_integer_overflow() {
716 assert_eq!(
718 lex_tokens("9223372036854775807"),
719 vec![Token::Int(9223372036854775807)]
720 );
721
722 assert_eq!(
725 lex_tokens("9223372036854775808"),
726 vec![Token::IntOverflow("9223372036854775808".to_string())]
727 );
728
729 assert_eq!(
731 lex_tokens("1000000000000000000"),
732 vec![Token::Int(1000000000000000000)]
733 );
734 }
735}