1use sema_core::{SemaError, Span};
2
3#[derive(Debug, Clone, PartialEq)]
4pub enum FStringPart {
5 Literal(String),
6 Expr(String),
7}
8
9#[derive(Debug, Clone, PartialEq)]
10pub enum Token {
11 LParen,
12 RParen,
13 LBracket,
14 RBracket,
15 LBrace,
16 RBrace,
17 Quote,
18 Quasiquote,
19 Unquote,
20 UnquoteSplice,
21 Int(i64),
22 Float(f64),
23 String(String),
24 FString(Vec<FStringPart>),
25 ShortLambdaStart,
26 Symbol(String),
27 Keyword(String),
28 Bool(bool),
29 Char(char),
30 BytevectorStart,
31 Dot,
32 Comment(String),
33 Newline,
34 Regex(String),
35}
36
37#[derive(Debug, Clone)]
38pub struct SpannedToken {
39 pub token: Token,
40 pub span: Span,
41 pub byte_start: usize,
43 pub byte_end: usize,
45}
46
47pub fn tokenize(input: &str) -> Result<Vec<SpannedToken>, SemaError> {
48 let mut tokens = Vec::new();
49 let chars: Vec<char> = input.chars().collect();
50 let byte_offsets: Vec<usize> = {
52 let mut offsets = Vec::with_capacity(chars.len() + 1);
53 let mut pos = 0;
54 for c in &chars {
55 offsets.push(pos);
56 pos += c.len_utf8();
57 }
58 offsets.push(pos);
59 offsets
60 };
61 let mut i = 0;
62 let mut line = 1;
63 let mut col = 1;
64
65 while i < chars.len() {
66 let ch = chars[i];
67 let span = Span::point(line, col);
68
69 match ch {
70 ' ' | '\t' | '\r' => {
72 col += 1;
73 i += 1;
74 }
75 '\n' => {
76 tokens.push(SpannedToken {
77 token: Token::Newline,
78 span: span.with_end(line, col + 1),
79 byte_start: byte_offsets[i],
80 byte_end: byte_offsets[i + 1],
81 });
82 line += 1;
83 col = 1;
84 i += 1;
85 }
86
87 ';' => {
89 let start = i;
90 while i < chars.len() && chars[i] != '\n' {
91 i += 1;
92 }
93 let text: String = chars[start..i].iter().collect();
94 let end_col = col + (i - start);
95 tokens.push(SpannedToken {
96 token: Token::Comment(text),
97 span: span.with_end(line, end_col),
98 byte_start: byte_offsets[start],
99 byte_end: byte_offsets[i],
100 });
101 col = end_col;
102 }
103
104 '(' => {
106 col += 1;
107 i += 1;
108 tokens.push(SpannedToken {
109 token: Token::LParen,
110 span: span.with_end(line, col),
111 byte_start: byte_offsets[i - 1],
112 byte_end: byte_offsets[i],
113 });
114 }
115 ')' => {
116 col += 1;
117 i += 1;
118 tokens.push(SpannedToken {
119 token: Token::RParen,
120 span: span.with_end(line, col),
121 byte_start: byte_offsets[i - 1],
122 byte_end: byte_offsets[i],
123 });
124 }
125 '[' => {
126 col += 1;
127 i += 1;
128 tokens.push(SpannedToken {
129 token: Token::LBracket,
130 span: span.with_end(line, col),
131 byte_start: byte_offsets[i - 1],
132 byte_end: byte_offsets[i],
133 });
134 }
135 ']' => {
136 col += 1;
137 i += 1;
138 tokens.push(SpannedToken {
139 token: Token::RBracket,
140 span: span.with_end(line, col),
141 byte_start: byte_offsets[i - 1],
142 byte_end: byte_offsets[i],
143 });
144 }
145 '{' => {
146 col += 1;
147 i += 1;
148 tokens.push(SpannedToken {
149 token: Token::LBrace,
150 span: span.with_end(line, col),
151 byte_start: byte_offsets[i - 1],
152 byte_end: byte_offsets[i],
153 });
154 }
155 '}' => {
156 col += 1;
157 i += 1;
158 tokens.push(SpannedToken {
159 token: Token::RBrace,
160 span: span.with_end(line, col),
161 byte_start: byte_offsets[i - 1],
162 byte_end: byte_offsets[i],
163 });
164 }
165
166 '\'' => {
168 col += 1;
169 i += 1;
170 tokens.push(SpannedToken {
171 token: Token::Quote,
172 span: span.with_end(line, col),
173 byte_start: byte_offsets[i - 1],
174 byte_end: byte_offsets[i],
175 });
176 }
177 '`' => {
178 col += 1;
179 i += 1;
180 tokens.push(SpannedToken {
181 token: Token::Quasiquote,
182 span: span.with_end(line, col),
183 byte_start: byte_offsets[i - 1],
184 byte_end: byte_offsets[i],
185 });
186 }
187 ',' => {
188 if i + 1 < chars.len() && chars[i + 1] == '@' {
189 col += 2;
190 i += 2;
191 tokens.push(SpannedToken {
192 token: Token::UnquoteSplice,
193 span: span.with_end(line, col),
194 byte_start: byte_offsets[i - 2],
195 byte_end: byte_offsets[i],
196 });
197 } else {
198 col += 1;
199 i += 1;
200 tokens.push(SpannedToken {
201 token: Token::Unquote,
202 span: span.with_end(line, col),
203 byte_start: byte_offsets[i - 1],
204 byte_end: byte_offsets[i],
205 });
206 }
207 }
208
209 '"' => {
211 let token_start = i;
212 let mut s = String::new();
213 i += 1;
214 col += 1;
215 while i < chars.len() && chars[i] != '"' {
216 if chars[i] == '\\' && i + 1 < chars.len() {
217 i += 1;
218 col += 1;
219 read_string_escape(&chars, &mut i, &mut col, &mut s, span)?;
220 } else {
221 if chars[i] == '\n' {
222 line += 1;
223 col = 0;
224 }
225 s.push(chars[i]);
226 }
227 i += 1;
228 col += 1;
229 }
230 if i >= chars.len() {
231 return Err(SemaError::Reader {
232 message: "unterminated string".to_string(),
233 span,
234 });
235 }
236 i += 1; col += 1;
238 tokens.push(SpannedToken {
239 token: Token::String(s),
240 span: span.with_end(line, col),
241 byte_start: byte_offsets[token_start],
242 byte_end: byte_offsets[i],
243 });
244 }
245
246 '#' => {
248 let token_start = i;
249 if i + 1 < chars.len() {
250 match chars[i + 1] {
251 't' => {
252 i += 2;
253 col += 2;
254 tokens.push(SpannedToken {
255 token: Token::Bool(true),
256 span: span.with_end(line, col),
257 byte_start: byte_offsets[token_start],
258 byte_end: byte_offsets[i],
259 });
260 }
261 'f' => {
262 i += 2;
263 col += 2;
264 tokens.push(SpannedToken {
265 token: Token::Bool(false),
266 span: span.with_end(line, col),
267 byte_start: byte_offsets[token_start],
268 byte_end: byte_offsets[i],
269 });
270 }
271 '\\' => {
272 i += 2; col += 2;
275 if i >= chars.len() {
276 return Err(SemaError::Reader {
277 message: "unexpected end of input after #\\".to_string(),
278 span,
279 });
280 }
281 let start = i;
282 if chars[i].is_alphabetic() {
283 while i < chars.len() && is_symbol_char(chars[i]) {
284 i += 1;
285 col += 1;
286 }
287 } else {
288 i += 1;
289 col += 1;
290 }
291 let name: String = chars[start..i].iter().collect();
292 let c = match name.as_str() {
293 "space" => ' ',
294 "newline" => '\n',
295 "tab" => '\t',
296 "return" => '\r',
297 "nul" => '\0',
298 s if s.chars().count() == 1 => s.chars().next().unwrap(),
299 _ => {
300 return Err(SemaError::Reader {
301 message: format!("unknown character name: {name}"),
302 span,
303 });
304 }
305 };
306 tokens.push(SpannedToken {
307 token: Token::Char(c),
308 span: span.with_end(line, col),
309 byte_start: byte_offsets[token_start],
310 byte_end: byte_offsets[i],
311 });
312 }
313 'u' if i + 3 < chars.len()
314 && chars[i + 2] == '8'
315 && chars[i + 3] == '(' =>
316 {
317 i += 4;
318 col += 4;
319 tokens.push(SpannedToken {
320 token: Token::BytevectorStart,
321 span: span.with_end(line, col),
322 byte_start: byte_offsets[token_start],
323 byte_end: byte_offsets[i],
324 });
325 }
326 '(' => {
327 i += 2; col += 2;
330 tokens.push(SpannedToken {
331 token: Token::ShortLambdaStart,
332 span: span.with_end(line, col),
333 byte_start: byte_offsets[token_start],
334 byte_end: byte_offsets[i],
335 });
336 }
337 '"' => {
338 i += 2; col += 2;
341 let mut s = String::new();
342 while i < chars.len() && chars[i] != '"' {
343 if chars[i] == '\\' && i + 1 < chars.len() && chars[i + 1] == '"' {
344 s.push('"');
345 i += 2;
346 col += 2;
347 } else {
348 if chars[i] == '\n' {
349 line += 1;
350 col = 0;
351 }
352 s.push(chars[i]);
353 i += 1;
354 col += 1;
355 }
356 }
357 if i >= chars.len() {
358 return Err(SemaError::Reader {
359 message: "unterminated regex literal".to_string(),
360 span,
361 }
362 .with_hint(
363 "add a closing `\"` to end the #\"...\" regex literal",
364 ));
365 }
366 i += 1; col += 1;
368 tokens.push(SpannedToken {
369 token: Token::Regex(s),
370 span: span.with_end(line, col),
371 byte_start: byte_offsets[token_start],
372 byte_end: byte_offsets[i],
373 });
374 }
375 '!' if line == 1 && col == 1 => {
376 while i < chars.len() && chars[i] != '\n' {
378 i += 1;
379 }
380 }
381 _ => {
382 return Err(SemaError::Reader {
383 message: format!(
384 "unexpected character after #: '{}'",
385 chars[i + 1]
386 ),
387 span,
388 });
389 }
390 }
391 } else {
392 return Err(SemaError::Reader {
393 message: "unexpected end of input after `#`".to_string(),
394 span,
395 }
396 .with_hint("# starts a special form: #t, #f, #\\char, #u8(...)"));
397 }
398 }
399
400 ':' => {
402 let token_start = i;
403 i += 1;
404 col += 1;
405 let start = i;
406 while i < chars.len() && is_symbol_char(chars[i]) {
407 i += 1;
408 col += 1;
409 }
410 if i == start {
411 return Err(SemaError::Reader {
412 message: "expected keyword name after ':'".to_string(),
413 span,
414 });
415 }
416 let name: String = chars[start..i].iter().collect();
417 tokens.push(SpannedToken {
418 token: Token::Keyword(name),
419 span: span.with_end(line, col),
420 byte_start: byte_offsets[token_start],
421 byte_end: byte_offsets[i],
422 });
423 }
424
425 _ => {
427 if ch == 'f' && i + 1 < chars.len() && chars[i + 1] == '"' {
428 let token_start = i;
430 i += 1; col += 1;
432 i += 1; col += 1;
434 let mut parts: Vec<FStringPart> = Vec::new();
435 let mut current = String::new();
436
437 while i < chars.len() && chars[i] != '"' {
438 if chars[i] == '\\' && i + 1 < chars.len() {
439 i += 1;
440 col += 1;
441 read_string_escape(&chars, &mut i, &mut col, &mut current, span)?;
442 } else if chars[i] == '$' && i + 1 < chars.len() && chars[i + 1] == '{' {
443 if !current.is_empty() {
445 parts.push(FStringPart::Literal(std::mem::take(&mut current)));
446 }
447 i += 2; col += 2;
449 let mut expr = String::new();
450 let mut depth = 1;
451 while i < chars.len() && depth > 0 {
452 if chars[i] == '{' {
453 depth += 1;
454 } else if chars[i] == '}' {
455 depth -= 1;
456 if depth == 0 {
457 break;
458 }
459 }
460 if chars[i] == '\n' {
461 line += 1;
462 col = 0;
463 }
464 expr.push(chars[i]);
465 i += 1;
466 col += 1;
467 }
468 if depth != 0 {
469 return Err(SemaError::Reader {
470 message: "unterminated interpolation in f-string".to_string(),
471 span,
472 }
473 .with_hint("add a closing `}` to end the ${...} interpolation"));
474 }
475 let trimmed = expr.trim().to_string();
476 if trimmed.is_empty() {
477 return Err(SemaError::Reader {
478 message: "empty interpolation in f-string".to_string(),
479 span,
480 }
481 .with_hint("${} must contain an expression, e.g. ${name}"));
482 }
483 parts.push(FStringPart::Expr(trimmed));
484 } else {
486 if chars[i] == '\n' {
487 line += 1;
488 col = 0;
489 }
490 current.push(chars[i]);
491 }
492 i += 1;
493 col += 1;
494 }
495
496 if i >= chars.len() {
497 return Err(SemaError::Reader {
498 message: "unterminated f-string".to_string(),
499 span,
500 }
501 .with_hint("add a closing `\"` to end the f-string"));
502 }
503 i += 1; col += 1;
505
506 if !current.is_empty() {
507 parts.push(FStringPart::Literal(current));
508 }
509
510 tokens.push(SpannedToken {
511 token: Token::FString(parts),
512 span: span.with_end(line, col),
513 byte_start: byte_offsets[token_start],
514 byte_end: byte_offsets[i],
515 });
516 } else if ch == '-' && i + 1 < chars.len() && chars[i + 1].is_ascii_digit() {
517 let token_start = i;
519 let (tok, len) = read_number(&chars[i..], &span)?;
520 i += len;
521 col += len;
522 tokens.push(SpannedToken {
523 token: tok,
524 span: span.with_end(line, col),
525 byte_start: byte_offsets[token_start],
526 byte_end: byte_offsets[i],
527 });
528 } else if ch.is_ascii_digit() {
529 let token_start = i;
530 let (tok, len) = read_number(&chars[i..], &span)?;
531 i += len;
532 col += len;
533 tokens.push(SpannedToken {
534 token: tok,
535 span: span.with_end(line, col),
536 byte_start: byte_offsets[token_start],
537 byte_end: byte_offsets[i],
538 });
539 } else if is_symbol_start(ch) {
540 let start = i;
541 while i < chars.len() && is_symbol_char(chars[i]) {
542 i += 1;
543 col += 1;
544 }
545 let name: String = chars[start..i].iter().collect();
546 let token_span = span.with_end(line, col);
547 let token_byte_start = byte_offsets[start];
549 let token_byte_end = byte_offsets[i];
550 match name.as_str() {
551 "true" => tokens.push(SpannedToken {
552 token: Token::Bool(true),
553 span: token_span,
554 byte_start: token_byte_start,
555 byte_end: token_byte_end,
556 }),
557 "false" => tokens.push(SpannedToken {
558 token: Token::Bool(false),
559 span: token_span,
560 byte_start: token_byte_start,
561 byte_end: token_byte_end,
562 }),
563 "nil" => tokens.push(SpannedToken {
564 token: Token::Symbol("nil".to_string()),
565 span: token_span,
566 byte_start: token_byte_start,
567 byte_end: token_byte_end,
568 }),
569 "." => tokens.push(SpannedToken {
570 token: Token::Dot,
571 span: token_span,
572 byte_start: token_byte_start,
573 byte_end: token_byte_end,
574 }),
575 _ => tokens.push(SpannedToken {
576 token: Token::Symbol(name),
577 span: token_span,
578 byte_start: token_byte_start,
579 byte_end: token_byte_end,
580 }),
581 }
582 } else {
583 return Err(SemaError::Reader {
584 message: format!("unexpected character: '{ch}'"),
585 span,
586 });
587 }
588 }
589 }
590 }
591
592 Ok(tokens)
593}
594
595fn read_string_escape(
599 chars: &[char],
600 i: &mut usize,
601 col: &mut usize,
602 buf: &mut String,
603 span: Span,
604) -> Result<(), SemaError> {
605 match chars[*i] {
606 'n' => buf.push('\n'),
607 't' => buf.push('\t'),
608 'r' => buf.push('\r'),
609 '\\' => buf.push('\\'),
610 '"' => buf.push('"'),
611 '0' => buf.push('\0'),
612 '$' => buf.push('$'),
613 'x' => {
614 let mut hex = String::new();
616 while *i + 1 < chars.len() && chars[*i + 1] != ';' && chars[*i + 1].is_ascii_hexdigit()
617 {
618 *i += 1;
619 *col += 1;
620 hex.push(chars[*i]);
621 }
622 if hex.is_empty() {
623 return Err(SemaError::Reader {
624 message: "empty hex escape \\x;".to_string(),
625 span,
626 });
627 }
628 if *i + 1 >= chars.len() || chars[*i + 1] != ';' {
629 return Err(SemaError::Reader {
630 message: "hex escape \\x missing terminating semicolon".to_string(),
631 span,
632 });
633 }
634 *i += 1;
635 *col += 1;
636 let code = u32::from_str_radix(&hex, 16).map_err(|_| SemaError::Reader {
637 message: format!("invalid hex escape \\x{};", hex),
638 span,
639 })?;
640 let ch = char::from_u32(code).ok_or_else(|| SemaError::Reader {
641 message: format!("invalid unicode scalar value \\x{};", hex),
642 span,
643 })?;
644 buf.push(ch);
645 }
646 'u' => {
647 let mut hex = String::new();
649 for _ in 0..4 {
650 if *i + 1 >= chars.len() || !chars[*i + 1].is_ascii_hexdigit() {
651 return Err(SemaError::Reader {
652 message: "\\u escape requires exactly 4 hex digits".to_string(),
653 span,
654 });
655 }
656 *i += 1;
657 *col += 1;
658 hex.push(chars[*i]);
659 }
660 let code = u32::from_str_radix(&hex, 16).map_err(|_| SemaError::Reader {
661 message: format!("invalid hex escape \\u{}", hex),
662 span,
663 })?;
664 let ch = char::from_u32(code).ok_or_else(|| SemaError::Reader {
665 message: format!("invalid unicode scalar value \\u{}", hex),
666 span,
667 })?;
668 buf.push(ch);
669 }
670 'U' => {
671 let mut hex = String::new();
673 for _ in 0..8 {
674 if *i + 1 >= chars.len() || !chars[*i + 1].is_ascii_hexdigit() {
675 return Err(SemaError::Reader {
676 message: "\\U escape requires exactly 8 hex digits".to_string(),
677 span,
678 });
679 }
680 *i += 1;
681 *col += 1;
682 hex.push(chars[*i]);
683 }
684 let code = u32::from_str_radix(&hex, 16).map_err(|_| SemaError::Reader {
685 message: format!("invalid hex escape \\U{}", hex),
686 span,
687 })?;
688 let ch = char::from_u32(code).ok_or_else(|| SemaError::Reader {
689 message: format!("invalid unicode scalar value \\U{}", hex),
690 span,
691 })?;
692 buf.push(ch);
693 }
694 other => {
695 buf.push('\\');
696 buf.push(other);
697 }
698 }
699 Ok(())
700}
701
702fn read_number(chars: &[char], span: &Span) -> Result<(Token, usize), SemaError> {
703 let mut i = 0;
704 if chars[i] == '-' {
705 i += 1;
706 }
707 while i < chars.len() && chars[i].is_ascii_digit() {
708 i += 1;
709 }
710 if i < chars.len() && chars[i] == '.' && i + 1 < chars.len() && chars[i + 1].is_ascii_digit() {
711 i += 1; while i < chars.len() && chars[i].is_ascii_digit() {
713 i += 1;
714 }
715 let s: String = chars[..i].iter().collect();
716 let f: f64 = s.parse().map_err(|_| SemaError::Reader {
717 message: format!("invalid float: {s}"),
718 span: *span,
719 })?;
720 Ok((Token::Float(f), i))
721 } else {
722 let s: String = chars[..i].iter().collect();
723 let n: i64 = s.parse().map_err(|_| SemaError::Reader {
724 message: format!("invalid integer: {s}"),
725 span: *span,
726 })?;
727 Ok((Token::Int(n), i))
728 }
729}
730
731fn is_symbol_start(ch: char) -> bool {
732 ch.is_alphabetic()
733 || matches!(
734 ch,
735 '+' | '-' | '*' | '/' | '!' | '?' | '<' | '>' | '=' | '_' | '&' | '%' | '^' | '~' | '.'
736 )
737}
738
739fn is_symbol_char(ch: char) -> bool {
740 is_symbol_start(ch) || ch.is_ascii_digit() || matches!(ch, '-' | '/' | '.' | '#')
741}
742
743#[cfg(test)]
744mod tests {
745 use super::*;
746
747 #[test]
748 fn test_comment_token_emitted() {
749 let tokens = tokenize("(+ 1 2) ; comment").unwrap();
750 let comment_tokens: Vec<_> = tokens
751 .iter()
752 .filter(|t| matches!(&t.token, Token::Comment(_)))
753 .collect();
754 assert_eq!(comment_tokens.len(), 1);
755 match &comment_tokens[0].token {
756 Token::Comment(text) => assert_eq!(text, "; comment"),
757 _ => panic!("expected Comment token"),
758 }
759 }
760
761 #[test]
762 fn test_newline_token_emitted() {
763 let tokens = tokenize("a\nb").unwrap();
764 let token_types: Vec<_> = tokens.iter().map(|t| &t.token).collect();
765 assert!(
766 matches!(token_types[0], Token::Symbol(s) if s == "a"),
767 "first token should be symbol 'a'"
768 );
769 assert!(
770 matches!(token_types[1], Token::Newline),
771 "second token should be Newline"
772 );
773 assert!(
774 matches!(token_types[2], Token::Symbol(s) if s == "b"),
775 "third token should be symbol 'b'"
776 );
777 }
778
779 #[test]
780 fn test_regex_token_emitted() {
781 let tokens = tokenize(r#"#"\d+""#).unwrap();
782 assert_eq!(tokens.len(), 1);
783 match &tokens[0].token {
784 Token::Regex(s) => assert_eq!(s, r"\d+"),
785 other => panic!("expected Regex token, got {:?}", other),
786 }
787 }
788
789 #[test]
790 fn test_regex_not_string() {
791 let tokens = tokenize(r#"#"[a-z]+""#).unwrap();
793 assert_eq!(tokens.len(), 1);
794 assert!(
795 !matches!(&tokens[0].token, Token::String(_)),
796 "regex should not produce Token::String"
797 );
798 assert!(
799 matches!(&tokens[0].token, Token::Regex(_)),
800 "regex should produce Token::Regex"
801 );
802 }
803
804 #[test]
805 fn test_multiple_comments_and_newlines_preserved() {
806 let tokens = tokenize("; first\n; second\n42").unwrap();
807 let token_types: Vec<&Token> = tokens.iter().map(|t| &t.token).collect();
808 assert!(matches!(token_types[0], Token::Comment(s) if s == "; first"));
809 assert!(matches!(token_types[1], Token::Newline));
810 assert!(matches!(token_types[2], Token::Comment(s) if s == "; second"));
811 assert!(matches!(token_types[3], Token::Newline));
812 assert!(matches!(token_types[4], Token::Int(42)));
813 }
814
815 #[test]
816 fn test_comment_does_not_include_trailing_newline() {
817 let tokens = tokenize("; hello world\n").unwrap();
818 match &tokens[0].token {
819 Token::Comment(text) => {
820 assert!(
821 !text.ends_with('\n'),
822 "comment should not include trailing newline"
823 );
824 assert_eq!(text, "; hello world");
825 }
826 _ => panic!("expected Comment token"),
827 }
828 assert!(matches!(&tokens[1].token, Token::Newline));
830 }
831
832 #[test]
833 fn test_inline_comment_after_code() {
834 let tokens = tokenize("(define x 42) ; set x").unwrap();
835 let has_comment = tokens
836 .iter()
837 .any(|t| matches!(&t.token, Token::Comment(s) if s == "; set x"));
838 assert!(has_comment, "should have inline comment token");
839 }
840
841 #[test]
842 fn test_trivia_order_preserved() {
843 let tokens = tokenize("a\n\n; comment\nb").unwrap();
844 let types: Vec<String> = tokens
845 .iter()
846 .map(|t| match &t.token {
847 Token::Symbol(s) => format!("sym:{}", s),
848 Token::Newline => "newline".to_string(),
849 Token::Comment(s) => format!("comment:{}", s),
850 other => format!("{:?}", other),
851 })
852 .collect();
853 assert_eq!(
854 types,
855 vec![
856 "sym:a",
857 "newline",
858 "newline",
859 "comment:; comment",
860 "newline",
861 "sym:b"
862 ]
863 );
864 }
865}