Skip to main content

sema_reader/
lexer.rs

1use sema_core::{SemaError, Span};
2
3#[derive(Debug, Clone, PartialEq)]
4pub enum FStringPart {
5    Literal(String),
6    Expr(String),
7}
8
9#[derive(Debug, Clone, PartialEq)]
10pub enum Token {
11    LParen,
12    RParen,
13    LBracket,
14    RBracket,
15    LBrace,
16    RBrace,
17    Quote,
18    Quasiquote,
19    Unquote,
20    UnquoteSplice,
21    Int(i64),
22    Float(f64),
23    String(String),
24    FString(Vec<FStringPart>),
25    ShortLambdaStart,
26    Symbol(String),
27    Keyword(String),
28    Bool(bool),
29    Char(char),
30    BytevectorStart,
31    Dot,
32    Comment(String),
33    Newline,
34    Regex(String),
35}
36
37#[derive(Debug, Clone)]
38pub struct SpannedToken {
39    pub token: Token,
40    pub span: Span,
41    /// Byte offset of the start of this token in the source string.
42    pub byte_start: usize,
43    /// Byte offset past the end of this token in the source string.
44    pub byte_end: usize,
45}
46
47pub fn tokenize(input: &str) -> Result<Vec<SpannedToken>, SemaError> {
48    let mut tokens = Vec::new();
49    let chars: Vec<char> = input.chars().collect();
50    // Build char-index → byte-offset lookup table for string source extraction
51    let byte_offsets: Vec<usize> = {
52        let mut offsets = Vec::with_capacity(chars.len() + 1);
53        let mut pos = 0;
54        for c in &chars {
55            offsets.push(pos);
56            pos += c.len_utf8();
57        }
58        offsets.push(pos);
59        offsets
60    };
61    let mut i = 0;
62    let mut line = 1;
63    let mut col = 1;
64
65    while i < chars.len() {
66        let ch = chars[i];
67        let span = Span::point(line, col);
68
69        match ch {
70            // Whitespace
71            ' ' | '\t' | '\r' => {
72                col += 1;
73                i += 1;
74            }
75            '\n' => {
76                tokens.push(SpannedToken {
77                    token: Token::Newline,
78                    span: span.with_end(line, col + 1),
79                    byte_start: byte_offsets[i],
80                    byte_end: byte_offsets[i + 1],
81                });
82                line += 1;
83                col = 1;
84                i += 1;
85            }
86
87            // Comments
88            ';' => {
89                let start = i;
90                while i < chars.len() && chars[i] != '\n' {
91                    i += 1;
92                }
93                let text: String = chars[start..i].iter().collect();
94                let end_col = col + (i - start);
95                tokens.push(SpannedToken {
96                    token: Token::Comment(text),
97                    span: span.with_end(line, end_col),
98                    byte_start: byte_offsets[start],
99                    byte_end: byte_offsets[i],
100                });
101                col = end_col;
102            }
103
104            // Delimiters
105            '(' => {
106                col += 1;
107                i += 1;
108                tokens.push(SpannedToken {
109                    token: Token::LParen,
110                    span: span.with_end(line, col),
111                    byte_start: byte_offsets[i - 1],
112                    byte_end: byte_offsets[i],
113                });
114            }
115            ')' => {
116                col += 1;
117                i += 1;
118                tokens.push(SpannedToken {
119                    token: Token::RParen,
120                    span: span.with_end(line, col),
121                    byte_start: byte_offsets[i - 1],
122                    byte_end: byte_offsets[i],
123                });
124            }
125            '[' => {
126                col += 1;
127                i += 1;
128                tokens.push(SpannedToken {
129                    token: Token::LBracket,
130                    span: span.with_end(line, col),
131                    byte_start: byte_offsets[i - 1],
132                    byte_end: byte_offsets[i],
133                });
134            }
135            ']' => {
136                col += 1;
137                i += 1;
138                tokens.push(SpannedToken {
139                    token: Token::RBracket,
140                    span: span.with_end(line, col),
141                    byte_start: byte_offsets[i - 1],
142                    byte_end: byte_offsets[i],
143                });
144            }
145            '{' => {
146                col += 1;
147                i += 1;
148                tokens.push(SpannedToken {
149                    token: Token::LBrace,
150                    span: span.with_end(line, col),
151                    byte_start: byte_offsets[i - 1],
152                    byte_end: byte_offsets[i],
153                });
154            }
155            '}' => {
156                col += 1;
157                i += 1;
158                tokens.push(SpannedToken {
159                    token: Token::RBrace,
160                    span: span.with_end(line, col),
161                    byte_start: byte_offsets[i - 1],
162                    byte_end: byte_offsets[i],
163                });
164            }
165
166            // Quote forms
167            '\'' => {
168                col += 1;
169                i += 1;
170                tokens.push(SpannedToken {
171                    token: Token::Quote,
172                    span: span.with_end(line, col),
173                    byte_start: byte_offsets[i - 1],
174                    byte_end: byte_offsets[i],
175                });
176            }
177            '`' => {
178                col += 1;
179                i += 1;
180                tokens.push(SpannedToken {
181                    token: Token::Quasiquote,
182                    span: span.with_end(line, col),
183                    byte_start: byte_offsets[i - 1],
184                    byte_end: byte_offsets[i],
185                });
186            }
187            ',' => {
188                if i + 1 < chars.len() && chars[i + 1] == '@' {
189                    col += 2;
190                    i += 2;
191                    tokens.push(SpannedToken {
192                        token: Token::UnquoteSplice,
193                        span: span.with_end(line, col),
194                        byte_start: byte_offsets[i - 2],
195                        byte_end: byte_offsets[i],
196                    });
197                } else {
198                    col += 1;
199                    i += 1;
200                    tokens.push(SpannedToken {
201                        token: Token::Unquote,
202                        span: span.with_end(line, col),
203                        byte_start: byte_offsets[i - 1],
204                        byte_end: byte_offsets[i],
205                    });
206                }
207            }
208
209            // Strings
210            '"' => {
211                let token_start = i;
212                let mut s = String::new();
213                i += 1;
214                col += 1;
215                while i < chars.len() && chars[i] != '"' {
216                    if chars[i] == '\\' && i + 1 < chars.len() {
217                        i += 1;
218                        col += 1;
219                        read_string_escape(&chars, &mut i, &mut col, &mut s, span)?;
220                    } else {
221                        if chars[i] == '\n' {
222                            line += 1;
223                            col = 0;
224                        }
225                        s.push(chars[i]);
226                    }
227                    i += 1;
228                    col += 1;
229                }
230                if i >= chars.len() {
231                    return Err(SemaError::Reader {
232                        message: "unterminated string".to_string(),
233                        span,
234                    });
235                }
236                i += 1; // closing quote
237                col += 1;
238                tokens.push(SpannedToken {
239                    token: Token::String(s),
240                    span: span.with_end(line, col),
241                    byte_start: byte_offsets[token_start],
242                    byte_end: byte_offsets[i],
243                });
244            }
245
246            // #t, #f booleans
247            '#' => {
248                let token_start = i;
249                if i + 1 < chars.len() {
250                    match chars[i + 1] {
251                        't' => {
252                            i += 2;
253                            col += 2;
254                            tokens.push(SpannedToken {
255                                token: Token::Bool(true),
256                                span: span.with_end(line, col),
257                                byte_start: byte_offsets[token_start],
258                                byte_end: byte_offsets[i],
259                            });
260                        }
261                        'f' => {
262                            i += 2;
263                            col += 2;
264                            tokens.push(SpannedToken {
265                                token: Token::Bool(false),
266                                span: span.with_end(line, col),
267                                byte_start: byte_offsets[token_start],
268                                byte_end: byte_offsets[i],
269                            });
270                        }
271                        '\\' => {
272                            // Character literal: #\a, #\space, #\newline, etc.
273                            i += 2; // skip #\
274                            col += 2;
275                            if i >= chars.len() {
276                                return Err(SemaError::Reader {
277                                    message: "unexpected end of input after #\\".to_string(),
278                                    span,
279                                });
280                            }
281                            let start = i;
282                            if chars[i].is_alphabetic() {
283                                while i < chars.len() && is_symbol_char(chars[i]) {
284                                    i += 1;
285                                    col += 1;
286                                }
287                            } else {
288                                i += 1;
289                                col += 1;
290                            }
291                            let name: String = chars[start..i].iter().collect();
292                            let c = match name.as_str() {
293                                "space" => ' ',
294                                "newline" => '\n',
295                                "tab" => '\t',
296                                "return" => '\r',
297                                "nul" => '\0',
298                                s if s.chars().count() == 1 => s.chars().next().unwrap(),
299                                _ => {
300                                    return Err(SemaError::Reader {
301                                        message: format!("unknown character name: {name}"),
302                                        span,
303                                    });
304                                }
305                            };
306                            tokens.push(SpannedToken {
307                                token: Token::Char(c),
308                                span: span.with_end(line, col),
309                                byte_start: byte_offsets[token_start],
310                                byte_end: byte_offsets[i],
311                            });
312                        }
313                        'u' if i + 3 < chars.len()
314                            && chars[i + 2] == '8'
315                            && chars[i + 3] == '(' =>
316                        {
317                            i += 4;
318                            col += 4;
319                            tokens.push(SpannedToken {
320                                token: Token::BytevectorStart,
321                                span: span.with_end(line, col),
322                                byte_start: byte_offsets[token_start],
323                                byte_end: byte_offsets[i],
324                            });
325                        }
326                        '(' => {
327                            // Short lambda: #(+ % 1) → (lambda (%1) (+ %1 1))
328                            i += 2; // skip #(
329                            col += 2;
330                            tokens.push(SpannedToken {
331                                token: Token::ShortLambdaStart,
332                                span: span.with_end(line, col),
333                                byte_start: byte_offsets[token_start],
334                                byte_end: byte_offsets[i],
335                            });
336                        }
337                        '"' => {
338                            // Regex literal: #"pattern" — raw string (no escape processing)
339                            i += 2; // skip #"
340                            col += 2;
341                            let mut s = String::new();
342                            while i < chars.len() && chars[i] != '"' {
343                                if chars[i] == '\\' && i + 1 < chars.len() && chars[i + 1] == '"' {
344                                    s.push('"');
345                                    i += 2;
346                                    col += 2;
347                                } else {
348                                    if chars[i] == '\n' {
349                                        line += 1;
350                                        col = 0;
351                                    }
352                                    s.push(chars[i]);
353                                    i += 1;
354                                    col += 1;
355                                }
356                            }
357                            if i >= chars.len() {
358                                return Err(SemaError::Reader {
359                                    message: "unterminated regex literal".to_string(),
360                                    span,
361                                }
362                                .with_hint(
363                                    "add a closing `\"` to end the #\"...\" regex literal",
364                                ));
365                            }
366                            i += 1; // closing quote
367                            col += 1;
368                            tokens.push(SpannedToken {
369                                token: Token::Regex(s),
370                                span: span.with_end(line, col),
371                                byte_start: byte_offsets[token_start],
372                                byte_end: byte_offsets[i],
373                            });
374                        }
375                        '!' if line == 1 && col == 1 => {
376                            // Shebang line: #!/usr/bin/env sema
377                            while i < chars.len() && chars[i] != '\n' {
378                                i += 1;
379                            }
380                        }
381                        _ => {
382                            return Err(SemaError::Reader {
383                                message: format!(
384                                    "unexpected character after #: '{}'",
385                                    chars[i + 1]
386                                ),
387                                span,
388                            });
389                        }
390                    }
391                } else {
392                    return Err(SemaError::Reader {
393                        message: "unexpected end of input after `#`".to_string(),
394                        span,
395                    }
396                    .with_hint("# starts a special form: #t, #f, #\\char, #u8(...)"));
397                }
398            }
399
400            // Keywords (:foo)
401            ':' => {
402                let token_start = i;
403                i += 1;
404                col += 1;
405                let start = i;
406                while i < chars.len() && is_symbol_char(chars[i]) {
407                    i += 1;
408                    col += 1;
409                }
410                if i == start {
411                    return Err(SemaError::Reader {
412                        message: "expected keyword name after ':'".to_string(),
413                        span,
414                    });
415                }
416                let name: String = chars[start..i].iter().collect();
417                tokens.push(SpannedToken {
418                    token: Token::Keyword(name),
419                    span: span.with_end(line, col),
420                    byte_start: byte_offsets[token_start],
421                    byte_end: byte_offsets[i],
422                });
423            }
424
425            // Numbers, f-strings, and symbols
426            _ => {
427                if ch == 'f' && i + 1 < chars.len() && chars[i + 1] == '"' {
428                    // f-string: f"Hello ${name}" → FString token
429                    let token_start = i;
430                    i += 1; // skip 'f'
431                    col += 1;
432                    i += 1; // skip opening '"'
433                    col += 1;
434                    let mut parts: Vec<FStringPart> = Vec::new();
435                    let mut current = String::new();
436
437                    while i < chars.len() && chars[i] != '"' {
438                        if chars[i] == '\\' && i + 1 < chars.len() {
439                            i += 1;
440                            col += 1;
441                            read_string_escape(&chars, &mut i, &mut col, &mut current, span)?;
442                        } else if chars[i] == '$' && i + 1 < chars.len() && chars[i + 1] == '{' {
443                            // Start interpolation
444                            if !current.is_empty() {
445                                parts.push(FStringPart::Literal(std::mem::take(&mut current)));
446                            }
447                            i += 2; // skip "${"
448                            col += 2;
449                            let mut expr = String::new();
450                            let mut depth = 1;
451                            while i < chars.len() && depth > 0 {
452                                if chars[i] == '{' {
453                                    depth += 1;
454                                } else if chars[i] == '}' {
455                                    depth -= 1;
456                                    if depth == 0 {
457                                        break;
458                                    }
459                                }
460                                if chars[i] == '\n' {
461                                    line += 1;
462                                    col = 0;
463                                }
464                                expr.push(chars[i]);
465                                i += 1;
466                                col += 1;
467                            }
468                            if depth != 0 {
469                                return Err(SemaError::Reader {
470                                    message: "unterminated interpolation in f-string".to_string(),
471                                    span,
472                                }
473                                .with_hint("add a closing `}` to end the ${...} interpolation"));
474                            }
475                            let trimmed = expr.trim().to_string();
476                            if trimmed.is_empty() {
477                                return Err(SemaError::Reader {
478                                    message: "empty interpolation in f-string".to_string(),
479                                    span,
480                                }
481                                .with_hint("${} must contain an expression, e.g. ${name}"));
482                            }
483                            parts.push(FStringPart::Expr(trimmed));
484                            // i points to closing '}', outer i+=1 will skip past it
485                        } else {
486                            if chars[i] == '\n' {
487                                line += 1;
488                                col = 0;
489                            }
490                            current.push(chars[i]);
491                        }
492                        i += 1;
493                        col += 1;
494                    }
495
496                    if i >= chars.len() {
497                        return Err(SemaError::Reader {
498                            message: "unterminated f-string".to_string(),
499                            span,
500                        }
501                        .with_hint("add a closing `\"` to end the f-string"));
502                    }
503                    i += 1; // closing quote
504                    col += 1;
505
506                    if !current.is_empty() {
507                        parts.push(FStringPart::Literal(current));
508                    }
509
510                    tokens.push(SpannedToken {
511                        token: Token::FString(parts),
512                        span: span.with_end(line, col),
513                        byte_start: byte_offsets[token_start],
514                        byte_end: byte_offsets[i],
515                    });
516                } else if ch == '-' && i + 1 < chars.len() && chars[i + 1].is_ascii_digit() {
517                    // Negative number
518                    let token_start = i;
519                    let (tok, len) = read_number(&chars[i..], &span)?;
520                    i += len;
521                    col += len;
522                    tokens.push(SpannedToken {
523                        token: tok,
524                        span: span.with_end(line, col),
525                        byte_start: byte_offsets[token_start],
526                        byte_end: byte_offsets[i],
527                    });
528                } else if ch.is_ascii_digit() {
529                    let token_start = i;
530                    let (tok, len) = read_number(&chars[i..], &span)?;
531                    i += len;
532                    col += len;
533                    tokens.push(SpannedToken {
534                        token: tok,
535                        span: span.with_end(line, col),
536                        byte_start: byte_offsets[token_start],
537                        byte_end: byte_offsets[i],
538                    });
539                } else if is_symbol_start(ch) {
540                    let start = i;
541                    while i < chars.len() && is_symbol_char(chars[i]) {
542                        i += 1;
543                        col += 1;
544                    }
545                    let name: String = chars[start..i].iter().collect();
546                    let token_span = span.with_end(line, col);
547                    // Check for special symbol names
548                    let token_byte_start = byte_offsets[start];
549                    let token_byte_end = byte_offsets[i];
550                    match name.as_str() {
551                        "true" => tokens.push(SpannedToken {
552                            token: Token::Bool(true),
553                            span: token_span,
554                            byte_start: token_byte_start,
555                            byte_end: token_byte_end,
556                        }),
557                        "false" => tokens.push(SpannedToken {
558                            token: Token::Bool(false),
559                            span: token_span,
560                            byte_start: token_byte_start,
561                            byte_end: token_byte_end,
562                        }),
563                        "nil" => tokens.push(SpannedToken {
564                            token: Token::Symbol("nil".to_string()),
565                            span: token_span,
566                            byte_start: token_byte_start,
567                            byte_end: token_byte_end,
568                        }),
569                        "." => tokens.push(SpannedToken {
570                            token: Token::Dot,
571                            span: token_span,
572                            byte_start: token_byte_start,
573                            byte_end: token_byte_end,
574                        }),
575                        _ => tokens.push(SpannedToken {
576                            token: Token::Symbol(name),
577                            span: token_span,
578                            byte_start: token_byte_start,
579                            byte_end: token_byte_end,
580                        }),
581                    }
582                } else {
583                    return Err(SemaError::Reader {
584                        message: format!("unexpected character: '{ch}'"),
585                        span,
586                    });
587                }
588            }
589        }
590    }
591
592    Ok(tokens)
593}
594
595/// Process a string escape sequence. `chars[*i]` is the character after `\`.
596/// Pushes the decoded character(s) to `buf` and advances `*i`/`*col` for
597/// multi-character escapes (hex, unicode). The caller handles the final `i += 1`.
598fn read_string_escape(
599    chars: &[char],
600    i: &mut usize,
601    col: &mut usize,
602    buf: &mut String,
603    span: Span,
604) -> Result<(), SemaError> {
605    match chars[*i] {
606        'n' => buf.push('\n'),
607        't' => buf.push('\t'),
608        'r' => buf.push('\r'),
609        '\\' => buf.push('\\'),
610        '"' => buf.push('"'),
611        '0' => buf.push('\0'),
612        '$' => buf.push('$'),
613        'x' => {
614            // R7RS hex escape: \x<hex>;
615            let mut hex = String::new();
616            while *i + 1 < chars.len() && chars[*i + 1] != ';' && chars[*i + 1].is_ascii_hexdigit()
617            {
618                *i += 1;
619                *col += 1;
620                hex.push(chars[*i]);
621            }
622            if hex.is_empty() {
623                return Err(SemaError::Reader {
624                    message: "empty hex escape \\x;".to_string(),
625                    span,
626                });
627            }
628            if *i + 1 >= chars.len() || chars[*i + 1] != ';' {
629                return Err(SemaError::Reader {
630                    message: "hex escape \\x missing terminating semicolon".to_string(),
631                    span,
632                });
633            }
634            *i += 1;
635            *col += 1;
636            let code = u32::from_str_radix(&hex, 16).map_err(|_| SemaError::Reader {
637                message: format!("invalid hex escape \\x{};", hex),
638                span,
639            })?;
640            let ch = char::from_u32(code).ok_or_else(|| SemaError::Reader {
641                message: format!("invalid unicode scalar value \\x{};", hex),
642                span,
643            })?;
644            buf.push(ch);
645        }
646        'u' => {
647            // \u<4 hex digits>
648            let mut hex = String::new();
649            for _ in 0..4 {
650                if *i + 1 >= chars.len() || !chars[*i + 1].is_ascii_hexdigit() {
651                    return Err(SemaError::Reader {
652                        message: "\\u escape requires exactly 4 hex digits".to_string(),
653                        span,
654                    });
655                }
656                *i += 1;
657                *col += 1;
658                hex.push(chars[*i]);
659            }
660            let code = u32::from_str_radix(&hex, 16).map_err(|_| SemaError::Reader {
661                message: format!("invalid hex escape \\u{}", hex),
662                span,
663            })?;
664            let ch = char::from_u32(code).ok_or_else(|| SemaError::Reader {
665                message: format!("invalid unicode scalar value \\u{}", hex),
666                span,
667            })?;
668            buf.push(ch);
669        }
670        'U' => {
671            // \U<8 hex digits>
672            let mut hex = String::new();
673            for _ in 0..8 {
674                if *i + 1 >= chars.len() || !chars[*i + 1].is_ascii_hexdigit() {
675                    return Err(SemaError::Reader {
676                        message: "\\U escape requires exactly 8 hex digits".to_string(),
677                        span,
678                    });
679                }
680                *i += 1;
681                *col += 1;
682                hex.push(chars[*i]);
683            }
684            let code = u32::from_str_radix(&hex, 16).map_err(|_| SemaError::Reader {
685                message: format!("invalid hex escape \\U{}", hex),
686                span,
687            })?;
688            let ch = char::from_u32(code).ok_or_else(|| SemaError::Reader {
689                message: format!("invalid unicode scalar value \\U{}", hex),
690                span,
691            })?;
692            buf.push(ch);
693        }
694        other => {
695            buf.push('\\');
696            buf.push(other);
697        }
698    }
699    Ok(())
700}
701
702fn read_number(chars: &[char], span: &Span) -> Result<(Token, usize), SemaError> {
703    let mut i = 0;
704    if chars[i] == '-' {
705        i += 1;
706    }
707    while i < chars.len() && chars[i].is_ascii_digit() {
708        i += 1;
709    }
710    if i < chars.len() && chars[i] == '.' && i + 1 < chars.len() && chars[i + 1].is_ascii_digit() {
711        i += 1; // skip dot
712        while i < chars.len() && chars[i].is_ascii_digit() {
713            i += 1;
714        }
715        let s: String = chars[..i].iter().collect();
716        let f: f64 = s.parse().map_err(|_| SemaError::Reader {
717            message: format!("invalid float: {s}"),
718            span: *span,
719        })?;
720        Ok((Token::Float(f), i))
721    } else {
722        let s: String = chars[..i].iter().collect();
723        let n: i64 = s.parse().map_err(|_| SemaError::Reader {
724            message: format!("invalid integer: {s}"),
725            span: *span,
726        })?;
727        Ok((Token::Int(n), i))
728    }
729}
730
731fn is_symbol_start(ch: char) -> bool {
732    ch.is_alphabetic()
733        || matches!(
734            ch,
735            '+' | '-' | '*' | '/' | '!' | '?' | '<' | '>' | '=' | '_' | '&' | '%' | '^' | '~' | '.'
736        )
737}
738
739fn is_symbol_char(ch: char) -> bool {
740    is_symbol_start(ch) || ch.is_ascii_digit() || matches!(ch, '-' | '/' | '.' | '#')
741}
742
743#[cfg(test)]
744mod tests {
745    use super::*;
746
747    #[test]
748    fn test_comment_token_emitted() {
749        let tokens = tokenize("(+ 1 2) ; comment").unwrap();
750        let comment_tokens: Vec<_> = tokens
751            .iter()
752            .filter(|t| matches!(&t.token, Token::Comment(_)))
753            .collect();
754        assert_eq!(comment_tokens.len(), 1);
755        match &comment_tokens[0].token {
756            Token::Comment(text) => assert_eq!(text, "; comment"),
757            _ => panic!("expected Comment token"),
758        }
759    }
760
761    #[test]
762    fn test_newline_token_emitted() {
763        let tokens = tokenize("a\nb").unwrap();
764        let token_types: Vec<_> = tokens.iter().map(|t| &t.token).collect();
765        assert!(
766            matches!(token_types[0], Token::Symbol(s) if s == "a"),
767            "first token should be symbol 'a'"
768        );
769        assert!(
770            matches!(token_types[1], Token::Newline),
771            "second token should be Newline"
772        );
773        assert!(
774            matches!(token_types[2], Token::Symbol(s) if s == "b"),
775            "third token should be symbol 'b'"
776        );
777    }
778
779    #[test]
780    fn test_regex_token_emitted() {
781        let tokens = tokenize(r#"#"\d+""#).unwrap();
782        assert_eq!(tokens.len(), 1);
783        match &tokens[0].token {
784            Token::Regex(s) => assert_eq!(s, r"\d+"),
785            other => panic!("expected Regex token, got {:?}", other),
786        }
787    }
788
789    #[test]
790    fn test_regex_not_string() {
791        // Regex should NOT produce Token::String
792        let tokens = tokenize(r#"#"[a-z]+""#).unwrap();
793        assert_eq!(tokens.len(), 1);
794        assert!(
795            !matches!(&tokens[0].token, Token::String(_)),
796            "regex should not produce Token::String"
797        );
798        assert!(
799            matches!(&tokens[0].token, Token::Regex(_)),
800            "regex should produce Token::Regex"
801        );
802    }
803
804    #[test]
805    fn test_multiple_comments_and_newlines_preserved() {
806        let tokens = tokenize("; first\n; second\n42").unwrap();
807        let token_types: Vec<&Token> = tokens.iter().map(|t| &t.token).collect();
808        assert!(matches!(token_types[0], Token::Comment(s) if s == "; first"));
809        assert!(matches!(token_types[1], Token::Newline));
810        assert!(matches!(token_types[2], Token::Comment(s) if s == "; second"));
811        assert!(matches!(token_types[3], Token::Newline));
812        assert!(matches!(token_types[4], Token::Int(42)));
813    }
814
815    #[test]
816    fn test_comment_does_not_include_trailing_newline() {
817        let tokens = tokenize("; hello world\n").unwrap();
818        match &tokens[0].token {
819            Token::Comment(text) => {
820                assert!(
821                    !text.ends_with('\n'),
822                    "comment should not include trailing newline"
823                );
824                assert_eq!(text, "; hello world");
825            }
826            _ => panic!("expected Comment token"),
827        }
828        // The newline should be a separate token
829        assert!(matches!(&tokens[1].token, Token::Newline));
830    }
831
832    #[test]
833    fn test_inline_comment_after_code() {
834        let tokens = tokenize("(define x 42) ; set x").unwrap();
835        let has_comment = tokens
836            .iter()
837            .any(|t| matches!(&t.token, Token::Comment(s) if s == "; set x"));
838        assert!(has_comment, "should have inline comment token");
839    }
840
841    #[test]
842    fn test_trivia_order_preserved() {
843        let tokens = tokenize("a\n\n; comment\nb").unwrap();
844        let types: Vec<String> = tokens
845            .iter()
846            .map(|t| match &t.token {
847                Token::Symbol(s) => format!("sym:{}", s),
848                Token::Newline => "newline".to_string(),
849                Token::Comment(s) => format!("comment:{}", s),
850                other => format!("{:?}", other),
851            })
852            .collect();
853        assert_eq!(
854            types,
855            vec![
856                "sym:a",
857                "newline",
858                "newline",
859                "comment:; comment",
860                "newline",
861                "sym:b"
862            ]
863        );
864    }
865}