bock_lexer/
lexer.rs

1//! Core lexer implementation for Bock source files.
2
3use std::collections::VecDeque;
4
5use bock_errors::{DiagnosticBag, DiagnosticCode, Span};
6use bock_source::SourceFile;
7
8use crate::token::{keyword_lookup, Token, TokenKind};
9
10/// Diagnostic code for unknown/unexpected characters.
11const E_UNEXPECTED_CHAR: DiagnosticCode = DiagnosticCode {
12    prefix: 'E',
13    number: 1001,
14};
15/// Diagnostic code for unterminated string or character literals.
16const E_UNTERMINATED_STRING: DiagnosticCode = DiagnosticCode {
17    prefix: 'E',
18    number: 1002,
19};
20/// Diagnostic code for invalid escape sequences.
21const E_INVALID_ESCAPE: DiagnosticCode = DiagnosticCode {
22    prefix: 'E',
23    number: 1003,
24};
25/// Diagnostic code for malformed character literals.
26const E_INVALID_CHAR_LITERAL: DiagnosticCode = DiagnosticCode {
27    prefix: 'E',
28    number: 1004,
29};
30/// Diagnostic code for invalid digit in a numeric literal (e.g., `0b123`).
31const E_INVALID_DIGIT: DiagnosticCode = DiagnosticCode {
32    prefix: 'E',
33    number: 1005,
34};
35/// Diagnostic code for an unterminated block comment.
36const E_UNTERMINATED_BLOCK_COMMENT: DiagnosticCode = DiagnosticCode {
37    prefix: 'E',
38    number: 1006,
39};
40
41/// Context needed to resume lexing a string after an interpolation expression ends.
42struct StringResumeCtx {
43    /// Byte offset of the opening delimiter (used for span construction on error).
44    string_start: usize,
45    is_raw: bool,
46    is_multiline: bool,
47}
48
49/// The Bock lexer: advances through a [`SourceFile`] and produces a token stream.
50pub struct Lexer<'src> {
51    source: &'src SourceFile,
52    /// Current byte position in `source.content`.
53    pos: usize,
54    diagnostics: DiagnosticBag,
55    /// Tokens buffered for emission before resuming normal lexing.
56    /// Used when a string sub-lexer produces multiple tokens at once.
57    pending: VecDeque<Token>,
58    /// Per-interpolation-level inner brace counter.
59    /// Non-empty ⟺ we are currently inside a `${...}` interpolation.
60    /// Each entry is the number of unmatched `{` seen since entering that level.
61    interp_brace_depth: Vec<u32>,
62    /// String resume contexts: one entry per active interpolation level.
63    string_resume: Vec<StringResumeCtx>,
64}
65
66impl<'src> Lexer<'src> {
67    /// Create a new [`Lexer`] for the given source file.
68    #[must_use]
69    pub fn new(source: &'src SourceFile) -> Self {
70        Self {
71            source,
72            pos: 0,
73            diagnostics: DiagnosticBag::new(),
74            pending: VecDeque::new(),
75            interp_brace_depth: Vec::new(),
76            string_resume: Vec::new(),
77        }
78    }
79
80    /// Tokenize the entire source file, returning all tokens including a final [`TokenKind::Eof`].
81    #[must_use]
82    pub fn tokenize(&mut self) -> Vec<Token> {
83        let mut tokens = Vec::new();
84        loop {
85            let tok = self.next_token();
86            let is_eof = tok.kind == TokenKind::Eof;
87            tokens.push(tok);
88            if is_eof {
89                break;
90            }
91        }
92        tokens
93    }
94
95    /// Access the accumulated diagnostics.
96    #[must_use]
97    pub fn diagnostics(&self) -> &DiagnosticBag {
98        &self.diagnostics
99    }
100
101    // ── Internal helpers ──────────────────────────────────────────────────────
102
103    /// Return the current character without advancing, or `None` at EOF.
104    fn peek(&self) -> Option<char> {
105        self.source.content[self.pos..].chars().next()
106    }
107
108    /// Return the character after the current one without advancing, or `None`.
109    fn peek_next(&self) -> Option<char> {
110        let mut chars = self.source.content[self.pos..].chars();
111        chars.next(); // skip current
112        chars.next()
113    }
114
115    /// Advance past the current character and return it, or `None` at EOF.
116    fn advance(&mut self) -> Option<char> {
117        let ch = self.source.content[self.pos..].chars().next()?;
118        self.pos += ch.len_utf8();
119        Some(ch)
120    }
121
122    /// Skip whitespace characters that are NOT newlines.
123    fn skip_whitespace(&mut self) {
124        while let Some(ch) = self.peek() {
125            if ch == '\n' || !ch.is_whitespace() {
126                break;
127            }
128            self.advance();
129        }
130    }
131
132    /// Build a span from `start` to the current position.
133    fn span_from(&self, start: usize) -> Span {
134        Span {
135            file: self.source.id,
136            start,
137            end: self.pos,
138        }
139    }
140
141    /// Make a simple token with no literal.
142    fn make_token(&self, kind: TokenKind, start: usize) -> Token {
143        Token::new(kind, self.span_from(start), None)
144    }
145
146    // ── Main dispatch ─────────────────────────────────────────────────────────
147
148    /// Lex the next token.
149    fn next_token(&mut self) -> Token {
150        // Drain any tokens buffered by the string sub-lexer.
151        if let Some(tok) = self.pending.pop_front() {
152            return tok;
153        }
154
155        self.skip_whitespace();
156
157        let start = self.pos;
158
159        let ch = match self.peek() {
160            None => return self.make_token(TokenKind::Eof, start),
161            Some(c) => c,
162        };
163
164        // Newline
165        if ch == '\n' {
166            self.advance();
167            return self.make_token(TokenKind::Newline, start);
168        }
169
170        // Windows-style \r\n — treat as a single newline
171        if ch == '\r' {
172            self.advance();
173            if self.peek() == Some('\n') {
174                self.advance();
175            }
176            return self.make_token(TokenKind::Newline, start);
177        }
178
179        // Comments: // or /*
180        if ch == '/' && (self.peek_next() == Some('/') || self.peek_next() == Some('*')) {
181            self.lex_comment();
182            // After a comment, recurse (skip it, get next real token)
183            return self.next_token();
184        }
185
186        // String / raw-string literals
187        if ch == '"' {
188            return self.lex_string();
189        }
190        if ch == 'r' && self.peek_next() == Some('"') {
191            return self.lex_string();
192        }
193
194        // Character literal
195        if ch == '\'' {
196            return self.lex_char();
197        }
198
199        // Numeric literal
200        if ch.is_ascii_digit() {
201            return self.lex_number();
202        }
203        // Also dispatch 0x / 0o / 0b handled inside lex_number
204        // (the leading char is always a digit so this is fine)
205
206        // Identifier or keyword
207        if ch.is_alphabetic() || ch == '_' {
208            return self.lex_ident_or_keyword();
209        }
210
211        // Backslash line continuation: `\` immediately followed by newline
212        // consumes both and continues lexing on the next line.
213        if ch == '\\' {
214            if self.peek_next() == Some('\n') {
215                self.advance(); // consume '\'
216                self.advance(); // consume '\n'
217                return self.next_token();
218            }
219            if self.peek_next() == Some('\r') {
220                // Check for \r\n
221                self.advance(); // consume '\'
222                self.advance(); // consume '\r'
223                if self.peek() == Some('\n') {
224                    self.advance(); // consume '\n'
225                }
226                return self.next_token();
227            }
228            // `\` not followed by newline — fall through to lex_operator
229            // which will emit an Error token.
230        }
231
232        // Operators and punctuation
233        self.lex_operator()
234    }
235
236    // ── String lexing (P1.3) ──────────────────────────────────────────────────
237
238    /// Lex a string literal (standard, raw, or multi-line).
239    fn lex_string(&mut self) -> Token {
240        let start = self.pos;
241
242        let is_raw = self.peek() == Some('r');
243        if is_raw {
244            self.advance(); // consume 'r'
245        }
246
247        // Check for triple-quote multiline string.
248        let is_multiline = self.source.content[self.pos..].starts_with("\"\"\"");
249        if is_multiline {
250            self.pos += 3; // consume """ (each " is 1 byte)
251        } else {
252            self.advance(); // consume single "
253        }
254
255        self.process_string_body(start, is_raw, is_multiline, false)
256    }
257
258    /// Process string content starting at the current position.
259    ///
260    /// `string_start` is the byte offset of the opening delimiter (for spans/errors).
261    /// `is_continuation` is `true` when resuming after an interpolation — in that case
262    /// the returned token is always `StringLiteralPart` even if there is no further
263    /// interpolation, so the parser can see where the string ends.
264    fn process_string_body(
265        &mut self,
266        string_start: usize,
267        is_raw: bool,
268        is_multiline: bool,
269        is_continuation: bool,
270    ) -> Token {
271        let segment_start = self.pos;
272        let mut content = String::new();
273
274        loop {
275            match self.peek() {
276                // EOF before the closing delimiter.
277                None => {
278                    let span = self.span_from(string_start);
279                    self.diagnostics.error(
280                        E_UNTERMINATED_STRING,
281                        "unterminated string literal",
282                        span,
283                    );
284                    let kind = closing_kind(is_raw, is_multiline, is_continuation);
285                    return Token::new(kind, span, Some(content));
286                }
287
288                // Closing delimiter check.
289                Some('"') => {
290                    if is_multiline {
291                        if self.source.content[self.pos..].starts_with("\"\"\"") {
292                            self.pos += 3; // consume """
293                            let span = self.span_from(string_start);
294                            let processed = if is_multiline && !is_raw {
295                                strip_common_indent(&content)
296                            } else {
297                                content
298                            };
299                            let kind = closing_kind(is_raw, is_multiline, is_continuation);
300                            return Token::new(kind, span, Some(processed));
301                        } else {
302                            // A lone `"` inside a multiline string is just a character.
303                            content.push('"');
304                            self.advance();
305                        }
306                    } else {
307                        // Single-line string: closing `"`.
308                        self.advance();
309                        let span = self.span_from(string_start);
310                        let kind = closing_kind(is_raw, is_multiline, is_continuation);
311                        return Token::new(kind, span, Some(content));
312                    }
313                }
314
315                // Newline inside a single-line string = unterminated.
316                Some('\n') if !is_multiline => {
317                    let span = self.span_from(string_start);
318                    self.diagnostics.error(
319                        E_UNTERMINATED_STRING,
320                        "unterminated string literal (newline)",
321                        span,
322                    );
323                    let kind = closing_kind(is_raw, is_multiline, is_continuation);
324                    return Token::new(kind, span, Some(content));
325                }
326
327                // Backslash escape — only in non-raw strings.
328                Some('\\') if !is_raw => {
329                    self.advance(); // consume '\'
330                    match self.advance() {
331                        Some('n') => content.push('\n'),
332                        Some('t') => content.push('\t'),
333                        Some('r') => content.push('\r'),
334                        Some('\\') => content.push('\\'),
335                        Some('"') => content.push('"'),
336                        Some('\'') => content.push('\''),
337                        Some('0') => content.push('\0'),
338                        Some('$') => content.push('$'),
339                        Some('u') => {
340                            self.lex_unicode_escape(&mut content, string_start);
341                        }
342                        Some(other) => {
343                            let span = self.span_from(string_start);
344                            self.diagnostics.error(
345                                E_INVALID_ESCAPE,
346                                format!("unknown escape sequence: \\{other}"),
347                                span,
348                            );
349                            content.push(other);
350                        }
351                        None => {
352                            let span = self.span_from(string_start);
353                            self.diagnostics.error(
354                                E_UNTERMINATED_STRING,
355                                "unterminated string literal after backslash",
356                                span,
357                            );
358                            let kind = closing_kind(is_raw, is_multiline, is_continuation);
359                            return Token::new(kind, span, Some(content));
360                        }
361                    }
362                }
363
364                // Interpolation `${` — only in non-raw strings.
365                Some('$') if !is_raw => {
366                    if self.source.content[self.pos..].starts_with("${") {
367                        // Emit the text before the interpolation as a StringLiteralPart.
368                        let part_span = Span {
369                            file: self.source.id,
370                            start: segment_start,
371                            end: self.pos,
372                        };
373                        let part_tok =
374                            Token::new(TokenKind::StringLiteralPart, part_span, Some(content));
375
376                        let interp_start = self.pos;
377                        self.pos += 2; // consume '${'
378                        let interp_span = Span {
379                            file: self.source.id,
380                            start: interp_start,
381                            end: self.pos,
382                        };
383                        let interp_tok =
384                            Token::new(TokenKind::InterpolationStart, interp_span, None);
385
386                        // Buffer InterpolationStart; push resume context.
387                        self.pending.push_back(interp_tok);
388                        self.interp_brace_depth.push(0);
389                        self.string_resume.push(StringResumeCtx {
390                            string_start,
391                            is_raw,
392                            is_multiline,
393                        });
394
395                        return part_tok;
396                    } else if self.source.content[self.pos..].starts_with("$$") {
397                        // `$$` is an escaped dollar sign in non-raw strings.
398                        content.push('$');
399                        self.pos += 2; // consume both '$'
400                    } else {
401                        content.push('$');
402                        self.advance();
403                    }
404                }
405
406                // Any other character — include as-is.
407                Some(ch) => {
408                    content.push(ch);
409                    self.advance();
410                }
411            }
412        }
413    }
414
415    /// Called from `lex_operator` when `}` closes an interpolation.
416    /// Resumes lexing the string body and pushes the resulting token(s) to `pending`.
417    fn resume_string_lex(&mut self, ctx: StringResumeCtx) {
418        let tok = self.process_string_body(ctx.string_start, ctx.is_raw, ctx.is_multiline, true);
419        // Push to front so it comes before any InterpolationStart that may have been
420        // buffered if there is an immediately following `${` in the continued string.
421        self.pending.push_front(tok);
422    }
423
424    /// Process a `\u{HHHH}` Unicode escape, appending the decoded character to `out`.
425    fn lex_unicode_escape(&mut self, out: &mut String, string_start: usize) {
426        if self.peek() != Some('{') {
427            let span = self.span_from(string_start);
428            self.diagnostics.error(
429                E_INVALID_ESCAPE,
430                "expected '{' after \\u in Unicode escape",
431                span,
432            );
433            return;
434        }
435        self.advance(); // consume '{'
436
437        let hex_start = self.pos;
438        while self.peek().map(|c| c.is_ascii_hexdigit()).unwrap_or(false) {
439            self.advance();
440        }
441        let hex_str = &self.source.content[hex_start..self.pos];
442
443        if self.peek() != Some('}') {
444            let span = self.span_from(string_start);
445            self.diagnostics.error(
446                E_INVALID_ESCAPE,
447                "expected '}' to close Unicode escape \\u{...}",
448                span,
449            );
450            return;
451        }
452        self.advance(); // consume '}'
453
454        match u32::from_str_radix(hex_str, 16)
455            .ok()
456            .and_then(char::from_u32)
457        {
458            Some(c) => out.push(c),
459            None => {
460                let span = self.span_from(string_start);
461                self.diagnostics.error(
462                    E_INVALID_ESCAPE,
463                    format!("invalid Unicode codepoint: \\u{{{hex_str}}}"),
464                    span,
465                );
466            }
467        }
468    }
469
470    // ── Character literal (P1.3) ──────────────────────────────────────────────
471
472    /// Lex a character literal: `'c'`, `'\n'`, `'\u{1F600}'`.
473    fn lex_char(&mut self) -> Token {
474        let start = self.pos;
475        self.advance(); // consume opening '
476
477        let ch = match self.peek() {
478            None => {
479                let span = self.span_from(start);
480                self.diagnostics.error(
481                    E_INVALID_CHAR_LITERAL,
482                    "unterminated character literal",
483                    span,
484                );
485                return Token::new(TokenKind::Error, span, None);
486            }
487            Some('\'') => {
488                // Empty literal ''
489                self.advance();
490                let span = self.span_from(start);
491                self.diagnostics
492                    .error(E_INVALID_CHAR_LITERAL, "empty character literal", span);
493                return Token::new(TokenKind::Error, span, None);
494            }
495            Some('\\') => {
496                self.advance(); // consume '\'
497                match self.advance() {
498                    Some('n') => '\n',
499                    Some('t') => '\t',
500                    Some('r') => '\r',
501                    Some('\\') => '\\',
502                    Some('\'') => '\'',
503                    Some('"') => '"',
504                    Some('0') => '\0',
505                    Some('u') => {
506                        let mut buf = String::new();
507                        self.lex_unicode_escape(&mut buf, start);
508                        buf.chars().next().unwrap_or('\0')
509                    }
510                    Some(other) => {
511                        let span = self.span_from(start);
512                        self.diagnostics.error(
513                            E_INVALID_ESCAPE,
514                            format!("unknown escape sequence: \\{other}"),
515                            span,
516                        );
517                        other
518                    }
519                    None => {
520                        let span = self.span_from(start);
521                        self.diagnostics.error(
522                            E_INVALID_CHAR_LITERAL,
523                            "unterminated character literal",
524                            span,
525                        );
526                        return Token::new(TokenKind::Error, span, None);
527                    }
528                }
529            }
530            Some(c) => {
531                self.advance();
532                c
533            }
534        };
535
536        // Expect closing '
537        if self.peek() == Some('\'') {
538            self.advance();
539            let span = self.span_from(start);
540            Token::new(TokenKind::CharLiteral, span, Some(ch.to_string()))
541        } else {
542            let span = self.span_from(start);
543            self.diagnostics.error(
544                E_INVALID_CHAR_LITERAL,
545                "expected closing ' in character literal",
546                span,
547            );
548            Token::new(TokenKind::Error, span, Some(ch.to_string()))
549        }
550    }
551
552    // ── Sub-lexer stubs ───────────────────────────────────────────────────────
553
554    /// Lex a numeric literal (integer or float, all bases, optional type suffix).
555    ///
556    /// Handles:
557    /// - Decimal: `42`, `1_000_000`
558    /// - Hex: `0xFF`, `0XFF`
559    /// - Octal: `0o77`, `0O77`
560    /// - Binary: `0b1010`, `0B1010`
561    /// - Float: `3.14`, `1.0e10`, `2.5E-3`, `1e6`
562    /// - Type suffix: `42_u8`, `3.14_f64` (underscore + type ident)
563    /// - Disambiguation: `1..2` emits `IntLiteral(1)`, `DotDot`, `IntLiteral(2)`.
564    fn lex_number(&mut self) -> Token {
565        let start = self.pos;
566        let mut literal = String::new();
567        let mut is_float = false;
568
569        // Consume the first digit (already peeked as ascii_digit in caller).
570        let first = self.advance().expect("caller guarantees a digit");
571        literal.push(first);
572
573        // Detect base prefix: 0x / 0o / 0b (only when first digit is '0').
574        if first == '0' {
575            match self.peek() {
576                Some('x') | Some('X') => {
577                    let prefix = self.advance().expect("peek confirmed 'x'/'X'");
578                    literal.push(prefix);
579                    let digit_start = self.pos;
580                    // Consume hex digits and underscores.
581                    self.consume_digits(&mut literal, |c| c.is_ascii_hexdigit() || c == '_');
582                    if self.pos == digit_start {
583                        self.diagnostics.error(
584                            E_INVALID_DIGIT,
585                            "expected hexadecimal digit after '0x'",
586                            self.span_from(start),
587                        );
588                    }
589                    let suffix = self.try_consume_suffix();
590                    let full = format!("{}{}", literal, suffix.as_deref().unwrap_or(""));
591                    return Token::new(TokenKind::IntLiteral, self.span_from(start), Some(full));
592                }
593                Some('o') | Some('O') => {
594                    let prefix = self.advance().expect("peek confirmed 'o'/'O'");
595                    literal.push(prefix);
596                    let digit_start = self.pos;
597                    // Consume all digits/underscores, collecting the full body for validation.
598                    self.consume_digits(&mut literal, |c| c.is_ascii_digit() || c == '_');
599                    if self.pos == digit_start {
600                        self.diagnostics.error(
601                            E_INVALID_DIGIT,
602                            "expected octal digit after '0o'",
603                            self.span_from(start),
604                        );
605                    } else {
606                        let body = &self.source.content[digit_start..self.pos];
607                        for ch in body.chars() {
608                            if ch != '_' && !matches!(ch, '0'..='7') {
609                                self.diagnostics.error(
610                                    E_INVALID_DIGIT,
611                                    format!("invalid octal digit '{ch}'"),
612                                    self.span_from(start),
613                                );
614                                break;
615                            }
616                        }
617                    }
618                    let suffix = self.try_consume_suffix();
619                    let full = format!("{}{}", literal, suffix.as_deref().unwrap_or(""));
620                    return Token::new(TokenKind::IntLiteral, self.span_from(start), Some(full));
621                }
622                Some('b') | Some('B') => {
623                    let prefix = self.advance().expect("peek confirmed 'b'/'B'");
624                    literal.push(prefix);
625                    let digit_start = self.pos;
626                    // Consume all digits/underscores, collecting the full body for validation.
627                    self.consume_digits(&mut literal, |c| c.is_ascii_digit() || c == '_');
628                    if self.pos == digit_start {
629                        self.diagnostics.error(
630                            E_INVALID_DIGIT,
631                            "expected binary digit after '0b'",
632                            self.span_from(start),
633                        );
634                    } else {
635                        let body = &self.source.content[digit_start..self.pos];
636                        for ch in body.chars() {
637                            if ch != '_' && !matches!(ch, '0' | '1') {
638                                self.diagnostics.error(
639                                    E_INVALID_DIGIT,
640                                    format!("invalid binary digit '{ch}'"),
641                                    self.span_from(start),
642                                );
643                                break;
644                            }
645                        }
646                    }
647                    let suffix = self.try_consume_suffix();
648                    let full = format!("{}{}", literal, suffix.as_deref().unwrap_or(""));
649                    return Token::new(TokenKind::IntLiteral, self.span_from(start), Some(full));
650                }
651                _ => {}
652            }
653        }
654
655        // Decimal integer body. Underscores are digit separators, but stop before
656        // a `_` that is immediately followed by an alphabetic char (type suffix start).
657        self.consume_decimal_digits(&mut literal);
658
659        // Float detection: `.` followed by a digit (not `..`).
660        if self.peek() == Some('.') && self.peek_next().is_some_and(|c| c.is_ascii_digit()) {
661            is_float = true;
662            literal.push(self.advance().expect("peek confirmed '.'")); // consume '.'
663            self.consume_decimal_digits(&mut literal);
664        }
665
666        // Exponent: e/E followed by optional +/- and digits.
667        if matches!(self.peek(), Some('e') | Some('E')) {
668            is_float = true;
669            literal.push(self.advance().expect("peek confirmed 'e'/'E'")); // consume 'e' or 'E'
670            if matches!(self.peek(), Some('+') | Some('-')) {
671                literal.push(self.advance().expect("peek confirmed '+'/'-'"));
672            }
673            self.consume_decimal_digits(&mut literal);
674        }
675
676        // Optional type suffix: `_` followed by a type identifier (e.g., `u8`, `f64`).
677        let suffix = self.try_consume_suffix();
678        let full = format!("{}{}", literal, suffix.as_deref().unwrap_or(""));
679
680        let kind = if is_float {
681            TokenKind::FloatLiteral
682        } else {
683            TokenKind::IntLiteral
684        };
685        Token::new(kind, self.span_from(start), Some(full))
686    }
687
688    /// Consume characters matching `predicate` into `buf`.
689    fn consume_digits(&mut self, buf: &mut String, predicate: impl Fn(char) -> bool) {
690        while let Some(ch) = self.peek() {
691            if predicate(ch) {
692                buf.push(ch);
693                self.advance();
694            } else {
695                break;
696            }
697        }
698    }
699
700    /// Consume decimal digits and underscore separators, but stop before a `_`
701    /// that is immediately followed by an alphabetic character (type suffix).
702    fn consume_decimal_digits(&mut self, buf: &mut String) {
703        loop {
704            match self.peek() {
705                Some(c) if c.is_ascii_digit() => {
706                    buf.push(c);
707                    self.advance();
708                }
709                Some('_') => {
710                    // Peek ahead: if next char after `_` is alphabetic, this is a
711                    // type suffix — stop consuming.
712                    if self.peek_next().is_some_and(|c| c.is_alphabetic()) {
713                        break;
714                    }
715                    buf.push('_');
716                    self.advance();
717                }
718                _ => break,
719            }
720        }
721    }
722
723    /// Try to consume a type suffix (`_` followed by identifier chars), returning
724    /// the suffix string (including the leading `_`) if present.
725    fn try_consume_suffix(&mut self) -> Option<String> {
726        // A suffix starts with `_` followed immediately by an alphabetic char.
727        if self.peek() == Some('_') && self.peek_next().is_some_and(|c| c.is_alphabetic()) {
728            let mut suffix = String::new();
729            suffix.push(self.advance().expect("peek confirmed '_'")); // '_'
730            while let Some(ch) = self.peek() {
731                if ch.is_alphanumeric() || ch == '_' {
732                    suffix.push(ch);
733                    self.advance();
734                } else {
735                    break;
736                }
737            }
738            Some(suffix)
739        } else {
740            None
741        }
742    }
743
744    /// Lex a comment, advancing past it.
745    ///
746    /// Regular comments (`//`, `/* */`) produce no token.
747    /// Doc comments (`///`, `//!`) push a token into `self.pending` so that
748    /// the next call to `next_token` returns it.
749    fn lex_comment(&mut self) {
750        let start = self.pos;
751        // Consume the opening `/`
752        self.advance();
753
754        match self.peek() {
755            Some('/') => {
756                // Line comment: `//`, `///`, or `//!`
757                self.advance(); // consume second `/`
758
759                if self.peek() == Some('/') {
760                    // `///` — doc comment
761                    self.advance(); // consume third `/`
762                    let content_start = self.pos;
763                    while let Some(ch) = self.peek() {
764                        if ch == '\n' {
765                            break;
766                        }
767                        self.advance();
768                    }
769                    let content = self.source.content[content_start..self.pos]
770                        .trim()
771                        .to_owned();
772                    let span = self.span_from(start);
773                    self.pending
774                        .push_back(Token::new(TokenKind::DocComment, span, Some(content)));
775                } else if self.peek() == Some('!') {
776                    // `//!` — module doc comment
777                    self.advance(); // consume `!`
778                    let content_start = self.pos;
779                    while let Some(ch) = self.peek() {
780                        if ch == '\n' {
781                            break;
782                        }
783                        self.advance();
784                    }
785                    let content = self.source.content[content_start..self.pos]
786                        .trim()
787                        .to_owned();
788                    let span = self.span_from(start);
789                    self.pending.push_back(Token::new(
790                        TokenKind::ModuleDocComment,
791                        span,
792                        Some(content),
793                    ));
794                } else {
795                    // Regular `//` line comment — consume to end of line, emit nothing
796                    while let Some(ch) = self.peek() {
797                        if ch == '\n' {
798                            break;
799                        }
800                        self.advance();
801                    }
802                }
803            }
804            Some('*') => {
805                // Block comment: `/* ... */` (nestable)
806                self.advance(); // consume `*`
807                let mut depth: u32 = 1;
808                loop {
809                    match self.peek() {
810                        None => {
811                            let span = self.span_from(start);
812                            self.diagnostics.error(
813                                E_UNTERMINATED_BLOCK_COMMENT,
814                                "unterminated block comment",
815                                span,
816                            );
817                            break;
818                        }
819                        Some('/') => {
820                            self.advance();
821                            if self.peek() == Some('*') {
822                                self.advance();
823                                depth += 1;
824                            }
825                        }
826                        Some('*') => {
827                            self.advance();
828                            if self.peek() == Some('/') {
829                                self.advance();
830                                depth -= 1;
831                                if depth == 0 {
832                                    break;
833                                }
834                            }
835                        }
836                        Some(_) => {
837                            self.advance();
838                        }
839                    }
840                }
841            }
842            _ => {
843                // Shouldn't happen: caller checks that next char is `/` or `*`
844            }
845        }
846    }
847
848    // ── Identifier / keyword ──────────────────────────────────────────────────
849
850    /// Lex an identifier or keyword starting at the current position.
851    fn lex_ident_or_keyword(&mut self) -> Token {
852        let start = self.pos;
853
854        while let Some(ch) = self.peek() {
855            if ch.is_alphanumeric() || ch == '_' {
856                self.advance();
857            } else {
858                break;
859            }
860        }
861
862        let text = &self.source.content[start..self.pos];
863        let span = self.span_from(start);
864
865        if let Some(kw) = keyword_lookup(text) {
866            Token::new(kw, span, None)
867        } else if text.starts_with(|c: char| c.is_uppercase()) {
868            Token::new(TokenKind::TypeIdent, span, Some(text.to_owned()))
869        } else if text == "_" {
870            Token::new(TokenKind::Underscore, span, None)
871        } else {
872            Token::new(TokenKind::Ident, span, Some(text.to_owned()))
873        }
874    }
875
876    // ── Operators ─────────────────────────────────────────────────────────────
877
878    /// Lex a single operator or punctuation token.
879    #[allow(clippy::too_many_lines)]
880    fn lex_operator(&mut self) -> Token {
881        let start = self.pos;
882        let ch = self.advance().expect("called with a character available");
883
884        let kind = match ch {
885            // Single-char punctuation
886            '(' => TokenKind::LParen,
887            ')' => TokenKind::RParen,
888            '[' => TokenKind::LBracket,
889            ']' => TokenKind::RBracket,
890
891            // `{` — track inner brace depth when inside an interpolation.
892            '{' => {
893                if !self.interp_brace_depth.is_empty() {
894                    *self.interp_brace_depth.last_mut().expect("non-empty") += 1;
895                }
896                TokenKind::LBrace
897            }
898
899            // `}` — either closes an interpolation or is a normal RBrace.
900            '}' => {
901                if !self.interp_brace_depth.is_empty() {
902                    let top = *self.interp_brace_depth.last().expect("non-empty");
903                    if top == 0 {
904                        // This `}` closes the interpolation.
905                        self.interp_brace_depth.pop();
906                        let ctx = self
907                            .string_resume
908                            .pop()
909                            .expect("resume stack mirrors brace stack");
910                        self.resume_string_lex(ctx);
911                        TokenKind::InterpolationEnd
912                    } else {
913                        *self.interp_brace_depth.last_mut().expect("non-empty") -= 1;
914                        TokenKind::RBrace
915                    }
916                } else {
917                    TokenKind::RBrace
918                }
919            }
920
921            ',' => TokenKind::Comma,
922            ':' => TokenKind::Colon,
923            ';' => TokenKind::Semicolon,
924            '@' => TokenKind::At,
925            '#' => TokenKind::Hash,
926            '~' => TokenKind::BitNot,
927            '^' => TokenKind::BitXor,
928            '?' => TokenKind::Question,
929
930            // `+` or `+=`
931            '+' => {
932                if self.peek() == Some('=') {
933                    self.advance();
934                    TokenKind::PlusEq
935                } else {
936                    TokenKind::Plus
937                }
938            }
939
940            // `-` or `-=` or `->`
941            '-' => match self.peek() {
942                Some('=') => {
943                    self.advance();
944                    TokenKind::MinusEq
945                }
946                Some('>') => {
947                    self.advance();
948                    TokenKind::ThinArrow
949                }
950                _ => TokenKind::Minus,
951            },
952
953            // `*` or `*=` or `**`
954            '*' => match self.peek() {
955                Some('=') => {
956                    self.advance();
957                    TokenKind::StarEq
958                }
959                Some('*') => {
960                    self.advance();
961                    TokenKind::Power
962                }
963                _ => TokenKind::Star,
964            },
965
966            // `/` or `/=`  (comments already dispatched above)
967            '/' => {
968                if self.peek() == Some('=') {
969                    self.advance();
970                    TokenKind::SlashEq
971                } else {
972                    TokenKind::Slash
973                }
974            }
975
976            // `%` or `%=`
977            '%' => {
978                if self.peek() == Some('=') {
979                    self.advance();
980                    TokenKind::PercentEq
981                } else {
982                    TokenKind::Percent
983                }
984            }
985
986            // `=` or `==` or `=>`
987            '=' => match self.peek() {
988                Some('=') => {
989                    self.advance();
990                    TokenKind::Eq
991                }
992                Some('>') => {
993                    self.advance();
994                    TokenKind::FatArrow
995                }
996                _ => TokenKind::Assign,
997            },
998
999            // `!` or `!=`
1000            '!' => {
1001                if self.peek() == Some('=') {
1002                    self.advance();
1003                    TokenKind::Neq
1004                } else {
1005                    TokenKind::Not
1006                }
1007            }
1008
1009            // `<` or `<=` or `<<`
1010            '<' => match self.peek() {
1011                Some('=') => {
1012                    self.advance();
1013                    TokenKind::Lte
1014                }
1015                Some('<') => {
1016                    self.advance();
1017                    TokenKind::Shl
1018                }
1019                _ => TokenKind::Lt,
1020            },
1021
1022            // `>` or `>=` or `>>`
1023            '>' => match self.peek() {
1024                Some('=') => {
1025                    self.advance();
1026                    TokenKind::Gte
1027                }
1028                Some('>') => {
1029                    self.advance();
1030                    TokenKind::Shr
1031                }
1032                _ => TokenKind::Gt,
1033            },
1034
1035            // `&` or `&&`
1036            '&' => {
1037                if self.peek() == Some('&') {
1038                    self.advance();
1039                    TokenKind::And
1040                } else {
1041                    TokenKind::BitAnd
1042                }
1043            }
1044
1045            // `|` or `||` or `|>`
1046            '|' => match self.peek() {
1047                Some('|') => {
1048                    self.advance();
1049                    TokenKind::Or
1050                }
1051                Some('>') => {
1052                    self.advance();
1053                    TokenKind::Pipe
1054                }
1055                _ => TokenKind::BitOr,
1056            },
1057
1058            // `.` or `..` or `..=`
1059            '.' => {
1060                if self.peek() == Some('.') {
1061                    self.advance(); // consume second '.'
1062                    if self.peek() == Some('=') {
1063                        self.advance(); // consume '='
1064                        TokenKind::DotDotEq
1065                    } else {
1066                        TokenKind::DotDot
1067                    }
1068                } else {
1069                    TokenKind::Dot
1070                }
1071            }
1072
1073            // Unknown character
1074            other => {
1075                let span = self.span_from(start);
1076                self.diagnostics.error(
1077                    E_UNEXPECTED_CHAR,
1078                    format!("unexpected character {:?}", other),
1079                    span,
1080                );
1081                return Token::new(TokenKind::Error, span, Some(other.to_string()));
1082            }
1083        };
1084
1085        self.make_token(kind, start)
1086    }
1087}
1088
1089// ── Free helpers ──────────────────────────────────────────────────────────────
1090
1091/// Determine the closing token kind for a string based on its attributes.
1092fn closing_kind(is_raw: bool, is_multiline: bool, is_continuation: bool) -> TokenKind {
1093    if is_continuation {
1094        TokenKind::StringLiteralPart
1095    } else if is_raw && is_multiline {
1096        TokenKind::RawMultiLineStringLiteral
1097    } else if is_multiline {
1098        TokenKind::MultiLineStringLiteral
1099    } else if is_raw {
1100        TokenKind::RawStringLiteral
1101    } else {
1102        TokenKind::StringLiteral
1103    }
1104}
1105
1106/// Strip the common leading indentation from a multi-line string body.
1107///
1108/// The first line (right after the opening `"""`) is stripped if it is blank.
1109/// The last newline before the closing `"""` is also trimmed.
1110fn strip_common_indent(s: &str) -> String {
1111    let raw_lines: Vec<&str> = s.split('\n').collect();
1112
1113    // Drop the first line if it's blank (the text after the opening `"""`).
1114    let lines: &[&str] = if raw_lines
1115        .first()
1116        .map(|l| l.trim().is_empty())
1117        .unwrap_or(false)
1118    {
1119        &raw_lines[1..]
1120    } else {
1121        &raw_lines
1122    };
1123
1124    // Find common indentation (only non-empty lines count).
1125    let common = lines
1126        .iter()
1127        .filter(|l| !l.trim().is_empty())
1128        .map(|l| l.len() - l.trim_start().len())
1129        .min()
1130        .unwrap_or(0);
1131
1132    // Strip common indent from each line.
1133    let stripped: Vec<&str> = lines
1134        .iter()
1135        .map(|l| if l.len() >= common { &l[common..] } else { *l })
1136        .collect();
1137
1138    let joined = stripped.join("\n");
1139    // Remove a single trailing newline that precedes the closing `"""`.
1140    joined.trim_end_matches('\n').to_string()
1141}
1142
1143// ── Tests ─────────────────────────────────────────────────────────────────────
1144
1145#[cfg(test)]
1146mod tests {
1147    use super::*;
1148    use bock_source::SourceFile;
1149    use std::path::PathBuf;
1150
1151    fn lex(src: &str) -> Vec<Token> {
1152        let file = SourceFile::new(
1153            bock_errors::FileId(0),
1154            PathBuf::from("test.bock"),
1155            src.to_string(),
1156        );
1157        let mut lexer = Lexer::new(&file);
1158        lexer.tokenize()
1159    }
1160
1161    fn kinds(src: &str) -> Vec<TokenKind> {
1162        lex(src).into_iter().map(|t| t.kind).collect()
1163    }
1164
1165    fn literals(src: &str) -> Vec<Option<String>> {
1166        lex(src).into_iter().map(|t| t.literal).collect()
1167    }
1168
1169    // ── Identifiers and keywords ───────────────────────────────────────────────
1170
1171    #[test]
1172    fn lex_simple_identifier() {
1173        let toks = kinds("foo");
1174        assert_eq!(toks, vec![TokenKind::Ident, TokenKind::Eof]);
1175    }
1176
1177    #[test]
1178    fn lex_type_identifier() {
1179        let toks = kinds("Foo");
1180        assert_eq!(toks, vec![TokenKind::TypeIdent, TokenKind::Eof]);
1181    }
1182
1183    #[test]
1184    fn lex_underscore() {
1185        let toks = kinds("_");
1186        assert_eq!(toks, vec![TokenKind::Underscore, TokenKind::Eof]);
1187    }
1188
1189    #[test]
1190    fn lex_underscore_ident() {
1191        // _foo starts with _ and has more chars → Ident
1192        let toks = kinds("_foo");
1193        assert_eq!(toks, vec![TokenKind::Ident, TokenKind::Eof]);
1194    }
1195
1196    #[test]
1197    fn lex_keywords() {
1198        let toks = kinds("fn let mut const if else match for in while loop break continue return");
1199        assert_eq!(
1200            toks,
1201            vec![
1202                TokenKind::Fn,
1203                TokenKind::Let,
1204                TokenKind::Mut,
1205                TokenKind::Const,
1206                TokenKind::If,
1207                TokenKind::Else,
1208                TokenKind::Match,
1209                TokenKind::For,
1210                TokenKind::In,
1211                TokenKind::While,
1212                TokenKind::Loop,
1213                TokenKind::Break,
1214                TokenKind::Continue,
1215                TokenKind::Return,
1216                TokenKind::Eof,
1217            ]
1218        );
1219    }
1220
1221    #[test]
1222    fn lex_true_false_as_bool_literal() {
1223        let toks = kinds("true false");
1224        assert_eq!(
1225            toks,
1226            vec![
1227                TokenKind::BoolLiteral,
1228                TokenKind::BoolLiteral,
1229                TokenKind::Eof
1230            ]
1231        );
1232    }
1233
1234    #[test]
1235    fn bool_literal_round_trip() {
1236        let src = "true false";
1237        let tokens = lex(src);
1238        // Both emit BoolLiteral
1239        assert_eq!(tokens[0].kind, TokenKind::BoolLiteral);
1240        assert_eq!(tokens[1].kind, TokenKind::BoolLiteral);
1241        // Source text is recoverable from span
1242        assert_eq!(&src[tokens[0].span.start..tokens[0].span.end], "true");
1243        assert_eq!(&src[tokens[1].span.start..tokens[1].span.end], "false");
1244    }
1245
1246    #[test]
1247    fn lex_self_keywords() {
1248        let toks = kinds("self Self");
1249        assert_eq!(
1250            toks,
1251            vec![TokenKind::SelfLower, TokenKind::SelfUpper, TokenKind::Eof]
1252        );
1253    }
1254
1255    #[test]
1256    fn lex_ok_err_some_none() {
1257        let toks = kinds("Ok Err Some None");
1258        assert_eq!(
1259            toks,
1260            vec![
1261                TokenKind::Ok_,
1262                TokenKind::Err_,
1263                TokenKind::Some_,
1264                TokenKind::None_,
1265                TokenKind::Eof,
1266            ]
1267        );
1268    }
1269
1270    // ── Operators ─────────────────────────────────────────────────────────────
1271
1272    #[test]
1273    fn lex_single_char_ops() {
1274        let toks = kinds("+ - * / % ! & | ^ ~ ? # @");
1275        assert_eq!(
1276            toks,
1277            vec![
1278                TokenKind::Plus,
1279                TokenKind::Minus,
1280                TokenKind::Star,
1281                TokenKind::Slash,
1282                TokenKind::Percent,
1283                TokenKind::Not,
1284                TokenKind::BitAnd,
1285                TokenKind::BitOr,
1286                TokenKind::BitXor,
1287                TokenKind::BitNot,
1288                TokenKind::Question,
1289                TokenKind::Hash,
1290                TokenKind::At,
1291                TokenKind::Eof,
1292            ]
1293        );
1294    }
1295
1296    #[test]
1297    fn lex_pipe_vs_bitor() {
1298        let toks = kinds("|> |");
1299        assert_eq!(
1300            toks,
1301            vec![TokenKind::Pipe, TokenKind::BitOr, TokenKind::Eof]
1302        );
1303    }
1304
1305    #[test]
1306    fn lex_compose() {
1307        // `>>` is lexed as Shr; parser re-interprets in expression context
1308        let toks = kinds(">>");
1309        assert_eq!(toks, vec![TokenKind::Shr, TokenKind::Eof]);
1310    }
1311
1312    #[test]
1313    fn lex_dotdot_dotdoteq_dot() {
1314        let toks = kinds(". .. ..=");
1315        assert_eq!(
1316            toks,
1317            vec![
1318                TokenKind::Dot,
1319                TokenKind::DotDot,
1320                TokenKind::DotDotEq,
1321                TokenKind::Eof
1322            ]
1323        );
1324    }
1325
1326    #[test]
1327    fn lex_fat_arrow_vs_eq() {
1328        let toks = kinds("=> = ==");
1329        assert_eq!(
1330            toks,
1331            vec![
1332                TokenKind::FatArrow,
1333                TokenKind::Assign,
1334                TokenKind::Eq,
1335                TokenKind::Eof
1336            ]
1337        );
1338    }
1339
1340    #[test]
1341    fn lex_thin_arrow_vs_minus() {
1342        let toks = kinds("-> - -=");
1343        assert_eq!(
1344            toks,
1345            vec![
1346                TokenKind::ThinArrow,
1347                TokenKind::Minus,
1348                TokenKind::MinusEq,
1349                TokenKind::Eof
1350            ]
1351        );
1352    }
1353
1354    #[test]
1355    fn lex_power_vs_star() {
1356        let toks = kinds("** * *=");
1357        assert_eq!(
1358            toks,
1359            vec![
1360                TokenKind::Power,
1361                TokenKind::Star,
1362                TokenKind::StarEq,
1363                TokenKind::Eof
1364            ]
1365        );
1366    }
1367
1368    #[test]
1369    fn lex_shift_ops() {
1370        let toks = kinds("<< >>");
1371        assert_eq!(toks, vec![TokenKind::Shl, TokenKind::Shr, TokenKind::Eof]);
1372    }
1373
1374    #[test]
1375    fn lex_assignment_ops() {
1376        let toks = kinds("+= -= *= /= %=");
1377        assert_eq!(
1378            toks,
1379            vec![
1380                TokenKind::PlusEq,
1381                TokenKind::MinusEq,
1382                TokenKind::StarEq,
1383                TokenKind::SlashEq,
1384                TokenKind::PercentEq,
1385                TokenKind::Eof,
1386            ]
1387        );
1388    }
1389
1390    #[test]
1391    fn lex_comparison_ops() {
1392        let toks = kinds("== != < > <= >=");
1393        assert_eq!(
1394            toks,
1395            vec![
1396                TokenKind::Eq,
1397                TokenKind::Neq,
1398                TokenKind::Lt,
1399                TokenKind::Gt,
1400                TokenKind::Lte,
1401                TokenKind::Gte,
1402                TokenKind::Eof,
1403            ]
1404        );
1405    }
1406
1407    #[test]
1408    fn lex_logical_ops() {
1409        let toks = kinds("&& || !");
1410        assert_eq!(
1411            toks,
1412            vec![
1413                TokenKind::And,
1414                TokenKind::Or,
1415                TokenKind::Not,
1416                TokenKind::Eof
1417            ]
1418        );
1419    }
1420
1421    // ── Punctuation ───────────────────────────────────────────────────────────
1422
1423    #[test]
1424    fn lex_delimiters() {
1425        let toks = kinds("( ) [ ] { }");
1426        assert_eq!(
1427            toks,
1428            vec![
1429                TokenKind::LParen,
1430                TokenKind::RParen,
1431                TokenKind::LBracket,
1432                TokenKind::RBracket,
1433                TokenKind::LBrace,
1434                TokenKind::RBrace,
1435                TokenKind::Eof,
1436            ]
1437        );
1438    }
1439
1440    #[test]
1441    fn lex_misc_punct() {
1442        let toks = kinds(", : ;");
1443        assert_eq!(
1444            toks,
1445            vec![
1446                TokenKind::Comma,
1447                TokenKind::Colon,
1448                TokenKind::Semicolon,
1449                TokenKind::Eof
1450            ]
1451        );
1452    }
1453
1454    // ── Newlines ──────────────────────────────────────────────────────────────
1455
1456    #[test]
1457    fn lex_newlines() {
1458        let toks = kinds("foo\nbar");
1459        assert_eq!(
1460            toks,
1461            vec![
1462                TokenKind::Ident,
1463                TokenKind::Newline,
1464                TokenKind::Ident,
1465                TokenKind::Eof,
1466            ]
1467        );
1468    }
1469
1470    #[test]
1471    fn lex_crlf_newline() {
1472        let toks = kinds("foo\r\nbar");
1473        assert_eq!(
1474            toks,
1475            vec![
1476                TokenKind::Ident,
1477                TokenKind::Newline,
1478                TokenKind::Ident,
1479                TokenKind::Eof,
1480            ]
1481        );
1482    }
1483
1484    #[test]
1485    fn lex_multiple_newlines() {
1486        let toks = kinds("a\n\nb");
1487        assert_eq!(
1488            toks,
1489            vec![
1490                TokenKind::Ident,
1491                TokenKind::Newline,
1492                TokenKind::Newline,
1493                TokenKind::Ident,
1494                TokenKind::Eof,
1495            ]
1496        );
1497    }
1498
1499    // ── Error tokens ──────────────────────────────────────────────────────────
1500
1501    #[test]
1502    fn lex_unknown_char_produces_error() {
1503        let file = SourceFile::new(
1504            bock_errors::FileId(0),
1505            PathBuf::from("test.bock"),
1506            "§".to_string(),
1507        );
1508        let mut lexer = Lexer::new(&file);
1509        let toks = lexer.tokenize();
1510        assert_eq!(toks[0].kind, TokenKind::Error);
1511        assert!(lexer.diagnostics().has_errors());
1512    }
1513
1514    // ── Integration: idents + keywords + operators ────────────────────────────
1515
1516    #[test]
1517    fn integration_basic_function_signature() {
1518        // fn add(x: Int) -> Int
1519        let toks = kinds("fn add(x: Int) -> Int");
1520        assert_eq!(
1521            toks,
1522            vec![
1523                TokenKind::Fn,
1524                TokenKind::Ident, // add
1525                TokenKind::LParen,
1526                TokenKind::Ident, // x
1527                TokenKind::Colon,
1528                TokenKind::TypeIdent, // Int
1529                TokenKind::RParen,
1530                TokenKind::ThinArrow,
1531                TokenKind::TypeIdent, // Int
1532                TokenKind::Eof,
1533            ]
1534        );
1535    }
1536
1537    #[test]
1538    fn integration_let_binding() {
1539        // let x = 42  — number is dispatched to lex_number (todo! for P1.4),
1540        // so skip the number and just test the surrounding tokens
1541        let toks = kinds("let mut x =");
1542        assert_eq!(
1543            toks,
1544            vec![
1545                TokenKind::Let,
1546                TokenKind::Mut,
1547                TokenKind::Ident,
1548                TokenKind::Assign,
1549                TokenKind::Eof,
1550            ]
1551        );
1552    }
1553
1554    #[test]
1555    fn integration_match_arm() {
1556        // Ok(x) => x
1557        let toks = kinds("Ok(x) => x");
1558        assert_eq!(
1559            toks,
1560            vec![
1561                TokenKind::Ok_,
1562                TokenKind::LParen,
1563                TokenKind::Ident,
1564                TokenKind::RParen,
1565                TokenKind::FatArrow,
1566                TokenKind::Ident,
1567                TokenKind::Eof,
1568            ]
1569        );
1570    }
1571
1572    #[test]
1573    fn integration_pipe_expression() {
1574        // xs |> map |> filter
1575        let toks = kinds("xs |> map |> filter");
1576        assert_eq!(
1577            toks,
1578            vec![
1579                TokenKind::Ident,
1580                TokenKind::Pipe,
1581                TokenKind::Ident,
1582                TokenKind::Pipe,
1583                TokenKind::Ident,
1584                TokenKind::Eof,
1585            ]
1586        );
1587    }
1588
1589    #[test]
1590    fn integration_multiline() {
1591        let src = "fn foo()\n  let x = y\n  x";
1592        let toks = kinds(src);
1593        // fn foo ( ) <newline> let x = y <newline> x <eof>
1594        // Note: `=` and `y` are operators/idents (no number), numeric `lex_number` not called
1595        assert_eq!(
1596            toks,
1597            vec![
1598                TokenKind::Fn,
1599                TokenKind::Ident,
1600                TokenKind::LParen,
1601                TokenKind::RParen,
1602                TokenKind::Newline,
1603                TokenKind::Let,
1604                TokenKind::Ident,
1605                TokenKind::Assign,
1606                TokenKind::Ident,
1607                TokenKind::Newline,
1608                TokenKind::Ident,
1609                TokenKind::Eof,
1610            ]
1611        );
1612    }
1613
1614    // ── String literals (P1.3) ────────────────────────────────────────────────
1615
1616    #[test]
1617    fn lex_plain_string() {
1618        let toks = lex(r#""hello""#);
1619        assert_eq!(toks[0].kind, TokenKind::StringLiteral);
1620        assert_eq!(toks[0].literal.as_deref(), Some("hello"));
1621        assert_eq!(toks[1].kind, TokenKind::Eof);
1622    }
1623
1624    #[test]
1625    fn lex_string_escape_sequences() {
1626        // "a\nb\tc\\"
1627        let toks = lex("\"a\\nb\\tc\\\\\"");
1628        assert_eq!(toks[0].kind, TokenKind::StringLiteral);
1629        assert_eq!(toks[0].literal.as_deref(), Some("a\nb\tc\\"));
1630    }
1631
1632    #[test]
1633    fn lex_string_escape_dollar() {
1634        let toks = lex(r#""\$""#);
1635        assert_eq!(toks[0].kind, TokenKind::StringLiteral);
1636        assert_eq!(toks[0].literal.as_deref(), Some("$"));
1637    }
1638
1639    #[test]
1640    fn lex_string_double_dollar_escape() {
1641        // $$ is an escaped dollar in non-raw strings
1642        let toks = lex(r#""$$""#);
1643        assert_eq!(toks[0].kind, TokenKind::StringLiteral);
1644        assert_eq!(toks[0].literal.as_deref(), Some("$"));
1645    }
1646
1647    #[test]
1648    fn lex_string_unicode_escape() {
1649        // "\u{41}" → "A"
1650        let toks = lex("\"\\u{41}\"");
1651        assert_eq!(toks[0].kind, TokenKind::StringLiteral);
1652        assert_eq!(toks[0].literal.as_deref(), Some("A"));
1653    }
1654
1655    #[test]
1656    fn lex_string_unicode_escape_multibyte() {
1657        // "\u{1F600}" → "😀"
1658        let toks = lex("\"\\u{1F600}\"");
1659        assert_eq!(toks[0].kind, TokenKind::StringLiteral);
1660        assert_eq!(toks[0].literal.as_deref(), Some("😀"));
1661    }
1662
1663    #[test]
1664    fn lex_raw_string() {
1665        let toks = lex(r#"r"hello\nworld""#);
1666        assert_eq!(toks[0].kind, TokenKind::RawStringLiteral);
1667        // Raw strings don't process escapes: backslash-n is two chars
1668        assert_eq!(toks[0].literal.as_deref(), Some("hello\\nworld"));
1669    }
1670
1671    #[test]
1672    fn lex_raw_string_dollar_literal() {
1673        // In raw strings, ${ is not an interpolation
1674        let toks = lex(r#"r"${not interp}""#);
1675        assert_eq!(toks[0].kind, TokenKind::RawStringLiteral);
1676        assert_eq!(toks[0].literal.as_deref(), Some("${not interp}"));
1677    }
1678
1679    #[test]
1680    fn lex_multiline_string() {
1681        let src = "\"\"\"hello world\"\"\"";
1682        let toks = lex(src);
1683        assert_eq!(toks[0].kind, TokenKind::MultiLineStringLiteral);
1684        assert_eq!(toks[0].literal.as_deref(), Some("hello world"));
1685    }
1686
1687    #[test]
1688    fn lex_multiline_string_indent_stripping() {
1689        // """
1690        //   hello
1691        //   world
1692        // """
1693        let src = "\"\"\"\n  hello\n  world\n\"\"\"";
1694        let toks = lex(src);
1695        assert_eq!(toks[0].kind, TokenKind::MultiLineStringLiteral);
1696        assert_eq!(toks[0].literal.as_deref(), Some("hello\nworld"));
1697    }
1698
1699    #[test]
1700    fn lex_raw_multiline_string() {
1701        let src = "r\"\"\"\nhello\\nworld\n\"\"\"";
1702        let toks = lex(src);
1703        assert_eq!(toks[0].kind, TokenKind::RawMultiLineStringLiteral);
1704        // Raw: no escape processing; but closing """ is consumed
1705        assert!(toks[0]
1706            .literal
1707            .as_deref()
1708            .unwrap()
1709            .contains("hello\\nworld"));
1710    }
1711
1712    // ── String interpolation ──────────────────────────────────────────────────
1713
1714    #[test]
1715    fn lex_interpolated_string_simple() {
1716        // "hello ${name}!"
1717        let toks = lex("\"hello ${name}!\"");
1718        // Expected: StringLiteralPart("hello "), InterpolationStart, Ident("name"),
1719        //           InterpolationEnd, StringLiteralPart("!"), Eof
1720        assert_eq!(toks[0].kind, TokenKind::StringLiteralPart);
1721        assert_eq!(toks[0].literal.as_deref(), Some("hello "));
1722        assert_eq!(toks[1].kind, TokenKind::InterpolationStart);
1723        assert_eq!(toks[2].kind, TokenKind::Ident);
1724        assert_eq!(toks[3].kind, TokenKind::InterpolationEnd);
1725        assert_eq!(toks[4].kind, TokenKind::StringLiteralPart);
1726        assert_eq!(toks[4].literal.as_deref(), Some("!"));
1727        assert_eq!(toks[5].kind, TokenKind::Eof);
1728    }
1729
1730    #[test]
1731    fn lex_interpolated_string_nested_braces() {
1732        // "${f({key: val})}"  — inner {} must not close the interpolation
1733        let toks = lex("\"${f({key: val})}\"");
1734        // StringLiteralPart(""), InterpolationStart, Ident(f), LParen,
1735        // LBrace, Ident(key), Colon, Ident(val), RBrace,
1736        // RParen, InterpolationEnd, StringLiteralPart(""), Eof
1737        let ks: Vec<_> = toks.iter().map(|t| t.kind.clone()).collect();
1738        assert_eq!(
1739            ks,
1740            vec![
1741                TokenKind::StringLiteralPart, // ""
1742                TokenKind::InterpolationStart,
1743                TokenKind::Ident, // f
1744                TokenKind::LParen,
1745                TokenKind::LBrace,
1746                TokenKind::Ident, // key
1747                TokenKind::Colon,
1748                TokenKind::Ident, // val
1749                TokenKind::RBrace,
1750                TokenKind::RParen,
1751                TokenKind::InterpolationEnd,
1752                TokenKind::StringLiteralPart, // ""
1753                TokenKind::Eof,
1754            ]
1755        );
1756    }
1757
1758    #[test]
1759    fn lex_interpolated_string_multiple_interps() {
1760        // "${a} + ${b}"
1761        let toks = lex("\"${a} + ${b}\"");
1762        let ks: Vec<_> = toks.iter().map(|t| t.kind.clone()).collect();
1763        assert_eq!(
1764            ks,
1765            vec![
1766                TokenKind::StringLiteralPart, // ""
1767                TokenKind::InterpolationStart,
1768                TokenKind::Ident, // a
1769                TokenKind::InterpolationEnd,
1770                TokenKind::StringLiteralPart, // " + "
1771                TokenKind::InterpolationStart,
1772                TokenKind::Ident, // b
1773                TokenKind::InterpolationEnd,
1774                TokenKind::StringLiteralPart, // ""
1775                TokenKind::Eof,
1776            ]
1777        );
1778        assert_eq!(toks[4].literal.as_deref(), Some(" + "));
1779    }
1780
1781    // ── Character literals ────────────────────────────────────────────────────
1782
1783    #[test]
1784    fn lex_char_simple() {
1785        let toks = lex("'a'");
1786        assert_eq!(toks[0].kind, TokenKind::CharLiteral);
1787        assert_eq!(toks[0].literal.as_deref(), Some("a"));
1788    }
1789
1790    #[test]
1791    fn lex_char_newline_escape() {
1792        let toks = lex("'\\n'");
1793        assert_eq!(toks[0].kind, TokenKind::CharLiteral);
1794        assert_eq!(toks[0].literal.as_deref(), Some("\n"));
1795    }
1796
1797    #[test]
1798    fn lex_char_unicode_escape() {
1799        // '\u{1F600}' → 😀
1800        let toks = lex("'\\u{1F600}'");
1801        assert_eq!(toks[0].kind, TokenKind::CharLiteral);
1802        assert_eq!(toks[0].literal.as_deref(), Some("😀"));
1803    }
1804
1805    #[test]
1806    fn lex_char_multibyte_unicode() {
1807        // '😀' — a directly embedded Unicode character
1808        let toks = lex("'😀'");
1809        assert_eq!(toks[0].kind, TokenKind::CharLiteral);
1810        assert_eq!(toks[0].literal.as_deref(), Some("😀"));
1811    }
1812
1813    // ── Diagnostics for invalid literals ─────────────────────────────────────
1814
1815    #[test]
1816    fn lex_unterminated_string_produces_diagnostic() {
1817        let file = SourceFile::new(
1818            bock_errors::FileId(0),
1819            PathBuf::from("test.bock"),
1820            "\"unterminated".to_string(),
1821        );
1822        let mut lexer = Lexer::new(&file);
1823        let _ = lexer.tokenize();
1824        assert!(lexer.diagnostics().has_errors());
1825    }
1826
1827    #[test]
1828    fn lex_empty_char_literal_produces_diagnostic() {
1829        let file = SourceFile::new(
1830            bock_errors::FileId(0),
1831            PathBuf::from("test.bock"),
1832            "''".to_string(),
1833        );
1834        let mut lexer = Lexer::new(&file);
1835        let toks = lexer.tokenize();
1836        assert_eq!(toks[0].kind, TokenKind::Error);
1837        assert!(lexer.diagnostics().has_errors());
1838    }
1839
1840    #[test]
1841    fn lex_literals_helper() {
1842        // Smoke-test the `literals` helper used in some tests above.
1843        let lits = literals(r#""hi""#);
1844        assert_eq!(lits[0].as_deref(), Some("hi"));
1845    }
1846
1847    // ── Numeric literals (P1.4) ───────────────────────────────────────────────
1848
1849    fn lex_num(src: &str) -> Vec<Token> {
1850        lex(src)
1851    }
1852
1853    #[test]
1854    fn lex_decimal_integer() {
1855        let toks = lex_num("42");
1856        assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1857        assert_eq!(toks[0].literal.as_deref(), Some("42"));
1858    }
1859
1860    #[test]
1861    fn lex_decimal_with_underscores() {
1862        let toks = lex_num("1_000_000");
1863        assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1864        assert_eq!(toks[0].literal.as_deref(), Some("1_000_000"));
1865    }
1866
1867    #[test]
1868    fn lex_hex_literal() {
1869        let toks = lex_num("0xFF");
1870        assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1871        assert_eq!(toks[0].literal.as_deref(), Some("0xFF"));
1872    }
1873
1874    #[test]
1875    fn lex_hex_literal_uppercase_prefix() {
1876        let toks = lex_num("0XFF");
1877        assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1878        assert_eq!(toks[0].literal.as_deref(), Some("0XFF"));
1879    }
1880
1881    #[test]
1882    fn lex_octal_literal() {
1883        let toks = lex_num("0o77");
1884        assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1885        assert_eq!(toks[0].literal.as_deref(), Some("0o77"));
1886    }
1887
1888    #[test]
1889    fn lex_octal_literal_uppercase_prefix() {
1890        let toks = lex_num("0O77");
1891        assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1892        assert_eq!(toks[0].literal.as_deref(), Some("0O77"));
1893    }
1894
1895    #[test]
1896    fn lex_binary_literal() {
1897        let toks = lex_num("0b1010");
1898        assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1899        assert_eq!(toks[0].literal.as_deref(), Some("0b1010"));
1900    }
1901
1902    #[test]
1903    fn lex_binary_literal_uppercase_prefix() {
1904        let toks = lex_num("0B1010");
1905        assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1906        assert_eq!(toks[0].literal.as_deref(), Some("0B1010"));
1907    }
1908
1909    #[test]
1910    fn lex_float_simple() {
1911        let toks = lex_num("3.14");
1912        assert_eq!(toks[0].kind, TokenKind::FloatLiteral);
1913        assert_eq!(toks[0].literal.as_deref(), Some("3.14"));
1914    }
1915
1916    #[test]
1917    fn lex_float_exponent_lower() {
1918        let toks = lex_num("1.0e10");
1919        assert_eq!(toks[0].kind, TokenKind::FloatLiteral);
1920        assert_eq!(toks[0].literal.as_deref(), Some("1.0e10"));
1921    }
1922
1923    #[test]
1924    fn lex_float_exponent_upper() {
1925        let toks = lex_num("2.5E-3");
1926        assert_eq!(toks[0].kind, TokenKind::FloatLiteral);
1927        assert_eq!(toks[0].literal.as_deref(), Some("2.5E-3"));
1928    }
1929
1930    #[test]
1931    fn lex_float_exponent_no_dot() {
1932        // `1e6` — exponent without fractional part
1933        let toks = lex_num("1e6");
1934        assert_eq!(toks[0].kind, TokenKind::FloatLiteral);
1935        assert_eq!(toks[0].literal.as_deref(), Some("1e6"));
1936    }
1937
1938    #[test]
1939    fn lex_float_exponent_plus() {
1940        let toks = lex_num("1.5E+3");
1941        assert_eq!(toks[0].kind, TokenKind::FloatLiteral);
1942        assert_eq!(toks[0].literal.as_deref(), Some("1.5E+3"));
1943    }
1944
1945    #[test]
1946    fn lex_int_with_type_suffix() {
1947        let toks = lex_num("42_u8");
1948        assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1949        assert_eq!(toks[0].literal.as_deref(), Some("42_u8"));
1950    }
1951
1952    #[test]
1953    fn lex_float_with_type_suffix() {
1954        let toks = lex_num("3.14_f64");
1955        assert_eq!(toks[0].kind, TokenKind::FloatLiteral);
1956        assert_eq!(toks[0].literal.as_deref(), Some("3.14_f64"));
1957    }
1958
1959    #[test]
1960    fn lex_range_does_not_consume_dotdot() {
1961        // `1..2` must produce IntLiteral(1), DotDot, IntLiteral(2) — not a float.
1962        let toks = lex_num("1..2");
1963        assert_eq!(toks[0].kind, TokenKind::IntLiteral);
1964        assert_eq!(toks[0].literal.as_deref(), Some("1"));
1965        assert_eq!(toks[1].kind, TokenKind::DotDot);
1966        assert_eq!(toks[2].kind, TokenKind::IntLiteral);
1967        assert_eq!(toks[2].literal.as_deref(), Some("2"));
1968    }
1969
1970    #[test]
1971    fn lex_invalid_binary_digit_produces_diagnostic() {
1972        let file = bock_source::SourceFile::new(
1973            bock_errors::FileId(0),
1974            std::path::PathBuf::from("test.bock"),
1975            "0b123".to_string(),
1976        );
1977        let mut lexer = Lexer::new(&file);
1978        let _ = lexer.tokenize();
1979        assert!(
1980            !lexer.diagnostics().is_empty(),
1981            "expected diagnostic for invalid binary digit"
1982        );
1983    }
1984
1985    #[test]
1986    fn lex_invalid_octal_digit_produces_diagnostic() {
1987        let file = bock_source::SourceFile::new(
1988            bock_errors::FileId(0),
1989            std::path::PathBuf::from("test.bock"),
1990            "0o89".to_string(),
1991        );
1992        let mut lexer = Lexer::new(&file);
1993        let _ = lexer.tokenize();
1994        assert!(
1995            !lexer.diagnostics().is_empty(),
1996            "expected diagnostic for invalid octal digit"
1997        );
1998    }
1999
2000    #[test]
2001    fn lex_hex_with_underscores() {
2002        let toks = lex_num("0xFF_FF");
2003        assert_eq!(toks[0].kind, TokenKind::IntLiteral);
2004        assert_eq!(toks[0].literal.as_deref(), Some("0xFF_FF"));
2005    }
2006
2007    #[test]
2008    fn lex_zero_alone() {
2009        let toks = lex_num("0");
2010        assert_eq!(toks[0].kind, TokenKind::IntLiteral);
2011        assert_eq!(toks[0].literal.as_deref(), Some("0"));
2012    }
2013
2014    // ── Comments (P1.5) ───────────────────────────────────────────────────────
2015
2016    fn has_errors(src: &str) -> bool {
2017        let file = SourceFile::new(
2018            bock_errors::FileId(0),
2019            std::path::PathBuf::from("test.bock"),
2020            src.to_string(),
2021        );
2022        let mut lexer = Lexer::new(&file);
2023        let _ = lexer.tokenize();
2024        !lexer.diagnostics().is_empty()
2025    }
2026
2027    #[test]
2028    fn lex_line_comment_produces_no_token() {
2029        // A line comment before an identifier should be invisible
2030        let toks = kinds("// this is a comment\nfoo");
2031        assert_eq!(
2032            toks,
2033            vec![TokenKind::Newline, TokenKind::Ident, TokenKind::Eof]
2034        );
2035    }
2036
2037    #[test]
2038    fn lex_line_comment_at_eof() {
2039        // A line comment at end of file with no trailing newline
2040        let toks = kinds("// comment at eof");
2041        assert_eq!(toks, vec![TokenKind::Eof]);
2042    }
2043
2044    #[test]
2045    fn lex_doc_comment_produces_token() {
2046        let toks = lex("/// doc comment");
2047        assert_eq!(toks[0].kind, TokenKind::DocComment);
2048        assert_eq!(toks[0].literal.as_deref(), Some("doc comment"));
2049    }
2050
2051    #[test]
2052    fn lex_doc_comment_content_trimmed() {
2053        let toks = lex("///   spaces around   ");
2054        assert_eq!(toks[0].kind, TokenKind::DocComment);
2055        assert_eq!(toks[0].literal.as_deref(), Some("spaces around"));
2056    }
2057
2058    #[test]
2059    fn lex_module_doc_comment_produces_token() {
2060        let toks = lex("//! module doc");
2061        assert_eq!(toks[0].kind, TokenKind::ModuleDocComment);
2062        assert_eq!(toks[0].literal.as_deref(), Some("module doc"));
2063    }
2064
2065    #[test]
2066    fn lex_doc_comment_then_ident() {
2067        let toks = kinds("/// docs\nfoo");
2068        assert_eq!(
2069            toks,
2070            vec![
2071                TokenKind::DocComment,
2072                TokenKind::Newline,
2073                TokenKind::Ident,
2074                TokenKind::Eof,
2075            ]
2076        );
2077    }
2078
2079    #[test]
2080    fn lex_block_comment_produces_no_token() {
2081        let toks = kinds("/* block comment */ foo");
2082        assert_eq!(toks, vec![TokenKind::Ident, TokenKind::Eof]);
2083    }
2084
2085    #[test]
2086    fn lex_nested_block_comment() {
2087        // Nested block comments must be properly balanced
2088        let toks = kinds("/* outer /* inner */ still outer */ foo");
2089        assert_eq!(toks, vec![TokenKind::Ident, TokenKind::Eof]);
2090    }
2091
2092    #[test]
2093    fn lex_deeply_nested_block_comment() {
2094        let toks = kinds("/* a /* b /* c */ b */ a */ x");
2095        assert_eq!(toks, vec![TokenKind::Ident, TokenKind::Eof]);
2096    }
2097
2098    #[test]
2099    fn lex_unterminated_block_comment_produces_diagnostic() {
2100        assert!(
2101            has_errors("/* not closed"),
2102            "expected diagnostic for unterminated block comment"
2103        );
2104    }
2105
2106    #[test]
2107    fn lex_block_comment_inline() {
2108        // Block comment between tokens
2109        let toks = kinds("foo /* ignore */ bar");
2110        assert_eq!(
2111            toks,
2112            vec![TokenKind::Ident, TokenKind::Ident, TokenKind::Eof]
2113        );
2114    }
2115
2116    // ── M-010: Raw multiline string distinct token kind ──────────────────────
2117
2118    #[test]
2119    fn raw_multiline_string_has_distinct_kind() {
2120        let src = "r\"\"\"\nhello\n\"\"\"";
2121        let toks = lex(src);
2122        assert_eq!(toks[0].kind, TokenKind::RawMultiLineStringLiteral);
2123        // Non-raw multiline should still be MultiLineStringLiteral
2124        let toks2 = lex("\"\"\"\nhello\n\"\"\"");
2125        assert_eq!(toks2[0].kind, TokenKind::MultiLineStringLiteral);
2126    }
2127
2128    // ── M-011: Backslash line continuation ───────────────────────────────────
2129
2130    #[test]
2131    fn backslash_newline_joins_lines() {
2132        // `let \\\nx = 1` should lex as `let x = 1`
2133        let toks = kinds("let \\\nx = 1");
2134        assert_eq!(
2135            toks,
2136            vec![
2137                TokenKind::Let,
2138                TokenKind::Ident,
2139                TokenKind::Assign,
2140                TokenKind::IntLiteral,
2141                TokenKind::Eof,
2142            ]
2143        );
2144    }
2145
2146    #[test]
2147    fn backslash_without_newline_is_error() {
2148        let toks = lex("\\x");
2149        assert_eq!(toks[0].kind, TokenKind::Error);
2150    }
2151
2152    #[test]
2153    fn backslash_continuation_multiline_expr() {
2154        // Multi-line expression: `1 + \\\n  2 + \\\n  3`
2155        let toks = kinds("1 + \\\n  2 + \\\n  3");
2156        assert_eq!(
2157            toks,
2158            vec![
2159                TokenKind::IntLiteral,
2160                TokenKind::Plus,
2161                TokenKind::IntLiteral,
2162                TokenKind::Plus,
2163                TokenKind::IntLiteral,
2164                TokenKind::Eof,
2165            ]
2166        );
2167    }
2168
2169    #[test]
2170    fn backslash_at_eof_is_error() {
2171        let toks = lex("\\");
2172        assert_eq!(toks[0].kind, TokenKind::Error);
2173    }
2174}
bock_lexer/lexer.rs

bock_lexer/
lexer.rs