qala_compiler/
lexer.rs

1//! the hand-written scanner: [`Lexer::tokenize`] turns Qala source text into a
2//! `Vec<Token>` ending in [`TokenKind::Eof`], or the first lex error.
3//!
4//! it is a single pass over `src.as_bytes()`: identifiers, keywords, operators,
5//! punctuation, and digit prefixes are all ASCII, so byte-level peeking is enough
6//! everywhere except inside string and comment bodies (where a multi-byte char is
7//! decoded only to know how far to advance and what to put in the error). every
8//! source index is a checked accessor or a position this scanner produced, and no
9//! malformed input reaches an `unwrap`/`expect`/`panic!`/`unreachable!` -- that
10//! discipline is load-bearing for Phase 6, where this code runs in WASM and a
11//! panic aborts the module.
12//!
13//! the lexer is fail-fast: it returns `Result<Vec<Token>, QalaError>` and stops
14//! at the first error. the parser is fail-fast for the same reason -- the first
15//! error wins and parsing stops; multi-error reporting and resync points are
16//! deferred (per CONTEXT.md).
17
18use std::num::IntErrorKind;
19
20use crate::errors::QalaError;
21use crate::span::Span;
22use crate::token::{Token, TokenKind};
23
24/// which body the scanner is inside. pushed on a small stack so interpolations
25/// can nest (`"{ "{a}" }"` is a string inside an interpolation inside a string).
26///
27/// the offsets carried by `StrText` and `Interp` are not bookkeeping for parsing
28/// -- they are the *cause* offsets for error spans: an unterminated string points
29/// at its opening quote, an unterminated interpolation at its `{`, even though the
30/// scanner only notices the problem later (at EOF or the string's end).
31enum Mode {
32    /// ordinary token scanning. the bottom of the stack is always this.
33    Normal,
34    /// inside a string literal's text. carries the byte offset of the opening
35    /// quote, for the unterminated-string error span. `saw_interp` is true once
36    /// this string has emitted at least one interpolation, so the next text run
37    /// closes with `StrEnd` (not `Str`) and an inner run opens with `StrMid` (not
38    /// `StrStart`).
39    StrText { open_quote: usize, saw_interp: bool },
40    /// inside an interpolation expression opened from a string. `brace_depth`
41    /// counts `{`/`}` *within* this interpolation (nested blocks, struct
42    /// literals); only the depth-0 `}` closes it. `open_brace` is the offset of
43    /// the `{` that opened it, for the unterminated-interpolation error span.
44    Interp { brace_depth: u32, open_brace: usize },
45}
46
47/// the scanner state: the source, its bytes (for cheap ASCII peeking), the
48/// current byte offset, and the mode stack.
49pub struct Lexer<'src> {
50    src: &'src str,
51    bytes: &'src [u8],
52    pos: usize,
53    /// starts as `[Mode::Normal]`; never empties (the bottom `Normal` stays).
54    mode_stack: Vec<Mode>,
55}
56
57impl<'src> Lexer<'src> {
58    /// scan `src` into a token stream ending in [`TokenKind::Eof`], or the first
59    /// lex error.
60    ///
61    /// fail-fast: the first malformed token aborts with its `QalaError`. an empty,
62    /// whitespace-only, or comment-only source yields exactly `[Eof]` and no error.
63    /// a leading UTF-8 BOM is skipped silently.
64    pub fn tokenize(src: &'src str) -> Result<Vec<Token>, QalaError> {
65        let mut lx = Lexer {
66            src,
67            bytes: src.as_bytes(),
68            pos: 0,
69            mode_stack: vec![Mode::Normal],
70        };
71        lx.skip_bom();
72        let mut out = Vec::new();
73        loop {
74            // a string body scans differently: no trivia skipping, no maximal
75            // munch -- copy/decode characters until the close quote or an
76            // interpolation `{`. the string scan pushes one or more tokens itself
77            // (Str, or StrStart/StrMid/StrEnd around the interpolation markers).
78            if matches!(lx.mode_stack.last(), Some(Mode::StrText { .. })) {
79                lx.scan_string_body(&mut out)?;
80                continue;
81            }
82            lx.skip_trivia();
83            let start = lx.pos;
84            match lx.peek() {
85                None => {
86                    // a string or interpolation still open at EOF is the cause
87                    // of an unterminated error; point at where it opened.
88                    if let Some(err) = lx.unterminated_at_eof() {
89                        return Err(err);
90                    }
91                    out.push(Token::new(TokenKind::Eof, Span::new(start, 0)));
92                    break;
93                }
94                // a `"` opens a string literal: push StrText and let the next
95                // iteration's string-body branch produce the token(s). handled
96                // here rather than in scan_token because a string is not a single
97                // token -- an interpolated one is several.
98                Some(b'"') => {
99                    lx.pos += 1; // consume the opening quote
100                    lx.mode_stack.push(Mode::StrText {
101                        open_quote: start,
102                        saw_interp: false,
103                    });
104                }
105                Some(b) => {
106                    let kind = lx.scan_token(b, start)?;
107                    out.push(Token::new(kind, Span::new(start, lx.pos - start)));
108                }
109            }
110        }
111        Ok(out)
112    }
113
114    // ---- peeking and advancing -------------------------------------------------
115
116    /// the byte at the cursor, or `None` at end of input.
117    fn peek(&self) -> Option<u8> {
118        self.bytes.get(self.pos).copied()
119    }
120
121    /// the byte one past the cursor, or `None`.
122    fn peek2(&self) -> Option<u8> {
123        self.bytes.get(self.pos + 1).copied()
124    }
125
126    /// the byte at the cursor; advances the cursor past it if there is one.
127    fn bump(&mut self) -> Option<u8> {
128        let b = self.peek();
129        if b.is_some() {
130            self.pos += 1;
131        }
132        b
133    }
134
135    // ---- trivia ---------------------------------------------------------------
136
137    /// strip exactly the 3-byte UTF-8 BOM (`EF BB BF`) if the source starts with
138    /// it. only at offset 0, only once.
139    fn skip_bom(&mut self) {
140        if self.src.starts_with('\u{FEFF}') {
141            self.pos += '\u{FEFF}'.len_utf8();
142        }
143    }
144
145    /// skip ASCII whitespace and `//` line comments until the next real byte.
146    /// `//` runs to (not including) the next `\n`; a line that is only `//` is
147    /// fine; there are no block comments, so `/x` is `Slash` then `x`.
148    fn skip_trivia(&mut self) {
149        loop {
150            match self.peek() {
151                Some(b) if b.is_ascii_whitespace() => {
152                    self.pos += 1;
153                }
154                Some(b'/') if self.peek2() == Some(b'/') => {
155                    // consume "//" then everything to end of line.
156                    self.pos += 2;
157                    while let Some(b) = self.peek() {
158                        if b == b'\n' {
159                            break;
160                        }
161                        self.pos += 1;
162                    }
163                }
164                _ => break,
165            }
166        }
167    }
168
169    // ---- the dispatcher -------------------------------------------------------
170
171    /// classify the token starting at `b` (the byte at `start == self.pos`) and
172    /// advance the cursor past it. returns the token kind, or a lex error.
173    ///
174    /// `"` is never seen here -- the main loop handles a string opening before
175    /// calling this, because an interpolated string is more than one token.
176    fn scan_token(&mut self, b: u8, start: usize) -> Result<TokenKind, QalaError> {
177        // an identifier or keyword: ASCII letter or `_`, then letters/digits/`_`.
178        if b == b'_' || b.is_ascii_alphabetic() {
179            // a `b` immediately followed by `'` is a byte literal, not an
180            // identifier; everything else starting with `b` (`by`, `b 1`, `b`)
181            // is the identifier `b`.
182            if b == b'b' && self.peek2() == Some(b'\'') {
183                return self.scan_byte_literal(start);
184            }
185            return Ok(self.scan_identifier());
186        }
187        if b.is_ascii_digit() {
188            return self.scan_number(start);
189        }
190        // a brace while an interpolation is on top does brace-depth bookkeeping.
191        if (b == b'{' || b == b'}') && matches!(self.mode_stack.last(), Some(Mode::Interp { .. })) {
192            return Ok(self.scan_brace_in_interp(b));
193        }
194        match b {
195            b'(' => {
196                self.pos += 1;
197                Ok(TokenKind::LParen)
198            }
199            b')' => {
200                self.pos += 1;
201                Ok(TokenKind::RParen)
202            }
203            b'[' => {
204                self.pos += 1;
205                Ok(TokenKind::LBracket)
206            }
207            b']' => {
208                self.pos += 1;
209                Ok(TokenKind::RBracket)
210            }
211            b'{' => {
212                self.pos += 1;
213                Ok(TokenKind::LBrace)
214            }
215            b'}' => {
216                self.pos += 1;
217                Ok(TokenKind::RBrace)
218            }
219            b',' => {
220                self.pos += 1;
221                Ok(TokenKind::Comma)
222            }
223            b':' => {
224                self.pos += 1;
225                Ok(TokenKind::Colon)
226            }
227            b';' => {
228                self.pos += 1;
229                Ok(TokenKind::Semi)
230            }
231            b'+' => {
232                self.pos += 1;
233                Ok(TokenKind::Plus)
234            }
235            b'*' => {
236                self.pos += 1;
237                Ok(TokenKind::Star)
238            }
239            b'%' => {
240                self.pos += 1;
241                Ok(TokenKind::Percent)
242            }
243            b'?' => {
244                self.pos += 1;
245                Ok(TokenKind::Question)
246            }
247            // `/` here is always division: `//` was consumed by skip_trivia.
248            b'/' => {
249                self.pos += 1;
250                Ok(TokenKind::Slash)
251            }
252            // `.` `..` `..=` -- never the start of a float (a number only starts
253            // on a digit), so this is unambiguous.
254            b'.' => {
255                self.pos += 1;
256                if self.peek() == Some(b'.') {
257                    self.pos += 1;
258                    if self.peek() == Some(b'=') {
259                        self.pos += 1;
260                        Ok(TokenKind::DotDotEq)
261                    } else {
262                        Ok(TokenKind::DotDot)
263                    }
264                } else {
265                    Ok(TokenKind::Dot)
266                }
267            }
268            b'-' => {
269                self.pos += 1;
270                if self.peek() == Some(b'>') {
271                    self.pos += 1;
272                    Ok(TokenKind::Arrow)
273                } else {
274                    Ok(TokenKind::Minus)
275                }
276            }
277            b'=' => {
278                self.pos += 1;
279                match self.peek() {
280                    Some(b'=') => {
281                        self.pos += 1;
282                        Ok(TokenKind::EqEq)
283                    }
284                    Some(b'>') => {
285                        self.pos += 1;
286                        Ok(TokenKind::FatArrow)
287                    }
288                    _ => Ok(TokenKind::Eq),
289                }
290            }
291            b'!' => {
292                self.pos += 1;
293                if self.peek() == Some(b'=') {
294                    self.pos += 1;
295                    Ok(TokenKind::BangEq)
296                } else {
297                    Ok(TokenKind::Bang)
298                }
299            }
300            b'<' => {
301                self.pos += 1;
302                if self.peek() == Some(b'=') {
303                    self.pos += 1;
304                    Ok(TokenKind::LtEq)
305                } else {
306                    Ok(TokenKind::Lt)
307                }
308            }
309            b'>' => {
310                self.pos += 1;
311                if self.peek() == Some(b'=') {
312                    self.pos += 1;
313                    Ok(TokenKind::GtEq)
314                } else {
315                    Ok(TokenKind::Gt)
316                }
317            }
318            // `&&` is the only `&` form; a lone `&` is rejected (no bitwise `&`,
319            // no reference syntax in v1).
320            b'&' => {
321                if self.peek2() == Some(b'&') {
322                    self.pos += 2;
323                    Ok(TokenKind::AmpAmp)
324                } else {
325                    let span = Span::new(start, 1);
326                    self.pos += 1;
327                    Err(QalaError::UnexpectedChar { span, ch: '&' })
328                }
329            }
330            // `||` and `|>` are the only `|` forms; a lone `|` is rejected (no
331            // bitwise `|`, no pattern alternatives in v1).
332            b'|' => match self.peek2() {
333                Some(b'|') => {
334                    self.pos += 2;
335                    Ok(TokenKind::PipePipe)
336                }
337                Some(b'>') => {
338                    self.pos += 2;
339                    Ok(TokenKind::PipeGt)
340                }
341                _ => {
342                    let span = Span::new(start, 1);
343                    self.pos += 1;
344                    Err(QalaError::UnexpectedChar { span, ch: '|' })
345                }
346            },
347            // any non-ASCII byte outside a string or comment cannot begin a
348            // token. decode the whole character (so we know how far to advance
349            // and what to report), advance past it, and error.
350            _ if b >= 0x80 => {
351                let ch = self.src[self.pos..].chars().next().unwrap_or('\u{FFFD}');
352                let len = ch.len_utf8();
353                let span = Span::new(start, len);
354                self.pos += len;
355                Err(QalaError::UnexpectedChar { span, ch })
356            }
357            // a stray ASCII control or symbol byte we do not lex (`@`, `#`, `$`,
358            // `` ` ``, `~`, `^`, `\`, `'` outside a byte literal, ...).
359            _ => {
360                let span = Span::new(start, 1);
361                self.pos += 1;
362                Err(QalaError::UnexpectedChar {
363                    span,
364                    ch: b as char,
365                })
366            }
367        }
368    }
369
370    /// scan `[A-Za-z_][A-Za-z0-9_]*` from the cursor and classify it: a reserved
371    /// word becomes its keyword kind, anything else becomes [`TokenKind::Ident`].
372    fn scan_identifier(&mut self) -> TokenKind {
373        let start = self.pos;
374        while let Some(b) = self.peek() {
375            if b == b'_' || b.is_ascii_alphanumeric() {
376                self.pos += 1;
377            } else {
378                break;
379            }
380        }
381        let text = &self.src[start..self.pos];
382        match crate::token::keyword(text) {
383            Some(kw) => kw,
384            None => TokenKind::Ident(text.to_string()),
385        }
386    }
387
388    /// brace-depth bookkeeping while an interpolation is the top mode. `{` deepens
389    /// it; `}` either un-deepens a nested block or, at depth 0, closes the
390    /// interpolation (emit [`TokenKind::InterpEnd`], pop back to the enclosing
391    /// `StrText`).
392    fn scan_brace_in_interp(&mut self, b: u8) -> TokenKind {
393        self.pos += 1; // consume the brace
394        match b {
395            b'{' => {
396                if let Some(Mode::Interp { brace_depth, .. }) = self.mode_stack.last_mut() {
397                    *brace_depth += 1;
398                }
399                TokenKind::LBrace
400            }
401            // b'}'
402            _ => {
403                let at_depth_zero = matches!(
404                    self.mode_stack.last(),
405                    Some(Mode::Interp { brace_depth: 0, .. })
406                );
407                if at_depth_zero {
408                    self.mode_stack.pop(); // leave the Interp; the StrText below resumes
409                    TokenKind::InterpEnd
410                } else {
411                    if let Some(Mode::Interp { brace_depth, .. }) = self.mode_stack.last_mut() {
412                        // depth > 0 here, so this does not underflow.
413                        *brace_depth -= 1;
414                    }
415                    TokenKind::RBrace
416                }
417            }
418        }
419    }
420
421    // ---- numeric and byte literals --------------------------------------------
422
423    /// scan a numeric literal starting at `start` (the digit at `self.pos`).
424    ///
425    /// the literal scans as a *non-negative magnitude*: a leading `-` is always a
426    /// separate `Minus` token the parser folds in later, so the bare literal
427    /// `9223372036854775808` (with no `-`) is reported as overflow -- acceptable
428    /// v1 behavior; folding `- 9223372036854775808` into `i64::MIN` is left for
429    /// the parser.
430    ///
431    /// the disambiguation that matters: a `.` is part of the number only when the
432    /// byte *after* it is an ASCII digit. that single rule keeps `.5` as `Dot`
433    /// then `Int(5)`, `0..5` as `Int(0)` `DotDot` `Int(5)`, and `x.0` as
434    /// `Ident(x)` `Dot` `Int(0)` -- field access and ranges stay unambiguous.
435    fn scan_number(&mut self, start: usize) -> Result<TokenKind, QalaError> {
436        // prefix forms: 0x / 0b. checked before the decimal path so `0x` is not
437        // mis-scanned as the decimal `0` followed by the identifier `x`.
438        if self.peek() == Some(b'0') {
439            match self.peek2() {
440                Some(b'x') | Some(b'X') => return self.scan_radix_int(start, 16),
441                Some(b'b') | Some(b'B') => return self.scan_radix_int(start, 2),
442                _ => {}
443            }
444        }
445        self.scan_decimal(start)
446    }
447
448    /// scan a `0x...` (radix 16) or `0b...` (radix 2) integer literal: the two
449    /// prefix bytes, then a run of valid digits and `_` separators with at least
450    /// one digit. an empty body (`0x`, `0b`), an out-of-range digit (`0xG`,
451    /// `0b2`), or a misplaced `_` is [`QalaError::MalformedNumber`]; a magnitude
452    /// past `i64::MAX` is [`QalaError::IntOverflow`].
453    fn scan_radix_int(&mut self, start: usize, radix: u32) -> Result<TokenKind, QalaError> {
454        self.pos += 2; // consume "0x" / "0b"
455        let body_start = self.pos;
456        let is_digit = |b: u8| match radix {
457            16 => b.is_ascii_hexdigit(),
458            2 => b == b'0' || b == b'1',
459            _ => false,
460        };
461        // scan digits + underscores, tracking the last char class for the `_` rules.
462        let mut last_was_underscore = false;
463        let mut saw_digit = false;
464        let mut malformed = false;
465        while let Some(b) = self.peek() {
466            if is_digit(b) {
467                last_was_underscore = false;
468                saw_digit = true;
469                self.pos += 1;
470            } else if b == b'_' {
471                // a `_` is valid only between digits: it cannot start the body and
472                // cannot follow another `_`.
473                if !saw_digit || last_was_underscore {
474                    malformed = true;
475                }
476                last_was_underscore = true;
477                self.pos += 1;
478            } else {
479                break;
480            }
481        }
482        // a body that ends in `_` (`0xFF_`) is malformed; an empty body (`0x`) too.
483        // also reject a trailing alphanumeric like `0xG` or `0b2`: such a byte
484        // right after the body is the user's mistake, not the start of a new
485        // identifier. consume the whole offending tail so the error span covers it.
486        if self
487            .peek()
488            .is_some_and(|b| b == b'_' || b.is_ascii_alphanumeric())
489        {
490            self.consume_alnum_tail();
491            malformed = true;
492        }
493        if malformed || !saw_digit || last_was_underscore {
494            return Err(self.malformed_number(start));
495        }
496        let digits: String = self.src[body_start..self.pos]
497            .chars()
498            .filter(|&c| c != '_')
499            .collect();
500        match i64::from_str_radix(&digits, radix) {
501            Ok(value) => Ok(TokenKind::Int(value)),
502            Err(e) => Err(self.int_parse_error(start, e.kind())),
503        }
504    }
505
506    /// scan a decimal integer or float starting at `start`. integer part: digits +
507    /// `_`. then, only if a `.` is followed by an ASCII digit, a fractional part.
508    /// then, optionally, an `e`/`E` exponent (`[+-]?` then a digit run). a
509    /// misplaced `_`, an `e` with no exponent digits, or a stray alphanumeric tail
510    /// is [`QalaError::MalformedNumber`]; an integer past `i64::MAX` is
511    /// [`QalaError::IntOverflow`].
512    fn scan_decimal(&mut self, start: usize) -> Result<TokenKind, QalaError> {
513        let mut is_float = false;
514
515        // integer part: at least one digit (we were called on a digit), then more
516        // digits and `_`. the helper rejects a `_` that does not sit between two
517        // digits within this run.
518        self.scan_digit_run(start)?;
519
520        // fractional part. a `.` is the decimal point only when something digit-
521        // like follows it: a digit (`1.0` -- a float) or a `_` (`1._5` -- a
522        // malformed fractional part, consumed so the error span covers it). a `.`
523        // followed by anything else is NOT part of the number, so `0..5` stays
524        // `Int(0) DotDot Int(5)` and `x.0` stays `Ident(x) Dot Int(0)`.
525        if self.peek() == Some(b'.') {
526            match self.peek2() {
527                Some(b) if b.is_ascii_digit() => {
528                    is_float = true;
529                    self.pos += 1; // consume the `.`
530                    self.scan_digit_run(start)?;
531                }
532                Some(b'_') => {
533                    self.pos += 1; // consume the `.`
534                    self.consume_alnum_tail();
535                    return Err(self.malformed_number(start));
536                }
537                _ => {}
538            }
539        }
540
541        // exponent: `e`/`E`, an optional sign, then a digit run that must be
542        // non-empty (`1e` is malformed, `1e10` and `1.5e-3` are floats).
543        if matches!(self.peek(), Some(b'e') | Some(b'E')) {
544            is_float = true;
545            self.pos += 1; // consume `e`/`E`
546            if matches!(self.peek(), Some(b'+') | Some(b'-')) {
547                self.pos += 1;
548            }
549            if !self.peek().is_some_and(|b| b.is_ascii_digit()) {
550                // no exponent digits: consume any alphanumeric tail for the span,
551                // then error.
552                self.consume_alnum_tail();
553                return Err(self.malformed_number(start));
554            }
555            self.scan_digit_run(start)?;
556        }
557
558        // a stray alphanumeric byte right after the literal (`1abc`, `0x` after a
559        // decimal, ...) is the user's mistake, not the start of a new token.
560        if self
561            .peek()
562            .is_some_and(|b| b == b'_' || b.is_ascii_alphanumeric())
563        {
564            self.consume_alnum_tail();
565            return Err(self.malformed_number(start));
566        }
567
568        let text: String = self.src[start..self.pos]
569            .chars()
570            .filter(|&c| c != '_')
571            .collect();
572        if is_float {
573            match text.parse::<f64>() {
574                Ok(value) => Ok(TokenKind::Float(value)),
575                // a non-round-tripping float is NOT an error (a Phase 4 precision
576                // warning handles it); only a shape that slipped past the scanner
577                // lands here, which is genuinely malformed.
578                Err(_) => Err(self.malformed_number(start)),
579            }
580        } else {
581            match text.parse::<i64>() {
582                Ok(value) => Ok(TokenKind::Int(value)),
583                Err(e) => Err(self.int_parse_error(start, e.kind())),
584            }
585        }
586    }
587
588    /// advance over a run of ASCII digits and `_` separators, where each `_` must
589    /// sit between two digits. the run must contain at least one digit (the caller
590    /// only invokes this when the next byte is a digit, or just after a `.`/`e`
591    /// that requires digits). a misplaced `_` -> [`QalaError::MalformedNumber`].
592    fn scan_digit_run(&mut self, literal_start: usize) -> Result<(), QalaError> {
593        let mut last_was_underscore = false;
594        let mut saw_digit = false;
595        // a leading `_` cannot start a digit run (e.g. `1._5`, `1e_5`).
596        if self.peek() == Some(b'_') {
597            self.consume_alnum_tail();
598            return Err(self.malformed_number(literal_start));
599        }
600        while let Some(b) = self.peek() {
601            if b.is_ascii_digit() {
602                last_was_underscore = false;
603                saw_digit = true;
604                self.pos += 1;
605            } else if b == b'_' {
606                if last_was_underscore {
607                    // a doubled `_` (`1__0`).
608                    self.consume_alnum_tail();
609                    return Err(self.malformed_number(literal_start));
610                }
611                last_was_underscore = true;
612                self.pos += 1;
613            } else {
614                break;
615            }
616        }
617        // a run that ends in `_` (`1_`, `1_.0`) is malformed -- the `_` is not
618        // between two digits.
619        if last_was_underscore || !saw_digit {
620            self.consume_alnum_tail();
621            return Err(self.malformed_number(literal_start));
622        }
623        Ok(())
624    }
625
626    /// advance past any run of `_`, ASCII alphanumeric, or `.` bytes at the
627    /// cursor, so a malformed-literal span covers the whole offending text
628    /// (`1_.0`, `0xFG`, `1abc`) rather than ending mid-word. only ever called on
629    /// an error path, so a `.` consumed here never matters to a valid token.
630    fn consume_alnum_tail(&mut self) {
631        while let Some(b) = self.peek() {
632            if b == b'_' || b.is_ascii_alphanumeric() || b == b'.' {
633                self.pos += 1;
634            } else {
635                break;
636            }
637        }
638    }
639
640    /// build a [`QalaError::MalformedNumber`] spanning the literal from
641    /// `literal_start` to the cursor, with a message echoing the offending text.
642    fn malformed_number(&self, literal_start: usize) -> QalaError {
643        let text = &self.src[literal_start..self.pos];
644        QalaError::MalformedNumber {
645            span: Span::new(literal_start, self.pos - literal_start),
646            message: format!("`{text}`"),
647        }
648    }
649
650    /// map a [`std::num::IntErrorKind`] from parsing an integer literal to the
651    /// right error: overflow (either sign, since the scanner only ever hands a
652    /// magnitude) -> [`QalaError::IntOverflow`]; anything else -> malformed.
653    fn int_parse_error(&self, literal_start: usize, kind: &IntErrorKind) -> QalaError {
654        match kind {
655            IntErrorKind::PosOverflow | IntErrorKind::NegOverflow => QalaError::IntOverflow {
656                span: Span::new(literal_start, self.pos - literal_start),
657            },
658            _ => self.malformed_number(literal_start),
659        }
660    }
661
662    /// scan a `b'X'` byte literal starting at `start` (the cursor is on `b`, and
663    /// the byte after it is `'` -- the dispatcher checked).
664    ///
665    /// inside `b'...'` is exactly one ASCII char, or one of the one-byte escapes
666    /// `\n \t \r \0 \\ \'`. anything else -- empty (`b''`), two or more chars
667    /// (`b'ab'`), a bad escape (`b'\x'`), a non-ASCII byte (`b'é'`), or a missing
668    /// closing quote -- is [`QalaError::BadByteLiteral`] spanning from `b` to
669    /// where scanning stopped.
670    fn scan_byte_literal(&mut self, start: usize) -> Result<TokenKind, QalaError> {
671        self.pos += 2; // consume `b` and the opening `'`
672        let value: u8 = match self.peek() {
673            // empty: `b''`.
674            Some(b'\'') => {
675                self.pos += 1;
676                return Err(
677                    self.bad_byte_literal(start, "expected one character between `b'` and `'`")
678                );
679            }
680            // an escape: `b'\X'`.
681            Some(b'\\') => {
682                self.pos += 1;
683                match self.bump() {
684                    Some(b'n') => b'\n',
685                    Some(b't') => b'\t',
686                    Some(b'r') => b'\r',
687                    Some(b'0') => 0,
688                    Some(b'\\') => b'\\',
689                    Some(b'\'') => b'\'',
690                    _ => {
691                        // consume a closing quote if present so the span is tidy.
692                        if self.peek() == Some(b'\'') {
693                            self.pos += 1;
694                        }
695                        return Err(self.bad_byte_literal(start, "unknown escape in byte literal"));
696                    }
697                }
698            }
699            // a non-ASCII byte: not a valid byte-literal char.
700            Some(b) if b >= 0x80 => {
701                let ch = self.src[self.pos..].chars().next().unwrap_or('\u{FFFD}');
702                self.pos += ch.len_utf8();
703                // consume a closing quote too if it is there, so the span is tidy.
704                if self.peek() == Some(b'\'') {
705                    self.pos += 1;
706                }
707                return Err(
708                    self.bad_byte_literal(start, "byte literal must be a single ASCII character")
709                );
710            }
711            // a raw newline or EOF before any content.
712            Some(b'\n') | None => {
713                return Err(self.bad_byte_literal(start, "unterminated byte literal"));
714            }
715            // an ordinary ASCII char.
716            Some(b) => {
717                self.pos += 1;
718                b
719            }
720        };
721        // require the closing `'`. if the next byte is another char (not `'`), the
722        // literal had two or more chars; consume to the next `'` or boundary so the
723        // span covers the mistake.
724        match self.peek() {
725            Some(b'\'') => {
726                self.pos += 1;
727                Ok(TokenKind::Byte(value))
728            }
729            _ => {
730                while let Some(b) = self.peek() {
731                    if b == b'\'' {
732                        self.pos += 1;
733                        break;
734                    }
735                    if b == b'\n' {
736                        break;
737                    }
738                    self.pos += 1;
739                }
740                Err(self.bad_byte_literal(start, "byte literal must be exactly one character"))
741            }
742        }
743    }
744
745    /// build a [`QalaError::BadByteLiteral`] spanning from `literal_start` to the
746    /// cursor.
747    fn bad_byte_literal(&self, literal_start: usize, message: &str) -> QalaError {
748        QalaError::BadByteLiteral {
749            span: Span::new(literal_start, self.pos - literal_start),
750            message: message.to_string(),
751        }
752    }
753
754    // ---- string literals and interpolation ------------------------------------
755
756    /// scan a run of string-literal text once `StrText` is the top mode. the
757    /// opening `"` (or, on a resumed call, the closing `}` of the preceding
758    /// interpolation) has already been consumed, so `self.pos` is at the first
759    /// content byte.
760    ///
761    /// it decodes escapes into a buffer and stops at one of:
762    /// - the closing `"` -- pushes [`TokenKind::Str`] (no interpolation seen) or
763    ///   [`TokenKind::StrEnd`] (resuming after the last interpolation), pops the
764    ///   `StrText`. the `Str` span runs from the opening quote through the closing
765    ///   quote inclusive; the `StrEnd` span runs from just after the preceding `}`
766    ///   through the closing quote.
767    /// - an unescaped `{` -- pushes [`TokenKind::StrStart`] (first interpolation)
768    ///   or [`TokenKind::StrMid`] (a later one) for the text so far, then
769    ///   [`TokenKind::InterpStart`] for the `{`, then a [`Mode::Interp`] so the
770    ///   main loop scans the embedded expression. a literal `{` in a string is
771    ///   written `\{`, handled in the escape path, so a bare `{` always opens an
772    ///   interpolation.
773    /// - a backslash -- decodes `\n \t \r \0 \\ \" \{ \}` or `\u{1..6 hex}` into
774    ///   the buffer; anything else is [`QalaError::InvalidEscape`] whose span is
775    ///   the backslash (not the bad character after it).
776    /// - a raw newline or EOF -- [`QalaError::UnterminatedString`] whose span is
777    ///   the opening quote (no multi-line strings in v1).
778    fn scan_string_body(&mut self, out: &mut Vec<Token>) -> Result<(), QalaError> {
779        // pull the opening-quote offset and whether this string has already had an
780        // interpolation. `open_quote` is needed for the `Str` span and the
781        // unterminated-string span; `saw_interp` decides Str-vs-StrEnd and
782        // StrStart-vs-StrMid.
783        let (open_quote, saw_interp) = match self.mode_stack.last() {
784            Some(Mode::StrText {
785                open_quote,
786                saw_interp,
787            }) => (*open_quote, *saw_interp),
788            _ => return Ok(()), // not in a string; nothing to do
789        };
790        let text_start = self.pos;
791        let mut buf = String::new();
792        loop {
793            match self.peek() {
794                // closing quote: the string ends here.
795                Some(b'"') => {
796                    self.pos += 1; // consume the closing quote
797                    self.mode_stack.pop(); // leave StrText
798                    if saw_interp {
799                        // span: just after the preceding `}` through the closing
800                        // quote.
801                        let span = Span::new(text_start, self.pos - text_start);
802                        out.push(Token::new(TokenKind::StrEnd(buf), span));
803                    } else {
804                        // span: opening quote through closing quote inclusive.
805                        let span = Span::new(open_quote, self.pos - open_quote);
806                        out.push(Token::new(TokenKind::Str(buf), span));
807                    }
808                    return Ok(());
809                }
810                // unescaped `{`: opens an interpolation.
811                Some(b'{') => {
812                    let brace_pos = self.pos;
813                    // emit the literal text so far.
814                    if saw_interp {
815                        // a StrMid spans just the text between the previous `}`
816                        // and this `{`.
817                        let span = Span::new(text_start, brace_pos - text_start);
818                        out.push(Token::new(TokenKind::StrMid(buf), span));
819                    } else {
820                        // a StrStart spans the opening quote through the text
821                        // before this `{`.
822                        let span = Span::new(open_quote, brace_pos - open_quote);
823                        out.push(Token::new(TokenKind::StrStart(buf), span));
824                    }
825                    // mark the enclosing StrText so the next text run closes with
826                    // StrEnd / opens with StrMid.
827                    if let Some(Mode::StrText { saw_interp, .. }) = self.mode_stack.last_mut() {
828                        *saw_interp = true;
829                    }
830                    // emit the InterpStart for the `{` and enter Interp mode.
831                    self.pos += 1; // consume the `{`
832                    out.push(Token::new(TokenKind::InterpStart, Span::new(brace_pos, 1)));
833                    self.mode_stack.push(Mode::Interp {
834                        brace_depth: 0,
835                        open_brace: brace_pos,
836                    });
837                    return Ok(());
838                }
839                // backslash: an escape sequence.
840                Some(b'\\') => {
841                    let backslash_pos = self.pos;
842                    self.pos += 1; // consume the `\`
843                    match self.bump() {
844                        Some(b'n') => buf.push('\n'),
845                        Some(b't') => buf.push('\t'),
846                        Some(b'r') => buf.push('\r'),
847                        Some(b'0') => buf.push('\0'),
848                        Some(b'\\') => buf.push('\\'),
849                        Some(b'"') => buf.push('"'),
850                        Some(b'{') => buf.push('{'),
851                        Some(b'}') => buf.push('}'),
852                        Some(b'u') => {
853                            let ch = self.scan_unicode_escape(backslash_pos)?;
854                            buf.push(ch);
855                        }
856                        _ => {
857                            return Err(QalaError::InvalidEscape {
858                                span: Span::new(backslash_pos, 1),
859                                message: "unknown escape sequence".to_string(),
860                            });
861                        }
862                    }
863                }
864                // a raw newline before the close: no multi-line strings.
865                Some(b'\n') | None => {
866                    return Err(QalaError::UnterminatedString {
867                        span: Span::new(open_quote, 1),
868                    });
869                }
870                // any other byte (ASCII or the lead of a multi-byte char) is
871                // literal content. decode the whole char so a multi-byte char
872                // (a non-ASCII char inside a string is fine) lands in the buffer
873                // intact and the cursor advances by the right number of bytes.
874                Some(_) => {
875                    let ch = self.src[self.pos..].chars().next().unwrap_or('\u{FFFD}');
876                    self.pos += ch.len_utf8();
877                    buf.push(ch);
878                }
879            }
880        }
881    }
882
883    /// decode a `\u{1..6 hex}` escape. the cursor is just past the `u`; this
884    /// expects `{`, 1..=6 hex digits, then `}`, and validates the codepoint with
885    /// [`char::from_u32`] (which rejects surrogates `D800..DFFF` and values above
886    /// `10FFFF`). any malformation -- no `{`, no digits, too many digits, no `}`,
887    /// or a bad codepoint -- is [`QalaError::InvalidEscape`] whose span is the
888    /// backslash at `backslash_pos`.
889    fn scan_unicode_escape(&mut self, backslash_pos: usize) -> Result<char, QalaError> {
890        let bad = |message: &str| QalaError::InvalidEscape {
891            span: Span::new(backslash_pos, 1),
892            message: message.to_string(),
893        };
894        if self.peek() != Some(b'{') {
895            return Err(bad("expected `{` after `\\u`"));
896        }
897        self.pos += 1; // consume `{`
898        let hex_start = self.pos;
899        let mut count = 0usize;
900        while let Some(b) = self.peek() {
901            if b.is_ascii_hexdigit() && count < 6 {
902                count += 1;
903                self.pos += 1;
904            } else {
905                break;
906            }
907        }
908        if count == 0 {
909            return Err(bad("expected 1 to 6 hex digits in `\\u{...}`"));
910        }
911        if self.peek() != Some(b'}') {
912            return Err(bad("expected `}` to close `\\u{...}`"));
913        }
914        let hex = &self.src[hex_start..self.pos];
915        self.pos += 1; // consume `}`
916        // count <= 6 hex digits fit in u32; from_str_radix only errors on an
917        // empty string here, which count == 0 already ruled out.
918        let code = u32::from_str_radix(hex, 16).map_err(|_| bad("invalid `\\u{...}` codepoint"))?;
919        char::from_u32(code).ok_or_else(|| bad("invalid Unicode codepoint in `\\u{...}`"))
920    }
921
922    /// if EOF was reached with a string or interpolation still open on the mode
923    /// stack, return the matching unterminated error pointing at where it opened.
924    ///
925    /// a string body is always fully consumed (or errored) inside
926    /// `scan_string_body` before control returns here, so in practice this fires
927    /// for an [`Mode::Interp`] that never saw its closing `}` (`"x{y"`), reporting
928    /// at the `{`. the `StrText` arm is a defensive backstop.
929    fn unterminated_at_eof(&self) -> Option<QalaError> {
930        match self.mode_stack.last() {
931            Some(Mode::StrText { open_quote, .. }) => Some(QalaError::UnterminatedString {
932                span: Span::new(*open_quote, 1),
933            }),
934            Some(Mode::Interp { open_brace, .. }) => Some(QalaError::UnterminatedInterpolation {
935                span: Span::new(*open_brace, 1),
936            }),
937            _ => None,
938        }
939    }
940}
941
942#[cfg(test)]
943mod tests {
944    use super::*;
945
946    /// tokenize, asserting success, and return the kinds (dropping spans) for
947    /// terse stream comparisons.
948    fn kinds(src: &str) -> Vec<TokenKind> {
949        Lexer::tokenize(src)
950            .expect("expected successful tokenize")
951            .into_iter()
952            .map(|t| t.kind)
953            .collect()
954    }
955
956    /// tokenize, asserting it failed, and return the error.
957    fn err(src: &str) -> QalaError {
958        Lexer::tokenize(src).expect_err("expected a lex error")
959    }
960
961    #[test]
962    fn empty_source_is_just_eof_at_offset_zero() {
963        let toks = Lexer::tokenize("").unwrap();
964        assert_eq!(toks.len(), 1);
965        assert_eq!(toks[0].kind, TokenKind::Eof);
966        assert_eq!(toks[0].span, Span::new(0, 0));
967    }
968
969    #[test]
970    fn whitespace_only_source_is_just_eof() {
971        assert_eq!(kinds("   \n\t \r\n"), vec![TokenKind::Eof]);
972    }
973
974    #[test]
975    fn comment_only_source_is_just_eof() {
976        assert_eq!(kinds("// hi\n  \n"), vec![TokenKind::Eof]);
977        assert_eq!(kinds("// just a comment, no newline"), vec![TokenKind::Eof]);
978        // a line that is only "//" is fine.
979        assert_eq!(kinds("//\n"), vec![TokenKind::Eof]);
980    }
981
982    #[test]
983    fn whitespace_and_comments_mixed_is_just_eof() {
984        assert_eq!(kinds("   \n\t // hi\n   // more\n  "), vec![TokenKind::Eof]);
985    }
986
987    #[test]
988    fn leading_bom_is_skipped_silently() {
989        let toks = kinds("\u{FEFF}fn main(){}");
990        assert_eq!(
991            toks,
992            vec![
993                TokenKind::Fn,
994                TokenKind::Ident("main".to_string()),
995                TokenKind::LParen,
996                TokenKind::RParen,
997                TokenKind::LBrace,
998                TokenKind::RBrace,
999                TokenKind::Eof,
1000            ]
1001        );
1002        // a bare BOM file is still just Eof, no BOM token.
1003        assert_eq!(kinds("\u{FEFF}"), vec![TokenKind::Eof]);
1004    }
1005
1006    #[test]
1007    fn identifiers_lex_with_their_text() {
1008        for name in ["_x", "x1", "__", "fooBar", "_", "main", "abc123", "X"] {
1009            assert_eq!(
1010                kinds(name),
1011                vec![TokenKind::Ident(name.to_string()), TokenKind::Eof],
1012                "identifier {name:?}"
1013            );
1014        }
1015    }
1016
1017    #[test]
1018    fn reserved_words_lex_to_keyword_kinds_not_idents() {
1019        let cases: &[(&str, TokenKind)] = &[
1020            ("fn", TokenKind::Fn),
1021            ("let", TokenKind::Let),
1022            ("mut", TokenKind::Mut),
1023            ("if", TokenKind::If),
1024            ("else", TokenKind::Else),
1025            ("while", TokenKind::While),
1026            ("for", TokenKind::For),
1027            ("in", TokenKind::In),
1028            ("return", TokenKind::Return),
1029            ("break", TokenKind::Break),
1030            ("continue", TokenKind::Continue),
1031            ("defer", TokenKind::Defer),
1032            ("match", TokenKind::Match),
1033            ("struct", TokenKind::Struct),
1034            ("enum", TokenKind::Enum),
1035            ("interface", TokenKind::Interface),
1036            ("comptime", TokenKind::Comptime),
1037            ("is", TokenKind::Is),
1038            ("pure", TokenKind::Pure),
1039            ("io", TokenKind::Io),
1040            ("alloc", TokenKind::Alloc),
1041            ("panic", TokenKind::Panic),
1042            ("or", TokenKind::Or),
1043            ("self", TokenKind::SelfKw),
1044        ];
1045        for (src, kind) in cases {
1046            assert_eq!(
1047                kinds(src),
1048                vec![kind.clone(), TokenKind::Eof],
1049                "keyword {src:?}"
1050            );
1051        }
1052    }
1053
1054    #[test]
1055    fn true_and_false_are_boolean_keyword_kinds() {
1056        assert_eq!(kinds("true"), vec![TokenKind::True, TokenKind::Eof]);
1057        assert_eq!(kinds("false"), vec![TokenKind::False, TokenKind::Eof]);
1058    }
1059
1060    #[test]
1061    fn primitive_type_names_are_keyword_kinds() {
1062        assert_eq!(kinds("i64"), vec![TokenKind::I64Ty, TokenKind::Eof]);
1063        assert_eq!(kinds("f64"), vec![TokenKind::F64Ty, TokenKind::Eof]);
1064        assert_eq!(kinds("bool"), vec![TokenKind::BoolTy, TokenKind::Eof]);
1065        assert_eq!(kinds("str"), vec![TokenKind::StrTy, TokenKind::Eof]);
1066        assert_eq!(kinds("byte"), vec![TokenKind::ByteTy, TokenKind::Eof]);
1067        assert_eq!(kinds("void"), vec![TokenKind::VoidTy, TokenKind::Eof]);
1068    }
1069
1070    #[test]
1071    fn stdlib_and_result_family_names_lex_as_identifiers() {
1072        for name in [
1073            "Result", "Option", "Ok", "Err", "Some", "None", "println", "open", "map", "filter",
1074            "reduce", "print", "len", "push", "pop", "sqrt", "abs", "assert",
1075        ] {
1076            assert_eq!(
1077                kinds(name),
1078                vec![TokenKind::Ident(name.to_string()), TokenKind::Eof],
1079                "{name:?} should be an identifier"
1080            );
1081        }
1082    }
1083
1084    #[test]
1085    fn operators_and_punctuation_lex_one_per_kind() {
1086        let cases: &[(&str, TokenKind)] = &[
1087            ("+", TokenKind::Plus),
1088            ("-", TokenKind::Minus),
1089            ("*", TokenKind::Star),
1090            ("/", TokenKind::Slash),
1091            ("%", TokenKind::Percent),
1092            ("<", TokenKind::Lt),
1093            (">", TokenKind::Gt),
1094            ("!", TokenKind::Bang),
1095            ("=", TokenKind::Eq),
1096            (".", TokenKind::Dot),
1097            (",", TokenKind::Comma),
1098            (":", TokenKind::Colon),
1099            (";", TokenKind::Semi),
1100            ("(", TokenKind::LParen),
1101            (")", TokenKind::RParen),
1102            ("[", TokenKind::LBracket),
1103            ("]", TokenKind::RBracket),
1104            ("{", TokenKind::LBrace),
1105            ("}", TokenKind::RBrace),
1106            ("?", TokenKind::Question),
1107        ];
1108        for (src, kind) in cases {
1109            assert_eq!(
1110                kinds(src),
1111                vec![kind.clone(), TokenKind::Eof],
1112                "operator {src:?}"
1113            );
1114        }
1115    }
1116
1117    #[test]
1118    fn maximal_munch_pairs_and_triples() {
1119        let cases: &[(&str, TokenKind)] = &[
1120            ("->", TokenKind::Arrow),
1121            ("=>", TokenKind::FatArrow),
1122            ("|>", TokenKind::PipeGt),
1123            ("..", TokenKind::DotDot),
1124            ("..=", TokenKind::DotDotEq),
1125            ("==", TokenKind::EqEq),
1126            ("!=", TokenKind::BangEq),
1127            ("<=", TokenKind::LtEq),
1128            (">=", TokenKind::GtEq),
1129            ("&&", TokenKind::AmpAmp),
1130            ("||", TokenKind::PipePipe),
1131        ];
1132        for (src, kind) in cases {
1133            assert_eq!(
1134                kinds(src),
1135                vec![kind.clone(), TokenKind::Eof],
1136                "operator {src:?}"
1137            );
1138        }
1139        // `a -> b` is Ident, Arrow, Ident -- not Minus, Gt.
1140        assert_eq!(
1141            kinds("a -> b"),
1142            vec![
1143                TokenKind::Ident("a".to_string()),
1144                TokenKind::Arrow,
1145                TokenKind::Ident("b".to_string()),
1146                TokenKind::Eof,
1147            ]
1148        );
1149        // `a => b` is Ident, FatArrow, Ident.
1150        assert_eq!(
1151            kinds("a => b"),
1152            vec![
1153                TokenKind::Ident("a".to_string()),
1154                TokenKind::FatArrow,
1155                TokenKind::Ident("b".to_string()),
1156                TokenKind::Eof,
1157            ]
1158        );
1159    }
1160
1161    #[test]
1162    fn line_comments_are_skipped_but_slash_is_division() {
1163        // `//` to end of line skipped.
1164        assert_eq!(
1165            kinds("a // comment\nb"),
1166            vec![
1167                TokenKind::Ident("a".to_string()),
1168                TokenKind::Ident("b".to_string()),
1169                TokenKind::Eof,
1170            ]
1171        );
1172        // a lone `/` is Slash; there are no block comments.
1173        assert_eq!(
1174            kinds("/x"),
1175            vec![
1176                TokenKind::Slash,
1177                TokenKind::Ident("x".to_string()),
1178                TokenKind::Eof
1179            ]
1180        );
1181        assert_eq!(
1182            kinds("a / b"),
1183            vec![
1184                TokenKind::Ident("a".to_string()),
1185                TokenKind::Slash,
1186                TokenKind::Ident("b".to_string()),
1187                TokenKind::Eof,
1188            ]
1189        );
1190    }
1191
1192    #[test]
1193    fn a_lone_ampersand_is_unexpected_char() {
1194        match err("a & b") {
1195            QalaError::UnexpectedChar { span, ch } => {
1196                assert_eq!(ch, '&');
1197                assert_eq!(span, Span::new(2, 1));
1198            }
1199            other => panic!("expected UnexpectedChar, got {other:?}"),
1200        }
1201        // bare `&` too.
1202        assert!(matches!(
1203            err("&"),
1204            QalaError::UnexpectedChar { ch: '&', .. }
1205        ));
1206    }
1207
1208    #[test]
1209    fn a_lone_pipe_is_unexpected_char() {
1210        match err("a | b") {
1211            QalaError::UnexpectedChar { span, ch } => {
1212                assert_eq!(ch, '|');
1213                assert_eq!(span, Span::new(2, 1));
1214            }
1215            other => panic!("expected UnexpectedChar, got {other:?}"),
1216        }
1217        // bare `|` and `|x` (not `||` or `|>`).
1218        assert!(matches!(
1219            err("|"),
1220            QalaError::UnexpectedChar { ch: '|', .. }
1221        ));
1222        assert!(matches!(
1223            err("|x"),
1224            QalaError::UnexpectedChar { ch: '|', .. }
1225        ));
1226    }
1227
1228    #[test]
1229    fn a_non_ascii_byte_in_identifier_position_is_unexpected_char() {
1230        match err("let café = 1") {
1231            QalaError::UnexpectedChar { span, ch } => {
1232                assert_eq!(ch, 'é');
1233                // "let caf" is 7 bytes; the 'é' starts at byte 7 and is 2 bytes.
1234                let e_start = "let caf".len();
1235                assert_eq!(span, Span::new(e_start, 'é'.len_utf8()));
1236                assert_eq!(span.slice("let café = 1"), "é");
1237            }
1238            other => panic!("expected UnexpectedChar, got {other:?}"),
1239        }
1240        // a non-ASCII char inside a // comment is fine -> just Eof.
1241        assert_eq!(kinds("// café au lait\n"), vec![TokenKind::Eof]);
1242    }
1243
1244    #[test]
1245    fn representative_token_spans_are_exact() {
1246        // "fn main()": "main" is bytes 3..7, length 4, slices back to "main".
1247        let src = "fn main()";
1248        let toks = Lexer::tokenize(src).unwrap();
1249        let main_tok = &toks[1];
1250        assert_eq!(main_tok.kind, TokenKind::Ident("main".to_string()));
1251        assert_eq!(main_tok.span.start, 3);
1252        assert_eq!(main_tok.span.len, 4);
1253        assert_eq!(main_tok.span.slice(src), "main");
1254        // the `fn` keyword: bytes 0..2.
1255        assert_eq!(toks[0].span, Span::new(0, 2));
1256        assert_eq!(toks[0].span.slice(src), "fn");
1257        // the trailing Eof is a zero-length span at the end.
1258        let eof = toks.last().unwrap();
1259        assert_eq!(eof.kind, TokenKind::Eof);
1260        assert_eq!(eof.span, Span::new(src.len(), 0));
1261    }
1262
1263    #[test]
1264    fn span_after_trivia_starts_at_the_token_not_the_whitespace() {
1265        // "  +  ": the `+` is at byte 2, length 1.
1266        let src = "  +  ";
1267        let toks = Lexer::tokenize(src).unwrap();
1268        assert_eq!(toks[0].kind, TokenKind::Plus);
1269        assert_eq!(toks[0].span, Span::new(2, 1));
1270    }
1271
1272    // ---- Task 2: numeric and byte literals ------------------------------------
1273
1274    /// the single kind of a one-token source (plus the trailing Eof).
1275    fn one(src: &str) -> TokenKind {
1276        let mut k = kinds(src);
1277        assert_eq!(
1278            k.last(),
1279            Some(&TokenKind::Eof),
1280            "stream should end in Eof: {src:?}"
1281        );
1282        k.pop();
1283        assert_eq!(
1284            k.len(),
1285            1,
1286            "expected exactly one token in {src:?}, got {k:?}"
1287        );
1288        k.pop().unwrap()
1289    }
1290
1291    #[test]
1292    fn decimal_integers() {
1293        assert_eq!(one("0"), TokenKind::Int(0));
1294        assert_eq!(one("42"), TokenKind::Int(42));
1295        assert_eq!(one("1_000_000"), TokenKind::Int(1_000_000));
1296        assert_eq!(one("9223372036854775807"), TokenKind::Int(i64::MAX));
1297    }
1298
1299    #[test]
1300    fn hex_integers() {
1301        assert_eq!(one("0xFF"), TokenKind::Int(255));
1302        assert_eq!(one("0xFF_FF"), TokenKind::Int(65_535));
1303        assert_eq!(one("0X1a"), TokenKind::Int(26));
1304        assert_eq!(one("0x0"), TokenKind::Int(0));
1305    }
1306
1307    #[test]
1308    fn binary_integers() {
1309        assert_eq!(one("0b1010"), TokenKind::Int(10));
1310        assert_eq!(one("0b1010_0101"), TokenKind::Int(165));
1311        assert_eq!(one("0B1"), TokenKind::Int(1));
1312    }
1313
1314    #[test]
1315    fn integer_overflow_errors_at_the_digits() {
1316        // 2^63 = i64::MAX + 1.
1317        match err("9223372036854775808") {
1318            QalaError::IntOverflow { span } => {
1319                assert_eq!(span, Span::new(0, 19), "span should cover the 19 digits");
1320            }
1321            other => panic!("expected IntOverflow, got {other:?}"),
1322        }
1323        match err("0x8000000000000000") {
1324            QalaError::IntOverflow { span } => assert_eq!(span, Span::new(0, 18)),
1325            other => panic!("expected IntOverflow, got {other:?}"),
1326        }
1327        // a 50-digit number.
1328        let big = "1".repeat(50);
1329        assert!(matches!(err(&big), QalaError::IntOverflow { .. }));
1330        // overflow inside a larger source still points at the digits.
1331        match err("let x = 99999999999999999999\n") {
1332            QalaError::IntOverflow { span } => {
1333                assert_eq!(
1334                    span.slice("let x = 99999999999999999999\n"),
1335                    "99999999999999999999"
1336                );
1337            }
1338            other => panic!("expected IntOverflow, got {other:?}"),
1339        }
1340    }
1341
1342    #[test]
1343    fn malformed_numbers_span_the_literal() {
1344        for src in [
1345            "1_", "1__0", "0x", "0xG", "0b2", "1_.0", "1._5", "1e_5", "1e", "1e+", "1e-", "0b",
1346            "0x_FF", "0b_1", "0xFF_",
1347        ] {
1348            match Lexer::tokenize(src) {
1349                Err(QalaError::MalformedNumber { span, .. }) => {
1350                    assert_eq!(
1351                        span.slice(src),
1352                        src,
1353                        "MalformedNumber span should cover the whole literal {src:?}"
1354                    );
1355                }
1356                other => panic!("expected MalformedNumber for {src:?}, got {other:?}"),
1357            }
1358        }
1359        // the `1__0` span is exactly those 4 bytes even mid-source.
1360        match err("a = 1__0;") {
1361            QalaError::MalformedNumber { span, .. } => assert_eq!(span.slice("a = 1__0;"), "1__0"),
1362            other => panic!("expected MalformedNumber, got {other:?}"),
1363        }
1364    }
1365
1366    #[test]
1367    fn floats_including_exponents() {
1368        assert_eq!(one("1.0"), TokenKind::Float(1.0));
1369        assert_eq!(one("1.5e10"), TokenKind::Float(1.5e10));
1370        assert_eq!(one("1e10"), TokenKind::Float(1e10));
1371        assert_eq!(one("2.0e-3"), TokenKind::Float(2.0e-3));
1372        assert_eq!(one("1.5E+2"), TokenKind::Float(150.0));
1373        assert_eq!(one("7.25"), TokenKind::Float(7.25));
1374        // a non-round-tripping float is NOT an error.
1375        assert_eq!(one("0.1"), TokenKind::Float(0.1));
1376        // underscores inside a float are stripped.
1377        assert_eq!(one("1_000.000_5"), TokenKind::Float(1000.0005));
1378    }
1379
1380    #[test]
1381    fn the_leading_dot_rule_keeps_dot_and_ranges_unambiguous() {
1382        // a number only starts on a digit; `.` is never the start of one.
1383        assert_eq!(
1384            kinds(".5"),
1385            vec![TokenKind::Dot, TokenKind::Int(5), TokenKind::Eof]
1386        );
1387        assert_eq!(
1388            kinds("0..5"),
1389            vec![
1390                TokenKind::Int(0),
1391                TokenKind::DotDot,
1392                TokenKind::Int(5),
1393                TokenKind::Eof
1394            ]
1395        );
1396        assert_eq!(
1397            kinds("0..=5"),
1398            vec![
1399                TokenKind::Int(0),
1400                TokenKind::DotDotEq,
1401                TokenKind::Int(5),
1402                TokenKind::Eof
1403            ]
1404        );
1405        assert_eq!(
1406            kinds("x.0"),
1407            vec![
1408                TokenKind::Ident("x".to_string()),
1409                TokenKind::Dot,
1410                TokenKind::Int(0),
1411                TokenKind::Eof,
1412            ]
1413        );
1414        // the fibonacci example's range.
1415        assert_eq!(
1416            kinds("0..15"),
1417            vec![
1418                TokenKind::Int(0),
1419                TokenKind::DotDot,
1420                TokenKind::Int(15),
1421                TokenKind::Eof
1422            ]
1423        );
1424        // `1.0` is still a float -- the `.` here IS followed by a digit.
1425        assert_eq!(one("1.0"), TokenKind::Float(1.0));
1426        // `1..2` is a range over integers, not `1.` then `.2`.
1427        assert_eq!(
1428            kinds("1..2"),
1429            vec![
1430                TokenKind::Int(1),
1431                TokenKind::DotDot,
1432                TokenKind::Int(2),
1433                TokenKind::Eof
1434            ]
1435        );
1436    }
1437
1438    #[test]
1439    fn byte_literals() {
1440        assert_eq!(one("b'A'"), TokenKind::Byte(65));
1441        assert_eq!(one("b'\\n'"), TokenKind::Byte(10));
1442        assert_eq!(one("b'\\t'"), TokenKind::Byte(9));
1443        assert_eq!(one("b'\\r'"), TokenKind::Byte(13));
1444        assert_eq!(one("b'\\\\'"), TokenKind::Byte(92));
1445        assert_eq!(one("b'\\''"), TokenKind::Byte(39));
1446        assert_eq!(one("b'\\0'"), TokenKind::Byte(0));
1447        assert_eq!(one("b' '"), TokenKind::Byte(32));
1448        assert_eq!(one("b'z'"), TokenKind::Byte(122));
1449    }
1450
1451    #[test]
1452    fn bad_byte_literals() {
1453        for src in ["b''", "b'ab'", "b'\\x'", "b'é'"] {
1454            match Lexer::tokenize(src) {
1455                Err(QalaError::BadByteLiteral { span, .. }) => {
1456                    // the span starts at the `b` and covers the offending literal.
1457                    assert_eq!(span.start, 0, "span should start at `b` for {src:?}");
1458                    assert!(span.len >= 3, "span should cover the literal for {src:?}");
1459                }
1460                other => panic!("expected BadByteLiteral for {src:?}, got {other:?}"),
1461            }
1462        }
1463    }
1464
1465    #[test]
1466    fn b_not_followed_by_quote_is_the_identifier_b() {
1467        assert_eq!(
1468            kinds("by"),
1469            vec![TokenKind::Ident("by".to_string()), TokenKind::Eof]
1470        );
1471        assert_eq!(
1472            kinds("b 1"),
1473            vec![
1474                TokenKind::Ident("b".to_string()),
1475                TokenKind::Int(1),
1476                TokenKind::Eof
1477            ]
1478        );
1479        assert_eq!(
1480            kinds("b"),
1481            vec![TokenKind::Ident("b".to_string()), TokenKind::Eof]
1482        );
1483        // `byte` is still the keyword, not `b` + `yte`.
1484        assert_eq!(kinds("byte"), vec![TokenKind::ByteTy, TokenKind::Eof]);
1485    }
1486
1487    #[test]
1488    fn numbers_in_a_realistic_snippet() {
1489        // `for i in 0..15 { x = 1_000 + 0xFF }` -- a mix of range and literals.
1490        let toks = kinds("for i in 0..15 { x = 1_000 + 0xFF }");
1491        assert_eq!(
1492            toks,
1493            vec![
1494                TokenKind::For,
1495                TokenKind::Ident("i".to_string()),
1496                TokenKind::In,
1497                TokenKind::Int(0),
1498                TokenKind::DotDot,
1499                TokenKind::Int(15),
1500                TokenKind::LBrace,
1501                TokenKind::Ident("x".to_string()),
1502                TokenKind::Eq,
1503                TokenKind::Int(1_000),
1504                TokenKind::Plus,
1505                TokenKind::Int(255),
1506                TokenKind::RBrace,
1507                TokenKind::Eof,
1508            ]
1509        );
1510    }
1511
1512    #[test]
1513    fn numeric_literal_spans_are_exact() {
1514        // "x = 42": the `42` is bytes 4..6, length 2.
1515        let src = "x = 42";
1516        let toks = Lexer::tokenize(src).unwrap();
1517        let lit = &toks[2];
1518        assert_eq!(lit.kind, TokenKind::Int(42));
1519        assert_eq!(lit.span, Span::new(4, 2));
1520        assert_eq!(lit.span.slice(src), "42");
1521        // a float's span covers the whole literal including the exponent.
1522        let src2 = "y = 1.5e10;";
1523        let toks2 = Lexer::tokenize(src2).unwrap();
1524        let f = &toks2[2];
1525        assert_eq!(f.kind, TokenKind::Float(1.5e10));
1526        assert_eq!(f.span.slice(src2), "1.5e10");
1527    }
1528
1529    // ---- Task 3: strings, escapes, interpolation ------------------------------
1530    //
1531    // these tests are written against Qala source, so a `"` in the source is `\"`
1532    // in the Rust string literal and a `\` in the source is `\\`.
1533
1534    /// shorthand for `TokenKind::Str(s.to_string())` and friends, to keep the
1535    /// expected streams readable.
1536    fn s(text: &str) -> TokenKind {
1537        TokenKind::Str(text.to_string())
1538    }
1539    fn id(text: &str) -> TokenKind {
1540        TokenKind::Ident(text.to_string())
1541    }
1542
1543    #[test]
1544    fn interpolation_free_strings() {
1545        assert_eq!(one("\"abc\""), s("abc"));
1546        assert_eq!(one("\"\""), s(""));
1547        assert_eq!(one("\"hello, world\""), s("hello, world"));
1548        // a non-ASCII char inside a string is fine and lands in the payload.
1549        assert_eq!(one("\"café\""), s("café"));
1550        // the Str span covers the opening quote through the closing quote.
1551        let src = "x = \"hi\"";
1552        let toks = Lexer::tokenize(src).unwrap();
1553        assert_eq!(toks[2].kind, s("hi"));
1554        assert_eq!(toks[2].span.slice(src), "\"hi\"");
1555    }
1556
1557    #[test]
1558    fn escapes_are_decoded_into_the_payload() {
1559        assert_eq!(one("\"a\\nb\""), s("a\nb"));
1560        assert_eq!(one("\"\\t\""), s("\t"));
1561        assert_eq!(one("\"\\r\""), s("\r"));
1562        assert_eq!(one("\"\\0\""), s("\0"));
1563        assert_eq!(one("\"\\\\\""), s("\\"));
1564        assert_eq!(one("\"\\\"\""), s("\""));
1565        assert_eq!(one("\"\\{\""), s("{"));
1566        assert_eq!(one("\"\\}\""), s("}"));
1567        assert_eq!(one("\"\\u{41}\""), s("A"));
1568        assert_eq!(one("\"\\u{1F600}\""), s("\u{1F600}"));
1569        // a mix: "line1\nline2 \u{2764}" -> "line1", newline, "line2 ", heart.
1570        assert_eq!(
1571            one("\"line1\\nline2 \\u{2764}\""),
1572            s("line1\nline2 \u{2764}")
1573        );
1574        // escaped braces do NOT open an interpolation.
1575        assert_eq!(one("\"a \\{not interp\\} b\""), s("a {not interp} b"));
1576    }
1577
1578    #[test]
1579    fn bad_escapes_error_at_the_backslash() {
1580        // "a\qb": the `\` is byte 2 (after the `"` and `a`).
1581        match err("\"a\\qb\"") {
1582            QalaError::InvalidEscape { span, .. } => {
1583                assert_eq!(span, Span::new(2, 1), "span should be the backslash byte");
1584            }
1585            other => panic!("expected InvalidEscape, got {other:?}"),
1586        }
1587        // bad \u{...} forms: empty, out of range, surrogate, no closing brace.
1588        for src in [
1589            "\"\\u{}\"",
1590            "\"\\u{110000}\"",
1591            "\"\\u{D800}\"",
1592            "\"\\u{41\"",
1593            "\"\\uABCD\"",
1594        ] {
1595            assert!(
1596                matches!(Lexer::tokenize(src), Err(QalaError::InvalidEscape { .. })),
1597                "expected InvalidEscape for {src:?}"
1598            );
1599        }
1600        // the backslash position is reported even mid-string.
1601        match err("\"hello \\x world\"") {
1602            QalaError::InvalidEscape { span, .. } => {
1603                assert_eq!(span.slice("\"hello \\x world\""), "\\");
1604            }
1605            other => panic!("expected InvalidEscape, got {other:?}"),
1606        }
1607    }
1608
1609    #[test]
1610    fn unterminated_string_errors_at_the_opening_quote() {
1611        // "abc with EOF before the close.
1612        match err("\"abc") {
1613            QalaError::UnterminatedString { span } => {
1614                assert_eq!(span, Span::new(0, 1), "span should be the opening quote");
1615            }
1616            other => panic!("expected UnterminatedString, got {other:?}"),
1617        }
1618        // a real newline before the close: no multi-line strings.
1619        match err("\"abc\ndef\"") {
1620            QalaError::UnterminatedString { span } => assert_eq!(span, Span::new(0, 1)),
1621            other => panic!("expected UnterminatedString, got {other:?}"),
1622        }
1623        // the opening quote is found even when the string starts mid-source.
1624        match err("let x = \"oops") {
1625            QalaError::UnterminatedString { span } => {
1626                assert_eq!(span.slice("let x = \"oops"), "\"");
1627            }
1628            other => panic!("expected UnterminatedString, got {other:?}"),
1629        }
1630        // an empty unterminated string `"` at EOF.
1631        assert!(matches!(err("\""), QalaError::UnterminatedString { .. }));
1632    }
1633
1634    #[test]
1635    fn simple_interpolation() {
1636        // "hi {name}!"
1637        assert_eq!(
1638            kinds("\"hi {name}!\""),
1639            vec![
1640                TokenKind::StrStart("hi ".to_string()),
1641                TokenKind::InterpStart,
1642                id("name"),
1643                TokenKind::InterpEnd,
1644                TokenKind::StrEnd("!".to_string()),
1645                TokenKind::Eof,
1646            ]
1647        );
1648        // the hello.qala interpolation: println("hello, {name}!").
1649        assert_eq!(
1650            kinds("println(\"hello, {name}!\")"),
1651            vec![
1652                id("println"),
1653                TokenKind::LParen,
1654                TokenKind::StrStart("hello, ".to_string()),
1655                TokenKind::InterpStart,
1656                id("name"),
1657                TokenKind::InterpEnd,
1658                TokenKind::StrEnd("!".to_string()),
1659                TokenKind::RParen,
1660                TokenKind::Eof,
1661            ]
1662        );
1663    }
1664
1665    #[test]
1666    fn multiple_and_empty_fragment_interpolations() {
1667        // "{a}{b}"
1668        assert_eq!(
1669            kinds("\"{a}{b}\""),
1670            vec![
1671                TokenKind::StrStart(String::new()),
1672                TokenKind::InterpStart,
1673                id("a"),
1674                TokenKind::InterpEnd,
1675                TokenKind::StrMid(String::new()),
1676                TokenKind::InterpStart,
1677                id("b"),
1678                TokenKind::InterpEnd,
1679                TokenKind::StrEnd(String::new()),
1680                TokenKind::Eof,
1681            ]
1682        );
1683        // "a{x}b{y}c"
1684        assert_eq!(
1685            kinds("\"a{x}b{y}c\""),
1686            vec![
1687                TokenKind::StrStart("a".to_string()),
1688                TokenKind::InterpStart,
1689                id("x"),
1690                TokenKind::InterpEnd,
1691                TokenKind::StrMid("b".to_string()),
1692                TokenKind::InterpStart,
1693                id("y"),
1694                TokenKind::InterpEnd,
1695                TokenKind::StrEnd("c".to_string()),
1696                TokenKind::Eof,
1697            ]
1698        );
1699    }
1700
1701    #[test]
1702    fn interpolation_with_nested_braces() {
1703        // "{ {a: 1}.a }" -- the interpolation ends at the matching depth-0 `}`,
1704        // not the struct literal's `}`. the inner braces appear as LBrace/RBrace.
1705        assert_eq!(
1706            kinds("\"{ {a: 1}.a }\""),
1707            vec![
1708                TokenKind::StrStart(String::new()),
1709                TokenKind::InterpStart,
1710                TokenKind::LBrace,
1711                id("a"),
1712                TokenKind::Colon,
1713                TokenKind::Int(1),
1714                TokenKind::RBrace,
1715                TokenKind::Dot,
1716                id("a"),
1717                TokenKind::InterpEnd,
1718                TokenKind::StrEnd(String::new()),
1719                TokenKind::Eof,
1720            ]
1721        );
1722        // "{ if x { 1 } else { 2 } }" -- the if/else block braces are handled by
1723        // depth; the interpolation closes at the final `}`.
1724        assert_eq!(
1725            kinds("\"{ if x { 1 } else { 2 } }\""),
1726            vec![
1727                TokenKind::StrStart(String::new()),
1728                TokenKind::InterpStart,
1729                TokenKind::If,
1730                id("x"),
1731                TokenKind::LBrace,
1732                TokenKind::Int(1),
1733                TokenKind::RBrace,
1734                TokenKind::Else,
1735                TokenKind::LBrace,
1736                TokenKind::Int(2),
1737                TokenKind::RBrace,
1738                TokenKind::InterpEnd,
1739                TokenKind::StrEnd(String::new()),
1740                TokenKind::Eof,
1741            ]
1742        );
1743    }
1744
1745    #[test]
1746    fn interpolation_with_a_nested_string() {
1747        // "{ "{inner}" }" -- a string inside an interpolation inside a string.
1748        // entering the nested `"..."` makes its braces string-rules, not
1749        // brace-depth-rules, so the outer interpolation still closes at the right
1750        // `}`.
1751        assert_eq!(
1752            kinds("\"{ \"{inner}\" }\""),
1753            vec![
1754                TokenKind::StrStart(String::new()),
1755                TokenKind::InterpStart,
1756                // the nested string:
1757                TokenKind::StrStart(String::new()),
1758                TokenKind::InterpStart,
1759                id("inner"),
1760                TokenKind::InterpEnd,
1761                TokenKind::StrEnd(String::new()),
1762                // back in the outer interpolation:
1763                TokenKind::InterpEnd,
1764                TokenKind::StrEnd(String::new()),
1765                TokenKind::Eof,
1766            ]
1767        );
1768    }
1769
1770    #[test]
1771    fn the_fibonacci_interpolation() {
1772        // "fib({i}) = {fibonacci(i)}" -- `fib(` and `) = ` are literal text; the
1773        // `(i)` inside `{fibonacci(i)}` is a normal call sequence (LParen/RParen
1774        // do not affect brace depth, only `{`/`}` do).
1775        assert_eq!(
1776            kinds("\"fib({i}) = {fibonacci(i)}\""),
1777            vec![
1778                TokenKind::StrStart("fib(".to_string()),
1779                TokenKind::InterpStart,
1780                id("i"),
1781                TokenKind::InterpEnd,
1782                TokenKind::StrMid(") = ".to_string()),
1783                TokenKind::InterpStart,
1784                id("fibonacci"),
1785                TokenKind::LParen,
1786                id("i"),
1787                TokenKind::RParen,
1788                TokenKind::InterpEnd,
1789                TokenKind::StrEnd(String::new()),
1790                TokenKind::Eof,
1791            ]
1792        );
1793    }
1794
1795    #[test]
1796    fn unterminated_interpolation_errors_at_the_brace() {
1797        // "x{y" -- EOF before the `}`. the `{` is byte 2.
1798        match err("\"x{y") {
1799            QalaError::UnterminatedInterpolation { span } => {
1800                assert_eq!(span, Span::new(2, 1), "span should be the `{{` byte");
1801            }
1802            other => panic!("expected UnterminatedInterpolation, got {other:?}"),
1803        }
1804        // the `{` is found even when the string starts mid-source.
1805        match err("let s = \"a{b") {
1806            QalaError::UnterminatedInterpolation { span } => {
1807                assert_eq!(span.slice("let s = \"a{b"), "{");
1808            }
1809            other => panic!("expected UnterminatedInterpolation, got {other:?}"),
1810        }
1811        // a nested-brace interpolation that never closes the outermost `}`.
1812        assert!(matches!(
1813            err("\"{ {a: 1}.a "),
1814            QalaError::UnterminatedInterpolation { .. }
1815        ));
1816    }
1817
1818    #[test]
1819    fn interpolation_fragment_spans_partition_the_source() {
1820        // "hi {x}!" -- StrStart covers `"hi `, InterpStart the `{`, InterpEnd the
1821        // `}`, StrEnd `!"`. every byte is covered exactly once.
1822        let src = "\"hi {x}!\"";
1823        let toks = Lexer::tokenize(src).unwrap();
1824        // toks: [StrStart, InterpStart, Ident, InterpEnd, StrEnd, Eof]
1825        assert_eq!(toks[0].kind, TokenKind::StrStart("hi ".to_string()));
1826        assert_eq!(toks[0].span.slice(src), "\"hi ");
1827        assert_eq!(toks[1].kind, TokenKind::InterpStart);
1828        assert_eq!(toks[1].span.slice(src), "{");
1829        assert_eq!(toks[2].kind, id("x"));
1830        assert_eq!(toks[2].span.slice(src), "x");
1831        assert_eq!(toks[3].kind, TokenKind::InterpEnd);
1832        assert_eq!(toks[3].span.slice(src), "}");
1833        assert_eq!(toks[4].kind, TokenKind::StrEnd("!".to_string()));
1834        assert_eq!(toks[4].span.slice(src), "!\"");
1835        assert_eq!(toks[5].kind, TokenKind::Eof);
1836    }
1837
1838    #[test]
1839    fn the_six_examples_tokenize_to_eof_with_no_error() {
1840        // a smoke test that the token set is complete: every bundled example
1841        // lexes cleanly. the path is relative to this crate's manifest, so it
1842        // works regardless of the test runner's cwd; a missing file fails loudly.
1843        for name in [
1844            "hello",
1845            "fibonacci",
1846            "effects",
1847            "pattern-matching",
1848            "pipeline",
1849            "defer-demo",
1850        ] {
1851            let path = format!(
1852                "{}/../../playground/public/examples/{}.qala",
1853                env!("CARGO_MANIFEST_DIR"),
1854                name
1855            );
1856            let src = std::fs::read_to_string(&path)
1857                .unwrap_or_else(|e| panic!("could not read example {path}: {e}"));
1858            let toks = Lexer::tokenize(&src)
1859                .unwrap_or_else(|e| panic!("example {name}.qala failed to tokenize: {e:?}"));
1860            assert_eq!(
1861                toks.last().map(|t| &t.kind),
1862                Some(&TokenKind::Eof),
1863                "example {name}.qala should end in Eof"
1864            );
1865            assert!(
1866                toks.len() > 1,
1867                "example {name}.qala should produce real tokens"
1868            );
1869        }
1870    }
1871
1872    #[test]
1873    fn tokenizing_is_deterministic() {
1874        // the lexer holds no global mutable state, so the same source always
1875        // produces the same token vector.
1876        let src = "fn main() is io {\n  let name = \"world\"\n  println(\"hello, {name}!\")\n}\n";
1877        let a = Lexer::tokenize(src).unwrap();
1878        let b = Lexer::tokenize(src).unwrap();
1879        assert_eq!(a, b);
1880        // also stable across an error case.
1881        let bad = "let x = \"unterminated";
1882        assert_eq!(Lexer::tokenize(bad), Lexer::tokenize(bad));
1883    }
1884}
qala_compiler/lexer.rs

qala_compiler/
lexer.rs