Skip to main content

cljrs_reader/
lexer.rs

1// CljxError embeds NamedSource<String> for miette diagnostics, which is
2// unavoidably large. Suppress the false-positive for every returning function.
3#![allow(clippy::result_large_err)]
4
5use std::sync::Arc;
6
7use miette::NamedSource;
8
9use cljrs_types::error::{CljxError, CljxResult};
10use cljrs_types::span::Span;
11
12use crate::token::Token;
13
14// ─── Character classification ─────────────────────────────────────────────────
15
16/// Returns `true` if `ch` is a valid constituent character for a symbol or
17/// keyword.  Defined *negatively*: everything that isn't a delimiter, whitespace,
18/// or special syntax character is a symbol constituent.
19fn is_symbol_char(ch: char) -> bool {
20    !matches!(
21        ch,
22        ' ' | '\t'
23            | '\n'
24            | '\r'
25            | ','
26            | '('
27            | ')'
28            | '['
29            | ']'
30            | '{'
31            | '}'
32            | '"'
33            | ';'
34            | '`'
35            | '~'
36            | '^'
37            | '@'
38            | '#'
39            | '\\'
40            | ':'
41    )
42}
43
44/// Returns `true` if `ch` can *start* a symbol (not a digit, not `+`/`-` when
45/// the following char is a digit — but the caller handles the `+`/`-` case).
46fn is_symbol_start(ch: char) -> bool {
47    is_symbol_char(ch) && !ch.is_ascii_digit()
48}
49
50// ─── Lexer ───────────────────────────────────────────────────────────────────
51
52pub struct Lexer {
53    source: Arc<String>,
54    file: Arc<String>,
55    pos: usize, // byte offset, always on a char boundary
56    line: u32,  // 1-based
57    col: u32,   // 1-based byte offset from line start
58}
59
60impl Lexer {
61    pub fn new(source: String, file: String) -> Self {
62        Self {
63            source: Arc::new(source),
64            file: Arc::new(file),
65            pos: 0,
66            line: 1,
67            col: 1,
68        }
69    }
70
71    // ── Public getters ────────────────────────────────────────────────────
72
73    pub fn source(&self) -> &Arc<String> {
74        &self.source
75    }
76
77    pub fn file(&self) -> &Arc<String> {
78        &self.file
79    }
80
81    // ── Low-level helpers ─────────────────────────────────────────────────
82
83    fn peek(&self) -> Option<char> {
84        self.source[self.pos..].chars().next()
85    }
86
87    fn peek_next(&self) -> Option<char> {
88        let mut chars = self.source[self.pos..].chars();
89        chars.next(); // skip current
90        chars.next()
91    }
92
93    fn advance(&mut self) -> Option<char> {
94        let ch = self.peek()?;
95        self.pos += ch.len_utf8();
96        if ch == '\n' {
97            self.line += 1;
98            self.col = 1;
99        } else {
100            self.col += ch.len_utf8() as u32;
101        }
102        Some(ch)
103    }
104
105    fn span_from(&self, start_pos: usize, start_line: u32, start_col: u32) -> Span {
106        Span::new(
107            Arc::clone(&self.file),
108            start_pos,
109            self.pos,
110            start_line,
111            start_col,
112        )
113    }
114
115    fn make_error(&self, msg: impl Into<String>, span: Span) -> CljxError {
116        CljxError::ReadError {
117            message: msg.into(),
118            span: Some(miette::SourceSpan::from(span)),
119            src: NamedSource::new((*self.file).clone(), (*self.source).clone()),
120        }
121    }
122
123    /// Consume characters while `is_symbol_char` holds, returning the collected
124    /// string.
125    fn read_symbol_chars(&mut self) -> String {
126        let mut buf = String::new();
127        while let Some(ch) = self.peek() {
128            if is_symbol_char(ch) {
129                buf.push(ch);
130                self.advance();
131            } else {
132                break;
133            }
134        }
135        buf
136    }
137
138    // ── Whitespace / comment skipping ─────────────────────────────────────
139
140    fn skip_whitespace_and_comments(&mut self) {
141        loop {
142            match self.peek() {
143                // Shebang: only recognised at the very start of the file.
144                Some('#') if self.pos == 0 => {
145                    if self.peek_next() == Some('!') {
146                        // skip to end of line
147                        while let Some(ch) = self.advance() {
148                            if ch == '\n' {
149                                break;
150                            }
151                        }
152                    } else {
153                        break; // '#' is meaningful, stop skipping
154                    }
155                }
156                Some(' ') | Some('\t') | Some('\r') | Some('\n') | Some(',') => {
157                    self.advance();
158                }
159                Some(';') => {
160                    while let Some(ch) = self.advance() {
161                        if ch == '\n' {
162                            break;
163                        }
164                    }
165                }
166                _ => break,
167            }
168        }
169    }
170
171    // ── `~` (unquote / unquote-splicing) ─────────────────────────────────
172
173    fn lex_unquote(
174        &mut self,
175        start_pos: usize,
176        start_line: u32,
177        start_col: u32,
178    ) -> CljxResult<(Token, Span)> {
179        self.advance(); // consume '~'
180        if self.peek() == Some('@') {
181            self.advance();
182            Ok((
183                Token::UnquoteSplice,
184                self.span_from(start_pos, start_line, start_col),
185            ))
186        } else {
187            Ok((
188                Token::Unquote,
189                self.span_from(start_pos, start_line, start_col),
190            ))
191        }
192    }
193
194    // ── `#` dispatch ──────────────────────────────────────────────────────
195
196    fn lex_hash(
197        &mut self,
198        start_pos: usize,
199        start_line: u32,
200        start_col: u32,
201    ) -> CljxResult<(Token, Span)> {
202        self.advance(); // consume '#'
203        match self.peek() {
204            Some('(') => {
205                self.advance();
206                Ok((
207                    Token::HashFn,
208                    self.span_from(start_pos, start_line, start_col),
209                ))
210            }
211            Some('{') => {
212                self.advance();
213                Ok((
214                    Token::HashSet,
215                    self.span_from(start_pos, start_line, start_col),
216                ))
217            }
218            Some('\'') => {
219                self.advance();
220                Ok((
221                    Token::HashVar,
222                    self.span_from(start_pos, start_line, start_col),
223                ))
224            }
225            Some('_') => {
226                self.advance();
227                Ok((
228                    Token::HashDiscard,
229                    self.span_from(start_pos, start_line, start_col),
230                ))
231            }
232            Some('"') => self.lex_regex(start_pos, start_line, start_col),
233            Some('?') => {
234                self.advance(); // consume '?'
235                if self.peek() == Some('@') {
236                    self.advance();
237                    Ok((
238                        Token::ReaderCondSplice,
239                        self.span_from(start_pos, start_line, start_col),
240                    ))
241                } else {
242                    Ok((
243                        Token::ReaderCond,
244                        self.span_from(start_pos, start_line, start_col),
245                    ))
246                }
247            }
248            Some('#') => self.lex_symbolic(start_pos, start_line, start_col),
249            Some(c) if is_symbol_start(c) => {
250                let name = self.read_symbol_chars();
251                Ok((
252                    Token::TaggedLiteral(name),
253                    self.span_from(start_pos, start_line, start_col),
254                ))
255            }
256            other => {
257                let span = self.span_from(start_pos, start_line, start_col);
258                Err(self.make_error(format!("unknown # dispatch character: {:?}", other), span))
259            }
260        }
261    }
262
263    fn lex_regex(
264        &mut self,
265        start_pos: usize,
266        start_line: u32,
267        start_col: u32,
268    ) -> CljxResult<(Token, Span)> {
269        self.advance(); // consume opening '"'
270        let mut buf = String::new();
271        loop {
272            match self.advance() {
273                None => {
274                    let span = self.span_from(start_pos, start_line, start_col);
275                    return Err(self.make_error("unterminated regex literal", span));
276                }
277                Some('"') => break,
278                Some('\\') => {
279                    // Store escape verbatim (two chars) — no processing.
280                    buf.push('\\');
281                    match self.advance() {
282                        Some(c) => buf.push(c),
283                        None => {
284                            let span = self.span_from(start_pos, start_line, start_col);
285                            return Err(self.make_error("unterminated regex literal", span));
286                        }
287                    }
288                }
289                Some(c) => buf.push(c),
290            }
291        }
292        Ok((
293            Token::Regex(buf),
294            self.span_from(start_pos, start_line, start_col),
295        ))
296    }
297
298    fn lex_symbolic(
299        &mut self,
300        start_pos: usize,
301        start_line: u32,
302        start_col: u32,
303    ) -> CljxResult<(Token, Span)> {
304        self.advance(); // consume second '#'
305        let name = self.read_symbol_chars();
306        match name.as_str() {
307            "Inf" | "-Inf" | "NaN" => Ok((
308                Token::Symbolic(name),
309                self.span_from(start_pos, start_line, start_col),
310            )),
311            _ => {
312                let span = self.span_from(start_pos, start_line, start_col);
313                Err(self.make_error(format!("unknown symbolic value: ##{name}"), span))
314            }
315        }
316    }
317
318    // ── String literal ────────────────────────────────────────────────────
319
320    fn lex_string(
321        &mut self,
322        start_pos: usize,
323        start_line: u32,
324        start_col: u32,
325    ) -> CljxResult<(Token, Span)> {
326        self.advance(); // consume opening '"'
327        let mut buf = String::new();
328        loop {
329            match self.advance() {
330                None => {
331                    let span = self.span_from(start_pos, start_line, start_col);
332                    return Err(self.make_error("unterminated string literal", span));
333                }
334                Some('"') => break,
335                Some('\\') => match self.advance() {
336                    Some('n') => buf.push('\n'),
337                    Some('t') => buf.push('\t'),
338                    Some('r') => buf.push('\r'),
339                    Some('b') => buf.push('\x08'),
340                    Some('f') => buf.push('\x0C'),
341                    Some('\\') => buf.push('\\'),
342                    Some('"') => buf.push('"'),
343                    Some('u') => {
344                        let ch = self.read_unicode_escape(start_pos, start_line, start_col)?;
345                        buf.push(ch);
346                    }
347                    Some(c) => {
348                        let span = self.span_from(start_pos, start_line, start_col);
349                        return Err(self.make_error(format!("unknown string escape: \\{c}"), span));
350                    }
351                    None => {
352                        let span = self.span_from(start_pos, start_line, start_col);
353                        return Err(self.make_error("unterminated string literal", span));
354                    }
355                },
356                Some(c) => buf.push(c),
357            }
358        }
359        Ok((
360            Token::Str(buf),
361            self.span_from(start_pos, start_line, start_col),
362        ))
363    }
364
365    /// Read exactly 4 hex digits after `\u` and return the corresponding char.
366    fn read_unicode_escape(
367        &mut self,
368        start_pos: usize,
369        start_line: u32,
370        start_col: u32,
371    ) -> CljxResult<char> {
372        let mut hex = String::with_capacity(4);
373        for _ in 0..4 {
374            match self.advance() {
375                Some(c) if c.is_ascii_hexdigit() => hex.push(c),
376                Some(c) => {
377                    let span = self.span_from(start_pos, start_line, start_col);
378                    return Err(self.make_error(
379                        format!("invalid \\u escape: expected hex digit, got {c:?}"),
380                        span,
381                    ));
382                }
383                None => {
384                    let span = self.span_from(start_pos, start_line, start_col);
385                    return Err(self.make_error("unterminated \\u escape", span));
386                }
387            }
388        }
389        let code = u32::from_str_radix(&hex, 16).unwrap();
390        char::from_u32(code).ok_or_else(|| {
391            let span = self.span_from(start_pos, start_line, start_col);
392            self.make_error(format!("invalid unicode code point: \\u{hex}"), span)
393        })
394    }
395
396    // ── Character literal `\X` ────────────────────────────────────────────
397
398    fn lex_char_literal(
399        &mut self,
400        start_pos: usize,
401        start_line: u32,
402        start_col: u32,
403    ) -> CljxResult<(Token, Span)> {
404        self.advance(); // consume '\'
405
406        // Peek ahead at all symbol-constituent chars to figure out the name.
407        let rest_start = self.pos;
408        let rest: String = self.source[rest_start..]
409            .chars()
410            .take_while(|&c| c.is_alphanumeric() || c == '-')
411            .collect();
412
413        let ch = match rest.as_str() {
414            "newline" => {
415                self.pos += "newline".len();
416                self.col += "newline".len() as u32;
417                '\n'
418            }
419            "space" => {
420                self.pos += "space".len();
421                self.col += "space".len() as u32;
422                ' '
423            }
424            "tab" => {
425                self.pos += "tab".len();
426                self.col += "tab".len() as u32;
427                '\t'
428            }
429            "backspace" => {
430                self.pos += "backspace".len();
431                self.col += "backspace".len() as u32;
432                '\x08'
433            }
434            "formfeed" => {
435                self.pos += "formfeed".len();
436                self.col += "formfeed".len() as u32;
437                '\x0C'
438            }
439            "return" => {
440                self.pos += "return".len();
441                self.col += "return".len() as u32;
442                '\r'
443            }
444            _ if rest.starts_with('u') && rest.len() >= 5 => {
445                // Try \uXXXX
446                let hex_part = &rest[1..5];
447                if hex_part.chars().all(|c| c.is_ascii_hexdigit()) {
448                    let code = u32::from_str_radix(hex_part, 16).unwrap();
449                    let c = char::from_u32(code).ok_or_else(|| {
450                        let span = self.span_from(start_pos, start_line, start_col);
451                        self.make_error(
452                            format!("invalid unicode code point in char literal: \\u{hex_part}"),
453                            span,
454                        )
455                    })?;
456                    // advance 5 bytes: 'u' + 4 hex digits
457                    self.pos += 5;
458                    self.col += 5;
459                    c
460                } else {
461                    let span = self.span_from(start_pos, start_line, start_col);
462                    return Err(self.make_error(format!("unknown character name: {rest}"), span));
463                }
464            }
465            _ if rest.len() == 1 => {
466                // Single ASCII or first char
467                let c = self.source[rest_start..].chars().next().unwrap();
468                self.pos += c.len_utf8();
469                self.col += c.len_utf8() as u32;
470                c
471            }
472            _ if rest.is_empty() => {
473                // Nothing after backslash — try a single non-alphanumeric char
474                match self.source[rest_start..].chars().next() {
475                    Some(c) => {
476                        self.pos += c.len_utf8();
477                        self.col += c.len_utf8() as u32;
478                        c
479                    }
480                    None => {
481                        let span = self.span_from(start_pos, start_line, start_col);
482                        return Err(self.make_error("unexpected end of file after \\", span));
483                    }
484                }
485            }
486            _ => {
487                let span = self.span_from(start_pos, start_line, start_col);
488                return Err(self.make_error(format!("unknown character name: {rest}"), span));
489            }
490        };
491
492        Ok((
493            Token::Char(ch),
494            self.span_from(start_pos, start_line, start_col),
495        ))
496    }
497
498    // ── Keyword ───────────────────────────────────────────────────────────
499
500    fn lex_keyword(
501        &mut self,
502        start_pos: usize,
503        start_line: u32,
504        start_col: u32,
505    ) -> CljxResult<(Token, Span)> {
506        self.advance(); // consume first ':'
507        if self.peek() == Some(':') {
508            self.advance(); // consume second ':'
509            let name = self.read_symbol_chars();
510            if name.is_empty() {
511                let span = self.span_from(start_pos, start_line, start_col);
512                return Err(self.make_error("empty auto-resolved keyword", span));
513            }
514            Ok((
515                Token::AutoKeyword(name),
516                self.span_from(start_pos, start_line, start_col),
517            ))
518        } else {
519            let name = self.read_symbol_chars();
520            if name.is_empty() {
521                let span = self.span_from(start_pos, start_line, start_col);
522                return Err(self.make_error("empty keyword", span));
523            }
524            Ok((
525                Token::Keyword(name),
526                self.span_from(start_pos, start_line, start_col),
527            ))
528        }
529    }
530
531    // ── Symbol (and nil/true/false) ────────────────────────────────────────
532
533    fn lex_symbol(
534        &mut self,
535        start_pos: usize,
536        start_line: u32,
537        start_col: u32,
538    ) -> CljxResult<(Token, Span)> {
539        let mut name = self.read_symbol_chars();
540
541        // Peek for a version suffix: `@<commit-hash>`.  We only consume the `@`
542        // when it is immediately followed by 7–40 hex characters so that a
543        // standalone `@expr` (deref reader macro) is never affected — deref
544        // always starts a *new* form where `@` is the first character, not a
545        // mid-symbol suffix.
546        if self.peek() == Some('@') {
547            let version_candidate = self.peek_version_hash();
548            if let Some(hash) = version_candidate {
549                self.advance(); // consume '@'
550                for _ in 0..hash.len() {
551                    self.advance();
552                }
553                name.push('@');
554                name.push_str(&hash);
555            }
556        }
557
558        let tok = match name.as_str() {
559            "nil" => Token::Nil,
560            "true" => Token::Bool(true),
561            "false" => Token::Bool(false),
562            _ => Token::Symbol(name),
563        };
564        Ok((tok, self.span_from(start_pos, start_line, start_col)))
565    }
566
567    /// Look ahead past the `@` that `peek()` just returned and collect
568    /// characters as long as they are ASCII hex digits, up to 40.  Returns
569    /// `Some(hash)` if the candidate is 7–40 hex chars followed by a
570    /// non-hex-digit (or EOF), `None` otherwise.  Does **not** advance the
571    /// cursor.
572    fn peek_version_hash(&self) -> Option<String> {
573        // Start one byte past the current `@`.
574        let at_byte = self.pos + 1; // '@' is single-byte ASCII
575        let rest = &self.source[at_byte..];
576        let hash: String = rest
577            .chars()
578            .take(40)
579            .take_while(|c| c.is_ascii_hexdigit())
580            .collect();
581        if hash.len() >= 7 {
582            // Make sure the character after the hash is a delimiter (or EOF).
583            let after = rest[hash.len()..].chars().next();
584            let is_delimited = after.is_none_or(|c| !c.is_ascii_hexdigit());
585            if is_delimited {
586                return Some(hash);
587            }
588        }
589        None
590    }
591
592    // ── Number ────────────────────────────────────────────────────────────
593
594    fn lex_number(
595        &mut self,
596        start_pos: usize,
597        start_line: u32,
598        start_col: u32,
599    ) -> CljxResult<(Token, Span)> {
600        // Optional sign
601        let negative = match self.peek() {
602            Some('-') => {
603                self.advance();
604                true
605            }
606            Some('+') => {
607                self.advance();
608                false
609            }
610            _ => false,
611        };
612        let sign_str = if negative { "-" } else { "" };
613
614        // Integer part (decimal digits)
615        let mut int_part = String::new();
616        while let Some(c) = self.peek() {
617            if c.is_ascii_digit() {
618                int_part.push(c);
619                self.advance();
620            } else {
621                break;
622            }
623        }
624
625        // Hex literal: 0x / 0X  (also -0x…)
626        if int_part == "0" && matches!(self.peek(), Some('x') | Some('X')) {
627            self.advance(); // consume 'x'/'X'
628            let mut hex = String::new();
629            while let Some(c) = self.peek() {
630                if c.is_ascii_hexdigit() {
631                    hex.push(c);
632                    self.advance();
633                } else {
634                    break;
635                }
636            }
637            if hex.is_empty() {
638                let span = self.span_from(start_pos, start_line, start_col);
639                return Err(self.make_error("expected hex digits after 0x", span));
640            }
641            let value = u128::from_str_radix(&hex, 16).unwrap_or(u128::MAX);
642            let span = self.span_from(start_pos, start_line, start_col);
643            return if negative {
644                // -0x8000000000000000 == i64::MIN is valid; anything larger overflows.
645                if value <= (i64::MAX as u128) + 1 {
646                    Ok((Token::Int(0i64.wrapping_sub(value as i64)), span))
647                } else {
648                    // Store as signed decimal string for BigInt.
649                    Ok((Token::BigInt(format!("-{value}")), span))
650                }
651            } else if value <= i64::MAX as u128 {
652                Ok((Token::Int(value as i64), span))
653            } else {
654                Ok((Token::BigInt(value.to_string()), span))
655            };
656        }
657
658        // Radix literal: NNrDIGITS
659        if matches!(self.peek(), Some('r') | Some('R')) {
660            let radix: u32 = int_part.parse().unwrap_or(0);
661            self.advance(); // consume 'r'/'R'
662            let mut digits = String::new();
663            while let Some(c) = self.peek() {
664                if c.is_ascii_alphanumeric() {
665                    digits.push(c);
666                    self.advance();
667                } else {
668                    break;
669                }
670            }
671            let mut value: u128 = 0;
672            for c in digits.chars() {
673                let d = c.to_digit(radix).ok_or_else(|| {
674                    let span = self.span_from(start_pos, start_line, start_col);
675                    self.make_error(format!("invalid digit {c:?} for radix {radix}"), span)
676                })?;
677                value = value.wrapping_mul(radix as u128).wrapping_add(d as u128);
678            }
679            if negative {
680                // Check if it fits as negative i64
681                if value <= (i64::MAX as u128) + 1 {
682                    let signed = -(value as i64);
683                    return Ok((
684                        Token::Int(signed),
685                        self.span_from(start_pos, start_line, start_col),
686                    ));
687                } else {
688                    // Store as decimal string with sign
689                    return Ok((
690                        Token::BigInt(format!("-{value}")),
691                        self.span_from(start_pos, start_line, start_col),
692                    ));
693                }
694            } else if value <= i64::MAX as u128 {
695                return Ok((
696                    Token::Int(value as i64),
697                    self.span_from(start_pos, start_line, start_col),
698                ));
699            } else {
700                return Ok((
701                    Token::BigInt(value.to_string()),
702                    self.span_from(start_pos, start_line, start_col),
703                ));
704            }
705        }
706
707        // BigInt suffix 'N'
708        if self.peek() == Some('N') {
709            self.advance();
710            return Ok((
711                Token::BigInt(format!("{sign_str}{int_part}")),
712                self.span_from(start_pos, start_line, start_col),
713            ));
714        }
715
716        // BigDecimal suffix 'M' on integer literal (e.g. 4M)
717        if self.peek() == Some('M') {
718            self.advance();
719            return Ok((
720                Token::BigDecimal(format!("{sign_str}{int_part}")),
721                self.span_from(start_pos, start_line, start_col),
722            ));
723        }
724
725        // Float: decimal point or exponent
726        if matches!(self.peek(), Some('.') | Some('e') | Some('E')) {
727            let mut raw = format!("{sign_str}{int_part}");
728            if self.peek() == Some('.') {
729                raw.push('.');
730                self.advance();
731                while let Some(c) = self.peek() {
732                    if c.is_ascii_digit() {
733                        raw.push(c);
734                        self.advance();
735                    } else {
736                        break;
737                    }
738                }
739            }
740            if matches!(self.peek(), Some('e') | Some('E')) {
741                raw.push('e');
742                self.advance();
743                if matches!(self.peek(), Some('+') | Some('-')) {
744                    raw.push(self.peek().unwrap());
745                    self.advance();
746                }
747                while let Some(c) = self.peek() {
748                    if c.is_ascii_digit() {
749                        raw.push(c);
750                        self.advance();
751                    } else {
752                        break;
753                    }
754                }
755            }
756            // BigDecimal suffix 'M'
757            if self.peek() == Some('M') {
758                self.advance();
759                return Ok((
760                    Token::BigDecimal(raw),
761                    self.span_from(start_pos, start_line, start_col),
762                ));
763            }
764            let val: f64 = raw.parse().map_err(|_| {
765                let span = self.span_from(start_pos, start_line, start_col);
766                self.make_error(format!("invalid float: {raw}"), span)
767            })?;
768            return Ok((
769                Token::Float(val),
770                self.span_from(start_pos, start_line, start_col),
771            ));
772        }
773
774        // Ratio: INT/DIGITS — only if next char after '/' is a digit
775        if self.peek() == Some('/') && matches!(self.peek_next(), Some(c) if c.is_ascii_digit()) {
776            self.advance(); // consume '/'
777            let mut denom = String::new();
778            while let Some(c) = self.peek() {
779                if c.is_ascii_digit() {
780                    denom.push(c);
781                    self.advance();
782                } else {
783                    break;
784                }
785            }
786            return Ok((
787                Token::Ratio(format!("{sign_str}{int_part}/{denom}")),
788                self.span_from(start_pos, start_line, start_col),
789            ));
790        }
791
792        // Plain integer
793        let full = format!("{sign_str}{int_part}");
794        match full.parse::<i64>() {
795            Ok(n) => Ok((
796                Token::Int(n),
797                self.span_from(start_pos, start_line, start_col),
798            )),
799            Err(_) => {
800                // Overflow: store decimal string
801                Ok((
802                    Token::BigInt(full),
803                    self.span_from(start_pos, start_line, start_col),
804                ))
805            }
806        }
807    }
808
809    // ── Top-level token dispatch ──────────────────────────────────────────
810
811    pub fn next_token(&mut self) -> CljxResult<(Token, Span)> {
812        self.skip_whitespace_and_comments();
813
814        let start_pos = self.pos;
815        let start_line = self.line;
816        let start_col = self.col;
817
818        let ch = match self.peek() {
819            None => {
820                return Ok((Token::Eof, self.span_from(start_pos, start_line, start_col)));
821            }
822            Some(c) => c,
823        };
824
825        match ch {
826            '(' => {
827                self.advance();
828                Ok((
829                    Token::LParen,
830                    self.span_from(start_pos, start_line, start_col),
831                ))
832            }
833            ')' => {
834                self.advance();
835                Ok((
836                    Token::RParen,
837                    self.span_from(start_pos, start_line, start_col),
838                ))
839            }
840            '[' => {
841                self.advance();
842                Ok((
843                    Token::LBracket,
844                    self.span_from(start_pos, start_line, start_col),
845                ))
846            }
847            ']' => {
848                self.advance();
849                Ok((
850                    Token::RBracket,
851                    self.span_from(start_pos, start_line, start_col),
852                ))
853            }
854            '{' => {
855                self.advance();
856                Ok((
857                    Token::LBrace,
858                    self.span_from(start_pos, start_line, start_col),
859                ))
860            }
861            '}' => {
862                self.advance();
863                Ok((
864                    Token::RBrace,
865                    self.span_from(start_pos, start_line, start_col),
866                ))
867            }
868            '\'' => {
869                self.advance();
870                Ok((
871                    Token::Quote,
872                    self.span_from(start_pos, start_line, start_col),
873                ))
874            }
875            '`' => {
876                self.advance();
877                Ok((
878                    Token::SyntaxQuote,
879                    self.span_from(start_pos, start_line, start_col),
880                ))
881            }
882            '@' => {
883                self.advance();
884                Ok((
885                    Token::Deref,
886                    self.span_from(start_pos, start_line, start_col),
887                ))
888            }
889            '^' => {
890                self.advance();
891                Ok((
892                    Token::Meta,
893                    self.span_from(start_pos, start_line, start_col),
894                ))
895            }
896            '~' => self.lex_unquote(start_pos, start_line, start_col),
897            '#' => self.lex_hash(start_pos, start_line, start_col),
898            '"' => self.lex_string(start_pos, start_line, start_col),
899            '\\' => self.lex_char_literal(start_pos, start_line, start_col),
900            ':' => self.lex_keyword(start_pos, start_line, start_col),
901            c if c.is_ascii_digit() => self.lex_number(start_pos, start_line, start_col),
902            '+' | '-' if matches!(self.peek_next(), Some(d) if d.is_ascii_digit()) => {
903                self.lex_number(start_pos, start_line, start_col)
904            }
905            c if is_symbol_start(c) => self.lex_symbol(start_pos, start_line, start_col),
906            // '+' and '-' alone (or before non-digit) are symbols
907            '+' | '-' => self.lex_symbol(start_pos, start_line, start_col),
908            c => {
909                self.advance();
910                let span = self.span_from(start_pos, start_line, start_col);
911                Err(self.make_error(format!("unexpected character: {c:?}"), span))
912            }
913        }
914    }
915}
916
917impl Iterator for Lexer {
918    type Item = CljxResult<(Token, Span)>;
919
920    fn next(&mut self) -> Option<Self::Item> {
921        match self.next_token() {
922            Ok((Token::Eof, _)) => None,
923            result => Some(result),
924        }
925    }
926}
927
928// ─── Tests ────────────────────────────────────────────────────────────────────
929
930#[cfg(test)]
931mod tests {
932    use super::*;
933
934    fn lex_all(src: &str) -> Vec<Token> {
935        Lexer::new(src.to_string(), "<test>".to_string())
936            .map(|r: CljxResult<(Token, Span)>| r.expect("lex error").0)
937            .collect()
938    }
939
940    fn lex_one(src: &str) -> Token {
941        let mut l = Lexer::new(src.to_string(), "<test>".to_string());
942        l.next_token().expect("lex error").0
943    }
944
945    fn lex_err(src: &str) -> String {
946        let mut l = Lexer::new(src.to_string(), "<test>".to_string());
947        loop {
948            match l.next_token() {
949                Err(CljxError::ReadError { message, .. }) => return message,
950                Err(e) => panic!("unexpected error type: {e}"),
951                Ok((Token::Eof, _)) => panic!("expected an error but got Eof"),
952                Ok(_) => {}
953            }
954        }
955    }
956
957    // ── nil / bool ────────────────────────────────────────────────────────
958
959    #[test]
960    fn test_nil() {
961        assert_eq!(lex_one("nil"), Token::Nil);
962    }
963
964    #[test]
965    fn test_bool() {
966        assert_eq!(lex_one("true"), Token::Bool(true));
967        assert_eq!(lex_one("false"), Token::Bool(false));
968    }
969
970    // ── Integers ──────────────────────────────────────────────────────────
971
972    #[test]
973    fn test_int_plain() {
974        assert_eq!(lex_one("42"), Token::Int(42));
975        assert_eq!(lex_one("-42"), Token::Int(-42));
976        assert_eq!(lex_one("+42"), Token::Int(42));
977        assert_eq!(lex_one("0"), Token::Int(0));
978    }
979
980    #[test]
981    fn test_bigint_suffix() {
982        assert_eq!(lex_one("42N"), Token::BigInt("42".to_string()));
983        assert_eq!(lex_one("-42N"), Token::BigInt("-42".to_string()));
984    }
985
986    #[test]
987    fn test_hex_literal() {
988        assert_eq!(lex_one("0xff"), Token::Int(255));
989        assert_eq!(lex_one("0xFF"), Token::Int(255));
990        assert_eq!(lex_one("0x0"), Token::Int(0));
991        assert_eq!(lex_one("0x7FFFFFFFFFFFFFFF"), Token::Int(i64::MAX));
992        assert_eq!(lex_one("-0x8000000000000000"), Token::Int(i64::MIN));
993        assert_eq!(lex_one("-0xff"), Token::Int(-255));
994        // Overflow → BigInt
995        match lex_one("0xFFFFFFFFFFFFFFFF") {
996            Token::BigInt(_) => {}
997            other => panic!("expected BigInt for 0xFFFF…, got {other:?}"),
998        }
999    }
1000
1001    #[test]
1002    fn test_radix() {
1003        assert_eq!(lex_one("2r1010"), Token::Int(10));
1004        assert_eq!(lex_one("8r77"), Token::Int(63));
1005        assert_eq!(lex_one("16rFF"), Token::Int(255));
1006        assert_eq!(lex_one("16rff"), Token::Int(255));
1007        assert_eq!(lex_one("36rZ"), Token::Int(35));
1008    }
1009
1010    #[test]
1011    fn test_radix_overflow() {
1012        // 2^64 fits in u128 but not i64
1013        let tok = lex_one("10r18446744073709551616");
1014        match tok {
1015            Token::BigInt(_) => {}
1016            other => panic!("expected BigInt, got {other:?}"),
1017        }
1018    }
1019
1020    // ── Floats ────────────────────────────────────────────────────────────
1021
1022    #[test]
1023    #[allow(clippy::approx_constant)]
1024    fn test_floats() {
1025        assert_eq!(lex_one("3.14"), Token::Float(3.14));
1026        assert_eq!(lex_one("1e10"), Token::Float(1e10));
1027        assert_eq!(lex_one("1.5e-3"), Token::Float(1.5e-3));
1028        assert_eq!(lex_one("-0.5"), Token::Float(-0.5));
1029    }
1030
1031    #[test]
1032    fn test_bigdecimal() {
1033        assert_eq!(lex_one("3.14M"), Token::BigDecimal("3.14".to_string()));
1034        assert_eq!(lex_one("1e5M"), Token::BigDecimal("1e5".to_string()));
1035    }
1036
1037    // ── Ratio ────────────────────────────────────────────────────────────
1038
1039    #[test]
1040    fn test_ratio() {
1041        assert_eq!(lex_one("3/4"), Token::Ratio("3/4".to_string()));
1042        assert_eq!(lex_one("-1/2"), Token::Ratio("-1/2".to_string()));
1043    }
1044
1045    #[test]
1046    fn test_ratio_vs_symbol() {
1047        // "3/foo" should lex as Int(3) then Symbol("/foo") — not a ratio
1048        let toks = lex_all("3/foo");
1049        assert_eq!(toks[0], Token::Int(3));
1050        assert_eq!(toks[1], Token::Symbol("/foo".to_string()));
1051    }
1052
1053    // ── Char literals ────────────────────────────────────────────────────
1054
1055    #[test]
1056    fn test_char_simple() {
1057        assert_eq!(lex_one("\\a"), Token::Char('a'));
1058    }
1059
1060    #[test]
1061    fn test_char_named() {
1062        assert_eq!(lex_one("\\newline"), Token::Char('\n'));
1063        assert_eq!(lex_one("\\space"), Token::Char(' '));
1064        assert_eq!(lex_one("\\tab"), Token::Char('\t'));
1065        assert_eq!(lex_one("\\backspace"), Token::Char('\x08'));
1066        assert_eq!(lex_one("\\formfeed"), Token::Char('\x0C'));
1067        assert_eq!(lex_one("\\return"), Token::Char('\r'));
1068    }
1069
1070    #[test]
1071    fn test_char_unicode() {
1072        assert_eq!(lex_one("\\u0041"), Token::Char('A'));
1073        assert_eq!(lex_one("\\u00e9"), Token::Char('é'));
1074    }
1075
1076    // ── Strings ──────────────────────────────────────────────────────────
1077
1078    #[test]
1079    fn test_string_basic() {
1080        assert_eq!(lex_one("\"hello\""), Token::Str("hello".to_string()));
1081    }
1082
1083    #[test]
1084    fn test_string_escapes() {
1085        assert_eq!(
1086            lex_one(r#""\n\t\r\b\f\\\"" "#),
1087            Token::Str("\n\t\r\x08\x0C\\\"".to_string())
1088        );
1089    }
1090
1091    #[test]
1092    fn test_string_unicode_escape() {
1093        assert_eq!(lex_one("\"\\u0041\""), Token::Str("A".to_string()));
1094    }
1095
1096    // ── Symbols ──────────────────────────────────────────────────────────
1097
1098    #[test]
1099    fn test_symbols() {
1100        assert_eq!(lex_one("foo"), Token::Symbol("foo".to_string()));
1101        assert_eq!(lex_one("ns/name"), Token::Symbol("ns/name".to_string()));
1102        assert_eq!(lex_one("/"), Token::Symbol("/".to_string()));
1103        assert_eq!(lex_one(".."), Token::Symbol("..".to_string()));
1104        assert_eq!(lex_one(".method"), Token::Symbol(".method".to_string()));
1105        assert_eq!(lex_one("+"), Token::Symbol("+".to_string()));
1106        assert_eq!(lex_one("-"), Token::Symbol("-".to_string()));
1107        assert_eq!(lex_one("+foo"), Token::Symbol("+foo".to_string()));
1108    }
1109
1110    // ── Keywords ─────────────────────────────────────────────────────────
1111
1112    #[test]
1113    fn test_keyword() {
1114        assert_eq!(lex_one(":foo"), Token::Keyword("foo".to_string()));
1115        assert_eq!(lex_one(":ns/name"), Token::Keyword("ns/name".to_string()));
1116    }
1117
1118    #[test]
1119    fn test_auto_keyword() {
1120        assert_eq!(lex_one("::foo"), Token::AutoKeyword("foo".to_string()));
1121        assert_eq!(
1122            lex_one("::ns/alias"),
1123            Token::AutoKeyword("ns/alias".to_string())
1124        );
1125    }
1126
1127    // ── Delimiters ───────────────────────────────────────────────────────
1128
1129    #[test]
1130    fn test_delimiters() {
1131        assert_eq!(
1132            lex_all("([{}])"),
1133            vec![
1134                Token::LParen,
1135                Token::LBracket,
1136                Token::LBrace,
1137                Token::RBrace,
1138                Token::RBracket,
1139                Token::RParen,
1140            ]
1141        );
1142    }
1143
1144    // ── Reader macros ────────────────────────────────────────────────────
1145
1146    #[test]
1147    fn test_reader_macros() {
1148        assert_eq!(lex_one("'x"), Token::Quote);
1149        assert_eq!(lex_one("`x"), Token::SyntaxQuote);
1150        assert_eq!(lex_one("~x"), Token::Unquote);
1151        assert_eq!(lex_one("~@x"), Token::UnquoteSplice);
1152        assert_eq!(lex_one("@x"), Token::Deref);
1153        assert_eq!(lex_one("^x"), Token::Meta);
1154    }
1155
1156    // ── `#` dispatch ─────────────────────────────────────────────────────
1157
1158    #[test]
1159    fn test_hash_dispatch() {
1160        assert_eq!(lex_one("#("), Token::HashFn);
1161        assert_eq!(lex_one("#{"), Token::HashSet);
1162        assert_eq!(lex_one("#'"), Token::HashVar);
1163        assert_eq!(lex_one("#_"), Token::HashDiscard);
1164        assert_eq!(lex_one("#?"), Token::ReaderCond);
1165        assert_eq!(lex_one("#?@"), Token::ReaderCondSplice);
1166    }
1167
1168    #[test]
1169    fn test_regex() {
1170        assert_eq!(lex_one("#\"[a-z]+\""), Token::Regex("[a-z]+".to_string()));
1171    }
1172
1173    #[test]
1174    fn test_symbolic() {
1175        assert_eq!(lex_one("##Inf"), Token::Symbolic("Inf".to_string()));
1176        assert_eq!(lex_one("##-Inf"), Token::Symbolic("-Inf".to_string()));
1177        assert_eq!(lex_one("##NaN"), Token::Symbolic("NaN".to_string()));
1178    }
1179
1180    #[test]
1181    fn test_tagged_literal() {
1182        assert_eq!(lex_one("#mytag"), Token::TaggedLiteral("mytag".to_string()));
1183    }
1184
1185    // ── Multi-token ──────────────────────────────────────────────────────
1186
1187    #[test]
1188    fn test_multi_token() {
1189        let toks = lex_all("(+ 1 2)");
1190        assert_eq!(
1191            toks,
1192            vec![
1193                Token::LParen,
1194                Token::Symbol("+".to_string()),
1195                Token::Int(1),
1196                Token::Int(2),
1197                Token::RParen,
1198            ]
1199        );
1200    }
1201
1202    // ── Whitespace / comments ────────────────────────────────────────────
1203
1204    #[test]
1205    fn test_comma_skipped() {
1206        assert_eq!(lex_all("{,,,}"), vec![Token::LBrace, Token::RBrace]);
1207    }
1208
1209    #[test]
1210    fn test_comment_skipped() {
1211        assert_eq!(lex_all("; this is a comment\n42"), vec![Token::Int(42)]);
1212    }
1213
1214    #[test]
1215    fn test_shebang_skipped() {
1216        assert_eq!(lex_all("#!/usr/bin/env cljx\n42"), vec![Token::Int(42)]);
1217    }
1218
1219    // ── Span tracking ────────────────────────────────────────────────────
1220
1221    #[test]
1222    fn test_span_col() {
1223        let mut l = Lexer::new("  foo".to_string(), "<test>".to_string());
1224        let (_tok, span) = l.next_token().unwrap();
1225        assert_eq!(span.start, 2);
1226        assert_eq!(span.col, 3);
1227    }
1228
1229    #[test]
1230    fn test_span_newline() {
1231        let mut l = Lexer::new("a\nb".to_string(), "<test>".to_string());
1232        l.next_token().unwrap(); // consume 'a'
1233        let (_tok, span) = l.next_token().unwrap(); // 'b'
1234        assert_eq!(span.line, 2);
1235        assert_eq!(span.col, 1);
1236    }
1237
1238    // ── Errors ───────────────────────────────────────────────────────────
1239
1240    #[test]
1241    fn test_error_unterminated_string() {
1242        let msg = lex_err("\"unterminated");
1243        assert!(msg.contains("unterminated string"));
1244    }
1245
1246    #[test]
1247    fn test_error_bad_hash_dispatch() {
1248        // '#1' is invalid: '1' is not a symbol start and not a special dispatch char
1249        let msg = lex_err("#1");
1250        assert!(msg.contains("unknown # dispatch"));
1251    }
1252
1253    #[test]
1254    fn test_error_bad_unicode_escape_in_string() {
1255        let msg = lex_err("\"\\uGHIJ\"");
1256        assert!(msg.contains("invalid") || msg.contains("hex"));
1257    }
1258
1259    #[test]
1260    fn test_error_unknown_char_name() {
1261        let msg = lex_err("\\bogus");
1262        assert!(msg.contains("unknown character name"));
1263    }
1264
1265    #[test]
1266    fn test_error_unknown_symbolic() {
1267        let msg = lex_err("##Bogus");
1268        assert!(msg.contains("unknown symbolic value"));
1269    }
1270
1271    #[test]
1272    fn test_error_bad_string_escape() {
1273        let msg = lex_err("\"\\q\"");
1274        assert!(msg.contains("unknown string escape"));
1275    }
1276}