Skip to main content

cljrs_reader/
lexer.rs

1// CljxError embeds NamedSource<String> for miette diagnostics, which is
2// unavoidably large. Suppress the false-positive for every returning function.
3#![allow(clippy::result_large_err)]
4
5use std::sync::Arc;
6
7use miette::NamedSource;
8
9use cljrs_types::error::{CljxError, CljxResult};
10use cljrs_types::span::Span;
11
12use crate::token::Token;
13
14// ─── Character classification ─────────────────────────────────────────────────
15
16/// Returns `true` if `ch` is a valid constituent character for a symbol or
17/// keyword.  Defined *negatively*: everything that isn't a delimiter, whitespace,
18/// or special syntax character is a symbol constituent.
19fn is_symbol_char(ch: char) -> bool {
20    !matches!(
21        ch,
22        ' ' | '\t'
23            | '\n'
24            | '\r'
25            | ','
26            | '('
27            | ')'
28            | '['
29            | ']'
30            | '{'
31            | '}'
32            | '"'
33            | ';'
34            | '`'
35            | '~'
36            | '^'
37            | '@'
38            | '#'
39            | '\\'
40            | ':'
41    )
42}
43
44/// Returns `true` if `ch` can *start* a symbol (not a digit, not `+`/`-` when
45/// the following char is a digit — but the caller handles the `+`/`-` case).
46fn is_symbol_start(ch: char) -> bool {
47    is_symbol_char(ch) && !ch.is_ascii_digit()
48}
49
50// ─── Lexer ───────────────────────────────────────────────────────────────────
51
52pub struct Lexer {
53    source: Arc<String>,
54    file: Arc<String>,
55    pos: usize, // byte offset, always on a char boundary
56    line: u32,  // 1-based
57    col: u32,   // 1-based byte offset from line start
58}
59
60impl Lexer {
61    pub fn new(source: String, file: String) -> Self {
62        Self {
63            source: Arc::new(source),
64            file: Arc::new(file),
65            pos: 0,
66            line: 1,
67            col: 1,
68        }
69    }
70
71    // ── Public getters ────────────────────────────────────────────────────
72
73    pub fn source(&self) -> &Arc<String> {
74        &self.source
75    }
76
77    pub fn file(&self) -> &Arc<String> {
78        &self.file
79    }
80
81    // ── Low-level helpers ─────────────────────────────────────────────────
82
83    fn peek(&self) -> Option<char> {
84        self.source[self.pos..].chars().next()
85    }
86
87    fn peek_next(&self) -> Option<char> {
88        let mut chars = self.source[self.pos..].chars();
89        chars.next(); // skip current
90        chars.next()
91    }
92
93    fn advance(&mut self) -> Option<char> {
94        let ch = self.peek()?;
95        self.pos += ch.len_utf8();
96        if ch == '\n' {
97            self.line += 1;
98            self.col = 1;
99        } else {
100            self.col += ch.len_utf8() as u32;
101        }
102        Some(ch)
103    }
104
105    fn span_from(&self, start_pos: usize, start_line: u32, start_col: u32) -> Span {
106        Span::new(
107            Arc::clone(&self.file),
108            start_pos,
109            self.pos,
110            start_line,
111            start_col,
112        )
113    }
114
115    fn make_error(&self, msg: impl Into<String>, span: Span) -> CljxError {
116        CljxError::ReadError {
117            message: msg.into(),
118            span: Some(miette::SourceSpan::from(span)),
119            src: NamedSource::new((*self.file).clone(), (*self.source).clone()),
120        }
121    }
122
123    /// Consume characters while `is_symbol_char` holds, returning the collected
124    /// string.
125    fn read_symbol_chars(&mut self) -> String {
126        let mut buf = String::new();
127        while let Some(ch) = self.peek() {
128            if is_symbol_char(ch) {
129                buf.push(ch);
130                self.advance();
131            } else {
132                break;
133            }
134        }
135        buf
136    }
137
138    // ── Whitespace / comment skipping ─────────────────────────────────────
139
140    fn skip_whitespace_and_comments(&mut self) {
141        loop {
142            match self.peek() {
143                // Shebang: only recognised at the very start of the file.
144                Some('#') if self.pos == 0 => {
145                    if self.peek_next() == Some('!') {
146                        // skip to end of line
147                        while let Some(ch) = self.advance() {
148                            if ch == '\n' {
149                                break;
150                            }
151                        }
152                    } else {
153                        break; // '#' is meaningful, stop skipping
154                    }
155                }
156                Some(' ') | Some('\t') | Some('\r') | Some('\n') | Some(',') => {
157                    self.advance();
158                }
159                Some(';') => {
160                    while let Some(ch) = self.advance() {
161                        if ch == '\n' {
162                            break;
163                        }
164                    }
165                }
166                _ => break,
167            }
168        }
169    }
170
171    // ── `~` (unquote / unquote-splicing) ─────────────────────────────────
172
173    fn lex_unquote(
174        &mut self,
175        start_pos: usize,
176        start_line: u32,
177        start_col: u32,
178    ) -> CljxResult<(Token, Span)> {
179        self.advance(); // consume '~'
180        if self.peek() == Some('@') {
181            self.advance();
182            Ok((
183                Token::UnquoteSplice,
184                self.span_from(start_pos, start_line, start_col),
185            ))
186        } else {
187            Ok((
188                Token::Unquote,
189                self.span_from(start_pos, start_line, start_col),
190            ))
191        }
192    }
193
194    // ── `#` dispatch ──────────────────────────────────────────────────────
195
196    fn lex_hash(
197        &mut self,
198        start_pos: usize,
199        start_line: u32,
200        start_col: u32,
201    ) -> CljxResult<(Token, Span)> {
202        self.advance(); // consume '#'
203        match self.peek() {
204            Some('(') => {
205                self.advance();
206                Ok((
207                    Token::HashFn,
208                    self.span_from(start_pos, start_line, start_col),
209                ))
210            }
211            Some('{') => {
212                self.advance();
213                Ok((
214                    Token::HashSet,
215                    self.span_from(start_pos, start_line, start_col),
216                ))
217            }
218            Some('\'') => {
219                self.advance();
220                Ok((
221                    Token::HashVar,
222                    self.span_from(start_pos, start_line, start_col),
223                ))
224            }
225            Some('_') => {
226                self.advance();
227                Ok((
228                    Token::HashDiscard,
229                    self.span_from(start_pos, start_line, start_col),
230                ))
231            }
232            Some('"') => self.lex_regex(start_pos, start_line, start_col),
233            Some('?') => {
234                self.advance(); // consume '?'
235                if self.peek() == Some('@') {
236                    self.advance();
237                    Ok((
238                        Token::ReaderCondSplice,
239                        self.span_from(start_pos, start_line, start_col),
240                    ))
241                } else {
242                    Ok((
243                        Token::ReaderCond,
244                        self.span_from(start_pos, start_line, start_col),
245                    ))
246                }
247            }
248            Some('#') => self.lex_symbolic(start_pos, start_line, start_col),
249            Some(c) if is_symbol_start(c) => {
250                let name = self.read_symbol_chars();
251                Ok((
252                    Token::TaggedLiteral(name),
253                    self.span_from(start_pos, start_line, start_col),
254                ))
255            }
256            other => {
257                let span = self.span_from(start_pos, start_line, start_col);
258                Err(self.make_error(format!("unknown # dispatch character: {:?}", other), span))
259            }
260        }
261    }
262
263    fn lex_regex(
264        &mut self,
265        start_pos: usize,
266        start_line: u32,
267        start_col: u32,
268    ) -> CljxResult<(Token, Span)> {
269        self.advance(); // consume opening '"'
270        let mut buf = String::new();
271        loop {
272            match self.advance() {
273                None => {
274                    let span = self.span_from(start_pos, start_line, start_col);
275                    return Err(self.make_error("unterminated regex literal", span));
276                }
277                Some('"') => break,
278                Some('\\') => {
279                    // Store escape verbatim (two chars) — no processing.
280                    buf.push('\\');
281                    match self.advance() {
282                        Some(c) => buf.push(c),
283                        None => {
284                            let span = self.span_from(start_pos, start_line, start_col);
285                            return Err(self.make_error("unterminated regex literal", span));
286                        }
287                    }
288                }
289                Some(c) => buf.push(c),
290            }
291        }
292        Ok((
293            Token::Regex(buf),
294            self.span_from(start_pos, start_line, start_col),
295        ))
296    }
297
298    fn lex_symbolic(
299        &mut self,
300        start_pos: usize,
301        start_line: u32,
302        start_col: u32,
303    ) -> CljxResult<(Token, Span)> {
304        self.advance(); // consume second '#'
305        let name = self.read_symbol_chars();
306        match name.as_str() {
307            "Inf" | "-Inf" | "NaN" => Ok((
308                Token::Symbolic(name),
309                self.span_from(start_pos, start_line, start_col),
310            )),
311            _ => {
312                let span = self.span_from(start_pos, start_line, start_col);
313                Err(self.make_error(format!("unknown symbolic value: ##{name}"), span))
314            }
315        }
316    }
317
318    // ── String literal ────────────────────────────────────────────────────
319
320    fn lex_string(
321        &mut self,
322        start_pos: usize,
323        start_line: u32,
324        start_col: u32,
325    ) -> CljxResult<(Token, Span)> {
326        self.advance(); // consume opening '"'
327        let mut buf = String::new();
328        loop {
329            match self.advance() {
330                None => {
331                    let span = self.span_from(start_pos, start_line, start_col);
332                    return Err(self.make_error("unterminated string literal", span));
333                }
334                Some('"') => break,
335                Some('\\') => match self.advance() {
336                    Some('n') => buf.push('\n'),
337                    Some('t') => buf.push('\t'),
338                    Some('r') => buf.push('\r'),
339                    Some('b') => buf.push('\x08'),
340                    Some('f') => buf.push('\x0C'),
341                    Some('\\') => buf.push('\\'),
342                    Some('"') => buf.push('"'),
343                    Some('u') => {
344                        let ch = self.read_unicode_escape(start_pos, start_line, start_col)?;
345                        buf.push(ch);
346                    }
347                    Some(c) => {
348                        let span = self.span_from(start_pos, start_line, start_col);
349                        return Err(self.make_error(format!("unknown string escape: \\{c}"), span));
350                    }
351                    None => {
352                        let span = self.span_from(start_pos, start_line, start_col);
353                        return Err(self.make_error("unterminated string literal", span));
354                    }
355                },
356                Some(c) => buf.push(c),
357            }
358        }
359        Ok((
360            Token::Str(buf),
361            self.span_from(start_pos, start_line, start_col),
362        ))
363    }
364
365    /// Read exactly 4 hex digits after `\u` and return the corresponding char.
366    fn read_unicode_escape(
367        &mut self,
368        start_pos: usize,
369        start_line: u32,
370        start_col: u32,
371    ) -> CljxResult<char> {
372        let mut hex = String::with_capacity(4);
373        for _ in 0..4 {
374            match self.advance() {
375                Some(c) if c.is_ascii_hexdigit() => hex.push(c),
376                Some(c) => {
377                    let span = self.span_from(start_pos, start_line, start_col);
378                    return Err(self.make_error(
379                        format!("invalid \\u escape: expected hex digit, got {c:?}"),
380                        span,
381                    ));
382                }
383                None => {
384                    let span = self.span_from(start_pos, start_line, start_col);
385                    return Err(self.make_error("unterminated \\u escape", span));
386                }
387            }
388        }
389        let code = u32::from_str_radix(&hex, 16).unwrap();
390        char::from_u32(code).ok_or_else(|| {
391            let span = self.span_from(start_pos, start_line, start_col);
392            self.make_error(format!("invalid unicode code point: \\u{hex}"), span)
393        })
394    }
395
396    // ── Character literal `\X` ────────────────────────────────────────────
397
398    fn lex_char_literal(
399        &mut self,
400        start_pos: usize,
401        start_line: u32,
402        start_col: u32,
403    ) -> CljxResult<(Token, Span)> {
404        self.advance(); // consume '\'
405
406        // Peek ahead at all symbol-constituent chars to figure out the name.
407        let rest_start = self.pos;
408        let rest: String = self.source[rest_start..]
409            .chars()
410            .take_while(|&c| c.is_alphanumeric() || c == '-')
411            .collect();
412
413        let ch = match rest.as_str() {
414            "newline" => {
415                self.pos += "newline".len();
416                self.col += "newline".len() as u32;
417                '\n'
418            }
419            "space" => {
420                self.pos += "space".len();
421                self.col += "space".len() as u32;
422                ' '
423            }
424            "tab" => {
425                self.pos += "tab".len();
426                self.col += "tab".len() as u32;
427                '\t'
428            }
429            "backspace" => {
430                self.pos += "backspace".len();
431                self.col += "backspace".len() as u32;
432                '\x08'
433            }
434            "formfeed" => {
435                self.pos += "formfeed".len();
436                self.col += "formfeed".len() as u32;
437                '\x0C'
438            }
439            "return" => {
440                self.pos += "return".len();
441                self.col += "return".len() as u32;
442                '\r'
443            }
444            _ if rest.starts_with('u') && rest.len() >= 5 => {
445                // Try \uXXXX
446                let hex_part = &rest[1..5];
447                if hex_part.chars().all(|c| c.is_ascii_hexdigit()) {
448                    let code = u32::from_str_radix(hex_part, 16).unwrap();
449                    let c = char::from_u32(code).ok_or_else(|| {
450                        let span = self.span_from(start_pos, start_line, start_col);
451                        self.make_error(
452                            format!("invalid unicode code point in char literal: \\u{hex_part}"),
453                            span,
454                        )
455                    })?;
456                    // advance 5 bytes: 'u' + 4 hex digits
457                    self.pos += 5;
458                    self.col += 5;
459                    c
460                } else {
461                    let span = self.span_from(start_pos, start_line, start_col);
462                    return Err(self.make_error(format!("unknown character name: {rest}"), span));
463                }
464            }
465            _ if rest.len() == 1 => {
466                // Single ASCII or first char
467                let c = self.source[rest_start..].chars().next().unwrap();
468                self.pos += c.len_utf8();
469                self.col += c.len_utf8() as u32;
470                c
471            }
472            _ if rest.is_empty() => {
473                // Nothing after backslash — try a single non-alphanumeric char
474                match self.source[rest_start..].chars().next() {
475                    Some(c) => {
476                        self.pos += c.len_utf8();
477                        self.col += c.len_utf8() as u32;
478                        c
479                    }
480                    None => {
481                        let span = self.span_from(start_pos, start_line, start_col);
482                        return Err(self.make_error("unexpected end of file after \\", span));
483                    }
484                }
485            }
486            _ => {
487                let span = self.span_from(start_pos, start_line, start_col);
488                return Err(self.make_error(format!("unknown character name: {rest}"), span));
489            }
490        };
491
492        Ok((
493            Token::Char(ch),
494            self.span_from(start_pos, start_line, start_col),
495        ))
496    }
497
498    // ── Keyword ───────────────────────────────────────────────────────────
499
500    fn lex_keyword(
501        &mut self,
502        start_pos: usize,
503        start_line: u32,
504        start_col: u32,
505    ) -> CljxResult<(Token, Span)> {
506        self.advance(); // consume first ':'
507        if self.peek() == Some(':') {
508            self.advance(); // consume second ':'
509            let name = self.read_symbol_chars();
510            if name.is_empty() {
511                let span = self.span_from(start_pos, start_line, start_col);
512                return Err(self.make_error("empty auto-resolved keyword", span));
513            }
514            Ok((
515                Token::AutoKeyword(name),
516                self.span_from(start_pos, start_line, start_col),
517            ))
518        } else {
519            let name = self.read_symbol_chars();
520            if name.is_empty() {
521                let span = self.span_from(start_pos, start_line, start_col);
522                return Err(self.make_error("empty keyword", span));
523            }
524            Ok((
525                Token::Keyword(name),
526                self.span_from(start_pos, start_line, start_col),
527            ))
528        }
529    }
530
531    // ── Symbol (and nil/true/false) ────────────────────────────────────────
532
533    fn lex_symbol(
534        &mut self,
535        start_pos: usize,
536        start_line: u32,
537        start_col: u32,
538    ) -> CljxResult<(Token, Span)> {
539        let name = self.read_symbol_chars();
540        let tok = match name.as_str() {
541            "nil" => Token::Nil,
542            "true" => Token::Bool(true),
543            "false" => Token::Bool(false),
544            _ => Token::Symbol(name),
545        };
546        Ok((tok, self.span_from(start_pos, start_line, start_col)))
547    }
548
549    // ── Number ────────────────────────────────────────────────────────────
550
551    fn lex_number(
552        &mut self,
553        start_pos: usize,
554        start_line: u32,
555        start_col: u32,
556    ) -> CljxResult<(Token, Span)> {
557        // Optional sign
558        let negative = match self.peek() {
559            Some('-') => {
560                self.advance();
561                true
562            }
563            Some('+') => {
564                self.advance();
565                false
566            }
567            _ => false,
568        };
569        let sign_str = if negative { "-" } else { "" };
570
571        // Integer part (decimal digits)
572        let mut int_part = String::new();
573        while let Some(c) = self.peek() {
574            if c.is_ascii_digit() {
575                int_part.push(c);
576                self.advance();
577            } else {
578                break;
579            }
580        }
581
582        // Hex literal: 0x / 0X  (also -0x…)
583        if int_part == "0" && matches!(self.peek(), Some('x') | Some('X')) {
584            self.advance(); // consume 'x'/'X'
585            let mut hex = String::new();
586            while let Some(c) = self.peek() {
587                if c.is_ascii_hexdigit() {
588                    hex.push(c);
589                    self.advance();
590                } else {
591                    break;
592                }
593            }
594            if hex.is_empty() {
595                let span = self.span_from(start_pos, start_line, start_col);
596                return Err(self.make_error("expected hex digits after 0x", span));
597            }
598            let value = u128::from_str_radix(&hex, 16).unwrap_or(u128::MAX);
599            let span = self.span_from(start_pos, start_line, start_col);
600            return if negative {
601                // -0x8000000000000000 == i64::MIN is valid; anything larger overflows.
602                if value <= (i64::MAX as u128) + 1 {
603                    Ok((Token::Int(0i64.wrapping_sub(value as i64)), span))
604                } else {
605                    // Store as signed decimal string for BigInt.
606                    Ok((Token::BigInt(format!("-{value}")), span))
607                }
608            } else if value <= i64::MAX as u128 {
609                Ok((Token::Int(value as i64), span))
610            } else {
611                Ok((Token::BigInt(value.to_string()), span))
612            };
613        }
614
615        // Radix literal: NNrDIGITS
616        if matches!(self.peek(), Some('r') | Some('R')) {
617            let radix: u32 = int_part.parse().unwrap_or(0);
618            self.advance(); // consume 'r'/'R'
619            let mut digits = String::new();
620            while let Some(c) = self.peek() {
621                if c.is_ascii_alphanumeric() {
622                    digits.push(c);
623                    self.advance();
624                } else {
625                    break;
626                }
627            }
628            let mut value: u128 = 0;
629            for c in digits.chars() {
630                let d = c.to_digit(radix).ok_or_else(|| {
631                    let span = self.span_from(start_pos, start_line, start_col);
632                    self.make_error(format!("invalid digit {c:?} for radix {radix}"), span)
633                })?;
634                value = value.wrapping_mul(radix as u128).wrapping_add(d as u128);
635            }
636            if negative {
637                // Check if it fits as negative i64
638                if value <= (i64::MAX as u128) + 1 {
639                    let signed = -(value as i64);
640                    return Ok((
641                        Token::Int(signed),
642                        self.span_from(start_pos, start_line, start_col),
643                    ));
644                } else {
645                    // Store as decimal string with sign
646                    return Ok((
647                        Token::BigInt(format!("-{value}")),
648                        self.span_from(start_pos, start_line, start_col),
649                    ));
650                }
651            } else if value <= i64::MAX as u128 {
652                return Ok((
653                    Token::Int(value as i64),
654                    self.span_from(start_pos, start_line, start_col),
655                ));
656            } else {
657                return Ok((
658                    Token::BigInt(value.to_string()),
659                    self.span_from(start_pos, start_line, start_col),
660                ));
661            }
662        }
663
664        // BigInt suffix 'N'
665        if self.peek() == Some('N') {
666            self.advance();
667            return Ok((
668                Token::BigInt(format!("{sign_str}{int_part}")),
669                self.span_from(start_pos, start_line, start_col),
670            ));
671        }
672
673        // BigDecimal suffix 'M' on integer literal (e.g. 4M)
674        if self.peek() == Some('M') {
675            self.advance();
676            return Ok((
677                Token::BigDecimal(format!("{sign_str}{int_part}")),
678                self.span_from(start_pos, start_line, start_col),
679            ));
680        }
681
682        // Float: decimal point or exponent
683        if matches!(self.peek(), Some('.') | Some('e') | Some('E')) {
684            let mut raw = format!("{sign_str}{int_part}");
685            if self.peek() == Some('.') {
686                raw.push('.');
687                self.advance();
688                while let Some(c) = self.peek() {
689                    if c.is_ascii_digit() {
690                        raw.push(c);
691                        self.advance();
692                    } else {
693                        break;
694                    }
695                }
696            }
697            if matches!(self.peek(), Some('e') | Some('E')) {
698                raw.push('e');
699                self.advance();
700                if matches!(self.peek(), Some('+') | Some('-')) {
701                    raw.push(self.peek().unwrap());
702                    self.advance();
703                }
704                while let Some(c) = self.peek() {
705                    if c.is_ascii_digit() {
706                        raw.push(c);
707                        self.advance();
708                    } else {
709                        break;
710                    }
711                }
712            }
713            // BigDecimal suffix 'M'
714            if self.peek() == Some('M') {
715                self.advance();
716                return Ok((
717                    Token::BigDecimal(raw),
718                    self.span_from(start_pos, start_line, start_col),
719                ));
720            }
721            let val: f64 = raw.parse().map_err(|_| {
722                let span = self.span_from(start_pos, start_line, start_col);
723                self.make_error(format!("invalid float: {raw}"), span)
724            })?;
725            return Ok((
726                Token::Float(val),
727                self.span_from(start_pos, start_line, start_col),
728            ));
729        }
730
731        // Ratio: INT/DIGITS — only if next char after '/' is a digit
732        if self.peek() == Some('/') && matches!(self.peek_next(), Some(c) if c.is_ascii_digit()) {
733            self.advance(); // consume '/'
734            let mut denom = String::new();
735            while let Some(c) = self.peek() {
736                if c.is_ascii_digit() {
737                    denom.push(c);
738                    self.advance();
739                } else {
740                    break;
741                }
742            }
743            return Ok((
744                Token::Ratio(format!("{sign_str}{int_part}/{denom}")),
745                self.span_from(start_pos, start_line, start_col),
746            ));
747        }
748
749        // Plain integer
750        let full = format!("{sign_str}{int_part}");
751        match full.parse::<i64>() {
752            Ok(n) => Ok((
753                Token::Int(n),
754                self.span_from(start_pos, start_line, start_col),
755            )),
756            Err(_) => {
757                // Overflow: store decimal string
758                Ok((
759                    Token::BigInt(full),
760                    self.span_from(start_pos, start_line, start_col),
761                ))
762            }
763        }
764    }
765
766    // ── Top-level token dispatch ──────────────────────────────────────────
767
768    pub fn next_token(&mut self) -> CljxResult<(Token, Span)> {
769        self.skip_whitespace_and_comments();
770
771        let start_pos = self.pos;
772        let start_line = self.line;
773        let start_col = self.col;
774
775        let ch = match self.peek() {
776            None => {
777                return Ok((Token::Eof, self.span_from(start_pos, start_line, start_col)));
778            }
779            Some(c) => c,
780        };
781
782        match ch {
783            '(' => {
784                self.advance();
785                Ok((
786                    Token::LParen,
787                    self.span_from(start_pos, start_line, start_col),
788                ))
789            }
790            ')' => {
791                self.advance();
792                Ok((
793                    Token::RParen,
794                    self.span_from(start_pos, start_line, start_col),
795                ))
796            }
797            '[' => {
798                self.advance();
799                Ok((
800                    Token::LBracket,
801                    self.span_from(start_pos, start_line, start_col),
802                ))
803            }
804            ']' => {
805                self.advance();
806                Ok((
807                    Token::RBracket,
808                    self.span_from(start_pos, start_line, start_col),
809                ))
810            }
811            '{' => {
812                self.advance();
813                Ok((
814                    Token::LBrace,
815                    self.span_from(start_pos, start_line, start_col),
816                ))
817            }
818            '}' => {
819                self.advance();
820                Ok((
821                    Token::RBrace,
822                    self.span_from(start_pos, start_line, start_col),
823                ))
824            }
825            '\'' => {
826                self.advance();
827                Ok((
828                    Token::Quote,
829                    self.span_from(start_pos, start_line, start_col),
830                ))
831            }
832            '`' => {
833                self.advance();
834                Ok((
835                    Token::SyntaxQuote,
836                    self.span_from(start_pos, start_line, start_col),
837                ))
838            }
839            '@' => {
840                self.advance();
841                Ok((
842                    Token::Deref,
843                    self.span_from(start_pos, start_line, start_col),
844                ))
845            }
846            '^' => {
847                self.advance();
848                Ok((
849                    Token::Meta,
850                    self.span_from(start_pos, start_line, start_col),
851                ))
852            }
853            '~' => self.lex_unquote(start_pos, start_line, start_col),
854            '#' => self.lex_hash(start_pos, start_line, start_col),
855            '"' => self.lex_string(start_pos, start_line, start_col),
856            '\\' => self.lex_char_literal(start_pos, start_line, start_col),
857            ':' => self.lex_keyword(start_pos, start_line, start_col),
858            c if c.is_ascii_digit() => self.lex_number(start_pos, start_line, start_col),
859            '+' | '-' if matches!(self.peek_next(), Some(d) if d.is_ascii_digit()) => {
860                self.lex_number(start_pos, start_line, start_col)
861            }
862            c if is_symbol_start(c) => self.lex_symbol(start_pos, start_line, start_col),
863            // '+' and '-' alone (or before non-digit) are symbols
864            '+' | '-' => self.lex_symbol(start_pos, start_line, start_col),
865            c => {
866                self.advance();
867                let span = self.span_from(start_pos, start_line, start_col);
868                Err(self.make_error(format!("unexpected character: {c:?}"), span))
869            }
870        }
871    }
872}
873
874impl Iterator for Lexer {
875    type Item = CljxResult<(Token, Span)>;
876
877    fn next(&mut self) -> Option<Self::Item> {
878        match self.next_token() {
879            Ok((Token::Eof, _)) => None,
880            result => Some(result),
881        }
882    }
883}
884
885// ─── Tests ────────────────────────────────────────────────────────────────────
886
887#[cfg(test)]
888mod tests {
889    use super::*;
890
891    fn lex_all(src: &str) -> Vec<Token> {
892        Lexer::new(src.to_string(), "<test>".to_string())
893            .map(|r: CljxResult<(Token, Span)>| r.expect("lex error").0)
894            .collect()
895    }
896
897    fn lex_one(src: &str) -> Token {
898        let mut l = Lexer::new(src.to_string(), "<test>".to_string());
899        l.next_token().expect("lex error").0
900    }
901
902    fn lex_err(src: &str) -> String {
903        let mut l = Lexer::new(src.to_string(), "<test>".to_string());
904        loop {
905            match l.next_token() {
906                Err(CljxError::ReadError { message, .. }) => return message,
907                Err(e) => panic!("unexpected error type: {e}"),
908                Ok((Token::Eof, _)) => panic!("expected an error but got Eof"),
909                Ok(_) => {}
910            }
911        }
912    }
913
914    // ── nil / bool ────────────────────────────────────────────────────────
915
916    #[test]
917    fn test_nil() {
918        assert_eq!(lex_one("nil"), Token::Nil);
919    }
920
921    #[test]
922    fn test_bool() {
923        assert_eq!(lex_one("true"), Token::Bool(true));
924        assert_eq!(lex_one("false"), Token::Bool(false));
925    }
926
927    // ── Integers ──────────────────────────────────────────────────────────
928
929    #[test]
930    fn test_int_plain() {
931        assert_eq!(lex_one("42"), Token::Int(42));
932        assert_eq!(lex_one("-42"), Token::Int(-42));
933        assert_eq!(lex_one("+42"), Token::Int(42));
934        assert_eq!(lex_one("0"), Token::Int(0));
935    }
936
937    #[test]
938    fn test_bigint_suffix() {
939        assert_eq!(lex_one("42N"), Token::BigInt("42".to_string()));
940        assert_eq!(lex_one("-42N"), Token::BigInt("-42".to_string()));
941    }
942
943    #[test]
944    fn test_hex_literal() {
945        assert_eq!(lex_one("0xff"), Token::Int(255));
946        assert_eq!(lex_one("0xFF"), Token::Int(255));
947        assert_eq!(lex_one("0x0"), Token::Int(0));
948        assert_eq!(lex_one("0x7FFFFFFFFFFFFFFF"), Token::Int(i64::MAX));
949        assert_eq!(lex_one("-0x8000000000000000"), Token::Int(i64::MIN));
950        assert_eq!(lex_one("-0xff"), Token::Int(-255));
951        // Overflow → BigInt
952        match lex_one("0xFFFFFFFFFFFFFFFF") {
953            Token::BigInt(_) => {}
954            other => panic!("expected BigInt for 0xFFFF…, got {other:?}"),
955        }
956    }
957
958    #[test]
959    fn test_radix() {
960        assert_eq!(lex_one("2r1010"), Token::Int(10));
961        assert_eq!(lex_one("8r77"), Token::Int(63));
962        assert_eq!(lex_one("16rFF"), Token::Int(255));
963        assert_eq!(lex_one("16rff"), Token::Int(255));
964        assert_eq!(lex_one("36rZ"), Token::Int(35));
965    }
966
967    #[test]
968    fn test_radix_overflow() {
969        // 2^64 fits in u128 but not i64
970        let tok = lex_one("10r18446744073709551616");
971        match tok {
972            Token::BigInt(_) => {}
973            other => panic!("expected BigInt, got {other:?}"),
974        }
975    }
976
977    // ── Floats ────────────────────────────────────────────────────────────
978
979    #[test]
980    fn test_floats() {
981        assert_eq!(lex_one("3.14"), Token::Float(3.14));
982        assert_eq!(lex_one("1e10"), Token::Float(1e10));
983        assert_eq!(lex_one("1.5e-3"), Token::Float(1.5e-3));
984        assert_eq!(lex_one("-0.5"), Token::Float(-0.5));
985    }
986
987    #[test]
988    fn test_bigdecimal() {
989        assert_eq!(lex_one("3.14M"), Token::BigDecimal("3.14".to_string()));
990        assert_eq!(lex_one("1e5M"), Token::BigDecimal("1e5".to_string()));
991    }
992
993    // ── Ratio ────────────────────────────────────────────────────────────
994
995    #[test]
996    fn test_ratio() {
997        assert_eq!(lex_one("3/4"), Token::Ratio("3/4".to_string()));
998        assert_eq!(lex_one("-1/2"), Token::Ratio("-1/2".to_string()));
999    }
1000
1001    #[test]
1002    fn test_ratio_vs_symbol() {
1003        // "3/foo" should lex as Int(3) then Symbol("/foo") — not a ratio
1004        let toks = lex_all("3/foo");
1005        assert_eq!(toks[0], Token::Int(3));
1006        assert_eq!(toks[1], Token::Symbol("/foo".to_string()));
1007    }
1008
1009    // ── Char literals ────────────────────────────────────────────────────
1010
1011    #[test]
1012    fn test_char_simple() {
1013        assert_eq!(lex_one("\\a"), Token::Char('a'));
1014    }
1015
1016    #[test]
1017    fn test_char_named() {
1018        assert_eq!(lex_one("\\newline"), Token::Char('\n'));
1019        assert_eq!(lex_one("\\space"), Token::Char(' '));
1020        assert_eq!(lex_one("\\tab"), Token::Char('\t'));
1021        assert_eq!(lex_one("\\backspace"), Token::Char('\x08'));
1022        assert_eq!(lex_one("\\formfeed"), Token::Char('\x0C'));
1023        assert_eq!(lex_one("\\return"), Token::Char('\r'));
1024    }
1025
1026    #[test]
1027    fn test_char_unicode() {
1028        assert_eq!(lex_one("\\u0041"), Token::Char('A'));
1029        assert_eq!(lex_one("\\u00e9"), Token::Char('é'));
1030    }
1031
1032    // ── Strings ──────────────────────────────────────────────────────────
1033
1034    #[test]
1035    fn test_string_basic() {
1036        assert_eq!(lex_one("\"hello\""), Token::Str("hello".to_string()));
1037    }
1038
1039    #[test]
1040    fn test_string_escapes() {
1041        assert_eq!(
1042            lex_one(r#""\n\t\r\b\f\\\"" "#),
1043            Token::Str("\n\t\r\x08\x0C\\\"".to_string())
1044        );
1045    }
1046
1047    #[test]
1048    fn test_string_unicode_escape() {
1049        assert_eq!(lex_one("\"\\u0041\""), Token::Str("A".to_string()));
1050    }
1051
1052    // ── Symbols ──────────────────────────────────────────────────────────
1053
1054    #[test]
1055    fn test_symbols() {
1056        assert_eq!(lex_one("foo"), Token::Symbol("foo".to_string()));
1057        assert_eq!(lex_one("ns/name"), Token::Symbol("ns/name".to_string()));
1058        assert_eq!(lex_one("/"), Token::Symbol("/".to_string()));
1059        assert_eq!(lex_one(".."), Token::Symbol("..".to_string()));
1060        assert_eq!(lex_one(".method"), Token::Symbol(".method".to_string()));
1061        assert_eq!(lex_one("+"), Token::Symbol("+".to_string()));
1062        assert_eq!(lex_one("-"), Token::Symbol("-".to_string()));
1063        assert_eq!(lex_one("+foo"), Token::Symbol("+foo".to_string()));
1064    }
1065
1066    // ── Keywords ─────────────────────────────────────────────────────────
1067
1068    #[test]
1069    fn test_keyword() {
1070        assert_eq!(lex_one(":foo"), Token::Keyword("foo".to_string()));
1071        assert_eq!(lex_one(":ns/name"), Token::Keyword("ns/name".to_string()));
1072    }
1073
1074    #[test]
1075    fn test_auto_keyword() {
1076        assert_eq!(lex_one("::foo"), Token::AutoKeyword("foo".to_string()));
1077        assert_eq!(
1078            lex_one("::ns/alias"),
1079            Token::AutoKeyword("ns/alias".to_string())
1080        );
1081    }
1082
1083    // ── Delimiters ───────────────────────────────────────────────────────
1084
1085    #[test]
1086    fn test_delimiters() {
1087        assert_eq!(
1088            lex_all("([{}])"),
1089            vec![
1090                Token::LParen,
1091                Token::LBracket,
1092                Token::LBrace,
1093                Token::RBrace,
1094                Token::RBracket,
1095                Token::RParen,
1096            ]
1097        );
1098    }
1099
1100    // ── Reader macros ────────────────────────────────────────────────────
1101
1102    #[test]
1103    fn test_reader_macros() {
1104        assert_eq!(lex_one("'x"), Token::Quote);
1105        assert_eq!(lex_one("`x"), Token::SyntaxQuote);
1106        assert_eq!(lex_one("~x"), Token::Unquote);
1107        assert_eq!(lex_one("~@x"), Token::UnquoteSplice);
1108        assert_eq!(lex_one("@x"), Token::Deref);
1109        assert_eq!(lex_one("^x"), Token::Meta);
1110    }
1111
1112    // ── `#` dispatch ─────────────────────────────────────────────────────
1113
1114    #[test]
1115    fn test_hash_dispatch() {
1116        assert_eq!(lex_one("#("), Token::HashFn);
1117        assert_eq!(lex_one("#{"), Token::HashSet);
1118        assert_eq!(lex_one("#'"), Token::HashVar);
1119        assert_eq!(lex_one("#_"), Token::HashDiscard);
1120        assert_eq!(lex_one("#?"), Token::ReaderCond);
1121        assert_eq!(lex_one("#?@"), Token::ReaderCondSplice);
1122    }
1123
1124    #[test]
1125    fn test_regex() {
1126        assert_eq!(lex_one("#\"[a-z]+\""), Token::Regex("[a-z]+".to_string()));
1127    }
1128
1129    #[test]
1130    fn test_symbolic() {
1131        assert_eq!(lex_one("##Inf"), Token::Symbolic("Inf".to_string()));
1132        assert_eq!(lex_one("##-Inf"), Token::Symbolic("-Inf".to_string()));
1133        assert_eq!(lex_one("##NaN"), Token::Symbolic("NaN".to_string()));
1134    }
1135
1136    #[test]
1137    fn test_tagged_literal() {
1138        assert_eq!(lex_one("#mytag"), Token::TaggedLiteral("mytag".to_string()));
1139    }
1140
1141    // ── Multi-token ──────────────────────────────────────────────────────
1142
1143    #[test]
1144    fn test_multi_token() {
1145        let toks = lex_all("(+ 1 2)");
1146        assert_eq!(
1147            toks,
1148            vec![
1149                Token::LParen,
1150                Token::Symbol("+".to_string()),
1151                Token::Int(1),
1152                Token::Int(2),
1153                Token::RParen,
1154            ]
1155        );
1156    }
1157
1158    // ── Whitespace / comments ────────────────────────────────────────────
1159
1160    #[test]
1161    fn test_comma_skipped() {
1162        assert_eq!(lex_all("{,,,}"), vec![Token::LBrace, Token::RBrace]);
1163    }
1164
1165    #[test]
1166    fn test_comment_skipped() {
1167        assert_eq!(lex_all("; this is a comment\n42"), vec![Token::Int(42)]);
1168    }
1169
1170    #[test]
1171    fn test_shebang_skipped() {
1172        assert_eq!(lex_all("#!/usr/bin/env cljx\n42"), vec![Token::Int(42)]);
1173    }
1174
1175    // ── Span tracking ────────────────────────────────────────────────────
1176
1177    #[test]
1178    fn test_span_col() {
1179        let mut l = Lexer::new("  foo".to_string(), "<test>".to_string());
1180        let (_tok, span) = l.next_token().unwrap();
1181        assert_eq!(span.start, 2);
1182        assert_eq!(span.col, 3);
1183    }
1184
1185    #[test]
1186    fn test_span_newline() {
1187        let mut l = Lexer::new("a\nb".to_string(), "<test>".to_string());
1188        l.next_token().unwrap(); // consume 'a'
1189        let (_tok, span) = l.next_token().unwrap(); // 'b'
1190        assert_eq!(span.line, 2);
1191        assert_eq!(span.col, 1);
1192    }
1193
1194    // ── Errors ───────────────────────────────────────────────────────────
1195
1196    #[test]
1197    fn test_error_unterminated_string() {
1198        let msg = lex_err("\"unterminated");
1199        assert!(msg.contains("unterminated string"));
1200    }
1201
1202    #[test]
1203    fn test_error_bad_hash_dispatch() {
1204        // '#1' is invalid: '1' is not a symbol start and not a special dispatch char
1205        let msg = lex_err("#1");
1206        assert!(msg.contains("unknown # dispatch"));
1207    }
1208
1209    #[test]
1210    fn test_error_bad_unicode_escape_in_string() {
1211        let msg = lex_err("\"\\uGHIJ\"");
1212        assert!(msg.contains("invalid") || msg.contains("hex"));
1213    }
1214
1215    #[test]
1216    fn test_error_unknown_char_name() {
1217        let msg = lex_err("\\bogus");
1218        assert!(msg.contains("unknown character name"));
1219    }
1220
1221    #[test]
1222    fn test_error_unknown_symbolic() {
1223        let msg = lex_err("##Bogus");
1224        assert!(msg.contains("unknown symbolic value"));
1225    }
1226
1227    #[test]
1228    fn test_error_bad_string_escape() {
1229        let msg = lex_err("\"\\q\"");
1230        assert!(msg.contains("unknown string escape"));
1231    }
1232}