Skip to main content

asm_rs/
lexer.rs

1//! Lexer for assembly source text.
2//!
3//! The lexer tokenizes assembly source into a stream of [`Token`](crate::lexer::Token)s, each
4//! carrying its [`Span`](crate::error::Span) (source position) so that error messages can
5//! point back to the exact location in the original input.
6
7use alloc::borrow::Cow;
8use alloc::string::String;
9#[allow(unused_imports)]
10use alloc::vec;
11use alloc::vec::Vec;
12use core::str;
13
14use crate::error::{AsmError, Span};
15
16/// A token produced by the lexer.
17///
18/// Token text is borrowed from the source string (`Cow::Borrowed`) in the
19/// common case, avoiding per-token heap allocation.  String literals with
20/// escape sequences are the only tokens that own their text on the heap.
21#[derive(Debug, Clone, PartialEq)]
22pub struct Token<'src> {
23    /// Token classification.
24    pub kind: TokenKind,
25    /// Source text of the token — borrowed from input in the common case.
26    pub text: Cow<'src, str>,
27    /// Source location.
28    pub span: Span,
29}
30
31impl<'src> Token<'src> {
32    /// Returns the token text as a `&str`.
33    #[inline]
34    pub fn text(&self) -> &str {
35        &self.text
36    }
37}
38
39/// The type of a token.
40#[derive(Debug, Clone, PartialEq)]
41pub enum TokenKind {
42    /// An identifier: mnemonic, register name, or label reference.
43    Ident,
44    /// A numeric literal (integer).
45    Number(i128),
46    /// A string literal (content without quotes).
47    StringLit,
48    /// A character literal (e.g., 'A').
49    CharLit(u8),
50    /// A directive (starts with `.`).
51    Directive,
52    /// Label definition (`name:`).
53    LabelDef,
54    /// Numeric label definition (`1:`).
55    NumericLabelDef(u32),
56    /// Numeric label forward reference (`1f`).
57    NumericLabelFwd(u32),
58    /// Numeric label backward reference (`1b`).
59    NumericLabelBwd(u32),
60    /// Comma separator.
61    Comma,
62    /// Open bracket `[`.
63    OpenBracket,
64    /// Close bracket `]`.
65    CloseBracket,
66    /// Plus `+`.
67    Plus,
68    /// Minus `-`.
69    Minus,
70    /// Asterisk `*` (for scale in memory operands).
71    Star,
72    /// Colon `:` (segment override: `fs:`).
73    Colon,
74    /// Equals `=` (constant assignment: `name = value`).
75    Equals,
76    /// Open brace `{` (ARM register list).
77    OpenBrace,
78    /// Close brace `}` (ARM register list).
79    CloseBrace,
80    /// Open parenthesis `(` (RISC-V memory operand).
81    OpenParen,
82    /// Close parenthesis `)` (RISC-V memory operand).
83    CloseParen,
84    /// Exclamation mark `!` (ARM writeback).
85    Bang,
86    /// Percent sign `%` (AT&T register prefix).
87    Percent,
88    /// Dollar sign `$` (AT&T immediate prefix).
89    Dollar,
90    /// Forward slash `/` (SVE predicate qualifier: p0/m, p0/z).
91    Slash,
92    /// Ampersand `&` (bitwise AND in constant expressions).
93    Ampersand,
94    /// Pipe `|` (bitwise OR in constant expressions).
95    Pipe,
96    /// Caret `^` (bitwise XOR in constant expressions).
97    Caret,
98    /// Tilde `~` (bitwise NOT in constant expressions).
99    Tilde,
100    /// Left shift `<<`.
101    LShift,
102    /// Right shift `>>`.
103    RShift,
104    /// A newline (statement separator).
105    Newline,
106    /// End of input.
107    Eof,
108}
109
110/// Tokenize assembly source text into a vector of tokens.
111///
112/// The lexer recognizes:
113/// - Identifiers (mnemonics, registers, label references)
114/// - Numeric literals (decimal, hex `0x`, binary `0b`, octal `0o`)
115/// - String literals (`"..."`)
116/// - Character literals (`'A'`)
117/// - Directives (`.byte`, `.equ`, etc.)
118/// - Label definitions (`name:`)
119/// - Numeric labels (`1:`, `1b`, `1f`)
120/// - Punctuation: `,`, `[`, `]`, `+`, `-`, `*`, `:`
121/// - Comments: `#` to end of line
122/// - Newlines and semicolons as statement separators
123///
124/// # Errors
125///
126/// Returns `Err(AsmError::Syntax)` if the input contains an unrecognised
127/// character or a malformed token (e.g. an unterminated string literal).
128pub fn tokenize<'s>(source: &'s str) -> Result<Vec<Token<'s>>, AsmError> {
129    // Heuristic: ~4 chars per token on average (mnemonics, registers, punctuation).
130    let mut tokens = Vec::with_capacity(source.len() / 3 + 1);
131    let bytes = source.as_bytes();
132    let len = bytes.len();
133    let mut pos = 0;
134    let mut line: u32 = 1;
135    let mut col: u32 = 1;
136    let mut line_start = 0usize;
137
138    while pos < len {
139        let ch = bytes[pos];
140
141        // Skip whitespace (but not newlines)
142        if ch == b' ' || ch == b'\t' || ch == b'\r' {
143            pos += 1;
144            col += 1;
145            continue;
146        }
147
148        // Newline
149        if ch == b'\n' {
150            tokens.push(Token {
151                kind: TokenKind::Newline,
152                text: Cow::Borrowed("\n"),
153                span: Span::new(line, col, pos, 1),
154            });
155            pos += 1;
156            line += 1;
157            col = 1;
158            line_start = pos;
159            continue;
160        }
161
162        // Semicolon as statement separator
163        if ch == b';' {
164            let start = pos;
165            tokens.push(Token {
166                kind: TokenKind::Newline,
167                text: Cow::Borrowed(";"),
168                span: Span::new(line, col, start, 1),
169            });
170            pos += 1;
171            col += 1;
172            continue;
173        }
174
175        // Comment: # to EOL
176        if ch == b'#' {
177            pos += 1;
178            while pos < len && bytes[pos] != b'\n' {
179                pos += 1;
180            }
181            col = (pos - line_start) as u32 + 1;
182            continue;
183        }
184
185        // Comma
186        if ch == b',' {
187            tokens.push(Token {
188                kind: TokenKind::Comma,
189                text: Cow::Borrowed(","),
190                span: Span::new(line, col, pos, 1),
191            });
192            pos += 1;
193            col += 1;
194            continue;
195        }
196
197        // Brackets
198        if ch == b'[' {
199            tokens.push(Token {
200                kind: TokenKind::OpenBracket,
201                text: Cow::Borrowed("["),
202                span: Span::new(line, col, pos, 1),
203            });
204            pos += 1;
205            col += 1;
206            continue;
207        }
208        if ch == b']' {
209            tokens.push(Token {
210                kind: TokenKind::CloseBracket,
211                text: Cow::Borrowed("]"),
212                span: Span::new(line, col, pos, 1),
213            });
214            pos += 1;
215            col += 1;
216            continue;
217        }
218
219        // Plus
220        if ch == b'+' {
221            tokens.push(Token {
222                kind: TokenKind::Plus,
223                text: Cow::Borrowed("+"),
224                span: Span::new(line, col, pos, 1),
225            });
226            pos += 1;
227            col += 1;
228            continue;
229        }
230
231        // Minus (standalone, not part of negative number if preceded by identifier or number)
232        if ch == b'-' {
233            // Check if this is a negative sign for a number
234            let is_unary = tokens.is_empty()
235                || matches!(
236                    tokens.last().map(|t| &t.kind),
237                    Some(
238                        TokenKind::Comma
239                            | TokenKind::OpenBracket
240                            | TokenKind::OpenBrace
241                            | TokenKind::Plus
242                            | TokenKind::Minus
243                            | TokenKind::Star
244                            | TokenKind::Newline
245                            | TokenKind::Equals
246                    )
247                );
248
249            if is_unary && pos + 1 < len && bytes[pos + 1].is_ascii_digit() {
250                // Parse as negative number
251                let start = pos;
252                let start_col = col;
253                pos += 1; // skip '-'
254                let value = parse_number_at(bytes, &mut pos, line, start_col)?;
255                let token_len = pos - start;
256                let text = Cow::Borrowed(str::from_utf8(&bytes[start..pos]).unwrap_or(""));
257                tokens.push(Token {
258                    kind: TokenKind::Number(-value),
259                    text,
260                    span: Span::new(line, start_col, start, token_len),
261                });
262                col = (pos - line_start) as u32 + 1;
263                continue;
264            }
265
266            tokens.push(Token {
267                kind: TokenKind::Minus,
268                text: Cow::Borrowed("-"),
269                span: Span::new(line, col, pos, 1),
270            });
271            pos += 1;
272            col += 1;
273            continue;
274        }
275
276        // Star
277        if ch == b'*' {
278            tokens.push(Token {
279                kind: TokenKind::Star,
280                text: Cow::Borrowed("*"),
281                span: Span::new(line, col, pos, 1),
282            });
283            pos += 1;
284            col += 1;
285            continue;
286        }
287
288        // Colon (standalone, used for segment overrides)
289        if ch == b':' {
290            tokens.push(Token {
291                kind: TokenKind::Colon,
292                text: Cow::Borrowed(":"),
293                span: Span::new(line, col, pos, 1),
294            });
295            pos += 1;
296            col += 1;
297            continue;
298        }
299
300        // Equals sign (constant assignment)
301        if ch == b'=' {
302            tokens.push(Token {
303                kind: TokenKind::Equals,
304                text: Cow::Borrowed("="),
305                span: Span::new(line, col, pos, 1),
306            });
307            pos += 1;
308            col += 1;
309            continue;
310        }
311
312        // String literal
313        if ch == b'"' {
314            let start = pos;
315            let start_col = col;
316            pos += 1;
317            col += 1;
318            let mut content = Vec::new();
319            while pos < len && bytes[pos] != b'"' {
320                if bytes[pos] == b'\\' && pos + 1 < len {
321                    pos += 1;
322                    col += 1;
323                    match bytes[pos] {
324                        b'n' => content.push(b'\n'),
325                        b't' => content.push(b'\t'),
326                        b'\\' => content.push(b'\\'),
327                        b'"' => content.push(b'"'),
328                        b'0' => content.push(0),
329                        b'x' => {
330                            // \xHH
331                            if pos + 2 < len {
332                                let hi = hex_digit(bytes[pos + 1]);
333                                let lo = hex_digit(bytes[pos + 2]);
334                                if let (Some(h), Some(l)) = (hi, lo) {
335                                    content.push(h * 16 + l);
336                                    pos += 2;
337                                    col += 2;
338                                } else {
339                                    return Err(AsmError::Syntax {
340                                        msg: String::from("invalid \\xHH escape sequence"),
341                                        span: Span::new(line, col, pos, 3),
342                                    });
343                                }
344                            }
345                        }
346                        _ => {
347                            return Err(AsmError::Syntax {
348                                msg: alloc::format!(
349                                    "unknown escape sequence '\\{}'",
350                                    bytes[pos] as char
351                                ),
352                                span: Span::new(line, col, pos - 1, 2),
353                            });
354                        }
355                    }
356                } else if bytes[pos] == b'\n' {
357                    return Err(AsmError::Syntax {
358                        msg: String::from("unterminated string literal"),
359                        span: Span::new(line, start_col, start, pos - start),
360                    });
361                } else {
362                    content.push(bytes[pos]);
363                }
364                pos += 1;
365                col += 1;
366            }
367            if pos >= len {
368                return Err(AsmError::Syntax {
369                    msg: String::from("unterminated string literal"),
370                    span: Span::new(line, start_col, start, pos - start),
371                });
372            }
373            pos += 1; // skip closing quote
374            col += 1;
375            let text_str = Cow::Owned(String::from_utf8(content).unwrap_or_default());
376            tokens.push(Token {
377                kind: TokenKind::StringLit,
378                text: text_str,
379                span: Span::new(line, start_col, start, pos - start),
380            });
381            continue;
382        }
383
384        // Character literal
385        if ch == b'\'' {
386            let start = pos;
387            let start_col = col;
388            pos += 1;
389            col += 1;
390            if pos >= len {
391                return Err(AsmError::Syntax {
392                    msg: String::from("unterminated character literal"),
393                    span: Span::new(line, start_col, start, 1),
394                });
395            }
396            let ch_val = if bytes[pos] == b'\\' && pos + 1 < len {
397                pos += 1;
398                col += 1;
399                match bytes[pos] {
400                    b'n' => b'\n',
401                    b't' => b'\t',
402                    b'\\' => b'\\',
403                    b'\'' => b'\'',
404                    b'0' => 0,
405                    _ => {
406                        return Err(AsmError::Syntax {
407                            msg: "unknown escape in character literal".into(),
408                            span: Span::new(line, col, pos - 1, 2),
409                        });
410                    }
411                }
412            } else {
413                bytes[pos]
414            };
415            pos += 1;
416            col += 1;
417            if pos >= len || bytes[pos] != b'\'' {
418                return Err(AsmError::Syntax {
419                    msg: String::from("unterminated character literal"),
420                    span: Span::new(line, start_col, start, pos - start),
421                });
422            }
423            pos += 1;
424            col += 1;
425            tokens.push(Token {
426                kind: TokenKind::CharLit(ch_val),
427                text: Cow::Owned(alloc::format!("'{}'", ch_val as char)),
428                span: Span::new(line, start_col, start, pos - start),
429            });
430            continue;
431        }
432
433        // Directive (starts with '.')
434        if ch == b'.' {
435            let start = pos;
436            let start_col = col;
437            pos += 1;
438            col += 1;
439            while pos < len && (bytes[pos].is_ascii_alphanumeric() || bytes[pos] == b'_') {
440                pos += 1;
441                col += 1;
442            }
443            let text = Cow::Borrowed(str::from_utf8(&bytes[start..pos]).unwrap_or(""));
444            tokens.push(Token {
445                kind: TokenKind::Directive,
446                text,
447                span: Span::new(line, start_col, start, pos - start),
448            });
449            continue;
450        }
451
452        // Number
453        if ch.is_ascii_digit() {
454            let start = pos;
455            let start_col = col;
456
457            // Check for numeric label: digit(s) followed by `:`, `b`, or `f`
458            // but NOT hex prefix 0x, 0b (binary), 0o
459            let mut temp = pos;
460            while temp < len && bytes[temp].is_ascii_digit() {
461                temp += 1;
462            }
463            // Check for numeric label def: `1:`
464            // Only single-digit labels (0-9) are valid, matching GAS convention.
465            // Multi-digit numbers followed by `:` are rejected to avoid
466            // silent mismatch with references (which must be single-digit).
467            if temp < len && bytes[temp] == b':' && (temp + 1 >= len || bytes[temp + 1] != b':') {
468                // Must be all digits
469                let num_str = str::from_utf8(&bytes[start..temp]).unwrap_or("0");
470                if let Ok(n) = num_str.parse::<u32>() {
471                    if temp != start + 1 {
472                        return Err(AsmError::Syntax {
473                            msg: alloc::format!(
474                                "numeric labels must be a single digit (0-9), got `{}`",
475                                n
476                            ),
477                            span: Span::new(line, start_col, start, temp - start + 1),
478                        });
479                    }
480                    pos = temp + 1; // past the ':'
481                    col = (pos - line_start) as u32 + 1;
482                    tokens.push(Token {
483                        kind: TokenKind::NumericLabelDef(n),
484                        text: Cow::Owned(alloc::format!("{}:", n)),
485                        span: Span::new(line, start_col, start, pos - start),
486                    });
487                    continue;
488                }
489            }
490            // Check for numeric label ref: `1b` or `1f` (only single digit before b/f)
491            if temp < len && temp == start + 1 && (bytes[temp] == b'b' || bytes[temp] == b'f') {
492                // Make sure it's not '0b' (binary prefix) — '0b' followed by 0/1 is binary
493                let digit = bytes[start] - b'0';
494                let suffix = bytes[temp];
495                if !(digit == 0
496                    && suffix == b'b'
497                    && temp + 1 < len
498                    && (bytes[temp + 1] == b'0' || bytes[temp + 1] == b'1'))
499                {
500                    pos = temp + 1;
501                    col = (pos - line_start) as u32 + 1;
502                    let kind = if suffix == b'b' {
503                        TokenKind::NumericLabelBwd(digit as u32)
504                    } else {
505                        TokenKind::NumericLabelFwd(digit as u32)
506                    };
507                    tokens.push(Token {
508                        kind,
509                        text: Cow::Owned(alloc::format!("{}{}", digit, suffix as char)),
510                        span: Span::new(line, start_col, start, pos - start),
511                    });
512                    continue;
513                }
514            }
515
516            let value = parse_number_at(bytes, &mut pos, line, start_col)?;
517            let token_len = pos - start;
518            let text = Cow::Borrowed(str::from_utf8(&bytes[start..pos]).unwrap_or(""));
519            tokens.push(Token {
520                kind: TokenKind::Number(value),
521                text,
522                span: Span::new(line, start_col, start, token_len),
523            });
524            col = (pos - line_start) as u32 + 1;
525            continue;
526        }
527
528        // Identifier or keyword (including register names)
529        if ch.is_ascii_alphabetic() || ch == b'_' {
530            let start = pos;
531            let start_col = col;
532            while pos < len
533                && (bytes[pos].is_ascii_alphanumeric() || bytes[pos] == b'_' || bytes[pos] == b'.')
534            {
535                pos += 1;
536            }
537            let text = Cow::Borrowed(str::from_utf8(&bytes[start..pos]).unwrap_or(""));
538            let token_len = pos - start;
539
540            // Check if followed by ':' → label definition
541            // But NOT if it's a segment register (cs, ds, es, fs, gs, ss)
542            if pos < len && bytes[pos] == b':' {
543                let is_segment_reg = text.eq_ignore_ascii_case("cs")
544                    || text.eq_ignore_ascii_case("ds")
545                    || text.eq_ignore_ascii_case("es")
546                    || text.eq_ignore_ascii_case("fs")
547                    || text.eq_ignore_ascii_case("gs")
548                    || text.eq_ignore_ascii_case("ss");
549                if is_segment_reg {
550                    // Emit as Ident; the ':' will be consumed next iteration
551                    tokens.push(Token {
552                        kind: TokenKind::Ident,
553                        text,
554                        span: Span::new(line, start_col, start, token_len),
555                    });
556                    col = (pos - line_start) as u32 + 1;
557                    continue;
558                }
559                pos += 1; // consume ':'
560                tokens.push(Token {
561                    kind: TokenKind::LabelDef,
562                    text,
563                    span: Span::new(line, start_col, start, pos - start),
564                });
565                col = (pos - line_start) as u32 + 1;
566                continue;
567            }
568
569            tokens.push(Token {
570                kind: TokenKind::Ident,
571                text,
572                span: Span::new(line, start_col, start, token_len),
573            });
574            col = (pos - line_start) as u32 + 1;
575            continue;
576        }
577
578        // Open brace (ARM register lists)
579        if ch == b'{' {
580            tokens.push(Token {
581                kind: TokenKind::OpenBrace,
582                text: Cow::Borrowed("{"),
583                span: Span::new(line, col, pos, 1),
584            });
585            pos += 1;
586            col += 1;
587            continue;
588        }
589
590        // Close brace (ARM register lists)
591        if ch == b'}' {
592            tokens.push(Token {
593                kind: TokenKind::CloseBrace,
594                text: Cow::Borrowed("}"),
595                span: Span::new(line, col, pos, 1),
596            });
597            pos += 1;
598            col += 1;
599            continue;
600        }
601
602        // Open parenthesis (RISC-V memory operands)
603        if ch == b'(' {
604            tokens.push(Token {
605                kind: TokenKind::OpenParen,
606                text: Cow::Borrowed("("),
607                span: Span::new(line, col, pos, 1),
608            });
609            pos += 1;
610            col += 1;
611            continue;
612        }
613
614        // Close parenthesis (RISC-V memory operands)
615        if ch == b')' {
616            tokens.push(Token {
617                kind: TokenKind::CloseParen,
618                text: Cow::Borrowed(")"),
619                span: Span::new(line, col, pos, 1),
620            });
621            pos += 1;
622            col += 1;
623            continue;
624        }
625
626        // Bang (ARM writeback)
627        if ch == b'!' {
628            tokens.push(Token {
629                kind: TokenKind::Bang,
630                text: Cow::Borrowed("!"),
631                span: Span::new(line, col, pos, 1),
632            });
633            pos += 1;
634            col += 1;
635            continue;
636        }
637
638        // Percent (AT&T register prefix)
639        if ch == b'%' {
640            tokens.push(Token {
641                kind: TokenKind::Percent,
642                text: Cow::Borrowed("%"),
643                span: Span::new(line, col, pos, 1),
644            });
645            pos += 1;
646            col += 1;
647            continue;
648        }
649
650        // Dollar (AT&T immediate prefix)
651        if ch == b'$' {
652            tokens.push(Token {
653                kind: TokenKind::Dollar,
654                text: Cow::Borrowed("$"),
655                span: Span::new(line, col, pos, 1),
656            });
657            pos += 1;
658            col += 1;
659            continue;
660        }
661
662        // Slash (SVE predicate qualifier)
663        if ch == b'/' {
664            // Check for C-style comments first
665            if pos + 1 < len && bytes[pos + 1] == b'/' {
666                // Line comment: skip to end of line
667                pos += 2;
668                while pos < len && bytes[pos] != b'\n' {
669                    pos += 1;
670                }
671                col = (pos - line_start) as u32 + 1;
672                continue;
673            }
674            if pos + 1 < len && bytes[pos + 1] == b'*' {
675                // Block comment: skip to matching */
676                let comment_start_line = line;
677                let comment_start_col = col;
678                let comment_start_pos = pos;
679                pos += 2;
680                col += 2;
681                while pos + 1 < len && !(bytes[pos] == b'*' && bytes[pos + 1] == b'/') {
682                    if bytes[pos] == b'\n' {
683                        line += 1;
684                        col = 1;
685                        line_start = pos + 1;
686                    } else {
687                        col += 1;
688                    }
689                    pos += 1;
690                }
691                if pos + 1 < len {
692                    pos += 2; // skip */
693                    col += 2;
694                } else {
695                    // Reached EOF without finding */
696                    return Err(AsmError::Syntax {
697                        msg: String::from("unterminated block comment"),
698                        span: Span::new(
699                            comment_start_line,
700                            comment_start_col,
701                            comment_start_pos,
702                            2,
703                        ),
704                    });
705                }
706                continue;
707            }
708            tokens.push(Token {
709                kind: TokenKind::Slash,
710                text: Cow::Borrowed("/"),
711                span: Span::new(line, col, pos, 1),
712            });
713            pos += 1;
714            col += 1;
715            continue;
716        }
717
718        // Ampersand (bitwise AND)
719        if ch == b'&' {
720            tokens.push(Token {
721                kind: TokenKind::Ampersand,
722                text: Cow::Borrowed("&"),
723                span: Span::new(line, col, pos, 1),
724            });
725            pos += 1;
726            col += 1;
727            continue;
728        }
729
730        // Pipe (bitwise OR)
731        if ch == b'|' {
732            tokens.push(Token {
733                kind: TokenKind::Pipe,
734                text: Cow::Borrowed("|"),
735                span: Span::new(line, col, pos, 1),
736            });
737            pos += 1;
738            col += 1;
739            continue;
740        }
741
742        // Caret (bitwise XOR)
743        if ch == b'^' {
744            tokens.push(Token {
745                kind: TokenKind::Caret,
746                text: Cow::Borrowed("^"),
747                span: Span::new(line, col, pos, 1),
748            });
749            pos += 1;
750            col += 1;
751            continue;
752        }
753
754        // Tilde (bitwise NOT)
755        if ch == b'~' {
756            tokens.push(Token {
757                kind: TokenKind::Tilde,
758                text: Cow::Borrowed("~"),
759                span: Span::new(line, col, pos, 1),
760            });
761            pos += 1;
762            col += 1;
763            continue;
764        }
765
766        // Shift operators << >>
767        if ch == b'<' && pos + 1 < len && bytes[pos + 1] == b'<' {
768            tokens.push(Token {
769                kind: TokenKind::LShift,
770                text: Cow::Borrowed("<<"),
771                span: Span::new(line, col, pos, 2),
772            });
773            pos += 2;
774            col += 2;
775            continue;
776        }
777        if ch == b'>' && pos + 1 < len && bytes[pos + 1] == b'>' {
778            tokens.push(Token {
779                kind: TokenKind::RShift,
780                text: Cow::Borrowed(">>"),
781                span: Span::new(line, col, pos, 2),
782            });
783            pos += 2;
784            col += 2;
785            continue;
786        }
787
788        // Unknown character
789        return Err(AsmError::Syntax {
790            msg: alloc::format!("unexpected character '{}'", ch as char),
791            span: Span::new(line, col, pos, 1),
792        });
793    }
794
795    tokens.push(Token {
796        kind: TokenKind::Eof,
797        text: Cow::Borrowed(""),
798        span: Span::new(line, col, pos, 0),
799    });
800
801    Ok(tokens)
802}
803
804/// Parse a number starting at `pos` in `bytes`. Advances `pos` past the number.
805#[inline]
806fn parse_number_at(
807    bytes: &[u8],
808    pos: &mut usize,
809    span_line: u32,
810    span_col: u32,
811) -> Result<i128, AsmError> {
812    let start = *pos;
813    let len = bytes.len();
814
815    if *pos >= len {
816        return Err(AsmError::Syntax {
817            msg: String::from("expected number"),
818            span: Span::new(span_line, span_col, start, 0),
819        });
820    }
821
822    // Check for hex, binary, octal prefix
823    if bytes[*pos] == b'0' && *pos + 1 < len {
824        match bytes[*pos + 1] {
825            b'x' | b'X' => {
826                *pos += 2;
827                let num_start = *pos;
828                while *pos < len && bytes[*pos].is_ascii_hexdigit() {
829                    *pos += 1;
830                }
831                if *pos == num_start {
832                    return Err(AsmError::Syntax {
833                        msg: String::from("expected hex digits after '0x'"),
834                        span: Span::new(span_line, span_col, start, *pos - start),
835                    });
836                }
837                let s = str::from_utf8(&bytes[num_start..*pos]).unwrap_or("0");
838                return i128::from_str_radix(s, 16).map_err(|_| AsmError::Syntax {
839                    msg: alloc::format!("invalid hex number '0x{}'", s),
840                    span: Span::new(span_line, span_col, start, *pos - start),
841                });
842            }
843            b'b' | b'B' => {
844                // Could be binary 0b prefix — check if next chars are 0 or 1
845                if *pos + 2 < len && (bytes[*pos + 2] == b'0' || bytes[*pos + 2] == b'1') {
846                    *pos += 2;
847                    let num_start = *pos;
848                    while *pos < len && (bytes[*pos] == b'0' || bytes[*pos] == b'1') {
849                        *pos += 1;
850                    }
851                    let s = str::from_utf8(&bytes[num_start..*pos]).unwrap_or("0");
852                    return i128::from_str_radix(s, 2).map_err(|_| AsmError::Syntax {
853                        msg: alloc::format!("invalid binary number '0b{}'", s),
854                        span: Span::new(span_line, span_col, start, *pos - start),
855                    });
856                }
857                // Otherwise, just '0' followed by 'b' which is not a binary prefix
858            }
859            b'o' | b'O' => {
860                *pos += 2;
861                let num_start = *pos;
862                while *pos < len && bytes[*pos] >= b'0' && bytes[*pos] <= b'7' {
863                    *pos += 1;
864                }
865                if *pos == num_start {
866                    return Err(AsmError::Syntax {
867                        msg: String::from("expected octal digits after '0o'"),
868                        span: Span::new(span_line, span_col, start, *pos - start),
869                    });
870                }
871                let s = str::from_utf8(&bytes[num_start..*pos]).unwrap_or("0");
872                return i128::from_str_radix(s, 8).map_err(|_| AsmError::Syntax {
873                    msg: alloc::format!("invalid octal number '0o{}'", s),
874                    span: Span::new(span_line, span_col, start, *pos - start),
875                });
876            }
877            _ => {}
878        }
879    }
880
881    // Decimal
882    while *pos < len && bytes[*pos].is_ascii_digit() {
883        *pos += 1;
884    }
885    // Check for hex suffix (e.g., 0FFh) — common in NASM/MASM
886    if *pos < len && (bytes[*pos] == b'h' || bytes[*pos] == b'H') {
887        let s = str::from_utf8(&bytes[start..*pos]).unwrap_or("0");
888        *pos += 1; // consume 'h'
889        return i128::from_str_radix(s, 16).map_err(|_| AsmError::Syntax {
890            msg: alloc::format!("invalid hex number '{}h'", s),
891            span: Span::new(span_line, span_col, start, *pos - start),
892        });
893    }
894    let s = str::from_utf8(&bytes[start..*pos]).unwrap_or("0");
895    s.parse::<i128>().map_err(|_| AsmError::Syntax {
896        msg: alloc::format!("invalid number '{}'", s),
897        span: Span::new(span_line, span_col, start, *pos - start),
898    })
899}
900
901#[inline]
902fn hex_digit(b: u8) -> Option<u8> {
903    match b {
904        b'0'..=b'9' => Some(b - b'0'),
905        b'a'..=b'f' => Some(b - b'a' + 10),
906        b'A'..=b'F' => Some(b - b'A' + 10),
907        _ => None,
908    }
909}
910
911#[cfg(test)]
912mod tests {
913    use super::*;
914
915    fn tok_kinds(src: &str) -> Vec<TokenKind> {
916        tokenize(src).unwrap().into_iter().map(|t| t.kind).collect()
917    }
918
919    #[allow(dead_code)]
920    fn tok_texts(src: &str) -> Vec<String> {
921        tokenize(src)
922            .unwrap()
923            .into_iter()
924            .map(|t| t.text.into_owned())
925            .collect()
926    }
927
928    #[test]
929    fn empty_input() {
930        let tokens = tokenize("").unwrap();
931        assert_eq!(tokens.len(), 1);
932        assert_eq!(tokens[0].kind, TokenKind::Eof);
933    }
934
935    #[test]
936    fn only_whitespace() {
937        let tokens = tokenize("   \t  ").unwrap();
938        assert_eq!(tokens.len(), 1);
939        assert_eq!(tokens[0].kind, TokenKind::Eof);
940    }
941
942    #[test]
943    fn only_comment() {
944        // Hash is the comment marker; semicolons are statement separators
945        let tokens = tokenize("# this is a comment").unwrap();
946        assert_eq!(tokens.len(), 1);
947        assert_eq!(tokens[0].kind, TokenKind::Eof);
948    }
949
950    #[test]
951    fn hash_comment() {
952        let tokens = tokenize("# comment").unwrap();
953        assert_eq!(tokens.len(), 1);
954        assert_eq!(tokens[0].kind, TokenKind::Eof);
955    }
956
957    #[test]
958    fn simple_instruction() {
959        let kinds = tok_kinds("mov rax, rbx");
960        assert_eq!(
961            kinds,
962            vec![
963                TokenKind::Ident, // mov
964                TokenKind::Ident, // rax
965                TokenKind::Comma,
966                TokenKind::Ident, // rbx
967                TokenKind::Eof,
968            ]
969        );
970    }
971
972    #[test]
973    fn instruction_with_immediate() {
974        let tokens = tokenize("mov rax, 42").unwrap();
975        assert_eq!(tokens[3].kind, TokenKind::Number(42));
976    }
977
978    #[test]
979    fn hex_immediate() {
980        let tokens = tokenize("mov rax, 0xFF").unwrap();
981        assert_eq!(tokens[3].kind, TokenKind::Number(255));
982    }
983
984    #[test]
985    fn hex_uppercase() {
986        let tokens = tokenize("mov rax, 0XAB").unwrap();
987        assert_eq!(tokens[3].kind, TokenKind::Number(0xAB));
988    }
989
990    #[test]
991    fn binary_immediate() {
992        let tokens = tokenize("mov rax, 0b1010").unwrap();
993        assert_eq!(tokens[3].kind, TokenKind::Number(10));
994    }
995
996    #[test]
997    fn octal_immediate() {
998        let tokens = tokenize("mov rax, 0o77").unwrap();
999        assert_eq!(tokens[3].kind, TokenKind::Number(63));
1000    }
1001
1002    #[test]
1003    fn negative_immediate() {
1004        let tokens = tokenize("mov rax, -1").unwrap();
1005        assert_eq!(tokens[3].kind, TokenKind::Number(-1));
1006    }
1007
1008    #[test]
1009    fn negative_hex() {
1010        let tokens = tokenize("add rsp, -0x10").unwrap();
1011        assert_eq!(tokens[3].kind, TokenKind::Number(-16));
1012    }
1013
1014    #[test]
1015    fn label_definition() {
1016        let tokens = tokenize("entry_point:").unwrap();
1017        assert_eq!(tokens[0].kind, TokenKind::LabelDef);
1018        assert_eq!(tokens[0].text, "entry_point");
1019    }
1020
1021    #[test]
1022    fn label_definition_with_instruction() {
1023        let kinds = tok_kinds("loop: dec rcx");
1024        assert_eq!(kinds[0], TokenKind::LabelDef);
1025        assert_eq!(kinds[1], TokenKind::Ident); // dec
1026        assert_eq!(kinds[2], TokenKind::Ident); // rcx
1027    }
1028
1029    #[test]
1030    fn numeric_label_def() {
1031        let tokens = tokenize("1:").unwrap();
1032        assert_eq!(tokens[0].kind, TokenKind::NumericLabelDef(1));
1033    }
1034
1035    #[test]
1036    fn numeric_label_backward_ref() {
1037        let tokens = tokenize("jnz 1b").unwrap();
1038        assert_eq!(tokens[1].kind, TokenKind::NumericLabelBwd(1));
1039    }
1040
1041    #[test]
1042    fn numeric_label_forward_ref() {
1043        let tokens = tokenize("jmp 2f").unwrap();
1044        assert_eq!(tokens[1].kind, TokenKind::NumericLabelFwd(2));
1045    }
1046
1047    #[test]
1048    fn directive() {
1049        let tokens = tokenize(".byte 0x90").unwrap();
1050        assert_eq!(tokens[0].kind, TokenKind::Directive);
1051        assert_eq!(tokens[0].text, ".byte");
1052        assert_eq!(tokens[1].kind, TokenKind::Number(0x90));
1053    }
1054
1055    #[test]
1056    fn equ_directive() {
1057        let tokens = tokenize(".equ SYS_WRITE, 1").unwrap();
1058        assert_eq!(tokens[0].kind, TokenKind::Directive);
1059        assert_eq!(tokens[0].text, ".equ");
1060        assert_eq!(tokens[1].kind, TokenKind::Ident);
1061        assert_eq!(tokens[1].text, "SYS_WRITE");
1062    }
1063
1064    #[test]
1065    fn memory_operand_tokens() {
1066        let kinds = tok_kinds("[rax + rbx*4 + 8]");
1067        assert_eq!(
1068            kinds,
1069            vec![
1070                TokenKind::OpenBracket,
1071                TokenKind::Ident, // rax
1072                TokenKind::Plus,
1073                TokenKind::Ident, // rbx
1074                TokenKind::Star,
1075                TokenKind::Number(4),
1076                TokenKind::Plus,
1077                TokenKind::Number(8),
1078                TokenKind::CloseBracket,
1079                TokenKind::Eof,
1080            ]
1081        );
1082    }
1083
1084    #[test]
1085    fn string_literal() {
1086        let tokens = tokenize(".asciz \"hello\"").unwrap();
1087        assert_eq!(tokens[1].kind, TokenKind::StringLit);
1088        assert_eq!(tokens[1].text, "hello");
1089    }
1090
1091    #[test]
1092    fn string_escape_sequences() {
1093        let tokens = tokenize(".ascii \"a\\nb\\t\\\\c\\0\\x41\"").unwrap();
1094        assert_eq!(tokens[1].kind, TokenKind::StringLit);
1095        assert_eq!(tokens[1].text, "a\nb\t\\c\0A");
1096    }
1097
1098    #[test]
1099    fn character_literal() {
1100        let tokens = tokenize("mov al, 'A'").unwrap();
1101        assert_eq!(tokens[3].kind, TokenKind::CharLit(b'A'));
1102    }
1103
1104    #[test]
1105    fn semicolon_separator() {
1106        let kinds = tok_kinds("nop; ret");
1107        assert_eq!(
1108            kinds,
1109            vec![
1110                TokenKind::Ident,   // nop
1111                TokenKind::Newline, // ;
1112                TokenKind::Ident,   // ret
1113                TokenKind::Eof,
1114            ]
1115        );
1116    }
1117
1118    #[test]
1119    fn newline_separator() {
1120        let kinds = tok_kinds("nop\nret");
1121        assert_eq!(
1122            kinds,
1123            vec![
1124                TokenKind::Ident, // nop
1125                TokenKind::Newline,
1126                TokenKind::Ident, // ret
1127                TokenKind::Eof,
1128            ]
1129        );
1130    }
1131
1132    #[test]
1133    fn segment_override_tokens() {
1134        let kinds = tok_kinds("fs:[rax]");
1135        assert_eq!(kinds[0], TokenKind::Ident); // fs
1136        assert_eq!(kinds[1], TokenKind::Colon);
1137        assert_eq!(kinds[2], TokenKind::OpenBracket);
1138        assert_eq!(kinds[3], TokenKind::Ident); // rax
1139        assert_eq!(kinds[4], TokenKind::CloseBracket);
1140    }
1141
1142    #[test]
1143    fn size_hint_tokens() {
1144        let kinds = tok_kinds("byte ptr [rax]");
1145        assert_eq!(kinds[0], TokenKind::Ident); // byte
1146        assert_eq!(kinds[1], TokenKind::Ident); // ptr
1147        assert_eq!(kinds[2], TokenKind::OpenBracket);
1148    }
1149
1150    #[test]
1151    fn prefix_and_instruction() {
1152        let kinds = tok_kinds("lock add [rax], 1");
1153        assert_eq!(kinds[0], TokenKind::Ident); // lock
1154        assert_eq!(kinds[1], TokenKind::Ident); // add
1155    }
1156
1157    #[test]
1158    fn span_tracking() {
1159        let tokens = tokenize("mov rax, 1").unwrap();
1160        assert_eq!(tokens[0].span, Span::new(1, 1, 0, 3)); // "mov"
1161        assert_eq!(tokens[1].span, Span::new(1, 5, 4, 3)); // "rax"
1162        assert_eq!(tokens[2].span, Span::new(1, 8, 7, 1)); // ","
1163    }
1164
1165    #[test]
1166    fn multiline_span_tracking() {
1167        let tokens = tokenize("nop\nmov rax, 1").unwrap();
1168        assert_eq!(tokens[0].span.line, 1); // nop
1169        assert_eq!(tokens[2].span.line, 2); // mov (after newline)
1170    }
1171
1172    #[test]
1173    fn unknown_character_error() {
1174        let err = tokenize("mov rax, @").unwrap_err();
1175        match err {
1176            AsmError::Syntax { msg, .. } => {
1177                assert!(msg.contains("unexpected character '@'"));
1178            }
1179            _ => panic!("expected Syntax error"),
1180        }
1181    }
1182
1183    #[test]
1184    fn unterminated_string() {
1185        let err = tokenize(".ascii \"hello").unwrap_err();
1186        match err {
1187            AsmError::Syntax { msg, .. } => {
1188                assert!(msg.contains("unterminated string"));
1189            }
1190            _ => panic!("expected Syntax error"),
1191        }
1192    }
1193
1194    #[test]
1195    fn unterminated_block_comment() {
1196        let err = tokenize("nop /* this is never closed").unwrap_err();
1197        match err {
1198            AsmError::Syntax { msg, span } => {
1199                assert!(
1200                    msg.contains("unterminated block comment"),
1201                    "expected 'unterminated block comment', got: {msg}"
1202                );
1203                // Span should point to the start of the comment, not (0,0)
1204                assert!(span.line > 0 || span.col > 0, "span should not be (0,0)");
1205            }
1206            _ => panic!("expected Syntax error"),
1207        }
1208    }
1209
1210    #[test]
1211    fn complex_instruction() {
1212        let tokens = tokenize("mov qword ptr [rbp - 0x10], rax").unwrap();
1213        let texts: Vec<_> = tokens.iter().map(|t| &*t.text).collect();
1214        assert_eq!(
1215            texts,
1216            vec!["mov", "qword", "ptr", "[", "rbp", "-", "0x10", "]", ",", "rax", ""]
1217        );
1218    }
1219
1220    #[test]
1221    fn all_punctuation() {
1222        let kinds = tok_kinds(", [ ] + - * :");
1223        assert_eq!(
1224            kinds,
1225            vec![
1226                TokenKind::Comma,
1227                TokenKind::OpenBracket,
1228                TokenKind::CloseBracket,
1229                TokenKind::Plus,
1230                TokenKind::Minus,
1231                TokenKind::Star,
1232                TokenKind::Colon,
1233                TokenKind::Eof,
1234            ]
1235        );
1236    }
1237
1238    #[test]
1239    fn trailing_whitespace() {
1240        let tokens = tokenize("nop   ").unwrap();
1241        assert_eq!(tokens.len(), 2); // nop + Eof
1242    }
1243
1244    #[test]
1245    fn zero_immediate() {
1246        let tokens = tokenize("xor eax, 0").unwrap();
1247        assert_eq!(tokens[3].kind, TokenKind::Number(0));
1248    }
1249
1250    #[test]
1251    fn large_hex_immediate() {
1252        let tokens = tokenize("mov rdi, 0x68732f2f6e69622f").unwrap();
1253        assert_eq!(tokens[3].kind, TokenKind::Number(0x68732f2f6e69622f));
1254    }
1255
1256    #[test]
1257    fn minus_in_memory_operand_is_not_unary() {
1258        // After identifier, '-' should be an operator, not unary
1259        let kinds = tok_kinds("[rbp - 0x10]");
1260        assert_eq!(
1261            kinds,
1262            vec![
1263                TokenKind::OpenBracket,
1264                TokenKind::Ident, // rbp
1265                TokenKind::Minus,
1266                TokenKind::Number(0x10),
1267                TokenKind::CloseBracket,
1268                TokenKind::Eof,
1269            ]
1270        );
1271    }
1272
1273    #[test]
1274    fn equals_token() {
1275        let kinds = tok_kinds("EXIT = 60");
1276        assert_eq!(
1277            kinds,
1278            vec![
1279                TokenKind::Ident, // EXIT
1280                TokenKind::Equals,
1281                TokenKind::Number(60),
1282                TokenKind::Eof,
1283            ]
1284        );
1285    }
1286
1287    #[test]
1288    fn equals_with_negative() {
1289        let kinds = tok_kinds("NEG = -1");
1290        assert_eq!(
1291            kinds,
1292            vec![
1293                TokenKind::Ident,
1294                TokenKind::Equals,
1295                TokenKind::Number(-1),
1296                TokenKind::Eof,
1297            ]
1298        );
1299    }
1300}