1use alloc::borrow::Cow;
8use alloc::string::String;
9#[allow(unused_imports)]
10use alloc::vec;
11use alloc::vec::Vec;
12use core::str;
13
14use crate::error::{AsmError, Span};
15
16#[derive(Debug, Clone, PartialEq)]
22pub struct Token<'src> {
23 pub kind: TokenKind,
25 pub text: Cow<'src, str>,
27 pub span: Span,
29}
30
31impl<'src> Token<'src> {
32 #[inline]
34 pub fn text(&self) -> &str {
35 &self.text
36 }
37}
38
39#[derive(Debug, Clone, PartialEq)]
41pub enum TokenKind {
42 Ident,
44 Number(i128),
46 StringLit,
48 CharLit(u8),
50 Directive,
52 LabelDef,
54 NumericLabelDef(u32),
56 NumericLabelFwd(u32),
58 NumericLabelBwd(u32),
60 Comma,
62 OpenBracket,
64 CloseBracket,
66 Plus,
68 Minus,
70 Star,
72 Colon,
74 Equals,
76 OpenBrace,
78 CloseBrace,
80 OpenParen,
82 CloseParen,
84 Bang,
86 Percent,
88 Dollar,
90 Slash,
92 Ampersand,
94 Pipe,
96 Caret,
98 Tilde,
100 LShift,
102 RShift,
104 Newline,
106 Eof,
108}
109
110pub fn tokenize<'s>(source: &'s str) -> Result<Vec<Token<'s>>, AsmError> {
129 let mut tokens = Vec::with_capacity(source.len() / 3 + 1);
131 let bytes = source.as_bytes();
132 let len = bytes.len();
133 let mut pos = 0;
134 let mut line: u32 = 1;
135 let mut col: u32 = 1;
136 let mut line_start = 0usize;
137
138 while pos < len {
139 let ch = bytes[pos];
140
141 if ch == b' ' || ch == b'\t' || ch == b'\r' {
143 pos += 1;
144 col += 1;
145 continue;
146 }
147
148 if ch == b'\n' {
150 tokens.push(Token {
151 kind: TokenKind::Newline,
152 text: Cow::Borrowed("\n"),
153 span: Span::new(line, col, pos, 1),
154 });
155 pos += 1;
156 line += 1;
157 col = 1;
158 line_start = pos;
159 continue;
160 }
161
162 if ch == b';' {
164 let start = pos;
165 tokens.push(Token {
166 kind: TokenKind::Newline,
167 text: Cow::Borrowed(";"),
168 span: Span::new(line, col, start, 1),
169 });
170 pos += 1;
171 col += 1;
172 continue;
173 }
174
175 if ch == b'#' {
177 pos += 1;
178 while pos < len && bytes[pos] != b'\n' {
179 pos += 1;
180 }
181 col = (pos - line_start) as u32 + 1;
182 continue;
183 }
184
185 if ch == b',' {
187 tokens.push(Token {
188 kind: TokenKind::Comma,
189 text: Cow::Borrowed(","),
190 span: Span::new(line, col, pos, 1),
191 });
192 pos += 1;
193 col += 1;
194 continue;
195 }
196
197 if ch == b'[' {
199 tokens.push(Token {
200 kind: TokenKind::OpenBracket,
201 text: Cow::Borrowed("["),
202 span: Span::new(line, col, pos, 1),
203 });
204 pos += 1;
205 col += 1;
206 continue;
207 }
208 if ch == b']' {
209 tokens.push(Token {
210 kind: TokenKind::CloseBracket,
211 text: Cow::Borrowed("]"),
212 span: Span::new(line, col, pos, 1),
213 });
214 pos += 1;
215 col += 1;
216 continue;
217 }
218
219 if ch == b'+' {
221 tokens.push(Token {
222 kind: TokenKind::Plus,
223 text: Cow::Borrowed("+"),
224 span: Span::new(line, col, pos, 1),
225 });
226 pos += 1;
227 col += 1;
228 continue;
229 }
230
231 if ch == b'-' {
233 let is_unary = tokens.is_empty()
235 || matches!(
236 tokens.last().map(|t| &t.kind),
237 Some(
238 TokenKind::Comma
239 | TokenKind::OpenBracket
240 | TokenKind::OpenBrace
241 | TokenKind::Plus
242 | TokenKind::Minus
243 | TokenKind::Star
244 | TokenKind::Newline
245 | TokenKind::Equals
246 )
247 );
248
249 if is_unary && pos + 1 < len && bytes[pos + 1].is_ascii_digit() {
250 let start = pos;
252 let start_col = col;
253 pos += 1; let value = parse_number_at(bytes, &mut pos, line, start_col)?;
255 let token_len = pos - start;
256 let text = Cow::Borrowed(str::from_utf8(&bytes[start..pos]).unwrap_or(""));
257 tokens.push(Token {
258 kind: TokenKind::Number(-value),
259 text,
260 span: Span::new(line, start_col, start, token_len),
261 });
262 col = (pos - line_start) as u32 + 1;
263 continue;
264 }
265
266 tokens.push(Token {
267 kind: TokenKind::Minus,
268 text: Cow::Borrowed("-"),
269 span: Span::new(line, col, pos, 1),
270 });
271 pos += 1;
272 col += 1;
273 continue;
274 }
275
276 if ch == b'*' {
278 tokens.push(Token {
279 kind: TokenKind::Star,
280 text: Cow::Borrowed("*"),
281 span: Span::new(line, col, pos, 1),
282 });
283 pos += 1;
284 col += 1;
285 continue;
286 }
287
288 if ch == b':' {
290 tokens.push(Token {
291 kind: TokenKind::Colon,
292 text: Cow::Borrowed(":"),
293 span: Span::new(line, col, pos, 1),
294 });
295 pos += 1;
296 col += 1;
297 continue;
298 }
299
300 if ch == b'=' {
302 tokens.push(Token {
303 kind: TokenKind::Equals,
304 text: Cow::Borrowed("="),
305 span: Span::new(line, col, pos, 1),
306 });
307 pos += 1;
308 col += 1;
309 continue;
310 }
311
312 if ch == b'"' {
314 let start = pos;
315 let start_col = col;
316 pos += 1;
317 col += 1;
318 let mut content = Vec::new();
319 while pos < len && bytes[pos] != b'"' {
320 if bytes[pos] == b'\\' && pos + 1 < len {
321 pos += 1;
322 col += 1;
323 match bytes[pos] {
324 b'n' => content.push(b'\n'),
325 b't' => content.push(b'\t'),
326 b'\\' => content.push(b'\\'),
327 b'"' => content.push(b'"'),
328 b'0' => content.push(0),
329 b'x' => {
330 if pos + 2 < len {
332 let hi = hex_digit(bytes[pos + 1]);
333 let lo = hex_digit(bytes[pos + 2]);
334 if let (Some(h), Some(l)) = (hi, lo) {
335 content.push(h * 16 + l);
336 pos += 2;
337 col += 2;
338 } else {
339 return Err(AsmError::Syntax {
340 msg: String::from("invalid \\xHH escape sequence"),
341 span: Span::new(line, col, pos, 3),
342 });
343 }
344 }
345 }
346 _ => {
347 return Err(AsmError::Syntax {
348 msg: alloc::format!(
349 "unknown escape sequence '\\{}'",
350 bytes[pos] as char
351 ),
352 span: Span::new(line, col, pos - 1, 2),
353 });
354 }
355 }
356 } else if bytes[pos] == b'\n' {
357 return Err(AsmError::Syntax {
358 msg: String::from("unterminated string literal"),
359 span: Span::new(line, start_col, start, pos - start),
360 });
361 } else {
362 content.push(bytes[pos]);
363 }
364 pos += 1;
365 col += 1;
366 }
367 if pos >= len {
368 return Err(AsmError::Syntax {
369 msg: String::from("unterminated string literal"),
370 span: Span::new(line, start_col, start, pos - start),
371 });
372 }
373 pos += 1; col += 1;
375 let text_str = Cow::Owned(String::from_utf8(content).unwrap_or_default());
376 tokens.push(Token {
377 kind: TokenKind::StringLit,
378 text: text_str,
379 span: Span::new(line, start_col, start, pos - start),
380 });
381 continue;
382 }
383
384 if ch == b'\'' {
386 let start = pos;
387 let start_col = col;
388 pos += 1;
389 col += 1;
390 if pos >= len {
391 return Err(AsmError::Syntax {
392 msg: String::from("unterminated character literal"),
393 span: Span::new(line, start_col, start, 1),
394 });
395 }
396 let ch_val = if bytes[pos] == b'\\' && pos + 1 < len {
397 pos += 1;
398 col += 1;
399 match bytes[pos] {
400 b'n' => b'\n',
401 b't' => b'\t',
402 b'\\' => b'\\',
403 b'\'' => b'\'',
404 b'0' => 0,
405 _ => {
406 return Err(AsmError::Syntax {
407 msg: "unknown escape in character literal".into(),
408 span: Span::new(line, col, pos - 1, 2),
409 });
410 }
411 }
412 } else {
413 bytes[pos]
414 };
415 pos += 1;
416 col += 1;
417 if pos >= len || bytes[pos] != b'\'' {
418 return Err(AsmError::Syntax {
419 msg: String::from("unterminated character literal"),
420 span: Span::new(line, start_col, start, pos - start),
421 });
422 }
423 pos += 1;
424 col += 1;
425 tokens.push(Token {
426 kind: TokenKind::CharLit(ch_val),
427 text: Cow::Owned(alloc::format!("'{}'", ch_val as char)),
428 span: Span::new(line, start_col, start, pos - start),
429 });
430 continue;
431 }
432
433 if ch == b'.' {
435 let start = pos;
436 let start_col = col;
437 pos += 1;
438 col += 1;
439 while pos < len && (bytes[pos].is_ascii_alphanumeric() || bytes[pos] == b'_') {
440 pos += 1;
441 col += 1;
442 }
443 let text = Cow::Borrowed(str::from_utf8(&bytes[start..pos]).unwrap_or(""));
444 tokens.push(Token {
445 kind: TokenKind::Directive,
446 text,
447 span: Span::new(line, start_col, start, pos - start),
448 });
449 continue;
450 }
451
452 if ch.is_ascii_digit() {
454 let start = pos;
455 let start_col = col;
456
457 let mut temp = pos;
460 while temp < len && bytes[temp].is_ascii_digit() {
461 temp += 1;
462 }
463 if temp < len && bytes[temp] == b':' && (temp + 1 >= len || bytes[temp + 1] != b':') {
468 let num_str = str::from_utf8(&bytes[start..temp]).unwrap_or("0");
470 if let Ok(n) = num_str.parse::<u32>() {
471 if temp != start + 1 {
472 return Err(AsmError::Syntax {
473 msg: alloc::format!(
474 "numeric labels must be a single digit (0-9), got `{}`",
475 n
476 ),
477 span: Span::new(line, start_col, start, temp - start + 1),
478 });
479 }
480 pos = temp + 1; col = (pos - line_start) as u32 + 1;
482 tokens.push(Token {
483 kind: TokenKind::NumericLabelDef(n),
484 text: Cow::Owned(alloc::format!("{}:", n)),
485 span: Span::new(line, start_col, start, pos - start),
486 });
487 continue;
488 }
489 }
490 if temp < len && temp == start + 1 && (bytes[temp] == b'b' || bytes[temp] == b'f') {
492 let digit = bytes[start] - b'0';
494 let suffix = bytes[temp];
495 if !(digit == 0
496 && suffix == b'b'
497 && temp + 1 < len
498 && (bytes[temp + 1] == b'0' || bytes[temp + 1] == b'1'))
499 {
500 pos = temp + 1;
501 col = (pos - line_start) as u32 + 1;
502 let kind = if suffix == b'b' {
503 TokenKind::NumericLabelBwd(digit as u32)
504 } else {
505 TokenKind::NumericLabelFwd(digit as u32)
506 };
507 tokens.push(Token {
508 kind,
509 text: Cow::Owned(alloc::format!("{}{}", digit, suffix as char)),
510 span: Span::new(line, start_col, start, pos - start),
511 });
512 continue;
513 }
514 }
515
516 let value = parse_number_at(bytes, &mut pos, line, start_col)?;
517 let token_len = pos - start;
518 let text = Cow::Borrowed(str::from_utf8(&bytes[start..pos]).unwrap_or(""));
519 tokens.push(Token {
520 kind: TokenKind::Number(value),
521 text,
522 span: Span::new(line, start_col, start, token_len),
523 });
524 col = (pos - line_start) as u32 + 1;
525 continue;
526 }
527
528 if ch.is_ascii_alphabetic() || ch == b'_' {
530 let start = pos;
531 let start_col = col;
532 while pos < len
533 && (bytes[pos].is_ascii_alphanumeric() || bytes[pos] == b'_' || bytes[pos] == b'.')
534 {
535 pos += 1;
536 }
537 let text = Cow::Borrowed(str::from_utf8(&bytes[start..pos]).unwrap_or(""));
538 let token_len = pos - start;
539
540 if pos < len && bytes[pos] == b':' {
543 let is_segment_reg = text.eq_ignore_ascii_case("cs")
544 || text.eq_ignore_ascii_case("ds")
545 || text.eq_ignore_ascii_case("es")
546 || text.eq_ignore_ascii_case("fs")
547 || text.eq_ignore_ascii_case("gs")
548 || text.eq_ignore_ascii_case("ss");
549 if is_segment_reg {
550 tokens.push(Token {
552 kind: TokenKind::Ident,
553 text,
554 span: Span::new(line, start_col, start, token_len),
555 });
556 col = (pos - line_start) as u32 + 1;
557 continue;
558 }
559 pos += 1; tokens.push(Token {
561 kind: TokenKind::LabelDef,
562 text,
563 span: Span::new(line, start_col, start, pos - start),
564 });
565 col = (pos - line_start) as u32 + 1;
566 continue;
567 }
568
569 tokens.push(Token {
570 kind: TokenKind::Ident,
571 text,
572 span: Span::new(line, start_col, start, token_len),
573 });
574 col = (pos - line_start) as u32 + 1;
575 continue;
576 }
577
578 if ch == b'{' {
580 tokens.push(Token {
581 kind: TokenKind::OpenBrace,
582 text: Cow::Borrowed("{"),
583 span: Span::new(line, col, pos, 1),
584 });
585 pos += 1;
586 col += 1;
587 continue;
588 }
589
590 if ch == b'}' {
592 tokens.push(Token {
593 kind: TokenKind::CloseBrace,
594 text: Cow::Borrowed("}"),
595 span: Span::new(line, col, pos, 1),
596 });
597 pos += 1;
598 col += 1;
599 continue;
600 }
601
602 if ch == b'(' {
604 tokens.push(Token {
605 kind: TokenKind::OpenParen,
606 text: Cow::Borrowed("("),
607 span: Span::new(line, col, pos, 1),
608 });
609 pos += 1;
610 col += 1;
611 continue;
612 }
613
614 if ch == b')' {
616 tokens.push(Token {
617 kind: TokenKind::CloseParen,
618 text: Cow::Borrowed(")"),
619 span: Span::new(line, col, pos, 1),
620 });
621 pos += 1;
622 col += 1;
623 continue;
624 }
625
626 if ch == b'!' {
628 tokens.push(Token {
629 kind: TokenKind::Bang,
630 text: Cow::Borrowed("!"),
631 span: Span::new(line, col, pos, 1),
632 });
633 pos += 1;
634 col += 1;
635 continue;
636 }
637
638 if ch == b'%' {
640 tokens.push(Token {
641 kind: TokenKind::Percent,
642 text: Cow::Borrowed("%"),
643 span: Span::new(line, col, pos, 1),
644 });
645 pos += 1;
646 col += 1;
647 continue;
648 }
649
650 if ch == b'$' {
652 tokens.push(Token {
653 kind: TokenKind::Dollar,
654 text: Cow::Borrowed("$"),
655 span: Span::new(line, col, pos, 1),
656 });
657 pos += 1;
658 col += 1;
659 continue;
660 }
661
662 if ch == b'/' {
664 if pos + 1 < len && bytes[pos + 1] == b'/' {
666 pos += 2;
668 while pos < len && bytes[pos] != b'\n' {
669 pos += 1;
670 }
671 col = (pos - line_start) as u32 + 1;
672 continue;
673 }
674 if pos + 1 < len && bytes[pos + 1] == b'*' {
675 let comment_start_line = line;
677 let comment_start_col = col;
678 let comment_start_pos = pos;
679 pos += 2;
680 col += 2;
681 while pos + 1 < len && !(bytes[pos] == b'*' && bytes[pos + 1] == b'/') {
682 if bytes[pos] == b'\n' {
683 line += 1;
684 col = 1;
685 line_start = pos + 1;
686 } else {
687 col += 1;
688 }
689 pos += 1;
690 }
691 if pos + 1 < len {
692 pos += 2; col += 2;
694 } else {
695 return Err(AsmError::Syntax {
697 msg: String::from("unterminated block comment"),
698 span: Span::new(
699 comment_start_line,
700 comment_start_col,
701 comment_start_pos,
702 2,
703 ),
704 });
705 }
706 continue;
707 }
708 tokens.push(Token {
709 kind: TokenKind::Slash,
710 text: Cow::Borrowed("/"),
711 span: Span::new(line, col, pos, 1),
712 });
713 pos += 1;
714 col += 1;
715 continue;
716 }
717
718 if ch == b'&' {
720 tokens.push(Token {
721 kind: TokenKind::Ampersand,
722 text: Cow::Borrowed("&"),
723 span: Span::new(line, col, pos, 1),
724 });
725 pos += 1;
726 col += 1;
727 continue;
728 }
729
730 if ch == b'|' {
732 tokens.push(Token {
733 kind: TokenKind::Pipe,
734 text: Cow::Borrowed("|"),
735 span: Span::new(line, col, pos, 1),
736 });
737 pos += 1;
738 col += 1;
739 continue;
740 }
741
742 if ch == b'^' {
744 tokens.push(Token {
745 kind: TokenKind::Caret,
746 text: Cow::Borrowed("^"),
747 span: Span::new(line, col, pos, 1),
748 });
749 pos += 1;
750 col += 1;
751 continue;
752 }
753
754 if ch == b'~' {
756 tokens.push(Token {
757 kind: TokenKind::Tilde,
758 text: Cow::Borrowed("~"),
759 span: Span::new(line, col, pos, 1),
760 });
761 pos += 1;
762 col += 1;
763 continue;
764 }
765
766 if ch == b'<' && pos + 1 < len && bytes[pos + 1] == b'<' {
768 tokens.push(Token {
769 kind: TokenKind::LShift,
770 text: Cow::Borrowed("<<"),
771 span: Span::new(line, col, pos, 2),
772 });
773 pos += 2;
774 col += 2;
775 continue;
776 }
777 if ch == b'>' && pos + 1 < len && bytes[pos + 1] == b'>' {
778 tokens.push(Token {
779 kind: TokenKind::RShift,
780 text: Cow::Borrowed(">>"),
781 span: Span::new(line, col, pos, 2),
782 });
783 pos += 2;
784 col += 2;
785 continue;
786 }
787
788 return Err(AsmError::Syntax {
790 msg: alloc::format!("unexpected character '{}'", ch as char),
791 span: Span::new(line, col, pos, 1),
792 });
793 }
794
795 tokens.push(Token {
796 kind: TokenKind::Eof,
797 text: Cow::Borrowed(""),
798 span: Span::new(line, col, pos, 0),
799 });
800
801 Ok(tokens)
802}
803
804#[inline]
806fn parse_number_at(
807 bytes: &[u8],
808 pos: &mut usize,
809 span_line: u32,
810 span_col: u32,
811) -> Result<i128, AsmError> {
812 let start = *pos;
813 let len = bytes.len();
814
815 if *pos >= len {
816 return Err(AsmError::Syntax {
817 msg: String::from("expected number"),
818 span: Span::new(span_line, span_col, start, 0),
819 });
820 }
821
822 if bytes[*pos] == b'0' && *pos + 1 < len {
824 match bytes[*pos + 1] {
825 b'x' | b'X' => {
826 *pos += 2;
827 let num_start = *pos;
828 while *pos < len && bytes[*pos].is_ascii_hexdigit() {
829 *pos += 1;
830 }
831 if *pos == num_start {
832 return Err(AsmError::Syntax {
833 msg: String::from("expected hex digits after '0x'"),
834 span: Span::new(span_line, span_col, start, *pos - start),
835 });
836 }
837 let s = str::from_utf8(&bytes[num_start..*pos]).unwrap_or("0");
838 return i128::from_str_radix(s, 16).map_err(|_| AsmError::Syntax {
839 msg: alloc::format!("invalid hex number '0x{}'", s),
840 span: Span::new(span_line, span_col, start, *pos - start),
841 });
842 }
843 b'b' | b'B' => {
844 if *pos + 2 < len && (bytes[*pos + 2] == b'0' || bytes[*pos + 2] == b'1') {
846 *pos += 2;
847 let num_start = *pos;
848 while *pos < len && (bytes[*pos] == b'0' || bytes[*pos] == b'1') {
849 *pos += 1;
850 }
851 let s = str::from_utf8(&bytes[num_start..*pos]).unwrap_or("0");
852 return i128::from_str_radix(s, 2).map_err(|_| AsmError::Syntax {
853 msg: alloc::format!("invalid binary number '0b{}'", s),
854 span: Span::new(span_line, span_col, start, *pos - start),
855 });
856 }
857 }
859 b'o' | b'O' => {
860 *pos += 2;
861 let num_start = *pos;
862 while *pos < len && bytes[*pos] >= b'0' && bytes[*pos] <= b'7' {
863 *pos += 1;
864 }
865 if *pos == num_start {
866 return Err(AsmError::Syntax {
867 msg: String::from("expected octal digits after '0o'"),
868 span: Span::new(span_line, span_col, start, *pos - start),
869 });
870 }
871 let s = str::from_utf8(&bytes[num_start..*pos]).unwrap_or("0");
872 return i128::from_str_radix(s, 8).map_err(|_| AsmError::Syntax {
873 msg: alloc::format!("invalid octal number '0o{}'", s),
874 span: Span::new(span_line, span_col, start, *pos - start),
875 });
876 }
877 _ => {}
878 }
879 }
880
881 while *pos < len && bytes[*pos].is_ascii_digit() {
883 *pos += 1;
884 }
885 if *pos < len && (bytes[*pos] == b'h' || bytes[*pos] == b'H') {
887 let s = str::from_utf8(&bytes[start..*pos]).unwrap_or("0");
888 *pos += 1; return i128::from_str_radix(s, 16).map_err(|_| AsmError::Syntax {
890 msg: alloc::format!("invalid hex number '{}h'", s),
891 span: Span::new(span_line, span_col, start, *pos - start),
892 });
893 }
894 let s = str::from_utf8(&bytes[start..*pos]).unwrap_or("0");
895 s.parse::<i128>().map_err(|_| AsmError::Syntax {
896 msg: alloc::format!("invalid number '{}'", s),
897 span: Span::new(span_line, span_col, start, *pos - start),
898 })
899}
900
901#[inline]
902fn hex_digit(b: u8) -> Option<u8> {
903 match b {
904 b'0'..=b'9' => Some(b - b'0'),
905 b'a'..=b'f' => Some(b - b'a' + 10),
906 b'A'..=b'F' => Some(b - b'A' + 10),
907 _ => None,
908 }
909}
910
911#[cfg(test)]
912mod tests {
913 use super::*;
914
915 fn tok_kinds(src: &str) -> Vec<TokenKind> {
916 tokenize(src).unwrap().into_iter().map(|t| t.kind).collect()
917 }
918
919 #[allow(dead_code)]
920 fn tok_texts(src: &str) -> Vec<String> {
921 tokenize(src)
922 .unwrap()
923 .into_iter()
924 .map(|t| t.text.into_owned())
925 .collect()
926 }
927
928 #[test]
929 fn empty_input() {
930 let tokens = tokenize("").unwrap();
931 assert_eq!(tokens.len(), 1);
932 assert_eq!(tokens[0].kind, TokenKind::Eof);
933 }
934
935 #[test]
936 fn only_whitespace() {
937 let tokens = tokenize(" \t ").unwrap();
938 assert_eq!(tokens.len(), 1);
939 assert_eq!(tokens[0].kind, TokenKind::Eof);
940 }
941
942 #[test]
943 fn only_comment() {
944 let tokens = tokenize("# this is a comment").unwrap();
946 assert_eq!(tokens.len(), 1);
947 assert_eq!(tokens[0].kind, TokenKind::Eof);
948 }
949
950 #[test]
951 fn hash_comment() {
952 let tokens = tokenize("# comment").unwrap();
953 assert_eq!(tokens.len(), 1);
954 assert_eq!(tokens[0].kind, TokenKind::Eof);
955 }
956
957 #[test]
958 fn simple_instruction() {
959 let kinds = tok_kinds("mov rax, rbx");
960 assert_eq!(
961 kinds,
962 vec![
963 TokenKind::Ident, TokenKind::Ident, TokenKind::Comma,
966 TokenKind::Ident, TokenKind::Eof,
968 ]
969 );
970 }
971
972 #[test]
973 fn instruction_with_immediate() {
974 let tokens = tokenize("mov rax, 42").unwrap();
975 assert_eq!(tokens[3].kind, TokenKind::Number(42));
976 }
977
978 #[test]
979 fn hex_immediate() {
980 let tokens = tokenize("mov rax, 0xFF").unwrap();
981 assert_eq!(tokens[3].kind, TokenKind::Number(255));
982 }
983
984 #[test]
985 fn hex_uppercase() {
986 let tokens = tokenize("mov rax, 0XAB").unwrap();
987 assert_eq!(tokens[3].kind, TokenKind::Number(0xAB));
988 }
989
990 #[test]
991 fn binary_immediate() {
992 let tokens = tokenize("mov rax, 0b1010").unwrap();
993 assert_eq!(tokens[3].kind, TokenKind::Number(10));
994 }
995
996 #[test]
997 fn octal_immediate() {
998 let tokens = tokenize("mov rax, 0o77").unwrap();
999 assert_eq!(tokens[3].kind, TokenKind::Number(63));
1000 }
1001
1002 #[test]
1003 fn negative_immediate() {
1004 let tokens = tokenize("mov rax, -1").unwrap();
1005 assert_eq!(tokens[3].kind, TokenKind::Number(-1));
1006 }
1007
1008 #[test]
1009 fn negative_hex() {
1010 let tokens = tokenize("add rsp, -0x10").unwrap();
1011 assert_eq!(tokens[3].kind, TokenKind::Number(-16));
1012 }
1013
1014 #[test]
1015 fn label_definition() {
1016 let tokens = tokenize("entry_point:").unwrap();
1017 assert_eq!(tokens[0].kind, TokenKind::LabelDef);
1018 assert_eq!(tokens[0].text, "entry_point");
1019 }
1020
1021 #[test]
1022 fn label_definition_with_instruction() {
1023 let kinds = tok_kinds("loop: dec rcx");
1024 assert_eq!(kinds[0], TokenKind::LabelDef);
1025 assert_eq!(kinds[1], TokenKind::Ident); assert_eq!(kinds[2], TokenKind::Ident); }
1028
1029 #[test]
1030 fn numeric_label_def() {
1031 let tokens = tokenize("1:").unwrap();
1032 assert_eq!(tokens[0].kind, TokenKind::NumericLabelDef(1));
1033 }
1034
1035 #[test]
1036 fn numeric_label_backward_ref() {
1037 let tokens = tokenize("jnz 1b").unwrap();
1038 assert_eq!(tokens[1].kind, TokenKind::NumericLabelBwd(1));
1039 }
1040
1041 #[test]
1042 fn numeric_label_forward_ref() {
1043 let tokens = tokenize("jmp 2f").unwrap();
1044 assert_eq!(tokens[1].kind, TokenKind::NumericLabelFwd(2));
1045 }
1046
1047 #[test]
1048 fn directive() {
1049 let tokens = tokenize(".byte 0x90").unwrap();
1050 assert_eq!(tokens[0].kind, TokenKind::Directive);
1051 assert_eq!(tokens[0].text, ".byte");
1052 assert_eq!(tokens[1].kind, TokenKind::Number(0x90));
1053 }
1054
1055 #[test]
1056 fn equ_directive() {
1057 let tokens = tokenize(".equ SYS_WRITE, 1").unwrap();
1058 assert_eq!(tokens[0].kind, TokenKind::Directive);
1059 assert_eq!(tokens[0].text, ".equ");
1060 assert_eq!(tokens[1].kind, TokenKind::Ident);
1061 assert_eq!(tokens[1].text, "SYS_WRITE");
1062 }
1063
1064 #[test]
1065 fn memory_operand_tokens() {
1066 let kinds = tok_kinds("[rax + rbx*4 + 8]");
1067 assert_eq!(
1068 kinds,
1069 vec![
1070 TokenKind::OpenBracket,
1071 TokenKind::Ident, TokenKind::Plus,
1073 TokenKind::Ident, TokenKind::Star,
1075 TokenKind::Number(4),
1076 TokenKind::Plus,
1077 TokenKind::Number(8),
1078 TokenKind::CloseBracket,
1079 TokenKind::Eof,
1080 ]
1081 );
1082 }
1083
1084 #[test]
1085 fn string_literal() {
1086 let tokens = tokenize(".asciz \"hello\"").unwrap();
1087 assert_eq!(tokens[1].kind, TokenKind::StringLit);
1088 assert_eq!(tokens[1].text, "hello");
1089 }
1090
1091 #[test]
1092 fn string_escape_sequences() {
1093 let tokens = tokenize(".ascii \"a\\nb\\t\\\\c\\0\\x41\"").unwrap();
1094 assert_eq!(tokens[1].kind, TokenKind::StringLit);
1095 assert_eq!(tokens[1].text, "a\nb\t\\c\0A");
1096 }
1097
1098 #[test]
1099 fn character_literal() {
1100 let tokens = tokenize("mov al, 'A'").unwrap();
1101 assert_eq!(tokens[3].kind, TokenKind::CharLit(b'A'));
1102 }
1103
1104 #[test]
1105 fn semicolon_separator() {
1106 let kinds = tok_kinds("nop; ret");
1107 assert_eq!(
1108 kinds,
1109 vec![
1110 TokenKind::Ident, TokenKind::Newline, TokenKind::Ident, TokenKind::Eof,
1114 ]
1115 );
1116 }
1117
1118 #[test]
1119 fn newline_separator() {
1120 let kinds = tok_kinds("nop\nret");
1121 assert_eq!(
1122 kinds,
1123 vec![
1124 TokenKind::Ident, TokenKind::Newline,
1126 TokenKind::Ident, TokenKind::Eof,
1128 ]
1129 );
1130 }
1131
1132 #[test]
1133 fn segment_override_tokens() {
1134 let kinds = tok_kinds("fs:[rax]");
1135 assert_eq!(kinds[0], TokenKind::Ident); assert_eq!(kinds[1], TokenKind::Colon);
1137 assert_eq!(kinds[2], TokenKind::OpenBracket);
1138 assert_eq!(kinds[3], TokenKind::Ident); assert_eq!(kinds[4], TokenKind::CloseBracket);
1140 }
1141
1142 #[test]
1143 fn size_hint_tokens() {
1144 let kinds = tok_kinds("byte ptr [rax]");
1145 assert_eq!(kinds[0], TokenKind::Ident); assert_eq!(kinds[1], TokenKind::Ident); assert_eq!(kinds[2], TokenKind::OpenBracket);
1148 }
1149
1150 #[test]
1151 fn prefix_and_instruction() {
1152 let kinds = tok_kinds("lock add [rax], 1");
1153 assert_eq!(kinds[0], TokenKind::Ident); assert_eq!(kinds[1], TokenKind::Ident); }
1156
1157 #[test]
1158 fn span_tracking() {
1159 let tokens = tokenize("mov rax, 1").unwrap();
1160 assert_eq!(tokens[0].span, Span::new(1, 1, 0, 3)); assert_eq!(tokens[1].span, Span::new(1, 5, 4, 3)); assert_eq!(tokens[2].span, Span::new(1, 8, 7, 1)); }
1164
1165 #[test]
1166 fn multiline_span_tracking() {
1167 let tokens = tokenize("nop\nmov rax, 1").unwrap();
1168 assert_eq!(tokens[0].span.line, 1); assert_eq!(tokens[2].span.line, 2); }
1171
1172 #[test]
1173 fn unknown_character_error() {
1174 let err = tokenize("mov rax, @").unwrap_err();
1175 match err {
1176 AsmError::Syntax { msg, .. } => {
1177 assert!(msg.contains("unexpected character '@'"));
1178 }
1179 _ => panic!("expected Syntax error"),
1180 }
1181 }
1182
1183 #[test]
1184 fn unterminated_string() {
1185 let err = tokenize(".ascii \"hello").unwrap_err();
1186 match err {
1187 AsmError::Syntax { msg, .. } => {
1188 assert!(msg.contains("unterminated string"));
1189 }
1190 _ => panic!("expected Syntax error"),
1191 }
1192 }
1193
1194 #[test]
1195 fn unterminated_block_comment() {
1196 let err = tokenize("nop /* this is never closed").unwrap_err();
1197 match err {
1198 AsmError::Syntax { msg, span } => {
1199 assert!(
1200 msg.contains("unterminated block comment"),
1201 "expected 'unterminated block comment', got: {msg}"
1202 );
1203 assert!(span.line > 0 || span.col > 0, "span should not be (0,0)");
1205 }
1206 _ => panic!("expected Syntax error"),
1207 }
1208 }
1209
1210 #[test]
1211 fn complex_instruction() {
1212 let tokens = tokenize("mov qword ptr [rbp - 0x10], rax").unwrap();
1213 let texts: Vec<_> = tokens.iter().map(|t| &*t.text).collect();
1214 assert_eq!(
1215 texts,
1216 vec!["mov", "qword", "ptr", "[", "rbp", "-", "0x10", "]", ",", "rax", ""]
1217 );
1218 }
1219
1220 #[test]
1221 fn all_punctuation() {
1222 let kinds = tok_kinds(", [ ] + - * :");
1223 assert_eq!(
1224 kinds,
1225 vec![
1226 TokenKind::Comma,
1227 TokenKind::OpenBracket,
1228 TokenKind::CloseBracket,
1229 TokenKind::Plus,
1230 TokenKind::Minus,
1231 TokenKind::Star,
1232 TokenKind::Colon,
1233 TokenKind::Eof,
1234 ]
1235 );
1236 }
1237
1238 #[test]
1239 fn trailing_whitespace() {
1240 let tokens = tokenize("nop ").unwrap();
1241 assert_eq!(tokens.len(), 2); }
1243
1244 #[test]
1245 fn zero_immediate() {
1246 let tokens = tokenize("xor eax, 0").unwrap();
1247 assert_eq!(tokens[3].kind, TokenKind::Number(0));
1248 }
1249
1250 #[test]
1251 fn large_hex_immediate() {
1252 let tokens = tokenize("mov rdi, 0x68732f2f6e69622f").unwrap();
1253 assert_eq!(tokens[3].kind, TokenKind::Number(0x68732f2f6e69622f));
1254 }
1255
1256 #[test]
1257 fn minus_in_memory_operand_is_not_unary() {
1258 let kinds = tok_kinds("[rbp - 0x10]");
1260 assert_eq!(
1261 kinds,
1262 vec![
1263 TokenKind::OpenBracket,
1264 TokenKind::Ident, TokenKind::Minus,
1266 TokenKind::Number(0x10),
1267 TokenKind::CloseBracket,
1268 TokenKind::Eof,
1269 ]
1270 );
1271 }
1272
1273 #[test]
1274 fn equals_token() {
1275 let kinds = tok_kinds("EXIT = 60");
1276 assert_eq!(
1277 kinds,
1278 vec![
1279 TokenKind::Ident, TokenKind::Equals,
1281 TokenKind::Number(60),
1282 TokenKind::Eof,
1283 ]
1284 );
1285 }
1286
1287 #[test]
1288 fn equals_with_negative() {
1289 let kinds = tok_kinds("NEG = -1");
1290 assert_eq!(
1291 kinds,
1292 vec![
1293 TokenKind::Ident,
1294 TokenKind::Equals,
1295 TokenKind::Number(-1),
1296 TokenKind::Eof,
1297 ]
1298 );
1299 }
1300}