1use std::fmt;
6
7pub struct Lexer<'src> {
12 source: &'src str,
13 data: Vec<char>,
14 pos: usize,
15 byte_pos: usize,
16 loc: Loc,
17 peeked: Option<Token>,
18 keywords: Vec<&'src str>,
19}
20
21impl<'src> Lexer<'src> {
22 pub fn new(source: &'src str) -> Self {
24 Self {
25 source,
26 data: source.chars().collect(),
27 loc: Loc::new(1, 1),
28 pos: 0,
29 byte_pos: 0,
30 peeked: None,
31 keywords: Vec::new(),
32 }
33 }
34
35 pub fn with_keywords(mut self, keywords: &[&'src str]) -> Self {
37 self.keywords = keywords.to_vec();
38 self
39 }
40
41 pub fn next(&mut self) -> Token {
44 if let Some(peek) = self.peeked.take() {
45 peek
46 } else {
47 self.next_token()
48 }
49 }
50
51 pub fn peek(&mut self) -> &Token {
54 if self.peeked.is_none() {
55 self.peeked = Some(self.next_token());
56 }
57 self.peeked.as_ref().unwrap()
58 }
59
60 fn advance(&mut self) -> char {
61 let ch = self.read_char();
62 self.byte_pos += ch.len_utf8();
63 self.pos += 1;
64 self.loc.next(ch);
65 ch
66 }
67
68 fn read_char(&mut self) -> char {
69 let pos = self.pos;
70 if pos >= self.data.len() {
71 '\0'
72 } else {
73 self.data[pos]
74 }
75 }
76
77 fn next_token(&mut self) -> Token {
78 while self.pos <= self.data.len() {
79 let begin_byte = self.byte_pos;
80 let ch = self.advance();
81 let loc = self.loc;
82
83 let tok = match ch {
84 '/' if self.read_char() == '/' => {
85 while self.advance() != '\n' {}
86 continue;
87 }
88 '#' => {
89 let ch = self.read_char();
90 if self.byte_pos == 1 && ch == '!' {
91 while self.advance() != '\n' {}
92 continue;
93 }
94 loop {
95 let ch = self.read_char();
96 if ch.is_alphanumeric() || ch == '_' {
97 self.advance();
98 } else {
99 break;
100 }
101 }
102 Token::new(
103 TokenKind::Directive,
104 loc,
105 self.source[begin_byte..self.byte_pos].into(),
106 )
107 }
108 '-' if self.read_char() == '>' => {
109 self.advance();
110 Token::new(
111 TokenKind::Arrow,
112 loc,
113 self.source[begin_byte..self.byte_pos].into(),
114 )
115 }
116 '=' if self.read_char() == '=' => {
117 self.advance();
118 Token::new(
119 TokenKind::EqEq,
120 loc,
121 self.source[begin_byte..self.byte_pos].into(),
122 )
123 }
124 ':' if self.read_char() == '=' => {
125 self.advance();
126 Token::new(
127 TokenKind::Assign,
128 loc,
129 self.source[begin_byte..self.byte_pos].into(),
130 )
131 }
132 '<' if self.read_char() == '=' => {
133 self.advance();
134 Token::new(
135 TokenKind::LtEq,
136 loc,
137 self.source[begin_byte..self.byte_pos].into(),
138 )
139 }
140 '>' if self.read_char() == '=' => {
141 self.advance();
142 Token::new(
143 TokenKind::GtEq,
144 loc,
145 self.source[begin_byte..self.byte_pos].into(),
146 )
147 }
148 '!' if self.read_char() == '=' => {
149 self.advance();
150 Token::new(
151 TokenKind::NotEq,
152 loc,
153 self.source[begin_byte..self.byte_pos].into(),
154 )
155 }
156 '&' if self.read_char() == '&' => {
157 self.advance();
158 Token::new(
159 TokenKind::DoubleAmpersand,
160 loc,
161 self.source[begin_byte..self.byte_pos].into(),
162 )
163 }
164 '|' if self.read_char() == '|' => {
165 self.advance();
166 Token::new(
167 TokenKind::DoublePipe,
168 loc,
169 self.source[begin_byte..self.byte_pos].into(),
170 )
171 }
172 ':' if self.read_char() == ':' => {
173 self.advance();
174 Token::new(
175 TokenKind::DoubleColon,
176 loc,
177 self.source[begin_byte..self.byte_pos].into(),
178 )
179 }
180 '.' if self.read_char() == '.' && self.read_char() == '.' => {
181 self.advance();
182 self.advance();
183 Token::new(
184 TokenKind::Ellipsis,
185 loc,
186 self.source[begin_byte..self.byte_pos].into(),
187 )
188 }
189 ch if ch.is_alphabetic() || ch == '_' => return self.lex_identifier(begin_byte),
190 '0'..='9' => return self.lex_number(begin_byte),
191 '"' => return self.lex_string(begin_byte),
192
193 ',' => Token::new(
194 TokenKind::Comma,
195 loc,
196 self.source[begin_byte..self.byte_pos].into(),
197 ),
198 ';' => Token::new(
199 TokenKind::SemiColon,
200 loc,
201 self.source[begin_byte..self.byte_pos].into(),
202 ),
203 ':' => Token::new(
204 TokenKind::Colon,
205 loc,
206 self.source[begin_byte..self.byte_pos].into(),
207 ),
208 '\\' => Token::new(
209 TokenKind::BackSlash,
210 loc,
211 self.source[begin_byte..self.byte_pos].into(),
212 ),
213 '=' => Token::new(
214 TokenKind::Eq,
215 loc,
216 self.source[begin_byte..self.byte_pos].into(),
217 ),
218 '<' => Token::new(
219 TokenKind::Lt,
220 loc,
221 self.source[begin_byte..self.byte_pos].into(),
222 ),
223 '>' => Token::new(
224 TokenKind::Gt,
225 loc,
226 self.source[begin_byte..self.byte_pos].into(),
227 ),
228 '!' => Token::new(
229 TokenKind::Bang,
230 loc,
231 self.source[begin_byte..self.byte_pos].into(),
232 ),
233 '+' => {
234 let next = self.read_char();
235 if next == '+' {
236 self.advance();
237 Token::new(
238 TokenKind::Concat,
239 loc,
240 self.source[begin_byte..self.byte_pos].into(),
241 )
242 } else if next == '=' {
243 self.advance();
244 Token::new(
245 TokenKind::PlusEq,
246 loc,
247 self.source[begin_byte..self.byte_pos].into(),
248 )
249 } else {
250 Token::new(
251 TokenKind::Plus,
252 loc,
253 self.source[begin_byte..self.byte_pos].into(),
254 )
255 }
256 }
257 '-' => {
258 let next = self.read_char();
259 if next == '>' {
260 self.advance();
261 Token::new(
262 TokenKind::Arrow,
263 loc,
264 self.source[begin_byte..self.byte_pos].into(),
265 )
266 } else if next == '=' {
267 self.advance();
268 Token::new(
269 TokenKind::MinusEq,
270 loc,
271 self.source[begin_byte..self.byte_pos].into(),
272 )
273 } else {
274 Token::new(
275 TokenKind::Minus,
276 loc,
277 self.source[begin_byte..self.byte_pos].into(),
278 )
279 }
280 }
281 '.' => Token::new(
282 TokenKind::Dot,
283 loc,
284 self.source[begin_byte..self.byte_pos].into(),
285 ),
286 '*' => {
287 let next = self.read_char();
288 if next == '=' {
289 self.advance();
290 Token::new(
291 TokenKind::AsteriskEq,
292 loc,
293 self.source[begin_byte..self.byte_pos].into(),
294 )
295 } else {
296 Token::new(
297 TokenKind::Asterisk,
298 loc,
299 self.source[begin_byte..self.byte_pos].into(),
300 )
301 }
302 }
303 '/' => {
304 let next = self.read_char();
305 if next == '=' {
306 self.advance();
307 Token::new(
308 TokenKind::SlashEq,
309 loc,
310 self.source[begin_byte..self.byte_pos].into(),
311 )
312 } else {
313 Token::new(
314 TokenKind::Slash,
315 loc,
316 self.source[begin_byte..self.byte_pos].into(),
317 )
318 }
319 }
320 '%' => {
321 let next = self.read_char();
322 if next == '=' {
323 self.advance();
324 Token::new(
325 TokenKind::ModEq,
326 loc,
327 self.source[begin_byte..self.byte_pos].into(),
328 )
329 } else {
330 Token::new(
331 TokenKind::Mod,
332 loc,
333 self.source[begin_byte..self.byte_pos].into(),
334 )
335 }
336 }
337 '$' => Token::new(
338 TokenKind::Dollar,
339 loc,
340 self.source[begin_byte..self.byte_pos].into(),
341 ),
342 '&' => Token::new(
343 TokenKind::Ampersand,
344 loc,
345 self.source[begin_byte..self.byte_pos].into(),
346 ),
347 '^' => Token::new(
348 TokenKind::Caret,
349 loc,
350 self.source[begin_byte..self.byte_pos].into(),
351 ),
352 '|' => Token::new(
353 TokenKind::Pipe,
354 loc,
355 self.source[begin_byte..self.byte_pos].into(),
356 ),
357 '(' => Token::new(
358 TokenKind::OpenParen,
359 loc,
360 self.source[begin_byte..self.byte_pos].into(),
361 ),
362 ')' => Token::new(
363 TokenKind::CloseParen,
364 loc,
365 self.source[begin_byte..self.byte_pos].into(),
366 ),
367 '[' => Token::new(
368 TokenKind::OpenBracket,
369 loc,
370 self.source[begin_byte..self.byte_pos].into(),
371 ),
372 ']' => Token::new(
373 TokenKind::CloseBracket,
374 loc,
375 self.source[begin_byte..self.byte_pos].into(),
376 ),
377 '{' => Token::new(
378 TokenKind::OpenCurly,
379 loc,
380 self.source[begin_byte..self.byte_pos].into(),
381 ),
382 '}' => Token::new(
383 TokenKind::CloseCurly,
384 loc,
385 self.source[begin_byte..self.byte_pos].into(),
386 ),
387
388 ch if ch.is_whitespace() => continue,
389 '\0' => return Token::new(TokenKind::EOF, self.loc, "\0".into()),
390 _ => {
391 return Token::new(
392 TokenKind::UnexpectedCharacter,
393 self.loc,
394 self.source[begin_byte..self.byte_pos].into(),
395 );
396 }
397 };
398 return tok;
399 }
400
401 Token::new(TokenKind::EOF, self.loc, "".into())
402 }
403
404 fn lex_identifier(&mut self, begin_byte: usize) -> Token {
405 let loc = self.loc;
406 #[allow(unused_mut)]
407 let mut kind = TokenKind::Identifier;
408 loop {
409 let ch = self.read_char();
410 if ch.is_alphanumeric() || ch == '_' {
411 self.advance();
412 } else {
413 break;
414 }
415 }
416 let ident = &self.source[begin_byte..self.byte_pos];
417
418 if self.keywords.contains(&ident) {
419 kind = TokenKind::Keyword;
420 }
421
422 Token::new(kind, loc, ident.into())
423 }
424
425 fn lex_number(&mut self, begin_byte: usize) -> Token {
426 let loc = self.loc;
427 let end;
428 let mut base = 10;
429
430 let next = self.read_char();
432 match next {
433 'x' | 'X' => {
434 base = 16;
435 self.advance(); self.advance(); }
438 'b' | 'B' => {
439 base = 2;
440 self.advance(); self.advance(); }
443 'o' | 'O' => {
444 base = 8;
445 self.advance(); self.advance(); }
448 _ => {}
449 }
450
451 loop {
453 let c = self.read_char();
454 let valid = match base {
455 2 => matches!(c, '0' | '1'),
456 8 => matches!(c, '0'..='7'),
457 10 if c == '.' => {
458 self.advance();
459 loop {
460 let c = self.read_char();
461 if !c.is_ascii_digit() {
462 break;
463 }
464 self.advance();
465 }
466 end = self.byte_pos;
467 let num_str = &self.source[begin_byte..end];
468 return Token::new(TokenKind::RealNumber, loc, (*num_str).into());
469 }
470 10 => c.is_ascii_digit(),
471 16 => c.is_ascii_hexdigit(),
472 _ => false,
473 };
474 if !valid {
475 break;
476 }
477 self.advance();
478 }
479
480 end = self.byte_pos;
481
482 let num_str = &self.source[begin_byte..end]
483 .trim_start_matches("0x")
484 .trim_start_matches("0X")
485 .trim_start_matches("0b")
486 .trim_start_matches("0B")
487 .trim_start_matches("0o")
488 .trim_start_matches("0O");
489 let kind = TokenKind::Number(NumberBase::from(base));
490
491 Token::new(kind, loc, (*num_str).into())
492 }
493
494 fn lex_string(&mut self, begin_byte: usize) -> Token {
495 let loc = self.loc;
497 loop {
498 let ch = self.read_char();
499 match ch {
500 '"' => {
501 self.advance();
502 break;
503 }
504 '\0' => {
505 return Token::new(
506 TokenKind::UnterminatedStringLiteral,
507 loc,
508 self.source[begin_byte..self.byte_pos].into(),
509 );
510 }
511 '\\' => {
512 self.advance();
513 let esc = self.read_char();
514 match esc {
515 'r' => {} 'n' => {} '"' => {} '\'' => {} '\\' => {} '0' => {} _ => {
522 return Token::new(
523 TokenKind::InvalidEscapeSequence,
524 loc,
525 self.source[begin_byte..self.byte_pos].into(),
526 );
527 }
528 }
529 }
530 _ => {} }
532 self.advance();
533 }
534
535 Token::new(
536 TokenKind::StringLiteral,
537 loc,
538 self.source[begin_byte..self.byte_pos].into(),
539 )
540 }
541}
542
543#[derive(Debug, Clone, PartialEq, Eq, Hash)]
546pub struct TokenSource(
547 #[cfg(feature = "interning")] pub &'static str,
548 #[cfg(not(feature = "interning"))] pub String,
549);
550
551impl std::ops::Deref for TokenSource {
552 type Target = str;
553
554 #[inline]
555 fn deref(&self) -> &str {
556 #[cfg(feature = "interning")]
557 {
558 self.0
559 }
560 #[cfg(not(feature = "interning"))]
561 {
562 &self.0
563 }
564 }
565}
566
567impl fmt::Display for TokenSource {
568 #[inline]
569 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
570 fmt::Display::fmt(&**self, f)
571 }
572}
573
574impl From<&str> for TokenSource {
575 #[inline]
576 fn from(s: &str) -> Self {
577 #[cfg(feature = "interning")]
578 {
579 Self(intern(s))
580 }
581 #[cfg(not(feature = "interning"))]
582 {
583 Self(s.to_string())
584 }
585 }
586}
587
588impl From<String> for TokenSource {
589 #[inline]
590 fn from(s: String) -> Self {
591 #[cfg(feature = "interning")]
592 {
593 Self(intern(&s))
594 }
595 #[cfg(not(feature = "interning"))]
596 {
597 Self(s)
598 }
599 }
600}
601
602impl From<&String> for TokenSource {
603 #[inline]
604 fn from(s: &String) -> Self {
605 #[cfg(feature = "interning")]
606 {
607 Self(intern(s.as_str()))
608 }
609 #[cfg(not(feature = "interning"))]
610 {
611 Self(s.clone())
612 }
613 }
614}
615
616#[cfg(feature = "interning")]
617static INTERNER: std::sync::OnceLock<std::sync::Mutex<std::collections::HashSet<&'static str>>> =
618 std::sync::OnceLock::new();
619
620#[cfg(feature = "interning")]
621fn intern(s: &str) -> &'static str {
622 let mut interner = INTERNER
623 .get_or_init(|| std::sync::Mutex::new(std::collections::HashSet::new()))
624 .lock()
625 .unwrap();
626 if let Some(interned) = interner.get(s) {
627 interned
628 } else {
629 let leaked: &'static str = Box::leak(s.to_string().into_boxed_str());
630 interner.insert(leaked);
631 leaked
632 }
633}
634
635#[derive(Debug, Clone, PartialEq, Eq, Hash)]
637pub struct Token {
638 pub kind: TokenKind,
639 pub loc: Loc,
640 pub source: TokenSource,
642}
643
644impl fmt::Display for Token {
645 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
646 match self.kind {
647 TokenKind::EOF => write!(f, "EOF"),
648 TokenKind::UnexpectedCharacter => {
649 write!(f, "Unexpected Character `{}`", self.source.escape_default())
650 }
651 TokenKind::InvalidEscapeSequence => {
652 write!(
653 f,
654 "Invalid Escape Sequence `{}`",
655 self.source.escape_default()
656 )
657 }
658 TokenKind::UnterminatedStringLiteral => {
659 write!(
660 f,
661 "Unterminated String Literal `{}`",
662 self.source.escape_default()
663 )
664 }
665 TokenKind::StringLiteral => write!(f, "{}", self.source.escape_default()),
666 TokenKind::CharacterLiteral => write!(f, "{}", self.source.escape_default()),
667 _ => write!(f, "{}", self.source),
668 }
669 }
670}
671
672impl Token {
673 pub fn source(&self) -> &str {
675 &self.source
677 }
678
679 pub fn new(kind: TokenKind, loc: Loc, source: TokenSource) -> Self {
681 Self {
682 kind,
683 loc,
684 source,
686 }
687 }
688
689 pub fn is_eof(&self) -> bool {
691 matches!(self.kind, TokenKind::EOF)
692 }
693
694 pub fn unescape(&self) -> String {
696 match self.kind {
697 TokenKind::StringLiteral => token_string_unescape(self.source()),
698 _ => todo!(),
699 }
700 }
701}
702pub fn token_string_unescape(source: &str) -> String {
703 let mut buffer = String::new();
704 let mut esc = false;
705 let mut src = source.chars();
706 src.next();
707 for ch in src {
708 match ch {
709 ch if esc => {
710 match ch {
711 'r' => buffer.push('\r'),
712 'n' => buffer.push('\n'),
713 '"' => buffer.push('"'),
714 '\'' => buffer.push('\''),
715 '\\' => buffer.push('\\'),
716 '0' => buffer.push('\0'),
717 _ => return buffer,
718 }
719 esc = false;
720 }
721 '"' => return buffer,
722 '\\' => {
723 esc = true;
724 continue;
725 }
726 _ => buffer.push(ch),
727 }
728 }
729 buffer
730}
731
732#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
734pub enum TokenKind {
735 #[default]
736 EOF,
737 UnexpectedCharacter,
738 InvalidEscapeSequence,
739 UnterminatedStringLiteral,
740
741 OpenParen,
742 CloseParen,
743 OpenBracket,
744 CloseBracket,
745 OpenCurly,
746 CloseCurly,
747
748 Identifier,
749 Keyword,
750
751 Directive,
752
753 RealNumber,
754 StringLiteral,
755 CharacterLiteral,
756
757 Dot,
758 Ellipsis,
759 Comma,
760 Colon,
761 DoubleColon,
762 SemiColon,
763 Arrow,
764 BackSlash,
765
766 Assign,
767 PlusEq,
768 MinusEq,
769 AsteriskEq,
770 SlashEq,
771 ModEq,
772 Bang,
773 Plus,
774 Concat,
775 Minus,
776 Asterisk,
777 Slash,
778 Eq,
779 EqEq,
780 NotEq,
781 Gt,
782 GtEq,
783 Lt,
784 LtEq,
785 Mod,
786 Ampersand,
787 Pipe,
788 Caret,
789 DoubleAmpersand,
790 DoublePipe,
791
792 Dollar,
793 InvalidNumber,
794
795 Number(NumberBase),
796}
797
798#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
800pub enum NumberBase {
801 B,
802 O,
803 D,
804 X,
805}
806impl NumberBase {
807 pub fn radix(&self) -> u32 {
808 match self {
809 NumberBase::B => 2,
810 NumberBase::O => 8,
811 NumberBase::D => 10,
812 NumberBase::X => 16,
813 }
814 }
815}
816
817impl From<u32> for NumberBase {
818 fn from(value: u32) -> Self {
819 match value {
820 2 => Self::B,
821 8 => Self::O,
822 10 => Self::D,
823 16 => Self::X,
824 _ => panic!("Unkwon base"),
825 }
826 }
827}
828
829impl From<NumberBase> for u32 {
830 fn from(val: NumberBase) -> Self {
831 match val {
832 NumberBase::B => 2,
833 NumberBase::O => 8,
834 NumberBase::D => 10,
835 NumberBase::X => 16,
836 }
837 }
838}
839
840impl TokenKind {
841 pub fn is_assign_kind(&self) -> bool {
842 matches!(
843 self,
844 Self::Assign
845 | Self::Eq
846 | Self::PlusEq
847 | Self::MinusEq
848 | Self::AsteriskEq
849 | Self::SlashEq
850 | Self::ModEq
851 )
852 }
853}
854
855#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
857pub struct Loc {
858 pub line: usize,
859 pub col: usize,
860}
861
862impl fmt::Display for Loc {
863 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
864 write!(f, "{}:{}", self.line, self.col)
865 }
866}
867
868impl Loc {
869 pub fn new(line: usize, col: usize) -> Self {
870 Self { line, col }
871 }
872
873 pub fn next_column(&mut self) {
874 self.col += 1;
875 }
876
877 pub fn next_line(&mut self) {
878 self.line += 1;
879 self.col = 1;
880 }
881
882 pub fn next(&mut self, c: char) {
883 match c {
884 '\n' => self.next_line(),
885 '\t' => {
886 let ts = 8;
887 self.col = (self.col / ts) * ts + ts;
888 }
889 c if c.is_control() => {}
890 _ => {
891 self.next_column();
895 }
896 }
897 }
898}
899
900#[cfg(test)]
901mod tests {
902 use super::*;
903
904 #[test]
905 fn test_lexer_init_and_eof() {
906 let mut lexer = Lexer::new("");
907 let tok = lexer.next();
908 assert_eq!(tok.kind, TokenKind::EOF);
909 assert!(tok.is_eof());
910
911 let tok2 = lexer.next();
912 assert_eq!(tok2.kind, TokenKind::EOF);
913 }
914
915 #[test]
916 fn test_lexer_peek() {
917 let mut lexer = Lexer::new("abc");
918 let peeked = lexer.peek().clone();
919 assert_eq!(peeked.kind, TokenKind::Identifier);
920 assert_eq!(peeked.source(), "abc");
921 let next = lexer.next();
922 assert_eq!(next, peeked);
923 assert_eq!(lexer.next().kind, TokenKind::EOF);
924 }
925
926 #[test]
927 fn test_comment_skipping() {
928 let source = " // this is a line comment\n identifier";
929 let mut lexer = Lexer::new(source);
930 let tok = lexer.next();
931 assert_eq!(tok.kind, TokenKind::Identifier);
932 assert_eq!(tok.source(), "identifier");
933 assert_eq!(tok.loc.line, 2);
934 assert_eq!(lexer.next().kind, TokenKind::EOF);
935 }
936
937 #[test]
938 fn test_shebang_skipping() {
939 let source = "#!/usr/bin/env rust\nidentifier";
940 let mut lexer = Lexer::new(source);
941 let tok = lexer.next();
942 assert_eq!(tok.kind, TokenKind::Identifier);
943 assert_eq!(tok.source(), "identifier");
944 assert_eq!(tok.loc.line, 2);
945 }
946
947 #[test]
948 fn test_keywords() {
949 let source = "var let my_ident";
950 let mut lexer = Lexer::new(source).with_keywords(&["var", "let"]);
951 let t1 = lexer.next();
952 assert_eq!(t1.kind, TokenKind::Keyword);
953 assert_eq!(t1.source(), "var");
954 let t2 = lexer.next();
955 assert_eq!(t2.kind, TokenKind::Keyword);
956 assert_eq!(t2.source(), "let");
957 let t3 = lexer.next();
958 assert_eq!(t3.kind, TokenKind::Identifier);
959 assert_eq!(t3.source(), "my_ident");
960 }
961
962 #[test]
963 fn test_identifiers() {
964 let source = "a _a a123 _123_abc";
965 let mut lexer = Lexer::new(source);
966 let idents = ["a", "_a", "a123", "_123_abc"];
967 for expected in idents {
968 let tok = lexer.next();
969 assert_eq!(tok.kind, TokenKind::Identifier);
970 assert_eq!(tok.source(), expected);
971 }
972 }
973
974 #[test]
975 fn test_location_tracking() {
976 let source = "a\n\tb";
977 let mut lexer = Lexer::new(source);
978 let t1 = lexer.next();
979 assert_eq!(t1.source(), "a");
980 assert_eq!(t1.loc, Loc::new(1, 2));
981 let t2 = lexer.next();
982 assert_eq!(t2.source(), "b");
983 assert_eq!(t2.loc, Loc::new(2, 9));
989 }
990
991 #[test]
992 fn test_multi_char_operators() {
993 let source = "-> == := <= >= != && || :: ...";
994 let mut lex = Lexer::new(source);
995 assert_eq!(lex.next().kind, TokenKind::Arrow);
996 assert_eq!(lex.next().kind, TokenKind::EqEq);
997 assert_eq!(lex.next().kind, TokenKind::Assign);
998 assert_eq!(lex.next().kind, TokenKind::LtEq);
999 assert_eq!(lex.next().kind, TokenKind::GtEq);
1000 assert_eq!(lex.next().kind, TokenKind::NotEq);
1001 assert_eq!(lex.next().kind, TokenKind::DoubleAmpersand);
1002 assert_eq!(lex.next().kind, TokenKind::DoublePipe);
1003 assert_eq!(lex.next().kind, TokenKind::DoubleColon);
1004 assert_eq!(lex.next().kind, TokenKind::Ellipsis);
1005 }
1006
1007 #[test]
1008 fn test_single_and_compound_operators() {
1009 let source = ", ; : \\ = < > ! + ++ += - -= . * *= / /= % %= $ & ^ | ( ) [ ] { }";
1010 let mut lex = Lexer::new(source);
1011 assert_eq!(lex.next().kind, TokenKind::Comma);
1012 assert_eq!(lex.next().kind, TokenKind::SemiColon);
1013 assert_eq!(lex.next().kind, TokenKind::Colon);
1014 assert_eq!(lex.next().kind, TokenKind::BackSlash);
1015 assert_eq!(lex.next().kind, TokenKind::Eq);
1016 assert_eq!(lex.next().kind, TokenKind::Lt);
1017 assert_eq!(lex.next().kind, TokenKind::Gt);
1018 assert_eq!(lex.next().kind, TokenKind::Bang);
1019 assert_eq!(lex.next().kind, TokenKind::Plus);
1020 assert_eq!(lex.next().kind, TokenKind::Concat); assert_eq!(lex.next().kind, TokenKind::PlusEq); assert_eq!(lex.next().kind, TokenKind::Minus);
1023 assert_eq!(lex.next().kind, TokenKind::MinusEq); assert_eq!(lex.next().kind, TokenKind::Dot);
1025 assert_eq!(lex.next().kind, TokenKind::Asterisk);
1026 assert_eq!(lex.next().kind, TokenKind::AsteriskEq); assert_eq!(lex.next().kind, TokenKind::Slash);
1028 assert_eq!(lex.next().kind, TokenKind::SlashEq); assert_eq!(lex.next().kind, TokenKind::Mod);
1030 assert_eq!(lex.next().kind, TokenKind::ModEq); assert_eq!(lex.next().kind, TokenKind::Dollar);
1032 assert_eq!(lex.next().kind, TokenKind::Ampersand);
1033 assert_eq!(lex.next().kind, TokenKind::Caret);
1034 assert_eq!(lex.next().kind, TokenKind::Pipe);
1035 assert_eq!(lex.next().kind, TokenKind::OpenParen);
1036 assert_eq!(lex.next().kind, TokenKind::CloseParen);
1037 assert_eq!(lex.next().kind, TokenKind::OpenBracket);
1038 assert_eq!(lex.next().kind, TokenKind::CloseBracket);
1039 assert_eq!(lex.next().kind, TokenKind::OpenCurly);
1040 assert_eq!(lex.next().kind, TokenKind::CloseCurly);
1041 }
1042
1043 #[test]
1044 fn test_directives() {
1045 let mut lex = Lexer::new("#define ABC");
1046 let tok = lex.next();
1047 assert_eq!(tok.kind, TokenKind::Directive);
1048 assert_eq!(tok.source(), "#define");
1049
1050 let mut lex2 = Lexer::new("#!/bin/bash\n#include");
1051 let tok2 = lex2.next();
1052 assert_eq!(tok2.kind, TokenKind::Directive);
1053 assert_eq!(tok2.source(), "#include");
1054
1055 let mut lex3 = Lexer::new(" #!");
1056 let tok3 = lex3.next();
1057 assert_eq!(tok3.kind, TokenKind::Directive);
1058 assert_eq!(tok3.source(), "#");
1059 }
1060
1061 #[test]
1062 fn test_numeric_bases() {
1063 let source = "123 0b101 0o755 0xFF 1.23";
1064 let mut lex = Lexer::new(source);
1065
1066 let t1 = lex.next();
1067 assert_eq!(t1.kind, TokenKind::Number(NumberBase::D));
1068 assert_eq!(t1.source(), "123");
1069
1070 let t2 = lex.next();
1071 assert_eq!(t2.kind, TokenKind::Number(NumberBase::B));
1072 assert_eq!(t2.source(), "101");
1073
1074 let t3 = lex.next();
1075 assert_eq!(t3.kind, TokenKind::Number(NumberBase::O));
1076 assert_eq!(t3.source(), "755");
1077
1078 let t4 = lex.next();
1079 assert_eq!(t4.kind, TokenKind::Number(NumberBase::X));
1080 assert_eq!(t4.source(), "FF");
1081
1082 let t5 = lex.next();
1083 assert_eq!(t5.kind, TokenKind::RealNumber);
1084 assert_eq!(t5.source(), "1.23");
1085 }
1086
1087 #[test]
1088 fn test_number_base_conversions() {
1089 assert_eq!(NumberBase::B.radix(), 2);
1090 assert_eq!(NumberBase::O.radix(), 8);
1091 assert_eq!(NumberBase::D.radix(), 10);
1092 assert_eq!(NumberBase::X.radix(), 16);
1093
1094 assert_eq!(NumberBase::from(2), NumberBase::B);
1095 assert_eq!(NumberBase::from(8), NumberBase::O);
1096 assert_eq!(NumberBase::from(10), NumberBase::D);
1097 assert_eq!(NumberBase::from(16), NumberBase::X);
1098
1099 assert_eq!(u32::from(NumberBase::B), 2);
1100 assert_eq!(u32::from(NumberBase::O), 8);
1101 assert_eq!(u32::from(NumberBase::D), 10);
1102 assert_eq!(u32::from(NumberBase::X), 16);
1103 }
1104
1105 #[test]
1106 #[should_panic(expected = "Unkwon base")]
1107 fn test_number_base_panic() {
1108 let _ = NumberBase::from(3);
1109 }
1110
1111 #[test]
1112 fn test_string_literals() {
1113 let mut lex = Lexer::new("\"hello\"");
1114 let t = lex.next();
1115 assert_eq!(t.kind, TokenKind::StringLiteral);
1116 assert_eq!(t.source(), "\"hello\"");
1117 assert_eq!(t.unescape(), "hello");
1118
1119 let mut lex = Lexer::new("\"hello\\nworld\"");
1120 let t = lex.next();
1121 assert_eq!(t.kind, TokenKind::StringLiteral);
1122 assert_eq!(t.unescape(), "hello\nworld");
1123
1124 let mut lex = Lexer::new("\"hello\\x\"");
1125 let t = lex.next();
1126 assert_eq!(t.kind, TokenKind::InvalidEscapeSequence);
1127 assert_eq!(t.source(), "\"hello\\");
1128
1129 let mut lex = Lexer::new("\"hello");
1130 let t = lex.next();
1131 assert_eq!(t.kind, TokenKind::UnterminatedStringLiteral);
1132 assert_eq!(t.source(), "\"hello");
1133 }
1134
1135 #[test]
1136 fn test_token_helpers_and_display() {
1137 let loc = Loc::new(5, 10);
1138 let token = Token::new(TokenKind::Identifier, loc, "foo".into());
1139 assert!(!token.is_eof());
1140 assert_eq!(format!("{}", loc), "5:10");
1141 assert_eq!(format!("{}", token), "foo");
1142
1143 let eof_token = Token::new(TokenKind::EOF, loc, "".into());
1144 assert!(eof_token.is_eof());
1145 assert_eq!(format!("{}", eof_token), "EOF");
1146
1147 let err_token = Token::new(TokenKind::UnexpectedCharacter, loc, "@".into());
1148 assert_eq!(format!("{}", err_token), "Unexpected Character `@`");
1149
1150 let esc_err = Token::new(TokenKind::InvalidEscapeSequence, loc, "\\x".into());
1151 assert_eq!(format!("{}", esc_err), "Invalid Escape Sequence `\\\\x`");
1152
1153 let unterminated = Token::new(TokenKind::UnterminatedStringLiteral, loc, "\"abc".into());
1154 assert_eq!(
1155 format!("{}", unterminated),
1156 "Unterminated String Literal `\\\"abc`"
1157 );
1158
1159 let str_tok = Token::new(TokenKind::StringLiteral, loc, "\"abc\"".into());
1160 assert_eq!(format!("{}", str_tok), "\\\"abc\\\"");
1161
1162 let char_tok = Token::new(TokenKind::CharacterLiteral, loc, "'a'".into());
1163 assert_eq!(format!("{}", char_tok), "\\'a\\'");
1164 }
1165
1166 #[test]
1167 fn test_is_assign_kind() {
1168 assert!(TokenKind::Assign.is_assign_kind());
1169 assert!(TokenKind::Eq.is_assign_kind());
1170 assert!(TokenKind::PlusEq.is_assign_kind());
1171 assert!(TokenKind::MinusEq.is_assign_kind());
1172 assert!(TokenKind::AsteriskEq.is_assign_kind());
1173 assert!(TokenKind::SlashEq.is_assign_kind());
1174 assert!(TokenKind::ModEq.is_assign_kind());
1175 assert!(!TokenKind::Plus.is_assign_kind());
1176 assert!(!TokenKind::Identifier.is_assign_kind());
1177 }
1178
1179 #[test]
1180 fn test_unexpected_character() {
1181 let mut lex = Lexer::new("@");
1182 let t = lex.next();
1183 assert_eq!(t.kind, TokenKind::UnexpectedCharacter);
1184 assert_eq!(t.source(), "@");
1185 }
1186
1187 #[test]
1188 fn test_string_interning_pointer_equality() {
1189 let source = "my_var my_var";
1190 let mut lex = Lexer::new(source);
1191 let t1 = lex.next();
1192 let t2 = lex.next();
1193 assert_eq!(t1.source(), "my_var");
1194 assert_eq!(t2.source(), "my_var");
1195
1196 #[cfg(feature = "interning")]
1197 {
1198 let p1 = t1.source.0;
1199 let p2 = t2.source.0;
1200 assert!(std::ptr::eq(p1, p2));
1201 }
1202
1203 #[cfg(not(feature = "interning"))]
1204 {
1205 let p1 = t1.source.0.as_ptr();
1206 let p2 = t2.source.0.as_ptr();
1207 assert!(!std::ptr::eq(p1, p2));
1208 }
1209 }
1210}