1use std::fmt;
6
7pub struct Lexer<'src> {
12 source: &'src str,
13 data: Vec<char>,
14 pos: usize,
15 byte_pos: usize,
16 loc: Loc,
17 peeked: Option<Token>,
18 keywords: Vec<&'src str>,
19}
20
21impl<'src> Lexer<'src> {
22 pub fn new(source: &'src str) -> Self {
24 Self {
25 source,
26 data: source.chars().collect(),
27 loc: Loc::new(1, 1),
28 pos: 0,
29 byte_pos: 0,
30 peeked: None,
31 keywords: Vec::new(),
32 }
33 }
34
35 pub fn with_keywords(mut self, keywords: &[&'src str]) -> Self {
37 self.keywords = keywords.to_vec();
38 self
39 }
40
41 pub fn next(&mut self) -> Token {
44 if let Some(peek) = self.peeked.take() {
45 peek
46 } else {
47 self.next_token()
48 }
49 }
50
51 pub fn peek(&mut self) -> &Token {
54 if self.peeked.is_none() {
55 self.peeked = Some(self.next_token());
56 }
57 self.peeked.as_ref().unwrap()
58 }
59
60 fn advance(&mut self) -> char {
61 let ch = self.read_char();
62 self.byte_pos += ch.len_utf8();
63 self.pos += 1;
64 self.loc.next(ch);
65 ch
66 }
67
68 fn read_char(&mut self) -> char {
69 let pos = self.pos;
70 if pos >= self.data.len() {
71 '\0'
72 } else {
73 self.data[pos]
74 }
75 }
76
77 fn next_token(&mut self) -> Token {
78 while self.pos <= self.data.len() {
79 let begin_byte = self.byte_pos;
80 let ch = self.advance();
81 let loc = self.loc;
82
83 let tok = match ch {
84 '/' if self.read_char() == '/' => {
85 while self.advance() != '\n' {}
86 continue;
87 }
88 '#' => {
89 let ch = self.read_char();
90 if self.byte_pos == 1 && ch == '!' {
91 while self.advance() != '\n' {}
92 continue;
93 }
94 loop {
95 let ch = self.read_char();
96 if ch.is_alphanumeric() || ch == '_' {
97 self.advance();
98 } else {
99 break;
100 }
101 }
102 Token::new(
103 TokenKind::Directive,
104 loc,
105 self.source[begin_byte..self.byte_pos].into(),
106 )
107 }
108 '-' if self.read_char() == '>' => {
109 self.advance();
110 Token::new(
111 TokenKind::Arrow,
112 loc,
113 self.source[begin_byte..self.byte_pos].into(),
114 )
115 }
116 '=' if self.read_char() == '=' => {
117 self.advance();
118 Token::new(
119 TokenKind::EqEq,
120 loc,
121 self.source[begin_byte..self.byte_pos].into(),
122 )
123 }
124 ':' if self.read_char() == '=' => {
125 self.advance();
126 Token::new(
127 TokenKind::Assign,
128 loc,
129 self.source[begin_byte..self.byte_pos].into(),
130 )
131 }
132 '<' if self.read_char() == '=' => {
133 self.advance();
134 Token::new(
135 TokenKind::LtEq,
136 loc,
137 self.source[begin_byte..self.byte_pos].into(),
138 )
139 }
140 '>' if self.read_char() == '=' => {
141 self.advance();
142 Token::new(
143 TokenKind::GtEq,
144 loc,
145 self.source[begin_byte..self.byte_pos].into(),
146 )
147 }
148 '!' if self.read_char() == '=' => {
149 self.advance();
150 Token::new(
151 TokenKind::NotEq,
152 loc,
153 self.source[begin_byte..self.byte_pos].into(),
154 )
155 }
156 '&' if self.read_char() == '&' => {
157 self.advance();
158 Token::new(
159 TokenKind::DoubleAmpersand,
160 loc,
161 self.source[begin_byte..self.byte_pos].into(),
162 )
163 }
164 '|' if self.read_char() == '|' => {
165 self.advance();
166 Token::new(
167 TokenKind::DoublePipe,
168 loc,
169 self.source[begin_byte..self.byte_pos].into(),
170 )
171 }
172 ':' if self.read_char() == ':' => {
173 self.advance();
174 Token::new(
175 TokenKind::DoubleColon,
176 loc,
177 self.source[begin_byte..self.byte_pos].into(),
178 )
179 }
180 '.' if self.read_char() == '.' && self.read_char() == '.' => {
181 self.advance();
182 self.advance();
183 Token::new(
184 TokenKind::Ellipsis,
185 loc,
186 self.source[begin_byte..self.byte_pos].into(),
187 )
188 }
189 ch if ch.is_alphabetic() || ch == '_' => return self.lex_identifier(begin_byte),
190 '0'..='9' => return self.lex_number(begin_byte),
191 '"' => return self.lex_string(begin_byte),
192
193 ',' => Token::new(
194 TokenKind::Comma,
195 loc,
196 self.source[begin_byte..self.byte_pos].into(),
197 ),
198 ';' => Token::new(
199 TokenKind::SemiColon,
200 loc,
201 self.source[begin_byte..self.byte_pos].into(),
202 ),
203 ':' => Token::new(
204 TokenKind::Colon,
205 loc,
206 self.source[begin_byte..self.byte_pos].into(),
207 ),
208 '\\' => Token::new(
209 TokenKind::BackSlash,
210 loc,
211 self.source[begin_byte..self.byte_pos].into(),
212 ),
213 '=' => Token::new(
214 TokenKind::Eq,
215 loc,
216 self.source[begin_byte..self.byte_pos].into(),
217 ),
218 '<' => Token::new(
219 TokenKind::Lt,
220 loc,
221 self.source[begin_byte..self.byte_pos].into(),
222 ),
223 '>' => Token::new(
224 TokenKind::Gt,
225 loc,
226 self.source[begin_byte..self.byte_pos].into(),
227 ),
228 '!' => Token::new(
229 TokenKind::Bang,
230 loc,
231 self.source[begin_byte..self.byte_pos].into(),
232 ),
233 '+' => {
234 let next = self.read_char();
235 if next == '+' {
236 self.advance();
237 Token::new(
238 TokenKind::Concat,
239 loc,
240 self.source[begin_byte..self.byte_pos].into(),
241 )
242 } else if next == '=' {
243 self.advance();
244 Token::new(
245 TokenKind::PlusEq,
246 loc,
247 self.source[begin_byte..self.byte_pos].into(),
248 )
249 } else {
250 Token::new(
251 TokenKind::Plus,
252 loc,
253 self.source[begin_byte..self.byte_pos].into(),
254 )
255 }
256 }
257 '-' => {
258 let next = self.read_char();
259 if next == '>' {
260 self.advance();
261 Token::new(
262 TokenKind::Arrow,
263 loc,
264 self.source[begin_byte..self.byte_pos].into(),
265 )
266 } else if next == '=' {
267 self.advance();
268 Token::new(
269 TokenKind::MinusEq,
270 loc,
271 self.source[begin_byte..self.byte_pos].into(),
272 )
273 } else {
274 Token::new(
275 TokenKind::Minus,
276 loc,
277 self.source[begin_byte..self.byte_pos].into(),
278 )
279 }
280 }
281 '.' => Token::new(
282 TokenKind::Dot,
283 loc,
284 self.source[begin_byte..self.byte_pos].into(),
285 ),
286 '*' => {
287 let next = self.read_char();
288 if next == '=' {
289 self.advance();
290 Token::new(
291 TokenKind::AsteriskEq,
292 loc,
293 self.source[begin_byte..self.byte_pos].into(),
294 )
295 } else {
296 Token::new(
297 TokenKind::Asterisk,
298 loc,
299 self.source[begin_byte..self.byte_pos].into(),
300 )
301 }
302 }
303 '/' => {
304 let next = self.read_char();
305 if next == '=' {
306 self.advance();
307 Token::new(
308 TokenKind::SlashEq,
309 loc,
310 self.source[begin_byte..self.byte_pos].into(),
311 )
312 } else {
313 Token::new(
314 TokenKind::Slash,
315 loc,
316 self.source[begin_byte..self.byte_pos].into(),
317 )
318 }
319 }
320 '%' => {
321 let next = self.read_char();
322 if next == '=' {
323 self.advance();
324 Token::new(
325 TokenKind::ModEq,
326 loc,
327 self.source[begin_byte..self.byte_pos].into(),
328 )
329 } else {
330 Token::new(
331 TokenKind::Mod,
332 loc,
333 self.source[begin_byte..self.byte_pos].into(),
334 )
335 }
336 }
337 '$' => Token::new(
338 TokenKind::Dollar,
339 loc,
340 self.source[begin_byte..self.byte_pos].into(),
341 ),
342 '&' => Token::new(
343 TokenKind::Ampersand,
344 loc,
345 self.source[begin_byte..self.byte_pos].into(),
346 ),
347 '^' => Token::new(
348 TokenKind::Caret,
349 loc,
350 self.source[begin_byte..self.byte_pos].into(),
351 ),
352 '|' => Token::new(
353 TokenKind::Pipe,
354 loc,
355 self.source[begin_byte..self.byte_pos].into(),
356 ),
357 '(' => Token::new(
358 TokenKind::OpenParen,
359 loc,
360 self.source[begin_byte..self.byte_pos].into(),
361 ),
362 ')' => Token::new(
363 TokenKind::CloseParen,
364 loc,
365 self.source[begin_byte..self.byte_pos].into(),
366 ),
367 '[' => Token::new(
368 TokenKind::OpenBracket,
369 loc,
370 self.source[begin_byte..self.byte_pos].into(),
371 ),
372 ']' => Token::new(
373 TokenKind::CloseBracket,
374 loc,
375 self.source[begin_byte..self.byte_pos].into(),
376 ),
377 '{' => Token::new(
378 TokenKind::OpenCurly,
379 loc,
380 self.source[begin_byte..self.byte_pos].into(),
381 ),
382 '}' => Token::new(
383 TokenKind::CloseCurly,
384 loc,
385 self.source[begin_byte..self.byte_pos].into(),
386 ),
387
388 ch if ch.is_whitespace() => continue,
389 '\0' => return Token::new(TokenKind::EOF, self.loc, "\0".into()),
390 _ => {
391 return Token::new(
392 TokenKind::UnexpectedCharacter,
393 self.loc,
394 self.source[begin_byte..self.byte_pos].into(),
395 );
396 }
397 };
398 return tok;
399 }
400
401 Token::new(TokenKind::EOF, self.loc, "".into())
402 }
403
404 fn lex_identifier(&mut self, begin_byte: usize) -> Token {
405 let loc = self.loc;
406 #[allow(unused_mut)]
407 let mut kind = TokenKind::Identifier;
408 loop {
409 let ch = self.read_char();
410 if ch.is_alphanumeric() || ch == '_' {
411 self.advance();
412 } else {
413 break;
414 }
415 }
416 let ident = &self.source[begin_byte..self.byte_pos];
417
418 if self.keywords.contains(&ident) {
419 kind = TokenKind::Keyword;
420 }
421
422 Token::new(kind, loc, ident.into())
423 }
424
425 fn lex_number(&mut self, begin_byte: usize) -> Token {
426 let loc = self.loc;
427 let end; let mut base = 10;
429
430 let next = self.read_char();
433 match next {
434 'x' | 'X' => {
435 base = 16;
436 self.advance(); self.advance(); }
439 'b' | 'B' => {
440 base = 2;
441 self.advance(); self.advance(); }
444 'o' | 'O' => {
445 base = 8;
446 self.advance(); self.advance(); }
449 _ => {}
450 }
451 loop {
455 let c = self.read_char();
456 let valid = match base {
457 2 => matches!(c, '0' | '1'),
458 8 => matches!(c, '0'..='7'),
459 10 => c.is_ascii_digit(),
460 16 => c.is_ascii_hexdigit(),
461 _ => false,
462 };
463 if !valid {
464 break;
465 }
466 self.advance();
467 }
468
469 end = self.byte_pos;
470
471 let mut suffix = String::new();
473 loop {
474 let c = self.read_char();
475 if c.is_ascii_alphanumeric() {
476 suffix.push(c);
477 self.advance();
478 } else {
479 break;
480 }
481 }
482
483 let num_str = &self.source[begin_byte..end]
484 .trim_start_matches("0x")
485 .trim_start_matches("0X")
486 .trim_start_matches("0b")
487 .trim_start_matches("0B")
488 .trim_start_matches("0o")
489 .trim_start_matches("0O");
490 let kind = match (base, suffix.as_str()) {
491 (2 | 8 | 10 | 16, "" | "i32") => TokenKind::Int(NumberBase::from(base)),
492 (2 | 8 | 10 | 16, "i64") => TokenKind::Int64(NumberBase::from(base)),
493 (2 | 8 | 10 | 16, "u32") => TokenKind::UInt(NumberBase::from(base)),
494 (2 | 8 | 10 | 16, "u64") => TokenKind::UInt64(NumberBase::from(base)),
495 _ => TokenKind::InvalidNumber,
496 };
497
498 Token::new(kind, loc, (*num_str).into())
499 }
500
501 fn lex_string(&mut self, begin_byte: usize) -> Token {
502 let loc = self.loc;
504 loop {
505 let ch = self.read_char();
506 match ch {
507 '"' => {
508 self.advance();
509 break;
510 }
511 '\0' => {
512 return Token::new(
513 TokenKind::UnterminatedStringLiteral,
514 loc,
515 self.source[begin_byte..self.byte_pos].into(),
516 );
517 }
518 '\\' => {
519 self.advance();
520 let esc = self.read_char();
521 match esc {
522 'r' => {} 'n' => {} '"' => {} '\'' => {} '\\' => {} '0' => {} _ => {
529 return Token::new(
530 TokenKind::InvalidEscapeSequence,
531 loc,
532 self.source[begin_byte..self.byte_pos].into(),
533 );
534 }
535 }
536 }
537 _ => {} }
539 self.advance();
540 }
541
542 Token::new(
543 TokenKind::StringLiteral,
544 loc,
545 self.source[begin_byte..self.byte_pos].into(),
546 )
547 }
548}
549
550#[derive(Debug, Clone, PartialEq, Eq, Hash)]
552pub struct Token {
553 pub kind: TokenKind,
554 pub loc: Loc,
555 pub source: String,
557}
558
559impl fmt::Display for Token {
560 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
561 match self.kind {
562 TokenKind::EOF => write!(f, "EOF"),
563 TokenKind::UnexpectedCharacter => {
564 write!(f, "Unexpected Character `{}`", self.source.escape_default())
565 }
566 TokenKind::InvalidEscapeSequence => {
567 write!(
568 f,
569 "Invalid Escape Sequence `{}`",
570 self.source.escape_default()
571 )
572 }
573 TokenKind::UnterminatedStringLiteral => {
574 write!(
575 f,
576 "Unterminated String Literal `{}`",
577 self.source.escape_default()
578 )
579 }
580 TokenKind::StringLiteral => write!(f, "{}", self.source.escape_default()),
581 TokenKind::CharacterLiteral => write!(f, "{}", self.source.escape_default()),
582 _ => write!(f, "{}", self.source),
583 }
584 }
585}
586
587impl Token {
588 pub fn source(&self) -> &str {
590 &self.source
592 }
593
594 pub fn new(kind: TokenKind, loc: Loc, source: String) -> Self {
596 Self {
597 kind,
598 loc,
599 source,
601 }
602 }
603
604 pub fn is_eof(&self) -> bool {
606 matches!(self.kind, TokenKind::EOF)
607 }
608
609 pub fn unescape(&self) -> String {
611 match self.kind {
612 TokenKind::StringLiteral => token_string_unescape(self.source()),
613 _ => todo!(),
614 }
615 }
616}
617pub fn token_string_unescape(source: &str) -> String {
618 let mut buffer = String::new();
619 let mut esc = false;
620 let mut src = source.chars();
621 src.next();
622 for ch in src {
623 match ch {
624 ch if esc => {
625 match ch {
626 'r' => buffer.push('\r'),
627 'n' => buffer.push('\n'),
628 '"' => buffer.push('"'),
629 '\'' => buffer.push('\''),
630 '\\' => buffer.push('\\'),
631 '0' => buffer.push('\0'),
632 _ => return buffer,
633 }
634 esc = false;
635 }
636 '"' => return buffer,
637 '\\' => {
638 esc = true;
639 continue;
640 }
641 _ => buffer.push(ch),
642 }
643 }
644 buffer
645}
646
647#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
649pub enum TokenKind {
650 #[default]
651 EOF,
652 UnexpectedCharacter,
653 InvalidEscapeSequence,
654 UnterminatedStringLiteral,
655
656 OpenParen,
657 CloseParen,
658 OpenBracket,
659 CloseBracket,
660 OpenCurly,
661 CloseCurly,
662
663 Identifier,
664 Keyword,
665
666 Directive,
667
668 RealNumber,
669 StringLiteral,
670 CharacterLiteral,
671
672 Dot,
673 Ellipsis,
674 Comma,
675 Colon,
676 DoubleColon,
677 SemiColon,
678 Arrow,
679 BackSlash,
680
681 Assign,
682 PlusEq,
683 MinusEq,
684 AsteriskEq,
685 SlashEq,
686 ModEq,
687 Bang,
688 Plus,
689 Concat,
690 Minus,
691 Asterisk,
692 Slash,
693 Eq,
694 EqEq,
695 NotEq,
696 Gt,
697 GtEq,
698 Lt,
699 LtEq,
700 Mod,
701 Ampersand,
702 Pipe,
703 Caret,
704 DoubleAmpersand,
705 DoublePipe,
706
707 Dollar,
708 InvalidNumber,
709
710 Int64(NumberBase),
711 UInt(NumberBase),
712 UInt64(NumberBase),
713 Int(NumberBase),
714}
715
716#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
718pub enum NumberBase {
719 B,
720 O,
721 D,
722 X,
723}
724impl NumberBase {
725 pub fn radix(&self) -> u32 {
726 match self {
727 NumberBase::B => 2,
728 NumberBase::O => 8,
729 NumberBase::D => 10,
730 NumberBase::X => 16,
731 }
732 }
733}
734
735impl From<u32> for NumberBase {
736 fn from(value: u32) -> Self {
737 match value {
738 2 => Self::B,
739 8 => Self::O,
740 10 => Self::D,
741 16 => Self::X,
742 _ => panic!("Unkwon base"),
743 }
744 }
745}
746
747impl From<NumberBase> for u32 {
748 fn from(val: NumberBase) -> Self {
749 match val {
750 NumberBase::B => 2,
751 NumberBase::O => 8,
752 NumberBase::D => 10,
753 NumberBase::X => 16,
754 }
755 }
756}
757
758impl TokenKind {
759 pub fn is_int_num(&self) -> bool {
760 matches!(
761 self,
762 Self::Int(_) | Self::Int64(_) | Self::UInt(_) | Self::UInt64(_)
763 )
764 }
765
766 pub fn is_assign_kind(&self) -> bool {
767 matches!(
768 self,
769 Self::Assign
770 | Self::Eq
771 | Self::PlusEq
772 | Self::MinusEq
773 | Self::AsteriskEq
774 | Self::SlashEq
775 | Self::ModEq
776 )
777 }
778}
779
780#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
782pub struct Loc {
783 pub line: usize,
784 pub col: usize,
785}
786
787impl fmt::Display for Loc {
788 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
789 write!(f, "{}:{}", self.line, self.col)
790 }
791}
792
793impl Loc {
794 pub fn new(line: usize, col: usize) -> Self {
795 Self { line, col }
796 }
797
798 pub fn next_column(&mut self) {
799 self.col += 1;
800 }
801
802 pub fn next_line(&mut self) {
803 self.line += 1;
804 self.col = 1;
805 }
806
807 pub fn next(&mut self, c: char) {
808 match c {
809 '\n' => self.next_line(),
810 '\t' => {
811 let ts = 8;
812 self.col = (self.col / ts) * ts + ts;
813 }
814 c if c.is_control() => {}
815 _ => {
816 self.next_column();
820 }
821 }
822 }
823}