1use std::fmt;
6
7pub struct Lexer<'src> {
12 source: &'src str,
13 data: Vec<char>,
14 pos: usize,
15 byte_pos: usize,
16 loc: Loc,
17 peeked: Option<Token>,
18 keywords: Vec<&'src str>,
19}
20
21impl<'src> Lexer<'src> {
22 pub fn new(source: &'src str) -> Self {
24 Self {
25 source,
26 data: source.chars().collect(),
27 loc: Loc::new(1, 1),
28 pos: 0,
29 byte_pos: 0,
30 peeked: None,
31 keywords: Vec::new(),
32 }
33 }
34
35 pub fn with_keywords(mut self, keywords: &[&'src str]) -> Self {
37 self.keywords = keywords.to_vec();
38 self
39 }
40
41 pub fn next(&mut self) -> Token {
44 if let Some(peek) = self.peeked.take() {
45 peek
46 } else {
47 self.next_token()
48 }
49 }
50
51 pub fn peek(&mut self) -> &Token {
54 if self.peeked.is_none() {
55 self.peeked = Some(self.next_token());
56 }
57 self.peeked.as_ref().unwrap()
58 }
59
60 fn advance(&mut self) -> char {
61 let ch = self.read_char();
62 self.byte_pos += ch.len_utf8();
63 self.pos += 1;
64 self.loc.next(ch);
65 ch
66 }
67
68 fn read_char(&mut self) -> char {
69 let pos = self.pos;
70 if pos >= self.data.len() {
71 '\0'
72 } else {
73 self.data[pos]
74 }
75 }
76
77 fn next_token(&mut self) -> Token {
78 while self.pos <= self.data.len() {
79 let begin_byte = self.byte_pos;
80 let ch = self.advance();
81 let loc = self.loc;
82
83 let tok = match ch {
84 '/' if self.read_char() == '/' => {
85 while self.advance() != '\n' {}
86 continue;
87 }
88 '#' => {
89 let ch = self.read_char();
90 if self.byte_pos == 1 && ch == '!' {
91 while self.advance() != '\n' {}
92 continue;
93 }
94 loop {
95 let ch = self.read_char();
96 if ch.is_alphanumeric() || ch == '_' {
97 self.advance();
98 } else {
99 break;
100 }
101 }
102 Token::new(
103 TokenKind::Directive,
104 loc,
105 self.source[begin_byte..self.byte_pos].into(),
106 )
107 }
108 '-' if self.read_char() == '>' => {
109 self.advance();
110 Token::new(
111 TokenKind::Arrow,
112 loc,
113 self.source[begin_byte..self.byte_pos].into(),
114 )
115 }
116 '=' if self.read_char() == '=' => {
117 self.advance();
118 Token::new(
119 TokenKind::EqEq,
120 loc,
121 self.source[begin_byte..self.byte_pos].into(),
122 )
123 }
124 ':' if self.read_char() == '=' => {
125 self.advance();
126 Token::new(
127 TokenKind::Assign,
128 loc,
129 self.source[begin_byte..self.byte_pos].into(),
130 )
131 }
132 '<' if self.read_char() == '=' => {
133 self.advance();
134 Token::new(
135 TokenKind::LtEq,
136 loc,
137 self.source[begin_byte..self.byte_pos].into(),
138 )
139 }
140 '>' if self.read_char() == '=' => {
141 self.advance();
142 Token::new(
143 TokenKind::GtEq,
144 loc,
145 self.source[begin_byte..self.byte_pos].into(),
146 )
147 }
148 '!' if self.read_char() == '=' => {
149 self.advance();
150 Token::new(
151 TokenKind::NotEq,
152 loc,
153 self.source[begin_byte..self.byte_pos].into(),
154 )
155 }
156 '&' if self.read_char() == '&' => {
157 self.advance();
158 Token::new(
159 TokenKind::DoubleAmpersand,
160 loc,
161 self.source[begin_byte..self.byte_pos].into(),
162 )
163 }
164 '|' if self.read_char() == '|' => {
165 self.advance();
166 Token::new(
167 TokenKind::DoublePipe,
168 loc,
169 self.source[begin_byte..self.byte_pos].into(),
170 )
171 }
172 ':' if self.read_char() == ':' => {
173 self.advance();
174 Token::new(
175 TokenKind::DoubleColon,
176 loc,
177 self.source[begin_byte..self.byte_pos].into(),
178 )
179 }
180 '.' if self.read_char() == '.' && self.read_char() == '.' => {
181 self.advance();
182 self.advance();
183 Token::new(
184 TokenKind::Ellipsis,
185 loc,
186 self.source[begin_byte..self.byte_pos].into(),
187 )
188 }
189 ch if ch.is_alphabetic() || ch == '_' => return self.lex_identifier(begin_byte),
190 '0'..='9' => return self.lex_number(begin_byte),
191 '"' => return self.lex_string(begin_byte),
192
193 ',' => Token::new(
194 TokenKind::Comma,
195 loc,
196 self.source[begin_byte..self.byte_pos].into(),
197 ),
198 ';' => Token::new(
199 TokenKind::SemiColon,
200 loc,
201 self.source[begin_byte..self.byte_pos].into(),
202 ),
203 ':' => Token::new(
204 TokenKind::Colon,
205 loc,
206 self.source[begin_byte..self.byte_pos].into(),
207 ),
208 '\\' => Token::new(
209 TokenKind::BackSlash,
210 loc,
211 self.source[begin_byte..self.byte_pos].into(),
212 ),
213 '=' => Token::new(
214 TokenKind::Eq,
215 loc,
216 self.source[begin_byte..self.byte_pos].into(),
217 ),
218 '<' => Token::new(
219 TokenKind::Lt,
220 loc,
221 self.source[begin_byte..self.byte_pos].into(),
222 ),
223 '>' => Token::new(
224 TokenKind::Gt,
225 loc,
226 self.source[begin_byte..self.byte_pos].into(),
227 ),
228 '!' => Token::new(
229 TokenKind::Bang,
230 loc,
231 self.source[begin_byte..self.byte_pos].into(),
232 ),
233 '+' => {
234 let next = self.read_char();
235 if next == '+' {
236 self.advance();
237 Token::new(
238 TokenKind::Concat,
239 loc,
240 self.source[begin_byte..self.byte_pos].into(),
241 )
242 } else if next == '=' {
243 self.advance();
244 Token::new(
245 TokenKind::PlusEq,
246 loc,
247 self.source[begin_byte..self.byte_pos].into(),
248 )
249 } else {
250 Token::new(
251 TokenKind::Plus,
252 loc,
253 self.source[begin_byte..self.byte_pos].into(),
254 )
255 }
256 }
257 '-' => {
258 let next = self.read_char();
259 if next == '>' {
260 self.advance();
261 Token::new(
262 TokenKind::Arrow,
263 loc,
264 self.source[begin_byte..self.byte_pos].into(),
265 )
266 } else if next == '=' {
267 self.advance();
268 Token::new(
269 TokenKind::MinusEq,
270 loc,
271 self.source[begin_byte..self.byte_pos].into(),
272 )
273 } else {
274 Token::new(
275 TokenKind::Minus,
276 loc,
277 self.source[begin_byte..self.byte_pos].into(),
278 )
279 }
280 }
281 '.' => Token::new(
282 TokenKind::Dot,
283 loc,
284 self.source[begin_byte..self.byte_pos].into(),
285 ),
286 '*' => {
287 let next = self.read_char();
288 if next == '=' {
289 self.advance();
290 Token::new(
291 TokenKind::AsteriskEq,
292 loc,
293 self.source[begin_byte..self.byte_pos].into(),
294 )
295 } else {
296 Token::new(
297 TokenKind::Asterisk,
298 loc,
299 self.source[begin_byte..self.byte_pos].into(),
300 )
301 }
302 }
303 '/' => {
304 let next = self.read_char();
305 if next == '=' {
306 self.advance();
307 Token::new(
308 TokenKind::SlashEq,
309 loc,
310 self.source[begin_byte..self.byte_pos].into(),
311 )
312 } else {
313 Token::new(
314 TokenKind::Slash,
315 loc,
316 self.source[begin_byte..self.byte_pos].into(),
317 )
318 }
319 }
320 '%' => {
321 let next = self.read_char();
322 if next == '=' {
323 self.advance();
324 Token::new(
325 TokenKind::ModEq,
326 loc,
327 self.source[begin_byte..self.byte_pos].into(),
328 )
329 } else {
330 Token::new(
331 TokenKind::Mod,
332 loc,
333 self.source[begin_byte..self.byte_pos].into(),
334 )
335 }
336 }
337 '$' => Token::new(
338 TokenKind::Dollar,
339 loc,
340 self.source[begin_byte..self.byte_pos].into(),
341 ),
342 '&' => Token::new(
343 TokenKind::Ampersand,
344 loc,
345 self.source[begin_byte..self.byte_pos].into(),
346 ),
347 '^' => Token::new(
348 TokenKind::Caret,
349 loc,
350 self.source[begin_byte..self.byte_pos].into(),
351 ),
352 '|' => Token::new(
353 TokenKind::Pipe,
354 loc,
355 self.source[begin_byte..self.byte_pos].into(),
356 ),
357 '(' => Token::new(
358 TokenKind::OpenParen,
359 loc,
360 self.source[begin_byte..self.byte_pos].into(),
361 ),
362 ')' => Token::new(
363 TokenKind::CloseParen,
364 loc,
365 self.source[begin_byte..self.byte_pos].into(),
366 ),
367 '[' => Token::new(
368 TokenKind::OpenBracket,
369 loc,
370 self.source[begin_byte..self.byte_pos].into(),
371 ),
372 ']' => Token::new(
373 TokenKind::CloseBracket,
374 loc,
375 self.source[begin_byte..self.byte_pos].into(),
376 ),
377 '{' => Token::new(
378 TokenKind::OpenCurly,
379 loc,
380 self.source[begin_byte..self.byte_pos].into(),
381 ),
382 '}' => Token::new(
383 TokenKind::CloseCurly,
384 loc,
385 self.source[begin_byte..self.byte_pos].into(),
386 ),
387
388 ch if ch.is_whitespace() => continue,
389 '\0' => return Token::new(TokenKind::EOF, self.loc, "\0".into()),
390 _ => {
391 return Token::new(
392 TokenKind::UnexpectedCharacter,
393 self.loc,
394 self.source[begin_byte..self.byte_pos].into(),
395 );
396 }
397 };
398 return tok;
399 }
400
401 Token::new(TokenKind::EOF, self.loc, "".into())
402 }
403
404 fn lex_identifier(&mut self, begin_byte: usize) -> Token {
405 let loc = self.loc;
406 #[allow(unused_mut)]
407 let mut kind = TokenKind::Identifier;
408 loop {
409 let ch = self.read_char();
410 if ch.is_alphanumeric() || ch == '_' {
411 self.advance();
412 } else {
413 break;
414 }
415 }
416 let ident = &self.source[begin_byte..self.byte_pos];
417
418 if self.keywords.contains(&ident) {
419 kind = TokenKind::Keyword;
420 }
421
422 Token::new(kind, loc, ident.into())
423 }
424
425 fn lex_number(&mut self, begin_byte: usize) -> Token {
426 let loc = self.loc;
427 let end;
428 let mut base = 10;
429
430 let next = self.read_char();
432 match next {
433 'x' | 'X' => {
434 base = 16;
435 self.advance(); self.advance(); }
438 'b' | 'B' => {
439 base = 2;
440 self.advance(); self.advance(); }
443 'o' | 'O' => {
444 base = 8;
445 self.advance(); self.advance(); }
448 _ => {}
449 }
450
451 loop {
453 let c = self.read_char();
454 let valid = match base {
455 2 => matches!(c, '0' | '1'),
456 8 => matches!(c, '0'..='7'),
457 10 if c == '.' => {
458 self.advance();
459 loop {
460 let c = self.read_char();
461 if !c.is_ascii_digit() {
462 break;
463 }
464 self.advance();
465 }
466 end = self.byte_pos;
467 let num_str = &self.source[begin_byte..end];
468 return Token::new(TokenKind::RealNumber, loc, (*num_str).into());
469 }
470 10 => c.is_ascii_digit(),
471 16 => c.is_ascii_hexdigit(),
472 _ => false,
473 };
474 if !valid {
475 break;
476 }
477 self.advance();
478 }
479
480 end = self.byte_pos;
481
482 let num_str = &self.source[begin_byte..end]
483 .trim_start_matches("0x")
484 .trim_start_matches("0X")
485 .trim_start_matches("0b")
486 .trim_start_matches("0B")
487 .trim_start_matches("0o")
488 .trim_start_matches("0O");
489 let kind = TokenKind::Number(NumberBase::from(base));
490
491 Token::new(kind, loc, (*num_str).into())
492 }
493
494 fn lex_string(&mut self, begin_byte: usize) -> Token {
495 let loc = self.loc;
497 loop {
498 let ch = self.read_char();
499 match ch {
500 '"' => {
501 self.advance();
502 break;
503 }
504 '\0' => {
505 return Token::new(
506 TokenKind::UnterminatedStringLiteral,
507 loc,
508 self.source[begin_byte..self.byte_pos].into(),
509 );
510 }
511 '\\' => {
512 self.advance();
513 let esc = self.read_char();
514 match esc {
515 'r' => {} 'n' => {} '"' => {} '\'' => {} '\\' => {} '0' => {} _ => {
522 return Token::new(
523 TokenKind::InvalidEscapeSequence,
524 loc,
525 self.source[begin_byte..self.byte_pos].into(),
526 );
527 }
528 }
529 }
530 _ => {} }
532 self.advance();
533 }
534
535 Token::new(
536 TokenKind::StringLiteral,
537 loc,
538 self.source[begin_byte..self.byte_pos].into(),
539 )
540 }
541}
542
543#[derive(Debug, Clone, PartialEq, Eq, Hash)]
545pub struct Token {
546 pub kind: TokenKind,
547 pub loc: Loc,
548 pub source: String,
550}
551
552impl fmt::Display for Token {
553 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
554 match self.kind {
555 TokenKind::EOF => write!(f, "EOF"),
556 TokenKind::UnexpectedCharacter => {
557 write!(f, "Unexpected Character `{}`", self.source.escape_default())
558 }
559 TokenKind::InvalidEscapeSequence => {
560 write!(
561 f,
562 "Invalid Escape Sequence `{}`",
563 self.source.escape_default()
564 )
565 }
566 TokenKind::UnterminatedStringLiteral => {
567 write!(
568 f,
569 "Unterminated String Literal `{}`",
570 self.source.escape_default()
571 )
572 }
573 TokenKind::StringLiteral => write!(f, "{}", self.source.escape_default()),
574 TokenKind::CharacterLiteral => write!(f, "{}", self.source.escape_default()),
575 _ => write!(f, "{}", self.source),
576 }
577 }
578}
579
580impl Token {
581 pub fn source(&self) -> &str {
583 &self.source
585 }
586
587 pub fn new(kind: TokenKind, loc: Loc, source: String) -> Self {
589 Self {
590 kind,
591 loc,
592 source,
594 }
595 }
596
597 pub fn is_eof(&self) -> bool {
599 matches!(self.kind, TokenKind::EOF)
600 }
601
602 pub fn unescape(&self) -> String {
604 match self.kind {
605 TokenKind::StringLiteral => token_string_unescape(self.source()),
606 _ => todo!(),
607 }
608 }
609}
610pub fn token_string_unescape(source: &str) -> String {
611 let mut buffer = String::new();
612 let mut esc = false;
613 let mut src = source.chars();
614 src.next();
615 for ch in src {
616 match ch {
617 ch if esc => {
618 match ch {
619 'r' => buffer.push('\r'),
620 'n' => buffer.push('\n'),
621 '"' => buffer.push('"'),
622 '\'' => buffer.push('\''),
623 '\\' => buffer.push('\\'),
624 '0' => buffer.push('\0'),
625 _ => return buffer,
626 }
627 esc = false;
628 }
629 '"' => return buffer,
630 '\\' => {
631 esc = true;
632 continue;
633 }
634 _ => buffer.push(ch),
635 }
636 }
637 buffer
638}
639
640#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
642pub enum TokenKind {
643 #[default]
644 EOF,
645 UnexpectedCharacter,
646 InvalidEscapeSequence,
647 UnterminatedStringLiteral,
648
649 OpenParen,
650 CloseParen,
651 OpenBracket,
652 CloseBracket,
653 OpenCurly,
654 CloseCurly,
655
656 Identifier,
657 Keyword,
658
659 Directive,
660
661 RealNumber,
662 StringLiteral,
663 CharacterLiteral,
664
665 Dot,
666 Ellipsis,
667 Comma,
668 Colon,
669 DoubleColon,
670 SemiColon,
671 Arrow,
672 BackSlash,
673
674 Assign,
675 PlusEq,
676 MinusEq,
677 AsteriskEq,
678 SlashEq,
679 ModEq,
680 Bang,
681 Plus,
682 Concat,
683 Minus,
684 Asterisk,
685 Slash,
686 Eq,
687 EqEq,
688 NotEq,
689 Gt,
690 GtEq,
691 Lt,
692 LtEq,
693 Mod,
694 Ampersand,
695 Pipe,
696 Caret,
697 DoubleAmpersand,
698 DoublePipe,
699
700 Dollar,
701 InvalidNumber,
702
703 Number(NumberBase),
704}
705
706#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
708pub enum NumberBase {
709 B,
710 O,
711 D,
712 X,
713}
714impl NumberBase {
715 pub fn radix(&self) -> u32 {
716 match self {
717 NumberBase::B => 2,
718 NumberBase::O => 8,
719 NumberBase::D => 10,
720 NumberBase::X => 16,
721 }
722 }
723}
724
725impl From<u32> for NumberBase {
726 fn from(value: u32) -> Self {
727 match value {
728 2 => Self::B,
729 8 => Self::O,
730 10 => Self::D,
731 16 => Self::X,
732 _ => panic!("Unkwon base"),
733 }
734 }
735}
736
737impl From<NumberBase> for u32 {
738 fn from(val: NumberBase) -> Self {
739 match val {
740 NumberBase::B => 2,
741 NumberBase::O => 8,
742 NumberBase::D => 10,
743 NumberBase::X => 16,
744 }
745 }
746}
747
748impl TokenKind {
749 pub fn is_assign_kind(&self) -> bool {
750 matches!(
751 self,
752 Self::Assign
753 | Self::Eq
754 | Self::PlusEq
755 | Self::MinusEq
756 | Self::AsteriskEq
757 | Self::SlashEq
758 | Self::ModEq
759 )
760 }
761}
762
763#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
765pub struct Loc {
766 pub line: usize,
767 pub col: usize,
768}
769
770impl fmt::Display for Loc {
771 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
772 write!(f, "{}:{}", self.line, self.col)
773 }
774}
775
776impl Loc {
777 pub fn new(line: usize, col: usize) -> Self {
778 Self { line, col }
779 }
780
781 pub fn next_column(&mut self) {
782 self.col += 1;
783 }
784
785 pub fn next_line(&mut self) {
786 self.line += 1;
787 self.col = 1;
788 }
789
790 pub fn next(&mut self, c: char) {
791 match c {
792 '\n' => self.next_line(),
793 '\t' => {
794 let ts = 8;
795 self.col = (self.col / ts) * ts + ts;
796 }
797 c if c.is_control() => {}
798 _ => {
799 self.next_column();
803 }
804 }
805 }
806}