1use crate::diagnostic::Diagnostic;
2use crate::token::SourceLocation;
3use crate::token::Token;
4use crate::token::TokenKind;
5
6pub struct Tokenizer {
7 pub(crate) content: Vec<char>,
8 pub(crate) content_len: usize,
9 pub(crate) index: usize,
10
11 pub(crate) line_start: u32,
12 pub(crate) line_end: u32,
13 pub(crate) column_start: u32,
14 pub(crate) column_end: u32,
15}
16
17impl Tokenizer {
18 pub(crate) fn new(chars: Vec<char>) -> Tokenizer {
19 let content_len = chars.len();
20 Tokenizer {
21 content: chars,
22 content_len,
23 index: 0,
24
25 line_start: 1,
26 line_end: 1,
27 column_start: 0,
28 column_end: 0,
29 }
30 }
31
32 pub fn tokenize(content: String) -> Result<Vec<Token>, Box<Diagnostic>> {
33 let mut tokenizer = Tokenizer::new(content.chars().collect());
34 tokenizer.tokenize_characters()
35 }
36
37 fn current_source_location(&self) -> SourceLocation {
38 SourceLocation {
39 line_start: self.line_start,
40 line_end: self.line_end,
41 column_start: self.column_start,
42 column_end: self.column_end,
43 }
44 }
45
46 fn tokenize_characters(&mut self) -> Result<Vec<Token>, Box<Diagnostic>> {
47 let mut tokens: Vec<Token> = Vec::new();
48 let len = self.content_len;
49
50 while self.has_next() {
51 self.column_start = self.column_end;
52 self.line_start = self.line_end;
53
54 let char = self.content[self.index];
55
56 if char.is_alphabetic() {
58 tokens.push(self.consume_identifier());
59 continue;
60 }
61
62 if char == '@' {
64 if self.index + 1 < len && self.content[self.index + 1] == '>' {
66 self.index += 2;
67 let location = self.current_source_location();
68 tokens.push(Token::new(TokenKind::AtRightArrow, location));
69 continue;
70 }
71
72 tokens.push(self.consume_global_variable_name()?);
73 continue;
74 }
75
76 if char.is_numeric() {
78 if char == '0' && self.index + 1 < len {
79 if self.content[self.index + 1] == 'x' {
80 self.index += 2;
81 self.column_start += 2;
82 tokens.push(self.consume_hex_number()?);
83 continue;
84 }
85
86 if self.content[self.index + 1] == 'b' {
87 self.index += 2;
88 self.column_start += 2;
89 tokens.push(self.consume_binary_number()?);
90 continue;
91 }
92
93 if self.content[self.index + 1] == 'o' {
94 self.index += 2;
95 self.column_start += 2;
96 tokens.push(self.consume_octal_number()?);
97 continue;
98 }
99 }
100
101 tokens.push(self.consume_number()?);
102 continue;
103 }
104
105 if char == '\'' {
107 tokens.push(self.consume_string_in_single_quotes()?);
108 continue;
109 }
110
111 if char == '"' {
113 tokens.push(self.consume_string_in_double_quotes()?);
114 continue;
115 }
116
117 if char == '`' {
119 tokens.push(self.consume_backticks_identifier()?);
120 continue;
121 }
122
123 if char == '+' {
125 let location = self.current_source_location();
126 tokens.push(Token::new(TokenKind::Plus, location));
127 self.advance();
128 continue;
129 }
130
131 if char == '-' {
133 if self.index + 1 < self.content_len && self.content[self.index + 1] == '-' {
135 self.ignore_single_line_comment();
136 continue;
137 }
138
139 let location = self.current_source_location();
140 tokens.push(Token::new(TokenKind::Minus, location));
141 self.advance();
142 continue;
143 }
144
145 if char == '*' {
147 let location = self.current_source_location();
148 tokens.push(Token::new(TokenKind::Star, location));
149 self.advance();
150 continue;
151 }
152
153 if char == '/' {
155 if self.index + 1 < self.content_len && self.content[self.index + 1] == '*' {
157 self.ignore_c_style_comment()?;
158 continue;
159 }
160
161 let location = self.current_source_location();
162 tokens.push(Token::new(TokenKind::Slash, location));
163 self.advance();
164 continue;
165 }
166
167 if char == '%' {
169 let location = self.current_source_location();
170 tokens.push(Token::new(TokenKind::Percentage, location));
171 self.advance();
172 continue;
173 }
174
175 if char == '^' {
177 let location = self.current_source_location();
178 tokens.push(Token::new(TokenKind::Caret, location));
179 self.advance();
180 continue;
181 }
182
183 if char == '~' {
185 let location = self.current_source_location();
186 tokens.push(Token::new(TokenKind::BitwiseNot, location));
187 self.advance();
188 continue;
189 }
190
191 if char == '|' {
193 let location = self.current_source_location();
194
195 self.advance();
196 let kind = if self.index < len && self.content[self.index] == '|' {
197 self.advance();
198 TokenKind::OrOr
199 } else {
200 TokenKind::BitwiseOr
201 };
202
203 tokens.push(Token::new(kind, location));
204 continue;
205 }
206
207 if char == '&' {
209 let location = self.current_source_location();
210
211 self.advance();
212 let kind = if self.index < len && self.content[self.index] == '&' {
213 self.advance();
214 TokenKind::AndAnd
215 } else {
216 TokenKind::BitwiseAnd
217 };
218
219 tokens.push(Token::new(kind, location));
220 continue;
221 }
222
223 if char == '#' {
225 let location = self.current_source_location();
226 tokens.push(Token::new(TokenKind::BitwiseXor, location));
227 self.advance();
228 continue;
229 }
230
231 if char == ',' {
233 let location = self.current_source_location();
234 tokens.push(Token::new(TokenKind::Comma, location));
235 self.advance();
236 continue;
237 }
238
239 if char == '.' {
241 let location = self.current_source_location();
242 tokens.push(Token::new(TokenKind::Dot, location));
243 self.advance();
244 continue;
245 }
246
247 if char == '>' {
249 let location = self.current_source_location();
250
251 self.advance();
252 let kind = if self.index < len && self.content[self.index] == '=' {
253 self.advance();
254 TokenKind::GreaterEqual
255 } else if self.index < len && self.content[self.index] == '>' {
256 self.advance();
257 TokenKind::BitwiseRightShift
258 } else {
259 TokenKind::Greater
260 };
261
262 tokens.push(Token::new(kind, location));
263 continue;
264 }
265
266 if char == '<' {
268 let location = self.current_source_location();
269
270 self.advance();
271 let kind = if self.index < len && self.content[self.index] == '=' {
272 self.advance();
273 if self.index < len && self.content[self.index] == '>' {
274 self.advance();
275 TokenKind::NullSafeEqual
276 } else {
277 TokenKind::LessEqual
278 }
279 } else if self.index < len && self.content[self.index] == '<' {
280 self.advance();
281 TokenKind::BitwiseLeftShift
282 } else if self.index < len && self.content[self.index] == '>' {
283 self.advance();
284 TokenKind::BangEqual
285 } else if self.index < len && self.content[self.index] == '@' {
286 self.advance();
287 TokenKind::ArrowRightAt
288 } else {
289 TokenKind::Less
290 };
291
292 tokens.push(Token::new(kind, location));
293 continue;
294 }
295
296 if char == '=' {
298 let location = self.current_source_location();
299 tokens.push(Token::new(TokenKind::Equal, location));
300 self.advance();
301 continue;
302 }
303
304 if char == ':' {
306 let location = self.current_source_location();
307
308 if self.index + 1 < len && self.content[self.index + 1] == '=' {
310 tokens.push(Token::new(TokenKind::ColonEqual, location));
311 self.advance_n(2);
313 continue;
314 }
315
316 if self.index + 1 < len && self.content[self.index + 1] == ':' {
318 tokens.push(Token::new(TokenKind::ColonColon, location));
319 self.advance_n(2);
321 continue;
322 }
323
324 tokens.push(Token::new(TokenKind::Colon, location));
325 self.advance();
326 continue;
327 }
328
329 if char == '!' {
331 let location = self.current_source_location();
332
333 self.advance();
335 let kind = if self.index < len && self.content[self.index] == '=' {
336 self.advance();
338 TokenKind::BangEqual
339 } else {
340 TokenKind::Bang
341 };
342
343 tokens.push(Token::new(kind, location));
344 continue;
345 }
346
347 if char == '(' {
349 let location = self.current_source_location();
350 tokens.push(Token::new(TokenKind::LeftParen, location));
351 self.advance();
352 continue;
353 }
354
355 if char == ')' {
357 let location = self.current_source_location();
358 tokens.push(Token::new(TokenKind::RightParen, location));
359 self.advance();
360 continue;
361 }
362
363 if char == '[' {
365 let location = self.current_source_location();
366 tokens.push(Token::new(TokenKind::LeftBracket, location));
367 self.advance();
368 continue;
369 }
370
371 if char == ']' {
373 let location = self.current_source_location();
374 tokens.push(Token::new(TokenKind::RightBracket, location));
375 self.advance();
376 continue;
377 }
378
379 if char == ';' {
381 let location = self.current_source_location();
382 tokens.push(Token::new(TokenKind::Semicolon, location));
383 self.advance();
384 continue;
385 }
386
387 if char == ' ' || char == '\t' {
389 self.advance();
390 continue;
391 }
392
393 if char == '\n' {
394 self.advance();
395 self.column_end = 0;
396 self.line_end += 1;
397 continue;
398 }
399
400 return Err(Diagnostic::error("Unexpected character")
401 .with_location(self.current_source_location())
402 .as_boxed());
403 }
404
405 Ok(tokens)
406 }
407
408 fn consume_global_variable_name(&mut self) -> Result<Token, Box<Diagnostic>> {
409 let start_index = self.index;
410
411 self.advance();
413
414 if self.has_next() && !self.content[self.index].is_alphabetic() {
416 return Err(Diagnostic::error(
417 "Global variable name must start with alphabetic character",
418 )
419 .add_help("Add at least one alphabetic character after @")
420 .with_location(self.current_source_location())
421 .as_boxed());
422 }
423
424 while self.has_next() && self.is_current_char_func(|c| c == '_' || c.is_alphanumeric()) {
425 self.advance();
426 }
427
428 let literal = &self.content[start_index..self.index];
430 let mut string: String = literal.iter().collect();
431 string = string.to_lowercase();
432
433 let location = self.current_source_location();
434 Ok(Token::new(TokenKind::GlobalVariable(string), location))
435 }
436
437 fn consume_identifier(&mut self) -> Token {
438 let start_index = self.index;
439
440 while self.has_next() && self.is_current_char_func(|c| c == '_' || c.is_alphanumeric()) {
441 self.advance();
442 }
443
444 let literal = &self.content[start_index..self.index];
446 let mut string: String = literal.iter().collect();
447 string = string.to_lowercase();
448
449 let location = self.current_source_location();
450 Token::new_symbol(string, location)
451 }
452
453 fn consume_backticks_identifier(&mut self) -> Result<Token, Box<Diagnostic>> {
454 let start_index = self.index;
455
456 self.advance();
458
459 while self.has_next() && !self.is_current_char('`') {
460 self.advance();
461 }
462
463 if self.index >= self.content_len {
464 return Err(Diagnostic::error("Unterminated backticks")
465 .add_help("Add ` at the end of the identifier")
466 .with_location(self.current_source_location())
467 .as_boxed());
468 }
469
470 self.advance();
472
473 let literal = &self.content[start_index + 1..self.index - 1];
474 let identifier: String = literal.iter().collect();
475 let location = self.current_source_location();
476 Ok(Token::new(TokenKind::Symbol(identifier), location))
477 }
478
479 fn consume_number(&mut self) -> Result<Token, Box<Diagnostic>> {
480 let start_index = self.index;
481
482 while self.has_next() && self.is_current_char_func(|c| c == '_' || c.is_numeric()) {
483 self.advance();
484 }
485
486 let mut is_float_value = false;
487 if self.has_next() && self.is_current_char('.') {
488 self.advance();
489
490 is_float_value = true;
491 while self.has_next() && self.is_current_char_func(|c| c == '_' || c.is_numeric()) {
492 self.advance();
493 }
494 }
495
496 let literal = &self.content[start_index..self.index];
497 let string: String = literal.iter().collect();
498 let literal_num = string.replace('_', "");
499 let location = self.current_source_location();
500
501 if is_float_value {
502 return match literal_num.parse::<f64>() {
503 Ok(float) => Ok(Token::new(TokenKind::Float(float), location)),
504 Err(parse_float_error) => Err(Diagnostic::error(&parse_float_error.to_string())
505 .add_note(&format!(
506 "Value must be between {} and {}",
507 f64::MIN,
508 f64::MAX
509 ))
510 .with_location(self.current_source_location())
511 .as_boxed()),
512 };
513 }
514
515 match literal_num.parse::<i64>() {
516 Ok(integer) => Ok(Token::new(TokenKind::Integer(integer), location)),
517 Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
518 .add_note(&format!(
519 "Value must be between {} and {}",
520 i64::MIN,
521 i64::MAX
522 ))
523 .with_location(self.current_source_location())
524 .as_boxed()),
525 }
526 }
527
528 fn consume_binary_number(&mut self) -> Result<Token, Box<Diagnostic>> {
529 let start_index = self.index;
530 let mut has_digit = false;
531
532 while self.has_next() && self.is_current_char_func(|c| c == '_' || c == '0' || c >= '1') {
533 self.advance();
534 has_digit = true;
535 }
536
537 if !has_digit {
538 return Err(
539 Diagnostic::error("Missing digits after the integer base prefix")
540 .add_help("Expect at least one binary digits after the prefix 0b")
541 .add_help("Binary digit mean 0 or 1")
542 .with_location(self.current_source_location())
543 .as_boxed(),
544 );
545 }
546
547 let literal = &self.content[start_index..self.index];
548 let string: String = literal.iter().collect();
549 let literal_num = string.replace('_', "");
550
551 const BINARY_RADIX: u32 = 2;
552 match i64::from_str_radix(&literal_num, BINARY_RADIX) {
553 Ok(integer) => {
554 let location = self.current_source_location();
555 Ok(Token::new(TokenKind::Integer(integer), location))
556 }
557 Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
558 .add_note(&format!(
559 "Value must be between {} and {}",
560 i64::MIN,
561 i64::MAX
562 ))
563 .with_location(self.current_source_location())
564 .as_boxed()),
565 }
566 }
567
568 fn consume_octal_number(&mut self) -> Result<Token, Box<Diagnostic>> {
569 let start_index = self.index;
570 let mut has_digit = false;
571
572 while self.has_next() && self.is_current_char_func(|c| c == '_' || ('0'..='8').contains(&c))
573 {
574 self.advance();
575 has_digit = true;
576 }
577
578 if !has_digit {
579 return Err(
580 Diagnostic::error("Missing digits after the integer base prefix")
581 .add_help("Expect at least one octal digits after the prefix 0o")
582 .add_help("Octal digit mean 0 to 8 number")
583 .with_location(self.current_source_location())
584 .as_boxed(),
585 );
586 }
587
588 let literal = &self.content[start_index..self.index];
589 let string: String = literal.iter().collect();
590 let literal_num = string.replace('_', "");
591
592 const OCTAL_RADIX: u32 = 2;
593 match i64::from_str_radix(&literal_num, OCTAL_RADIX) {
594 Ok(integer) => {
595 let location = self.current_source_location();
596 Ok(Token::new(TokenKind::Integer(integer), location))
597 }
598 Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
599 .add_note(&format!(
600 "Value must be between {} and {}",
601 i64::MIN,
602 i64::MAX
603 ))
604 .with_location(self.current_source_location())
605 .as_boxed()),
606 }
607 }
608
609 fn consume_hex_number(&mut self) -> Result<Token, Box<Diagnostic>> {
610 let start_index = self.index;
611 let mut has_digit = false;
612
613 while self.has_next() && self.is_current_char_func(|c| c == '_' || c.is_ascii_hexdigit()) {
614 self.advance();
615 has_digit = true;
616 }
617
618 if !has_digit {
619 return Err(
620 Diagnostic::error("Missing digits after the integer base prefix")
621 .add_help("Expect at least one hex digits after the prefix 0x")
622 .add_help("Hex digit mean 0 to 9 and a to f")
623 .with_location(self.current_source_location())
624 .as_boxed(),
625 );
626 }
627
628 let literal = &self.content[start_index..self.index];
629 let string: String = literal.iter().collect();
630 let literal_num = string.replace('_', "");
631
632 const HEX_RADIX: u32 = 16;
633 match i64::from_str_radix(&literal_num, HEX_RADIX) {
634 Ok(integer) => {
635 let location = self.current_source_location();
636 Ok(Token::new(TokenKind::Integer(integer), location))
637 }
638 Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
639 .add_note(&format!(
640 "Value must be between {} and {}",
641 i64::MIN,
642 i64::MAX
643 ))
644 .with_location(self.current_source_location())
645 .as_boxed()),
646 }
647 }
648
649 fn consume_string_in_single_quotes(&mut self) -> Result<Token, Box<Diagnostic>> {
650 let buffer = self.consume_string_with_around('\'')?;
651
652 if self.index >= self.content_len {
653 return Err(Diagnostic::error("Unterminated single quote string")
654 .add_help("Add \' at the end of the String literal")
655 .with_location(self.current_source_location())
656 .as_boxed());
657 }
658
659 self.advance();
661 let location = self.current_source_location();
662 Ok(Token::new(TokenKind::String(buffer), location))
663 }
664
665 fn consume_string_in_double_quotes(&mut self) -> Result<Token, Box<Diagnostic>> {
666 let buffer = self.consume_string_with_around('"')?;
667
668 if self.index >= self.content_len {
669 return Err(Diagnostic::error("Unterminated double quote string")
670 .add_help("Add \" at the end of the String literal")
671 .with_location(self.current_source_location())
672 .as_boxed());
673 }
674
675 self.advance();
677 let location = self.current_source_location();
678 Ok(Token::new(TokenKind::String(buffer), location))
679 }
680
681 fn consume_string_with_around(&mut self, around: char) -> Result<String, Box<Diagnostic>> {
682 self.advance();
684
685 let mut buffer = String::new();
686 while self.has_next() && self.content[self.index] != around {
687 if !self.is_current_char('\\') {
688 buffer.push(self.content[self.index]);
689 self.advance();
690 continue;
691 }
692
693 if self.is_last() {
695 buffer.push(self.content[self.index]);
696 self.advance();
697 continue;
698 }
699
700 self.advance();
702
703 let next_char = self.content[self.index];
705 let character_with_escape_handled = match next_char {
706 '\'' => {
708 self.advance();
709 '\''
710 }
711 '\"' => {
713 self.advance();
714 '\"'
715 }
716 '\\' => {
718 self.advance();
719 '\\'
720 }
721 'n' => {
723 self.advance();
724 '\n'
725 }
726 'r' => {
728 self.advance();
729 '\r'
730 }
731 't' => {
733 self.advance();
734 '\t'
735 }
736 _ => self.content[self.index - 1],
737 };
738
739 buffer.push(character_with_escape_handled);
740 }
741
742 Ok(buffer)
743 }
744
745 fn ignore_single_line_comment(&mut self) {
746 self.advance_n(2);
748
749 while self.has_next() && !self.is_current_char('\n') {
750 self.advance();
751 }
752
753 self.advance();
755 self.line_end += 1;
756 self.column_end = 0;
757 }
758
759 fn ignore_c_style_comment(&mut self) -> Result<(), Box<Diagnostic>> {
760 self.advance_n(2);
762
763 while self.index + 1 < self.content_len
764 && (!self.is_current_char('*') && self.content[self.index + 1] != '/')
765 {
766 self.advance();
768 }
769
770 if self.index + 2 > self.content_len {
771 return Err(Diagnostic::error("C Style comment must end with */")
772 .add_help("Add */ at the end of C Style comments")
773 .with_location(self.current_source_location())
774 .as_boxed());
775 }
776
777 self.advance_n(2);
779 Ok(())
780 }
781
782 fn advance(&mut self) {
783 self.index += 1;
784 self.column_end += 1;
785 }
786
787 fn advance_n(&mut self, n: usize) {
788 self.index += n;
789 self.column_end += n as u32;
790 }
791
792 fn is_current_char(&self, ch: char) -> bool {
793 self.content[self.index] == ch
794 }
795
796 fn is_current_char_func(&self, func: fn(char) -> bool) -> bool {
797 func(self.content[self.index])
798 }
799
800 fn has_next(&self) -> bool {
801 self.index < self.content_len
802 }
803
804 fn is_last(&self) -> bool {
805 self.index == self.content_len - 1
806 }
807}