1use crate::diagnostic::Diagnostic;
2use crate::token::SourceLocation;
3use crate::token::Token;
4use crate::token::TokenKind;
5use crate::token::GITQL_RESERVED_KEYWORDS;
6
7pub struct Tokenizer<'a> {
8 content: &'a [char],
9 content_len: usize,
10 index: usize,
11
12 line_start: u32,
13 line_end: u32,
14 column_start: u32,
15 column_end: u32,
16}
17
18impl<'a> Tokenizer<'a> {
19 pub(crate) fn new(chars: &'a [char]) -> Tokenizer<'a> {
20 Tokenizer {
21 content: chars,
22 content_len: chars.len(),
23 index: 0,
24
25 line_start: 1,
26 line_end: 1,
27 column_start: 0,
28 column_end: 0,
29 }
30 }
31
32 pub fn tokenize(chars: &'a str) -> Result<Vec<Token>, Box<Diagnostic>> {
33 let chars: Vec<char> = chars.chars().collect();
34 Tokenizer::new(&chars).tokenize_characters()
35 }
36
37 fn current_source_location(&self) -> SourceLocation {
38 SourceLocation {
39 line_start: self.line_start,
40 line_end: self.line_end,
41 column_start: self.column_start,
42 column_end: self.column_end,
43 }
44 }
45
46 fn tokenize_characters(&mut self) -> Result<Vec<Token>, Box<Diagnostic>> {
47 let mut tokens: Vec<Token> = Vec::new();
48 let len = self.content_len;
49
50 while self.has_next() {
51 self.column_start = self.column_end;
52 self.line_start = self.line_end;
53
54 let char = self.content[self.index];
55
56 if char.is_alphabetic() {
58 tokens.push(self.consume_identifier());
59 continue;
60 }
61
62 if char == '@' {
64 if self.is_next_char('>') {
66 self.index += 2;
67 let location = self.current_source_location();
68 tokens.push(Token::new(TokenKind::AtRightArrow, location));
69 continue;
70 }
71
72 tokens.push(self.consume_global_variable_name()?);
73 continue;
74 }
75
76 if char.is_numeric() {
78 if char == '0' && self.index + 1 < len {
79 match self.content[self.index + 1] {
80 'b' | 'B' => {
82 self.index += 2;
83 self.column_start += 2;
84 tokens.push(self.consume_binary_number()?);
85 continue;
86 }
87 'x' | 'X' => {
89 self.index += 2;
90 self.column_start += 2;
91 tokens.push(self.consume_hex_number()?);
92 continue;
93 }
94 'o' | 'O' => {
96 self.index += 2;
97 self.column_start += 2;
98 tokens.push(self.consume_octal_number()?);
99 continue;
100 }
101 _ => {
102 tokens.push(self.consume_number()?);
103 continue;
104 }
105 }
106 }
107
108 tokens.push(self.consume_number()?);
109 continue;
110 }
111
112 if char == '\'' {
114 tokens.push(self.consume_string_in_single_quotes()?);
115 continue;
116 }
117
118 if char == '"' {
120 tokens.push(self.consume_string_in_double_quotes()?);
121 continue;
122 }
123
124 if char == '`' {
126 tokens.push(self.consume_backticks_identifier()?);
127 continue;
128 }
129
130 if char == '+' {
132 let location = self.current_source_location();
133 tokens.push(Token::new(TokenKind::Plus, location));
134 self.advance();
135 continue;
136 }
137
138 if char == '-' {
140 if self.is_next_char('-') {
142 self.consume_single_line_comment();
143 continue;
144 }
145
146 let location = self.current_source_location();
147 tokens.push(Token::new(TokenKind::Minus, location));
148 self.advance();
149 continue;
150 }
151
152 if char == '*' {
154 let location = self.current_source_location();
155 tokens.push(Token::new(TokenKind::Star, location));
156 self.advance();
157 continue;
158 }
159
160 if char == '/' {
162 if self.is_next_char('*') {
164 self.consume_c_style_block_comment()?;
165 continue;
166 }
167
168 let location = self.current_source_location();
169 tokens.push(Token::new(TokenKind::Slash, location));
170 self.advance();
171 continue;
172 }
173
174 if char == '%' {
176 let location = self.current_source_location();
177 tokens.push(Token::new(TokenKind::Percentage, location));
178 self.advance();
179 continue;
180 }
181
182 if char == '^' {
184 let location = self.current_source_location();
185 tokens.push(Token::new(TokenKind::Caret, location));
186 self.advance();
187 continue;
188 }
189
190 if char == '~' {
192 let location = self.current_source_location();
193 tokens.push(Token::new(TokenKind::BitwiseNot, location));
194 self.advance();
195 continue;
196 }
197
198 if char == '|' {
200 let location = self.current_source_location();
201
202 self.advance();
203 let kind = if self.is_current_char('|') {
204 self.advance();
205 TokenKind::OrOr
206 } else {
207 TokenKind::BitwiseOr
208 };
209
210 tokens.push(Token::new(kind, location));
211 continue;
212 }
213
214 if char == '&' {
216 let location = self.current_source_location();
217
218 self.advance();
219 let kind = if self.is_current_char('&') {
220 self.advance();
221 TokenKind::AndAnd
222 } else {
223 TokenKind::BitwiseAnd
224 };
225
226 tokens.push(Token::new(kind, location));
227 continue;
228 }
229
230 if char == '#' {
232 let location = self.current_source_location();
233 tokens.push(Token::new(TokenKind::BitwiseXor, location));
234 self.advance();
235 continue;
236 }
237
238 if char == ',' {
240 let location = self.current_source_location();
241 tokens.push(Token::new(TokenKind::Comma, location));
242 self.advance();
243 continue;
244 }
245
246 if char == '.' {
248 let location = self.current_source_location();
249 tokens.push(Token::new(TokenKind::Dot, location));
250 self.advance();
251 continue;
252 }
253
254 if char == '>' {
256 let location = self.current_source_location();
257
258 self.advance();
259 let kind = if self.is_current_char('=') {
260 self.advance();
261 TokenKind::GreaterEqual
262 } else if self.is_current_char('>') {
263 self.advance();
264 TokenKind::BitwiseRightShift
265 } else {
266 TokenKind::Greater
267 };
268
269 tokens.push(Token::new(kind, location));
270 continue;
271 }
272
273 if char == '<' {
275 let location = self.current_source_location();
276
277 self.advance();
278 let kind = if self.is_current_char('=') {
279 self.advance();
280 if self.is_current_char('>') {
281 self.advance();
282 TokenKind::NullSafeEqual
283 } else {
284 TokenKind::LessEqual
285 }
286 } else if self.is_current_char('<') {
287 self.advance();
288 TokenKind::BitwiseLeftShift
289 } else if self.is_current_char('>') {
290 self.advance();
291 TokenKind::BangEqual
292 } else if self.is_current_char('@') {
293 self.advance();
294 TokenKind::ArrowRightAt
295 } else {
296 TokenKind::Less
297 };
298
299 tokens.push(Token::new(kind, location));
300 continue;
301 }
302
303 if char == '=' {
305 let location = self.current_source_location();
306 tokens.push(Token::new(TokenKind::Equal, location));
307 self.advance();
308 continue;
309 }
310
311 if char == ':' {
313 let location = self.current_source_location();
314
315 if self.is_next_char('=') {
317 tokens.push(Token::new(TokenKind::ColonEqual, location));
318 self.advance_n(2);
320 continue;
321 }
322
323 if self.is_next_char(':') {
325 tokens.push(Token::new(TokenKind::ColonColon, location));
326 self.advance_n(2);
328 continue;
329 }
330
331 tokens.push(Token::new(TokenKind::Colon, location));
332 self.advance();
333 continue;
334 }
335
336 if char == '!' {
338 let location = self.current_source_location();
339
340 self.advance();
342 let kind = if self.is_current_char('=') {
343 self.advance();
345 TokenKind::BangEqual
346 } else {
347 TokenKind::Bang
348 };
349
350 tokens.push(Token::new(kind, location));
351 continue;
352 }
353
354 if char == '(' {
356 let location = self.current_source_location();
357 tokens.push(Token::new(TokenKind::LeftParen, location));
358 self.advance();
359 continue;
360 }
361
362 if char == ')' {
364 let location = self.current_source_location();
365 tokens.push(Token::new(TokenKind::RightParen, location));
366 self.advance();
367 continue;
368 }
369
370 if char == '[' {
372 let location = self.current_source_location();
373 tokens.push(Token::new(TokenKind::LeftBracket, location));
374 self.advance();
375 continue;
376 }
377
378 if char == ']' {
380 let location = self.current_source_location();
381 tokens.push(Token::new(TokenKind::RightBracket, location));
382 self.advance();
383 continue;
384 }
385
386 if char == ';' {
388 let location = self.current_source_location();
389 tokens.push(Token::new(TokenKind::Semicolon, location));
390 self.advance();
391 continue;
392 }
393
394 if char == ' ' || char == '\t' {
396 self.advance();
397 continue;
398 }
399
400 if char == '\n' {
401 self.advance();
402 self.column_end = 0;
403 self.line_end += 1;
404 continue;
405 }
406
407 return Err(Diagnostic::error("Unexpected character")
408 .with_location(self.current_source_location())
409 .as_boxed());
410 }
411
412 Ok(tokens)
413 }
414
415 fn consume_global_variable_name(&mut self) -> Result<Token, Box<Diagnostic>> {
416 let start_index = self.index;
417
418 self.advance();
420
421 if !self.is_current_char_func(|c| c.is_alphanumeric()) {
423 return Err(Diagnostic::error(
424 "Global variable name must start with alphabetic character",
425 )
426 .add_help("Add at least one alphabetic character after @")
427 .with_location(self.current_source_location())
428 .as_boxed());
429 }
430
431 while self.is_current_char_func(|c| c == '_' || c.is_alphanumeric()) {
432 self.advance();
433 }
434
435 let literal = &self.content[start_index..self.index];
437 let mut string: String = literal.iter().collect();
438 string = string.to_lowercase();
439
440 let location = self.current_source_location();
441 Ok(Token::new(TokenKind::GlobalVariable(string), location))
442 }
443
444 fn consume_identifier(&mut self) -> Token {
445 let start_index = self.index;
446 while self.is_current_char_func(|c| c == '_' || c.is_alphanumeric()) {
447 self.advance();
448 }
449
450 let literal = &self.content[start_index..self.index];
452 let mut string: String = literal.iter().collect();
453 string = string.to_lowercase();
454
455 let kind = GITQL_RESERVED_KEYWORDS
456 .get(string.as_str())
457 .cloned()
458 .unwrap_or(TokenKind::Symbol(string));
459 Token::new(kind, self.current_source_location())
460 }
461
462 fn consume_backticks_identifier(&mut self) -> Result<Token, Box<Diagnostic>> {
463 let start_index = self.index;
464
465 self.advance();
467
468 while !self.is_current_char('`') {
469 self.advance();
470 }
471
472 if self.index >= self.content_len {
473 return Err(Diagnostic::error("Unterminated backticks")
474 .add_help("Add ` at the end of the identifier")
475 .with_location(self.current_source_location())
476 .as_boxed());
477 }
478
479 self.advance();
481
482 let literal = &self.content[start_index + 1..self.index - 1];
483 let identifier: String = literal.iter().collect();
484 let location = self.current_source_location();
485 Ok(Token::new(TokenKind::Symbol(identifier), location))
486 }
487
488 fn consume_number(&mut self) -> Result<Token, Box<Diagnostic>> {
489 let start_index = self.index;
490
491 while self.is_current_char_func(|c| c == '_' || c.is_numeric()) {
492 self.advance();
493 }
494
495 let mut is_float_value = false;
496 if self.is_current_char('.') {
497 self.advance();
498
499 is_float_value = true;
500 while self.is_current_char_func(|c| c == '_' || c.is_numeric()) {
501 self.advance();
502 }
503 }
504
505 let literal = &self.content[start_index..self.index];
506 let string: String = literal.iter().collect();
507 let literal_num = string.replace('_', "");
508 let location = self.current_source_location();
509
510 if is_float_value {
511 return match literal_num.parse::<f64>() {
512 Ok(float) => Ok(Token::new(TokenKind::Float(float), location)),
513 Err(parse_float_error) => Err(Diagnostic::error(&parse_float_error.to_string())
514 .add_note(&format!(
515 "Value must be between {} and {}",
516 f64::MIN,
517 f64::MAX
518 ))
519 .with_location(self.current_source_location())
520 .as_boxed()),
521 };
522 }
523
524 match literal_num.parse::<i64>() {
525 Ok(integer) => Ok(Token::new(TokenKind::Integer(integer), location)),
526 Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
527 .add_note(&format!(
528 "Value must be between {} and {}",
529 i64::MIN,
530 i64::MAX
531 ))
532 .with_location(self.current_source_location())
533 .as_boxed()),
534 }
535 }
536
537 fn consume_binary_number(&mut self) -> Result<Token, Box<Diagnostic>> {
538 let start_index = self.index;
539 while self.is_current_char_func(|c| c == '_' || c == '0' || c >= '1') {
540 self.advance();
541 }
542
543 if start_index == self.index {
544 return Err(
545 Diagnostic::error("Missing digits after the integer base prefix")
546 .add_help("Expect at least one binary digits after the prefix 0b")
547 .add_help("Binary digit mean 0 or 1")
548 .with_location(self.current_source_location())
549 .as_boxed(),
550 );
551 }
552
553 let literal = &self.content[start_index..self.index];
554 let string: String = literal.iter().collect();
555 let literal_num = string.replace('_', "");
556
557 const BINARY_RADIX: u32 = 2;
558 match i64::from_str_radix(&literal_num, BINARY_RADIX) {
559 Ok(integer) => {
560 let location = self.current_source_location();
561 Ok(Token::new(TokenKind::Integer(integer), location))
562 }
563 Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
564 .add_note(&format!(
565 "Value must be between {} and {}",
566 i64::MIN,
567 i64::MAX
568 ))
569 .with_location(self.current_source_location())
570 .as_boxed()),
571 }
572 }
573
574 fn consume_octal_number(&mut self) -> Result<Token, Box<Diagnostic>> {
575 let start_index = self.index;
576 while self.is_current_char_func(|c| c == '_' || ('0'..='8').contains(&c)) {
577 self.advance();
578 }
579
580 if start_index == self.index {
581 return Err(
582 Diagnostic::error("Missing digits after the integer base prefix")
583 .add_help("Expect at least one octal digits after the prefix 0o")
584 .add_help("Octal digit mean 0 to 8 number")
585 .with_location(self.current_source_location())
586 .as_boxed(),
587 );
588 }
589
590 let literal = &self.content[start_index..self.index];
591 let string: String = literal.iter().collect();
592 let literal_num = string.replace('_', "");
593
594 const OCTAL_RADIX: u32 = 8;
595 match i64::from_str_radix(&literal_num, OCTAL_RADIX) {
596 Ok(integer) => {
597 let location = self.current_source_location();
598 Ok(Token::new(TokenKind::Integer(integer), location))
599 }
600 Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
601 .add_note(&format!(
602 "Value must be between {} and {}",
603 i64::MIN,
604 i64::MAX
605 ))
606 .with_location(self.current_source_location())
607 .as_boxed()),
608 }
609 }
610
611 fn consume_hex_number(&mut self) -> Result<Token, Box<Diagnostic>> {
612 let start_index = self.index;
613 while self.is_current_char_func(|c| c == '_' || c.is_ascii_hexdigit()) {
614 self.advance();
615 }
616
617 if start_index == self.index {
618 return Err(
619 Diagnostic::error("Missing digits after the integer base prefix")
620 .add_help("Expect at least one hex digits after the prefix 0x")
621 .add_help("Hex digit mean 0 to 9 and a to f")
622 .with_location(self.current_source_location())
623 .as_boxed(),
624 );
625 }
626
627 let literal = &self.content[start_index..self.index];
628 let string: String = literal.iter().collect();
629 let literal_num = string.replace('_', "");
630
631 const HEX_RADIX: u32 = 16;
632 match i64::from_str_radix(&literal_num, HEX_RADIX) {
633 Ok(integer) => {
634 let location = self.current_source_location();
635 Ok(Token::new(TokenKind::Integer(integer), location))
636 }
637 Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
638 .add_note(&format!(
639 "Value must be between {} and {}",
640 i64::MIN,
641 i64::MAX
642 ))
643 .with_location(self.current_source_location())
644 .as_boxed()),
645 }
646 }
647
648 fn consume_string_in_single_quotes(&mut self) -> Result<Token, Box<Diagnostic>> {
649 let buffer = self.consume_string_with_around('\'')?;
650
651 if self.index >= self.content_len {
652 return Err(Diagnostic::error("Unterminated single quote string")
653 .add_help("Add \' at the end of the String literal")
654 .with_location(self.current_source_location())
655 .as_boxed());
656 }
657
658 self.advance();
660 let location = self.current_source_location();
661 Ok(Token::new(TokenKind::String(buffer), location))
662 }
663
664 fn consume_string_in_double_quotes(&mut self) -> Result<Token, Box<Diagnostic>> {
665 let buffer = self.consume_string_with_around('"')?;
666
667 if self.index >= self.content_len {
668 return Err(Diagnostic::error("Unterminated double quote string")
669 .add_help("Add \" at the end of the String literal")
670 .with_location(self.current_source_location())
671 .as_boxed());
672 }
673
674 self.advance();
676 let location = self.current_source_location();
677 Ok(Token::new(TokenKind::String(buffer), location))
678 }
679
680 fn consume_string_with_around(&mut self, around: char) -> Result<String, Box<Diagnostic>> {
681 self.advance();
683
684 let mut buffer = String::new();
685 while !self.is_current_char(around) {
686 if !self.is_current_char('\\') {
687 buffer.push(self.content[self.index]);
688 self.advance();
689 continue;
690 }
691
692 if self.is_last() {
694 buffer.push(self.content[self.index]);
695 self.advance();
696 continue;
697 }
698
699 self.advance();
701
702 let next_char = self.content[self.index];
704 let character_with_escape_handled = match next_char {
705 '\'' => {
707 self.advance();
708 '\''
709 }
710 '\"' => {
712 self.advance();
713 '\"'
714 }
715 '\\' => {
717 self.advance();
718 '\\'
719 }
720 'n' => {
722 self.advance();
723 '\n'
724 }
725 'r' => {
727 self.advance();
728 '\r'
729 }
730 't' => {
732 self.advance();
733 '\t'
734 }
735 _ => self.content[self.index - 1],
736 };
737
738 buffer.push(character_with_escape_handled);
739 }
740
741 Ok(buffer)
742 }
743
744 fn consume_single_line_comment(&mut self) {
745 self.advance_n(2);
747
748 while !self.is_current_char('\n') {
749 self.advance();
750 }
751
752 self.advance();
754 self.line_end += 1;
755 self.column_end = 0;
756 }
757
758 fn consume_c_style_block_comment(&mut self) -> Result<(), Box<Diagnostic>> {
759 self.advance_n(2);
761
762 let mut number_nested_block_start = 0;
763 loop {
764 if self.is_current_char('/') && self.is_next_char('*') {
765 number_nested_block_start += 1;
766 }
767
768 self.advance();
770
771 if self.is_current_char('*') && self.is_next_char('/') {
772 number_nested_block_start -= 1;
773 if number_nested_block_start < 0 {
774 break;
775 }
776 }
777 }
778
779 if self.index + 2 > self.content_len {
780 return Err(Diagnostic::error("C Style comment must end with */")
781 .add_help("Add */ at the end of C Style comments")
782 .with_location(self.current_source_location())
783 .as_boxed());
784 }
785
786 self.advance_n(2);
788 Ok(())
789 }
790
791 fn advance(&mut self) {
792 self.index += 1;
793 self.column_end += 1;
794 }
795
796 fn advance_n(&mut self, n: usize) {
797 self.index += n;
798 self.column_end += n as u32;
799 }
800
801 fn is_current_char(&self, ch: char) -> bool {
802 self.index < self.content_len && self.content[self.index] == ch
803 }
804
805 fn is_next_char(&self, ch: char) -> bool {
806 self.index + 1 < self.content_len && self.content[self.index + 1] == ch
807 }
808
809 fn is_current_char_func(&self, func: fn(char) -> bool) -> bool {
810 self.index < self.content_len && func(self.content[self.index])
811 }
812
813 fn has_next(&self) -> bool {
814 self.index < self.content_len
815 }
816
817 fn is_last(&self) -> bool {
818 self.index == self.content_len - 1
819 }
820}