gitql_parser/
tokenizer.rs

1use crate::diagnostic::Diagnostic;
2use crate::token::SourceLocation;
3use crate::token::Token;
4use crate::token::TokenKind;
5
6pub struct Tokenizer {
7    pub(crate) content: Vec<char>,
8    pub(crate) content_len: usize,
9    pub(crate) index: usize,
10
11    pub(crate) line_start: u32,
12    pub(crate) line_end: u32,
13    pub(crate) column_start: u32,
14    pub(crate) column_end: u32,
15}
16
17impl Tokenizer {
18    pub(crate) fn new(chars: Vec<char>) -> Tokenizer {
19        let content_len = chars.len();
20        Tokenizer {
21            content: chars,
22            content_len,
23            index: 0,
24
25            line_start: 1,
26            line_end: 1,
27            column_start: 0,
28            column_end: 0,
29        }
30    }
31
32    pub fn tokenize(content: String) -> Result<Vec<Token>, Box<Diagnostic>> {
33        let mut tokenizer = Tokenizer::new(content.chars().collect());
34        tokenizer.tokenize_characters()
35    }
36
37    fn current_source_location(&self) -> SourceLocation {
38        SourceLocation {
39            line_start: self.line_start,
40            line_end: self.line_end,
41            column_start: self.column_start,
42            column_end: self.column_end,
43        }
44    }
45
46    fn tokenize_characters(&mut self) -> Result<Vec<Token>, Box<Diagnostic>> {
47        let mut tokens: Vec<Token> = Vec::new();
48        let len = self.content_len;
49
50        while self.has_next() {
51            self.column_start = self.column_end;
52            self.line_start = self.line_end;
53
54            let char = self.content[self.index];
55
56            // Symbol
57            if char.is_alphabetic() {
58                tokens.push(self.consume_identifier());
59                continue;
60            }
61
62            // @> or Global Variable Symbol
63            if char == '@' {
64                // @>
65                if self.index + 1 < len && self.content[self.index + 1] == '>' {
66                    self.index += 2;
67                    let location = self.current_source_location();
68                    tokens.push(Token::new(TokenKind::AtRightArrow, location));
69                    continue;
70                }
71
72                tokens.push(self.consume_global_variable_name()?);
73                continue;
74            }
75
76            // Number
77            if char.is_numeric() {
78                if char == '0' && self.index + 1 < len {
79                    if self.content[self.index + 1] == 'x' {
80                        self.index += 2;
81                        self.column_start += 2;
82                        tokens.push(self.consume_hex_number()?);
83                        continue;
84                    }
85
86                    if self.content[self.index + 1] == 'b' {
87                        self.index += 2;
88                        self.column_start += 2;
89                        tokens.push(self.consume_binary_number()?);
90                        continue;
91                    }
92
93                    if self.content[self.index + 1] == 'o' {
94                        self.index += 2;
95                        self.column_start += 2;
96                        tokens.push(self.consume_octal_number()?);
97                        continue;
98                    }
99                }
100
101                tokens.push(self.consume_number()?);
102                continue;
103            }
104
105            // String literal between single quotes '...'
106            if char == '\'' {
107                tokens.push(self.consume_string_in_single_quotes()?);
108                continue;
109            }
110
111            // String literal between double quotes "..."
112            if char == '"' {
113                tokens.push(self.consume_string_in_double_quotes()?);
114                continue;
115            }
116
117            // All chars between two backticks should be consumed as identifier
118            if char == '`' {
119                tokens.push(self.consume_backticks_identifier()?);
120                continue;
121            }
122
123            // Plus
124            if char == '+' {
125                let location = self.current_source_location();
126                tokens.push(Token::new(TokenKind::Plus, location));
127                self.advance();
128                continue;
129            }
130
131            // Minus
132            if char == '-' {
133                // Ignore single line comment which from -- until the end of the current line
134                if self.index + 1 < self.content_len && self.content[self.index + 1] == '-' {
135                    self.ignore_single_line_comment();
136                    continue;
137                }
138
139                let location = self.current_source_location();
140                tokens.push(Token::new(TokenKind::Minus, location));
141                self.advance();
142                continue;
143            }
144
145            // Star
146            if char == '*' {
147                let location = self.current_source_location();
148                tokens.push(Token::new(TokenKind::Star, location));
149                self.advance();
150                continue;
151            }
152
153            // Slash
154            if char == '/' {
155                // Ignore C style comment which from /* comment */
156                if self.index + 1 < self.content_len && self.content[self.index + 1] == '*' {
157                    self.ignore_c_style_comment()?;
158                    continue;
159                }
160
161                let location = self.current_source_location();
162                tokens.push(Token::new(TokenKind::Slash, location));
163                self.advance();
164                continue;
165            }
166
167            // Percentage
168            if char == '%' {
169                let location = self.current_source_location();
170                tokens.push(Token::new(TokenKind::Percentage, location));
171                self.advance();
172                continue;
173            }
174
175            // Caret
176            if char == '^' {
177                let location = self.current_source_location();
178                tokens.push(Token::new(TokenKind::Caret, location));
179                self.advance();
180                continue;
181            }
182
183            // Bitwise NOT
184            if char == '~' {
185                let location = self.current_source_location();
186                tokens.push(Token::new(TokenKind::BitwiseNot, location));
187                self.advance();
188                continue;
189            }
190
191            // Or
192            if char == '|' {
193                let location = self.current_source_location();
194
195                self.advance();
196                let kind = if self.index < len && self.content[self.index] == '|' {
197                    self.advance();
198                    TokenKind::OrOr
199                } else {
200                    TokenKind::BitwiseOr
201                };
202
203                tokens.push(Token::new(kind, location));
204                continue;
205            }
206
207            // And
208            if char == '&' {
209                let location = self.current_source_location();
210
211                self.advance();
212                let kind = if self.index < len && self.content[self.index] == '&' {
213                    self.advance();
214                    TokenKind::AndAnd
215                } else {
216                    TokenKind::BitwiseAnd
217                };
218
219                tokens.push(Token::new(kind, location));
220                continue;
221            }
222
223            // xor
224            if char == '#' {
225                let location = self.current_source_location();
226                tokens.push(Token::new(TokenKind::BitwiseXor, location));
227                self.advance();
228                continue;
229            }
230
231            // Comma
232            if char == ',' {
233                let location = self.current_source_location();
234                tokens.push(Token::new(TokenKind::Comma, location));
235                self.advance();
236                continue;
237            }
238
239            // Dot
240            if char == '.' {
241                let location = self.current_source_location();
242                tokens.push(Token::new(TokenKind::Dot, location));
243                self.advance();
244                continue;
245            }
246
247            // Greater or GreaterEqual
248            if char == '>' {
249                let location = self.current_source_location();
250
251                self.advance();
252                let kind = if self.index < len && self.content[self.index] == '=' {
253                    self.advance();
254                    TokenKind::GreaterEqual
255                } else if self.index < len && self.content[self.index] == '>' {
256                    self.advance();
257                    TokenKind::BitwiseRightShift
258                } else {
259                    TokenKind::Greater
260                };
261
262                tokens.push(Token::new(kind, location));
263                continue;
264            }
265
266            // Less, LessEqual or NULL-safe equal
267            if char == '<' {
268                let location = self.current_source_location();
269
270                self.advance();
271                let kind = if self.index < len && self.content[self.index] == '=' {
272                    self.advance();
273                    if self.index < len && self.content[self.index] == '>' {
274                        self.advance();
275                        TokenKind::NullSafeEqual
276                    } else {
277                        TokenKind::LessEqual
278                    }
279                } else if self.index < len && self.content[self.index] == '<' {
280                    self.advance();
281                    TokenKind::BitwiseLeftShift
282                } else if self.index < len && self.content[self.index] == '>' {
283                    self.advance();
284                    TokenKind::BangEqual
285                } else if self.index < len && self.content[self.index] == '@' {
286                    self.advance();
287                    TokenKind::ArrowRightAt
288                } else {
289                    TokenKind::Less
290                };
291
292                tokens.push(Token::new(kind, location));
293                continue;
294            }
295
296            // Equal
297            if char == '=' {
298                let location = self.current_source_location();
299                tokens.push(Token::new(TokenKind::Equal, location));
300                self.advance();
301                continue;
302            }
303
304            // Colon , ColonColon or Colon Equal
305            if char == ':' {
306                let location = self.current_source_location();
307
308                // :=
309                if self.index + 1 < len && self.content[self.index + 1] == '=' {
310                    tokens.push(Token::new(TokenKind::ColonEqual, location));
311                    // Advance `:=`
312                    self.advance_n(2);
313                    continue;
314                }
315
316                // ::
317                if self.index + 1 < len && self.content[self.index + 1] == ':' {
318                    tokens.push(Token::new(TokenKind::ColonColon, location));
319                    // Advance `::`
320                    self.advance_n(2);
321                    continue;
322                }
323
324                tokens.push(Token::new(TokenKind::Colon, location));
325                self.advance();
326                continue;
327            }
328
329            // Bang or Bang Equal
330            if char == '!' {
331                let location = self.current_source_location();
332
333                // Consume `!`
334                self.advance();
335                let kind = if self.index < len && self.content[self.index] == '=' {
336                    // Consume `=`
337                    self.advance();
338                    TokenKind::BangEqual
339                } else {
340                    TokenKind::Bang
341                };
342
343                tokens.push(Token::new(kind, location));
344                continue;
345            }
346
347            // Left Paren
348            if char == '(' {
349                let location = self.current_source_location();
350                tokens.push(Token::new(TokenKind::LeftParen, location));
351                self.advance();
352                continue;
353            }
354
355            // Right Paren
356            if char == ')' {
357                let location = self.current_source_location();
358                tokens.push(Token::new(TokenKind::RightParen, location));
359                self.advance();
360                continue;
361            }
362
363            // Left Bracket
364            if char == '[' {
365                let location = self.current_source_location();
366                tokens.push(Token::new(TokenKind::LeftBracket, location));
367                self.advance();
368                continue;
369            }
370
371            // Right Bracket
372            if char == ']' {
373                let location = self.current_source_location();
374                tokens.push(Token::new(TokenKind::RightBracket, location));
375                self.advance();
376                continue;
377            }
378
379            // Semicolon
380            if char == ';' {
381                let location = self.current_source_location();
382                tokens.push(Token::new(TokenKind::Semicolon, location));
383                self.advance();
384                continue;
385            }
386
387            // Characters to ignoring
388            if char == ' ' || char == '\t' {
389                self.advance();
390                continue;
391            }
392
393            if char == '\n' {
394                self.advance();
395                self.column_end = 0;
396                self.line_end += 1;
397                continue;
398            }
399
400            return Err(Diagnostic::error("Unexpected character")
401                .with_location(self.current_source_location())
402                .as_boxed());
403        }
404
405        Ok(tokens)
406    }
407
408    fn consume_global_variable_name(&mut self) -> Result<Token, Box<Diagnostic>> {
409        let start_index = self.index;
410
411        // Advance `@`
412        self.advance();
413
414        // Make sure first character is  alphabetic
415        if self.has_next() && !self.content[self.index].is_alphabetic() {
416            return Err(Diagnostic::error(
417                "Global variable name must start with alphabetic character",
418            )
419            .add_help("Add at least one alphabetic character after @")
420            .with_location(self.current_source_location())
421            .as_boxed());
422        }
423
424        while self.has_next() && self.is_current_char_func(|c| c == '_' || c.is_alphanumeric()) {
425            self.advance();
426        }
427
428        // Identifier is being case-insensitive by default, convert to lowercase to be easy to compare and lookup
429        let literal = &self.content[start_index..self.index];
430        let mut string: String = literal.iter().collect();
431        string = string.to_lowercase();
432
433        let location = self.current_source_location();
434        Ok(Token::new(TokenKind::GlobalVariable(string), location))
435    }
436
437    fn consume_identifier(&mut self) -> Token {
438        let start_index = self.index;
439
440        while self.has_next() && self.is_current_char_func(|c| c == '_' || c.is_alphanumeric()) {
441            self.advance();
442        }
443
444        // Identifier is being case-insensitive by default, convert to lowercase to be easy to compare and lookup
445        let literal = &self.content[start_index..self.index];
446        let mut string: String = literal.iter().collect();
447        string = string.to_lowercase();
448
449        let location = self.current_source_location();
450        Token::new_symbol(string, location)
451    }
452
453    fn consume_backticks_identifier(&mut self) -> Result<Token, Box<Diagnostic>> {
454        let start_index = self.index;
455
456        // Advance '`'
457        self.advance();
458
459        while self.has_next() && !self.is_current_char('`') {
460            self.advance();
461        }
462
463        if self.index >= self.content_len {
464            return Err(Diagnostic::error("Unterminated backticks")
465                .add_help("Add ` at the end of the identifier")
466                .with_location(self.current_source_location())
467                .as_boxed());
468        }
469
470        // Advance '`'
471        self.advance();
472
473        let literal = &self.content[start_index + 1..self.index - 1];
474        let identifier: String = literal.iter().collect();
475        let location = self.current_source_location();
476        Ok(Token::new(TokenKind::Symbol(identifier), location))
477    }
478
479    fn consume_number(&mut self) -> Result<Token, Box<Diagnostic>> {
480        let start_index = self.index;
481
482        while self.has_next() && self.is_current_char_func(|c| c == '_' || c.is_numeric()) {
483            self.advance();
484        }
485
486        let mut is_float_value = false;
487        if self.has_next() && self.is_current_char('.') {
488            self.advance();
489
490            is_float_value = true;
491            while self.has_next() && self.is_current_char_func(|c| c == '_' || c.is_numeric()) {
492                self.advance();
493            }
494        }
495
496        let literal = &self.content[start_index..self.index];
497        let string: String = literal.iter().collect();
498        let literal_num = string.replace('_', "");
499        let location = self.current_source_location();
500
501        if is_float_value {
502            return match literal_num.parse::<f64>() {
503                Ok(float) => Ok(Token::new(TokenKind::Float(float), location)),
504                Err(parse_float_error) => Err(Diagnostic::error(&parse_float_error.to_string())
505                    .add_note(&format!(
506                        "Value must be between {} and {}",
507                        f64::MIN,
508                        f64::MAX
509                    ))
510                    .with_location(self.current_source_location())
511                    .as_boxed()),
512            };
513        }
514
515        match literal_num.parse::<i64>() {
516            Ok(integer) => Ok(Token::new(TokenKind::Integer(integer), location)),
517            Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
518                .add_note(&format!(
519                    "Value must be between {} and {}",
520                    i64::MIN,
521                    i64::MAX
522                ))
523                .with_location(self.current_source_location())
524                .as_boxed()),
525        }
526    }
527
528    fn consume_binary_number(&mut self) -> Result<Token, Box<Diagnostic>> {
529        let start_index = self.index;
530        let mut has_digit = false;
531
532        while self.has_next() && self.is_current_char_func(|c| c == '_' || c == '0' || c >= '1') {
533            self.advance();
534            has_digit = true;
535        }
536
537        if !has_digit {
538            return Err(
539                Diagnostic::error("Missing digits after the integer base prefix")
540                    .add_help("Expect at least one binary digits after the prefix 0b")
541                    .add_help("Binary digit mean 0 or 1")
542                    .with_location(self.current_source_location())
543                    .as_boxed(),
544            );
545        }
546
547        let literal = &self.content[start_index..self.index];
548        let string: String = literal.iter().collect();
549        let literal_num = string.replace('_', "");
550
551        const BINARY_RADIX: u32 = 2;
552        match i64::from_str_radix(&literal_num, BINARY_RADIX) {
553            Ok(integer) => {
554                let location = self.current_source_location();
555                Ok(Token::new(TokenKind::Integer(integer), location))
556            }
557            Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
558                .add_note(&format!(
559                    "Value must be between {} and {}",
560                    i64::MIN,
561                    i64::MAX
562                ))
563                .with_location(self.current_source_location())
564                .as_boxed()),
565        }
566    }
567
568    fn consume_octal_number(&mut self) -> Result<Token, Box<Diagnostic>> {
569        let start_index = self.index;
570        let mut has_digit = false;
571
572        while self.has_next() && self.is_current_char_func(|c| c == '_' || ('0'..='8').contains(&c))
573        {
574            self.advance();
575            has_digit = true;
576        }
577
578        if !has_digit {
579            return Err(
580                Diagnostic::error("Missing digits after the integer base prefix")
581                    .add_help("Expect at least one octal digits after the prefix 0o")
582                    .add_help("Octal digit mean 0 to 8 number")
583                    .with_location(self.current_source_location())
584                    .as_boxed(),
585            );
586        }
587
588        let literal = &self.content[start_index..self.index];
589        let string: String = literal.iter().collect();
590        let literal_num = string.replace('_', "");
591
592        const OCTAL_RADIX: u32 = 2;
593        match i64::from_str_radix(&literal_num, OCTAL_RADIX) {
594            Ok(integer) => {
595                let location = self.current_source_location();
596                Ok(Token::new(TokenKind::Integer(integer), location))
597            }
598            Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
599                .add_note(&format!(
600                    "Value must be between {} and {}",
601                    i64::MIN,
602                    i64::MAX
603                ))
604                .with_location(self.current_source_location())
605                .as_boxed()),
606        }
607    }
608
609    fn consume_hex_number(&mut self) -> Result<Token, Box<Diagnostic>> {
610        let start_index = self.index;
611        let mut has_digit = false;
612
613        while self.has_next() && self.is_current_char_func(|c| c == '_' || c.is_ascii_hexdigit()) {
614            self.advance();
615            has_digit = true;
616        }
617
618        if !has_digit {
619            return Err(
620                Diagnostic::error("Missing digits after the integer base prefix")
621                    .add_help("Expect at least one hex digits after the prefix 0x")
622                    .add_help("Hex digit mean 0 to 9 and a to f")
623                    .with_location(self.current_source_location())
624                    .as_boxed(),
625            );
626        }
627
628        let literal = &self.content[start_index..self.index];
629        let string: String = literal.iter().collect();
630        let literal_num = string.replace('_', "");
631
632        const HEX_RADIX: u32 = 16;
633        match i64::from_str_radix(&literal_num, HEX_RADIX) {
634            Ok(integer) => {
635                let location = self.current_source_location();
636                Ok(Token::new(TokenKind::Integer(integer), location))
637            }
638            Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
639                .add_note(&format!(
640                    "Value must be between {} and {}",
641                    i64::MIN,
642                    i64::MAX
643                ))
644                .with_location(self.current_source_location())
645                .as_boxed()),
646        }
647    }
648
649    fn consume_string_in_single_quotes(&mut self) -> Result<Token, Box<Diagnostic>> {
650        let buffer = self.consume_string_with_around('\'')?;
651
652        if self.index >= self.content_len {
653            return Err(Diagnostic::error("Unterminated single quote string")
654                .add_help("Add \' at the end of the String literal")
655                .with_location(self.current_source_location())
656                .as_boxed());
657        }
658
659        // Consume `'`
660        self.advance();
661        let location = self.current_source_location();
662        Ok(Token::new(TokenKind::String(buffer), location))
663    }
664
665    fn consume_string_in_double_quotes(&mut self) -> Result<Token, Box<Diagnostic>> {
666        let buffer = self.consume_string_with_around('"')?;
667
668        if self.index >= self.content_len {
669            return Err(Diagnostic::error("Unterminated double quote string")
670                .add_help("Add \" at the end of the String literal")
671                .with_location(self.current_source_location())
672                .as_boxed());
673        }
674
675        // Consume `"`
676        self.advance();
677        let location = self.current_source_location();
678        Ok(Token::new(TokenKind::String(buffer), location))
679    }
680
681    fn consume_string_with_around(&mut self, around: char) -> Result<String, Box<Diagnostic>> {
682        // Consume Around start
683        self.advance();
684
685        let mut buffer = String::new();
686        while self.has_next() && self.content[self.index] != around {
687            if !self.is_current_char('\\') {
688                buffer.push(self.content[self.index]);
689                self.advance();
690                continue;
691            }
692
693            // If '\\' is the last character, we don't need to escape it
694            if self.is_last() {
695                buffer.push(self.content[self.index]);
696                self.advance();
697                continue;
698            }
699
700            // Consume '\\'
701            self.advance();
702
703            // Check possible escape depending on the next character
704            let next_char = self.content[self.index];
705            let character_with_escape_handled = match next_char {
706                // Single quote
707                '\'' => {
708                    self.advance();
709                    '\''
710                }
711                // Double quote
712                '\"' => {
713                    self.advance();
714                    '\"'
715                }
716                // Backslash
717                '\\' => {
718                    self.advance();
719                    '\\'
720                }
721                // New line
722                'n' => {
723                    self.advance();
724                    '\n'
725                }
726                // Carriage return
727                'r' => {
728                    self.advance();
729                    '\r'
730                }
731                // Tab
732                't' => {
733                    self.advance();
734                    '\t'
735                }
736                _ => self.content[self.index - 1],
737            };
738
739            buffer.push(character_with_escape_handled);
740        }
741
742        Ok(buffer)
743    }
744
745    fn ignore_single_line_comment(&mut self) {
746        // Advance `--`
747        self.advance_n(2);
748
749        while self.has_next() && !self.is_current_char('\n') {
750            self.advance();
751        }
752
753        // Advance `\n`
754        self.advance();
755        self.line_end += 1;
756        self.column_end = 0;
757    }
758
759    fn ignore_c_style_comment(&mut self) -> Result<(), Box<Diagnostic>> {
760        // Advance `/*`
761        self.advance_n(2);
762
763        while self.index + 1 < self.content_len
764            && (!self.is_current_char('*') && self.content[self.index + 1] != '/')
765        {
766            // Advance char
767            self.advance();
768        }
769
770        if self.index + 2 > self.content_len {
771            return Err(Diagnostic::error("C Style comment must end with */")
772                .add_help("Add */ at the end of C Style comments")
773                .with_location(self.current_source_location())
774                .as_boxed());
775        }
776
777        // Advance `*/`
778        self.advance_n(2);
779        Ok(())
780    }
781
782    fn advance(&mut self) {
783        self.index += 1;
784        self.column_end += 1;
785    }
786
787    fn advance_n(&mut self, n: usize) {
788        self.index += n;
789        self.column_end += n as u32;
790    }
791
792    fn is_current_char(&self, ch: char) -> bool {
793        self.content[self.index] == ch
794    }
795
796    fn is_current_char_func(&self, func: fn(char) -> bool) -> bool {
797        func(self.content[self.index])
798    }
799
800    fn has_next(&self) -> bool {
801        self.index < self.content_len
802    }
803
804    fn is_last(&self) -> bool {
805        self.index == self.content_len - 1
806    }
807}