gitql_parser/
tokenizer.rs

1use crate::diagnostic::Diagnostic;
2use crate::token::SourceLocation;
3use crate::token::Token;
4use crate::token::TokenKind;
5use crate::token::GITQL_RESERVED_KEYWORDS;
6
7pub struct Tokenizer<'a> {
8    content: &'a [char],
9    content_len: usize,
10    index: usize,
11
12    line_start: u32,
13    line_end: u32,
14    column_start: u32,
15    column_end: u32,
16}
17
18impl<'a> Tokenizer<'a> {
19    pub(crate) fn new(chars: &'a [char]) -> Tokenizer<'a> {
20        Tokenizer {
21            content: chars,
22            content_len: chars.len(),
23            index: 0,
24
25            line_start: 1,
26            line_end: 1,
27            column_start: 0,
28            column_end: 0,
29        }
30    }
31
32    pub fn tokenize(chars: &'a str) -> Result<Vec<Token>, Box<Diagnostic>> {
33        let chars: Vec<char> = chars.chars().collect();
34        Tokenizer::new(&chars).tokenize_characters()
35    }
36
37    fn current_source_location(&self) -> SourceLocation {
38        SourceLocation {
39            line_start: self.line_start,
40            line_end: self.line_end,
41            column_start: self.column_start,
42            column_end: self.column_end,
43        }
44    }
45
46    fn tokenize_characters(&mut self) -> Result<Vec<Token>, Box<Diagnostic>> {
47        let mut tokens: Vec<Token> = Vec::new();
48        let len = self.content_len;
49
50        while self.has_next() {
51            self.column_start = self.column_end;
52            self.line_start = self.line_end;
53
54            let char = self.content[self.index];
55
56            // Symbol
57            if char.is_alphabetic() {
58                tokens.push(self.consume_identifier());
59                continue;
60            }
61
62            // @> or Global Variable Symbol
63            if char == '@' {
64                // @>
65                if self.is_next_char('>') {
66                    self.index += 2;
67                    let location = self.current_source_location();
68                    tokens.push(Token::new(TokenKind::AtRightArrow, location));
69                    continue;
70                }
71
72                tokens.push(self.consume_global_variable_name()?);
73                continue;
74            }
75
76            // Number
77            if char.is_numeric() {
78                if char == '0' && self.index + 1 < len {
79                    match self.content[self.index + 1] {
80                        // bindigits
81                        'b' | 'B' => {
82                            self.index += 2;
83                            self.column_start += 2;
84                            tokens.push(self.consume_binary_number()?);
85                            continue;
86                        }
87                        // hexdigits
88                        'x' | 'X' => {
89                            self.index += 2;
90                            self.column_start += 2;
91                            tokens.push(self.consume_hex_number()?);
92                            continue;
93                        }
94                        // octdigits
95                        'o' | 'O' => {
96                            self.index += 2;
97                            self.column_start += 2;
98                            tokens.push(self.consume_octal_number()?);
99                            continue;
100                        }
101                        _ => {
102                            tokens.push(self.consume_number()?);
103                            continue;
104                        }
105                    }
106                }
107
108                tokens.push(self.consume_number()?);
109                continue;
110            }
111
112            // String literal between single quotes '...'
113            if char == '\'' {
114                tokens.push(self.consume_string_in_single_quotes()?);
115                continue;
116            }
117
118            // String literal between double quotes "..."
119            if char == '"' {
120                tokens.push(self.consume_string_in_double_quotes()?);
121                continue;
122            }
123
124            // All chars between two backticks should be consumed as identifier
125            if char == '`' {
126                tokens.push(self.consume_backticks_identifier()?);
127                continue;
128            }
129
130            // Plus
131            if char == '+' {
132                let location = self.current_source_location();
133                tokens.push(Token::new(TokenKind::Plus, location));
134                self.advance();
135                continue;
136            }
137
138            // Minus
139            if char == '-' {
140                // Ignore single line comment which from -- until the end of the current line
141                if self.is_next_char('-') {
142                    self.consume_single_line_comment();
143                    continue;
144                }
145
146                let location = self.current_source_location();
147                tokens.push(Token::new(TokenKind::Minus, location));
148                self.advance();
149                continue;
150            }
151
152            // Star
153            if char == '*' {
154                let location = self.current_source_location();
155                tokens.push(Token::new(TokenKind::Star, location));
156                self.advance();
157                continue;
158            }
159
160            // Slash
161            if char == '/' {
162                // Ignore C style comment which from /* comment */
163                if self.is_next_char('*') {
164                    self.consume_c_style_block_comment()?;
165                    continue;
166                }
167
168                let location = self.current_source_location();
169                tokens.push(Token::new(TokenKind::Slash, location));
170                self.advance();
171                continue;
172            }
173
174            // Percentage
175            if char == '%' {
176                let location = self.current_source_location();
177                tokens.push(Token::new(TokenKind::Percentage, location));
178                self.advance();
179                continue;
180            }
181
182            // Caret
183            if char == '^' {
184                let location = self.current_source_location();
185                tokens.push(Token::new(TokenKind::Caret, location));
186                self.advance();
187                continue;
188            }
189
190            // Bitwise NOT
191            if char == '~' {
192                let location = self.current_source_location();
193                tokens.push(Token::new(TokenKind::BitwiseNot, location));
194                self.advance();
195                continue;
196            }
197
198            // Or
199            if char == '|' {
200                let location = self.current_source_location();
201
202                self.advance();
203                let kind = if self.is_current_char('|') {
204                    self.advance();
205                    TokenKind::OrOr
206                } else {
207                    TokenKind::BitwiseOr
208                };
209
210                tokens.push(Token::new(kind, location));
211                continue;
212            }
213
214            // And
215            if char == '&' {
216                let location = self.current_source_location();
217
218                self.advance();
219                let kind = if self.is_current_char('&') {
220                    self.advance();
221                    TokenKind::AndAnd
222                } else {
223                    TokenKind::BitwiseAnd
224                };
225
226                tokens.push(Token::new(kind, location));
227                continue;
228            }
229
230            // xor
231            if char == '#' {
232                let location = self.current_source_location();
233                tokens.push(Token::new(TokenKind::BitwiseXor, location));
234                self.advance();
235                continue;
236            }
237
238            // Comma
239            if char == ',' {
240                let location = self.current_source_location();
241                tokens.push(Token::new(TokenKind::Comma, location));
242                self.advance();
243                continue;
244            }
245
246            // Dot
247            if char == '.' {
248                let location = self.current_source_location();
249                tokens.push(Token::new(TokenKind::Dot, location));
250                self.advance();
251                continue;
252            }
253
254            // Greater or GreaterEqual
255            if char == '>' {
256                let location = self.current_source_location();
257
258                self.advance();
259                let kind = if self.is_current_char('=') {
260                    self.advance();
261                    TokenKind::GreaterEqual
262                } else if self.is_current_char('>') {
263                    self.advance();
264                    TokenKind::BitwiseRightShift
265                } else {
266                    TokenKind::Greater
267                };
268
269                tokens.push(Token::new(kind, location));
270                continue;
271            }
272
273            // Less, LessEqual or NULL-safe equal
274            if char == '<' {
275                let location = self.current_source_location();
276
277                self.advance();
278                let kind = if self.is_current_char('=') {
279                    self.advance();
280                    if self.is_current_char('>') {
281                        self.advance();
282                        TokenKind::NullSafeEqual
283                    } else {
284                        TokenKind::LessEqual
285                    }
286                } else if self.is_current_char('<') {
287                    self.advance();
288                    TokenKind::BitwiseLeftShift
289                } else if self.is_current_char('>') {
290                    self.advance();
291                    TokenKind::BangEqual
292                } else if self.is_current_char('@') {
293                    self.advance();
294                    TokenKind::ArrowRightAt
295                } else {
296                    TokenKind::Less
297                };
298
299                tokens.push(Token::new(kind, location));
300                continue;
301            }
302
303            // Equal
304            if char == '=' {
305                let location = self.current_source_location();
306                tokens.push(Token::new(TokenKind::Equal, location));
307                self.advance();
308                continue;
309            }
310
311            // Colon , ColonColon or Colon Equal
312            if char == ':' {
313                let location = self.current_source_location();
314
315                // :=
316                if self.is_next_char('=') {
317                    tokens.push(Token::new(TokenKind::ColonEqual, location));
318                    // Advance `:=`
319                    self.advance_n(2);
320                    continue;
321                }
322
323                // ::
324                if self.is_next_char(':') {
325                    tokens.push(Token::new(TokenKind::ColonColon, location));
326                    // Advance `::`
327                    self.advance_n(2);
328                    continue;
329                }
330
331                tokens.push(Token::new(TokenKind::Colon, location));
332                self.advance();
333                continue;
334            }
335
336            // Bang or Bang Equal
337            if char == '!' {
338                let location = self.current_source_location();
339
340                // Consume `!`
341                self.advance();
342                let kind = if self.is_current_char('=') {
343                    // Consume `=`
344                    self.advance();
345                    TokenKind::BangEqual
346                } else {
347                    TokenKind::Bang
348                };
349
350                tokens.push(Token::new(kind, location));
351                continue;
352            }
353
354            // Left Paren
355            if char == '(' {
356                let location = self.current_source_location();
357                tokens.push(Token::new(TokenKind::LeftParen, location));
358                self.advance();
359                continue;
360            }
361
362            // Right Paren
363            if char == ')' {
364                let location = self.current_source_location();
365                tokens.push(Token::new(TokenKind::RightParen, location));
366                self.advance();
367                continue;
368            }
369
370            // Left Bracket
371            if char == '[' {
372                let location = self.current_source_location();
373                tokens.push(Token::new(TokenKind::LeftBracket, location));
374                self.advance();
375                continue;
376            }
377
378            // Right Bracket
379            if char == ']' {
380                let location = self.current_source_location();
381                tokens.push(Token::new(TokenKind::RightBracket, location));
382                self.advance();
383                continue;
384            }
385
386            // Semicolon
387            if char == ';' {
388                let location = self.current_source_location();
389                tokens.push(Token::new(TokenKind::Semicolon, location));
390                self.advance();
391                continue;
392            }
393
394            // Characters to ignoring
395            if char == ' ' || char == '\t' {
396                self.advance();
397                continue;
398            }
399
400            if char == '\n' {
401                self.advance();
402                self.column_end = 0;
403                self.line_end += 1;
404                continue;
405            }
406
407            return Err(Diagnostic::error("Unexpected character")
408                .with_location(self.current_source_location())
409                .as_boxed());
410        }
411
412        Ok(tokens)
413    }
414
415    fn consume_global_variable_name(&mut self) -> Result<Token, Box<Diagnostic>> {
416        let start_index = self.index;
417
418        // Advance `@`
419        self.advance();
420
421        // Make sure first character is  alphabetic
422        if !self.is_current_char_func(|c| c.is_alphanumeric()) {
423            return Err(Diagnostic::error(
424                "Global variable name must start with alphabetic character",
425            )
426            .add_help("Add at least one alphabetic character after @")
427            .with_location(self.current_source_location())
428            .as_boxed());
429        }
430
431        while self.is_current_char_func(|c| c == '_' || c.is_alphanumeric()) {
432            self.advance();
433        }
434
435        // Identifier is being case-insensitive by default, convert to lowercase to be easy to compare and lookup
436        let literal = &self.content[start_index..self.index];
437        let mut string: String = literal.iter().collect();
438        string = string.to_lowercase();
439
440        let location = self.current_source_location();
441        Ok(Token::new(TokenKind::GlobalVariable(string), location))
442    }
443
444    fn consume_identifier(&mut self) -> Token {
445        let start_index = self.index;
446        while self.is_current_char_func(|c| c == '_' || c.is_alphanumeric()) {
447            self.advance();
448        }
449
450        // Identifier is being case-insensitive by default, convert to lowercase to be easy to compare and lookup
451        let literal = &self.content[start_index..self.index];
452        let mut string: String = literal.iter().collect();
453        string = string.to_lowercase();
454
455        let kind = GITQL_RESERVED_KEYWORDS
456            .get(string.as_str())
457            .cloned()
458            .unwrap_or(TokenKind::Symbol(string));
459        Token::new(kind, self.current_source_location())
460    }
461
462    fn consume_backticks_identifier(&mut self) -> Result<Token, Box<Diagnostic>> {
463        let start_index = self.index;
464
465        // Advance '`'
466        self.advance();
467
468        while !self.is_current_char('`') {
469            self.advance();
470        }
471
472        if self.index >= self.content_len {
473            return Err(Diagnostic::error("Unterminated backticks")
474                .add_help("Add ` at the end of the identifier")
475                .with_location(self.current_source_location())
476                .as_boxed());
477        }
478
479        // Advance '`'
480        self.advance();
481
482        let literal = &self.content[start_index + 1..self.index - 1];
483        let identifier: String = literal.iter().collect();
484        let location = self.current_source_location();
485        Ok(Token::new(TokenKind::Symbol(identifier), location))
486    }
487
488    fn consume_number(&mut self) -> Result<Token, Box<Diagnostic>> {
489        let start_index = self.index;
490
491        while self.is_current_char_func(|c| c == '_' || c.is_numeric()) {
492            self.advance();
493        }
494
495        let mut is_float_value = false;
496        if self.is_current_char('.') {
497            self.advance();
498
499            is_float_value = true;
500            while self.is_current_char_func(|c| c == '_' || c.is_numeric()) {
501                self.advance();
502            }
503        }
504
505        let literal = &self.content[start_index..self.index];
506        let string: String = literal.iter().collect();
507        let literal_num = string.replace('_', "");
508        let location = self.current_source_location();
509
510        if is_float_value {
511            return match literal_num.parse::<f64>() {
512                Ok(float) => Ok(Token::new(TokenKind::Float(float), location)),
513                Err(parse_float_error) => Err(Diagnostic::error(&parse_float_error.to_string())
514                    .add_note(&format!(
515                        "Value must be between {} and {}",
516                        f64::MIN,
517                        f64::MAX
518                    ))
519                    .with_location(self.current_source_location())
520                    .as_boxed()),
521            };
522        }
523
524        match literal_num.parse::<i64>() {
525            Ok(integer) => Ok(Token::new(TokenKind::Integer(integer), location)),
526            Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
527                .add_note(&format!(
528                    "Value must be between {} and {}",
529                    i64::MIN,
530                    i64::MAX
531                ))
532                .with_location(self.current_source_location())
533                .as_boxed()),
534        }
535    }
536
537    fn consume_binary_number(&mut self) -> Result<Token, Box<Diagnostic>> {
538        let start_index = self.index;
539        while self.is_current_char_func(|c| c == '_' || c == '0' || c >= '1') {
540            self.advance();
541        }
542
543        if start_index == self.index {
544            return Err(
545                Diagnostic::error("Missing digits after the integer base prefix")
546                    .add_help("Expect at least one binary digits after the prefix 0b")
547                    .add_help("Binary digit mean 0 or 1")
548                    .with_location(self.current_source_location())
549                    .as_boxed(),
550            );
551        }
552
553        let literal = &self.content[start_index..self.index];
554        let string: String = literal.iter().collect();
555        let literal_num = string.replace('_', "");
556
557        const BINARY_RADIX: u32 = 2;
558        match i64::from_str_radix(&literal_num, BINARY_RADIX) {
559            Ok(integer) => {
560                let location = self.current_source_location();
561                Ok(Token::new(TokenKind::Integer(integer), location))
562            }
563            Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
564                .add_note(&format!(
565                    "Value must be between {} and {}",
566                    i64::MIN,
567                    i64::MAX
568                ))
569                .with_location(self.current_source_location())
570                .as_boxed()),
571        }
572    }
573
574    fn consume_octal_number(&mut self) -> Result<Token, Box<Diagnostic>> {
575        let start_index = self.index;
576        while self.is_current_char_func(|c| c == '_' || ('0'..='8').contains(&c)) {
577            self.advance();
578        }
579
580        if start_index == self.index {
581            return Err(
582                Diagnostic::error("Missing digits after the integer base prefix")
583                    .add_help("Expect at least one octal digits after the prefix 0o")
584                    .add_help("Octal digit mean 0 to 8 number")
585                    .with_location(self.current_source_location())
586                    .as_boxed(),
587            );
588        }
589
590        let literal = &self.content[start_index..self.index];
591        let string: String = literal.iter().collect();
592        let literal_num = string.replace('_', "");
593
594        const OCTAL_RADIX: u32 = 8;
595        match i64::from_str_radix(&literal_num, OCTAL_RADIX) {
596            Ok(integer) => {
597                let location = self.current_source_location();
598                Ok(Token::new(TokenKind::Integer(integer), location))
599            }
600            Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
601                .add_note(&format!(
602                    "Value must be between {} and {}",
603                    i64::MIN,
604                    i64::MAX
605                ))
606                .with_location(self.current_source_location())
607                .as_boxed()),
608        }
609    }
610
611    fn consume_hex_number(&mut self) -> Result<Token, Box<Diagnostic>> {
612        let start_index = self.index;
613        while self.is_current_char_func(|c| c == '_' || c.is_ascii_hexdigit()) {
614            self.advance();
615        }
616
617        if start_index == self.index {
618            return Err(
619                Diagnostic::error("Missing digits after the integer base prefix")
620                    .add_help("Expect at least one hex digits after the prefix 0x")
621                    .add_help("Hex digit mean 0 to 9 and a to f")
622                    .with_location(self.current_source_location())
623                    .as_boxed(),
624            );
625        }
626
627        let literal = &self.content[start_index..self.index];
628        let string: String = literal.iter().collect();
629        let literal_num = string.replace('_', "");
630
631        const HEX_RADIX: u32 = 16;
632        match i64::from_str_radix(&literal_num, HEX_RADIX) {
633            Ok(integer) => {
634                let location = self.current_source_location();
635                Ok(Token::new(TokenKind::Integer(integer), location))
636            }
637            Err(parse_int_error) => Err(Diagnostic::error(&parse_int_error.to_string())
638                .add_note(&format!(
639                    "Value must be between {} and {}",
640                    i64::MIN,
641                    i64::MAX
642                ))
643                .with_location(self.current_source_location())
644                .as_boxed()),
645        }
646    }
647
648    fn consume_string_in_single_quotes(&mut self) -> Result<Token, Box<Diagnostic>> {
649        let buffer = self.consume_string_with_around('\'')?;
650
651        if self.index >= self.content_len {
652            return Err(Diagnostic::error("Unterminated single quote string")
653                .add_help("Add \' at the end of the String literal")
654                .with_location(self.current_source_location())
655                .as_boxed());
656        }
657
658        // Consume `'`
659        self.advance();
660        let location = self.current_source_location();
661        Ok(Token::new(TokenKind::String(buffer), location))
662    }
663
664    fn consume_string_in_double_quotes(&mut self) -> Result<Token, Box<Diagnostic>> {
665        let buffer = self.consume_string_with_around('"')?;
666
667        if self.index >= self.content_len {
668            return Err(Diagnostic::error("Unterminated double quote string")
669                .add_help("Add \" at the end of the String literal")
670                .with_location(self.current_source_location())
671                .as_boxed());
672        }
673
674        // Consume `"`
675        self.advance();
676        let location = self.current_source_location();
677        Ok(Token::new(TokenKind::String(buffer), location))
678    }
679
680    fn consume_string_with_around(&mut self, around: char) -> Result<String, Box<Diagnostic>> {
681        // Consume Around start
682        self.advance();
683
684        let mut buffer = String::new();
685        while !self.is_current_char(around) {
686            if !self.is_current_char('\\') {
687                buffer.push(self.content[self.index]);
688                self.advance();
689                continue;
690            }
691
692            // If '\\' is the last character, we don't need to escape it
693            if self.is_last() {
694                buffer.push(self.content[self.index]);
695                self.advance();
696                continue;
697            }
698
699            // Consume '\\'
700            self.advance();
701
702            // Check possible escape depending on the next character
703            let next_char = self.content[self.index];
704            let character_with_escape_handled = match next_char {
705                // Single quote
706                '\'' => {
707                    self.advance();
708                    '\''
709                }
710                // Double quote
711                '\"' => {
712                    self.advance();
713                    '\"'
714                }
715                // Backslash
716                '\\' => {
717                    self.advance();
718                    '\\'
719                }
720                // New line
721                'n' => {
722                    self.advance();
723                    '\n'
724                }
725                // Carriage return
726                'r' => {
727                    self.advance();
728                    '\r'
729                }
730                // Tab
731                't' => {
732                    self.advance();
733                    '\t'
734                }
735                _ => self.content[self.index - 1],
736            };
737
738            buffer.push(character_with_escape_handled);
739        }
740
741        Ok(buffer)
742    }
743
744    fn consume_single_line_comment(&mut self) {
745        // Advance `--`
746        self.advance_n(2);
747
748        while !self.is_current_char('\n') {
749            self.advance();
750        }
751
752        // Advance `\n`
753        self.advance();
754        self.line_end += 1;
755        self.column_end = 0;
756    }
757
758    fn consume_c_style_block_comment(&mut self) -> Result<(), Box<Diagnostic>> {
759        // Advance `/*`
760        self.advance_n(2);
761
762        let mut number_nested_block_start = 0;
763        loop {
764            if self.is_current_char('/') && self.is_next_char('*') {
765                number_nested_block_start += 1;
766            }
767
768            // Advance char
769            self.advance();
770
771            if self.is_current_char('*') && self.is_next_char('/') {
772                number_nested_block_start -= 1;
773                if number_nested_block_start < 0 {
774                    break;
775                }
776            }
777        }
778
779        if self.index + 2 > self.content_len {
780            return Err(Diagnostic::error("C Style comment must end with */")
781                .add_help("Add */ at the end of C Style comments")
782                .with_location(self.current_source_location())
783                .as_boxed());
784        }
785
786        // Advance `*/`
787        self.advance_n(2);
788        Ok(())
789    }
790
791    fn advance(&mut self) {
792        self.index += 1;
793        self.column_end += 1;
794    }
795
796    fn advance_n(&mut self, n: usize) {
797        self.index += n;
798        self.column_end += n as u32;
799    }
800
801    fn is_current_char(&self, ch: char) -> bool {
802        self.index < self.content_len && self.content[self.index] == ch
803    }
804
805    fn is_next_char(&self, ch: char) -> bool {
806        self.index + 1 < self.content_len && self.content[self.index + 1] == ch
807    }
808
809    fn is_current_char_func(&self, func: fn(char) -> bool) -> bool {
810        self.index < self.content_len && func(self.content[self.index])
811    }
812
813    fn has_next(&self) -> bool {
814        self.index < self.content_len
815    }
816
817    fn is_last(&self) -> bool {
818        self.index == self.content_len - 1
819    }
820}