1use crate::errors::{Result, SqlglotError};
2use crate::tokens::{Token, TokenType};
3
4#[inline]
8fn is_identifier_start(c: char) -> bool {
9 c == '_' || c.is_alphabetic()
10}
11
12#[inline]
19fn is_identifier_continue(c: char) -> bool {
20 if c == '_' || c == '$' || c.is_alphanumeric() {
21 return true;
22 }
23 if c.is_ascii() || c.is_whitespace() || c.is_control() {
24 return false;
25 }
26 !matches!(
30 c,
31 '\u{00AB}' | '\u{00BB}' | '\u{2018}' | '\u{2019}' | '\u{201C}' | '\u{201D}' )
35}
36
37pub struct Tokenizer {
46 input: Vec<char>,
47 pos: usize,
48 line: usize,
49 col: usize,
50 pub preserve_comments: bool,
52 prev_token_type: Option<TokenType>,
56}
57
58impl Tokenizer {
59 #[must_use]
61 pub fn new(input: &str) -> Self {
62 Self {
63 input: input.chars().collect(),
64 pos: 0,
65 line: 1,
66 col: 1,
67 preserve_comments: false,
68 prev_token_type: None,
69 }
70 }
71
72 #[must_use]
74 pub fn with_comments(input: &str) -> Self {
75 Self {
76 input: input.chars().collect(),
77 pos: 0,
78 line: 1,
79 col: 1,
80 preserve_comments: true,
81 prev_token_type: None,
82 }
83 }
84
85 pub fn tokenize(&mut self) -> Result<Vec<Token>> {
89 let mut tokens = Vec::new();
90 loop {
91 let token = self.next_token()?;
92 match token.token_type {
93 TokenType::Eof => {
94 tokens.push(token);
95 break;
96 }
97 TokenType::Whitespace => continue,
98 TokenType::LineComment | TokenType::BlockComment => {
99 if self.preserve_comments {
100 tokens.push(token);
101 }
102 }
103 _ => {
104 self.prev_token_type = Some(token.token_type.clone());
105 tokens.push(token);
106 }
107 }
108 }
109 Ok(tokens)
110 }
111
112 fn peek(&self) -> Option<char> {
113 self.input.get(self.pos).copied()
114 }
115
116 fn peek_at(&self, offset: usize) -> Option<char> {
117 self.input.get(self.pos + offset).copied()
118 }
119
120 fn advance(&mut self) -> Option<char> {
121 let ch = self.input.get(self.pos).copied();
122 if let Some(c) = ch {
123 self.pos += 1;
124 if c == '\n' {
125 self.line += 1;
126 self.col = 1;
127 } else {
128 self.col += 1;
129 }
130 }
131 ch
132 }
133
134 fn make_token(
135 &self,
136 token_type: TokenType,
137 value: impl Into<String>,
138 start: usize,
139 start_line: usize,
140 start_col: usize,
141 ) -> Token {
142 Token::with_location(token_type, value, start, start_line, start_col)
143 }
144
145 fn next_token(&mut self) -> Result<Token> {
146 while self.peek().is_some_and(|c| c.is_whitespace()) {
148 self.advance();
149 }
150
151 let start = self.pos;
152 let start_line = self.line;
153 let start_col = self.col;
154
155 let Some(ch) = self.advance() else {
156 return Ok(self.make_token(TokenType::Eof, "", start, start_line, start_col));
157 };
158
159 match ch {
160 '(' => Ok(self.make_token(TokenType::LParen, "(", start, start_line, start_col)),
162 ')' => Ok(self.make_token(TokenType::RParen, ")", start, start_line, start_col)),
163 '[' => {
164 let prev_is_subscriptable = matches!(
176 self.prev_token_type,
177 Some(
178 TokenType::Identifier
179 | TokenType::RParen
180 | TokenType::RBracket
181 | TokenType::String
182 | TokenType::Number
183 | TokenType::Int
185 | TokenType::Integer
186 | TokenType::BigInt
187 | TokenType::SmallInt
188 | TokenType::TinyInt
189 | TokenType::Float
190 | TokenType::Double
191 | TokenType::Decimal
192 | TokenType::Numeric
193 | TokenType::Real
194 | TokenType::Varchar
195 | TokenType::Char
196 | TokenType::Text
197 | TokenType::Boolean
198 | TokenType::Bool
199 | TokenType::Date
200 | TokenType::Timestamp
201 | TokenType::TimestampTz
202 | TokenType::Time
203 | TokenType::Interval
204 | TokenType::Blob
205 | TokenType::Bytea
206 | TokenType::Json
207 | TokenType::Jsonb
208 | TokenType::Uuid
209 | TokenType::Array
210 | TokenType::Map
211 | TokenType::Struct
212 )
213 );
214
215 let mut looks_like_ident = false;
216 let mut has_space_inside = false;
220 let mut has_operator_inside = false;
221 if prev_is_subscriptable {
222 let mut scan = self.pos;
223 while scan < self.input.len() {
224 let c = self.input[scan];
225 if c == ']' {
226 break;
227 }
228 if c == '\n' || c == '[' || c == ',' {
229 break;
230 }
231 if c == ' ' || c == '\t' {
232 has_space_inside = true;
233 }
234 if matches!(c, '+' | '-' | '*' | '/' | '%' | '=' | '<' | '>' | '!' | '&' | '|' | '^') {
235 has_operator_inside = true;
236 }
237 scan += 1;
238 }
239 }
240 if !prev_is_subscriptable || (has_space_inside && !has_operator_inside) {
241 let mut scan = self.pos;
242 let mut saw_quote = false;
243 while scan < self.input.len() {
244 let c = self.input[scan];
245 if c == ']' {
246 looks_like_ident = scan > self.pos
251 && (!prev_is_subscriptable || !saw_quote);
252 break;
253 }
254 if c == '\n' || c == '[' || c == ',' {
256 break;
257 }
258 if c == '\'' {
259 saw_quote = true;
260 }
261 scan += 1;
262 }
263 }
264 if looks_like_ident {
265 self.read_quoted_identifier(start, start_line, start_col, '[')
266 } else {
267 Ok(self.make_token(TokenType::LBracket, "[", start, start_line, start_col))
268 }
269 }
270 ']' => Ok(self.make_token(TokenType::RBracket, "]", start, start_line, start_col)),
271 '{' => {
272 if self
278 .peek()
279 .is_some_and(is_identifier_start)
280 {
281 let mut i = 1usize;
282 while self
283 .peek_at(i)
284 .is_some_and(|c| is_identifier_continue(c))
285 {
286 i += 1;
287 }
288 if self.peek_at(i) == Some(':') {
289 let mut value = String::from('{');
290 let mut depth = 0usize;
291 loop {
292 match self.peek() {
293 None => break,
294 Some('{') => {
295 depth += 1;
296 value.push('{');
297 self.advance();
298 }
299 Some('}') => {
300 if depth == 0 {
301 value.push('}');
302 self.advance();
303 return Ok(self.make_token(
304 TokenType::Parameter,
305 value,
306 start,
307 start_line,
308 start_col,
309 ));
310 }
311 depth -= 1;
312 value.push('}');
313 self.advance();
314 }
315 Some(c) => {
316 value.push(c);
317 self.advance();
318 }
319 }
320 }
321 return Err(SqlglotError::TokenizerError {
322 message: "Unterminated parameter placeholder".into(),
323 position: start,
324 });
325 }
326 }
327 Ok(self.make_token(TokenType::LBrace, "{", start, start_line, start_col))
328 }
329 '}' => Ok(self.make_token(TokenType::RBrace, "}", start, start_line, start_col)),
330 ',' => Ok(self.make_token(TokenType::Comma, ",", start, start_line, start_col)),
331 ';' => Ok(self.make_token(TokenType::Semicolon, ";", start, start_line, start_col)),
332 '.' => Ok(self.make_token(TokenType::Dot, ".", start, start_line, start_col)),
333 '+' => Ok(self.make_token(TokenType::Plus, "+", start, start_line, start_col)),
334 '~' => Ok(self.make_token(TokenType::BitwiseNot, "~", start, start_line, start_col)),
335 '@' => {
336 if self.peek() == Some('>') {
337 self.advance();
338 Ok(self.make_token(TokenType::AtArrow, "@>", start, start_line, start_col))
339 } else {
340 Ok(self.make_token(TokenType::AtSign, "@", start, start_line, start_col))
341 }
342 }
343 '=' => Ok(self.make_token(TokenType::Eq, "=", start, start_line, start_col)),
344 '*' => Ok(self.make_token(TokenType::Star, "*", start, start_line, start_col)),
345 '%' => Ok(self.make_token(TokenType::Percent2, "%", start, start_line, start_col)),
346 '^' => Ok(self.make_token(TokenType::BitwiseXor, "^", start, start_line, start_col)),
347
348 ':' => {
350 if self.peek() == Some(':') {
351 self.advance();
352 Ok(self.make_token(TokenType::DoubleColon, "::", start, start_line, start_col))
353 } else {
354 Ok(self.make_token(TokenType::Colon, ":", start, start_line, start_col))
355 }
356 }
357
358 '-' => {
360 if self.peek() == Some('-') {
361 self.advance();
362 let mut value = String::from("--");
363 while self.peek().is_some_and(|c| c != '\n') {
364 value.push(self.advance().unwrap());
365 }
366 Ok(
367 self.make_token(
368 TokenType::LineComment,
369 value,
370 start,
371 start_line,
372 start_col,
373 ),
374 )
375 } else if self.peek() == Some('>') {
376 self.advance();
377 if self.peek() == Some('>') {
378 self.advance();
379 Ok(self.make_token(
380 TokenType::DoubleArrow,
381 "->>",
382 start,
383 start_line,
384 start_col,
385 ))
386 } else {
387 Ok(self.make_token(TokenType::Arrow, "->", start, start_line, start_col))
388 }
389 } else {
390 Ok(self.make_token(TokenType::Minus, "-", start, start_line, start_col))
391 }
392 }
393
394 '/' => {
396 if self.peek() == Some('*') {
397 self.advance();
398 let mut value = String::from("/*");
399 let mut depth = 1;
400 while depth > 0 {
401 match self.advance() {
402 Some('*') if self.peek() == Some('/') => {
403 self.advance();
404 depth -= 1;
405 value.push_str("*/");
406 }
407 Some('/') if self.peek() == Some('*') => {
408 self.advance();
409 depth += 1;
410 value.push_str("/*");
411 }
412 Some(c) => value.push(c),
413 None => {
414 return Err(SqlglotError::TokenizerError {
415 message: "Unterminated block comment".into(),
416 position: start,
417 });
418 }
419 }
420 }
421 Ok(self.make_token(
422 TokenType::BlockComment,
423 value,
424 start,
425 start_line,
426 start_col,
427 ))
428 } else {
429 Ok(self.make_token(TokenType::Slash, "/", start, start_line, start_col))
430 }
431 }
432
433 '<' => {
435 if self.peek() == Some('=') {
436 self.advance();
437 Ok(self.make_token(TokenType::LtEq, "<=", start, start_line, start_col))
438 } else if self.peek() == Some('>') {
439 self.advance();
440 Ok(self.make_token(TokenType::Neq, "<>", start, start_line, start_col))
441 } else if self.peek() == Some('<') {
442 self.advance();
443 Ok(self.make_token(TokenType::ShiftLeft, "<<", start, start_line, start_col)) } else if self.peek() == Some('@') {
444 self.advance();
445 Ok(self.make_token(TokenType::ArrowAt, "<@", start, start_line, start_col)) } else {
446 Ok(self.make_token(TokenType::Lt, "<", start, start_line, start_col))
447 }
448 }
449
450 '>' => {
452 if self.peek() == Some('=') {
453 self.advance();
454 Ok(self.make_token(TokenType::GtEq, ">=", start, start_line, start_col))
455 } else if self.peek() == Some('>') {
456 self.advance();
457 Ok(self.make_token(TokenType::ShiftRight, ">>", start, start_line, start_col))
458 } else {
459 Ok(self.make_token(TokenType::Gt, ">", start, start_line, start_col))
460 }
461 }
462
463 '!' => {
465 if self.peek() == Some('=') {
466 self.advance();
467 Ok(self.make_token(TokenType::Neq, "!=", start, start_line, start_col))
468 } else {
469 Err(SqlglotError::TokenizerError {
470 message: format!("Unexpected character: {ch}"),
471 position: start,
472 })
473 }
474 }
475
476 '|' => {
478 if self.peek() == Some('|') {
479 self.advance();
480 Ok(self.make_token(TokenType::Concat, "||", start, start_line, start_col))
481 } else {
482 Ok(self.make_token(TokenType::BitwiseOr, "|", start, start_line, start_col))
483 }
484 }
485
486 '&' => Ok(self.make_token(TokenType::BitwiseAnd, "&", start, start_line, start_col)),
488
489 '#' => {
491 if self.peek() == Some('>') {
492 self.advance();
493 if self.peek() == Some('>') {
494 self.advance();
495 Ok(self.make_token(
496 TokenType::HashDoubleArrow,
497 "#>>",
498 start,
499 start_line,
500 start_col,
501 ))
502 } else {
503 Ok(self.make_token(
504 TokenType::HashArrow,
505 "#>",
506 start,
507 start_line,
508 start_col,
509 ))
510 }
511 } else if self.peek() == Some('#') {
512 let save_pos = self.pos;
518 let save_line = self.line;
519 let save_col = self.col;
520 self.advance(); let inner_start = self.pos;
522 let mut found_close = false;
523 while let Some(c) = self.peek() {
524 if c == '\n' {
525 break;
526 }
527 if c == '#' && self.peek_at(1) == Some('#') {
528 found_close = true;
529 break;
530 }
531 self.advance();
532 }
533 if found_close {
534 let value: String = self.input[inner_start..self.pos].iter().collect();
535 self.advance(); self.advance(); return Ok(Token::with_quote(
538 TokenType::Identifier,
539 value,
540 start,
541 start_line,
542 start_col,
543 '#',
544 ));
545 }
546 self.pos = save_pos;
548 self.line = save_line;
549 self.col = save_col;
550 let mut value = String::from("#");
551 while self.peek().is_some_and(|c| c != '\n') {
552 value.push(self.advance().unwrap());
553 }
554 Ok(
555 self.make_token(
556 TokenType::LineComment,
557 value,
558 start,
559 start_line,
560 start_col,
561 ),
562 )
563 } else if self.peek().is_some_and(|c| c.is_ascii_digit()) {
564 let mut value = String::from("#");
567 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
568 value.push(self.advance().unwrap());
569 }
570 Ok(self.make_token(
571 TokenType::Parameter,
572 value,
573 start,
574 start_line,
575 start_col,
576 ))
577 } else {
578 let mut value = String::from("#");
579 while self.peek().is_some_and(|c| c != '\n') {
580 value.push(self.advance().unwrap());
581 }
582 Ok(
583 self.make_token(
584 TokenType::LineComment,
585 value,
586 start,
587 start_line,
588 start_col,
589 ),
590 )
591 }
592 }
593
594 '\'' => self.read_string(start, start_line, start_col),
596
597 c if c.is_ascii_digit() => self.read_number(start, start_line, start_col, c),
599
600 c if is_identifier_start(c) => {
602 self.read_identifier(start, start_line, start_col, c)
603 }
604
605 '"' => self.read_quoted_identifier(start, start_line, start_col, '"'),
607
608 '`' => self.read_quoted_identifier(start, start_line, start_col, '`'),
610
611 '$' => {
613 if self.peek() == Some('$') {
618 self.advance(); let mut value = String::new();
620 while let Some(c) = self.peek() {
621 if c == '$' && self.peek_at(1) == Some('$') {
622 self.advance();
623 self.advance();
624 return Ok(self.make_token(
625 TokenType::String,
626 value,
627 start,
628 start_line,
629 start_col,
630 ));
631 }
632 value.push(self.advance().unwrap());
633 }
634 return Ok(self.make_token(
636 TokenType::String,
637 value,
638 start,
639 start_line,
640 start_col,
641 ));
642 }
643 if self.peek().is_some_and(is_identifier_start) {
648 let save_pos = self.pos;
649 let save_line = self.line;
650 let save_col = self.col;
651 let mut tag = String::new();
652 while self.peek().is_some_and(is_identifier_continue) {
653 tag.push(self.advance().unwrap());
654 }
655 if self.peek() == Some('$') {
656 self.advance();
657 let mut value = String::new();
659 let mut closed = false;
660 while let Some(c) = self.peek() {
661 if c == '$' {
662 let mut matched = true;
664 for (i, ch) in tag.chars().enumerate() {
665 if self.peek_at(i + 1) != Some(ch) {
666 matched = false;
667 break;
668 }
669 }
670 if matched && self.peek_at(tag.len() + 1) == Some('$') {
671 for _ in 0..(tag.len() + 2) {
673 self.advance();
674 }
675 closed = true;
676 break;
677 }
678 }
679 value.push(self.advance().unwrap());
680 }
681 if closed {
682 return Ok(self.make_token(
683 TokenType::String,
684 value,
685 start,
686 start_line,
687 start_col,
688 ));
689 }
690 }
691 self.pos = save_pos;
694 self.line = save_line;
695 self.col = save_col;
696 }
697 if self.peek() == Some('{') {
698 let mut value = String::from("$");
701 value.push(self.advance().unwrap()); while let Some(c) = self.peek() {
703 value.push(self.advance().unwrap());
704 if c == '}' {
705 break;
706 }
707 }
708 Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
709 } else if self.peek().is_some_and(|c| c.is_ascii_digit()) {
710 let mut value = String::from("$");
711 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
712 value.push(self.advance().unwrap());
713 }
714 Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
715 } else if self.peek().is_some_and(is_identifier_start) {
716 let mut value = String::from("$");
723 while self.peek().is_some_and(is_identifier_continue) {
724 value.push(self.advance().unwrap());
725 }
726 Ok(self.make_token(TokenType::Identifier, value, start, start_line, start_col))
727 } else {
728 Ok(self.make_token(TokenType::Parameter, "$", start, start_line, start_col))
729 }
730 }
731
732 '?' => Ok(self.make_token(TokenType::Parameter, "?", start, start_line, start_col)),
733
734 _ => Err(SqlglotError::TokenizerError {
735 message: format!("Unexpected character: {ch}"),
736 position: start,
737 }),
738 }
739 }
740
741 fn read_string(&mut self, start: usize, start_line: usize, start_col: usize) -> Result<Token> {
742 let mut value = String::new();
743 loop {
744 match self.advance() {
745 Some('\'') => {
746 if self.peek() == Some('\'') {
747 self.advance();
748 value.push('\'');
749 } else {
750 return Ok(self.make_token(
751 TokenType::String,
752 value,
753 start,
754 start_line,
755 start_col,
756 ));
757 }
758 }
759 Some('\\') => match self.peek() {
760 Some('\\') => {
761 self.advance();
762 value.push('\\');
763 }
764 Some('n') => {
765 self.advance();
766 value.push('\n');
767 }
768 Some('t') => {
769 self.advance();
770 value.push('\t');
771 }
772 Some('r') => {
773 self.advance();
774 value.push('\r');
775 }
776 Some('\'') => {
777 self.advance();
778 value.push('\'');
779 }
780 Some('"') => {
781 self.advance();
782 value.push('"');
783 }
784 Some('0') => {
785 self.advance();
786 value.push('\0');
787 }
788 Some('b') => {
789 self.advance();
790 value.push('\u{0008}');
791 }
792 Some('f') => {
793 self.advance();
794 value.push('\u{000C}');
795 }
796 Some('v') => {
797 self.advance();
798 value.push('\u{000B}');
799 }
800 Some('a') => {
801 self.advance();
802 value.push('\u{0007}');
803 }
804 Some(c) if c.is_ascii_alphanumeric() || c == '?' => {
805 self.advance();
809 value.push('\\');
810 value.push(c);
811 }
812 _ => {
813 value.push('\\');
814 }
815 },
816 Some(c) => value.push(c),
817 None => {
818 return Err(SqlglotError::TokenizerError {
819 message: "Unterminated string literal".into(),
820 position: start,
821 });
822 }
823 }
824 }
825 }
826
827 fn read_number(
828 &mut self,
829 start: usize,
830 start_line: usize,
831 start_col: usize,
832 first: char,
833 ) -> Result<Token> {
834 let mut value = String::new();
835 value.push(first);
836
837 if first == '0' && self.peek().is_some_and(|c| c == 'x' || c == 'X') {
838 value.push(self.advance().unwrap());
839 while self
840 .peek()
841 .is_some_and(|c| c.is_ascii_hexdigit() || c == '_')
842 {
843 value.push(self.advance().unwrap());
844 }
845 if self.peek().is_some_and(|c| c == 'p' || c == 'P') {
848 value.push(self.advance().unwrap());
849 if self.peek().is_some_and(|c| c == '+' || c == '-') {
850 value.push(self.advance().unwrap());
851 }
852 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
853 value.push(self.advance().unwrap());
854 }
855 }
856 return Ok(self.make_token(TokenType::HexString, value, start, start_line, start_col));
857 }
858
859 while self
860 .peek()
861 .is_some_and(|c| c.is_ascii_digit() || c == '_')
862 {
863 value.push(self.advance().unwrap());
864 }
865
866 if self.peek() == Some('.')
867 && (self.peek_at(1).is_some_and(|c| c.is_ascii_digit())
868 || !self.peek_at(1).is_some_and(is_identifier_start))
869 {
870 value.push(self.advance().unwrap());
871 while self
872 .peek()
873 .is_some_and(|c| c.is_ascii_digit() || c == '_')
874 {
875 value.push(self.advance().unwrap());
876 }
877 }
878
879 if self.peek().is_some_and(|c| c == 'e' || c == 'E') {
880 value.push(self.advance().unwrap());
881 if self.peek().is_some_and(|c| c == '+' || c == '-') {
882 value.push(self.advance().unwrap());
883 }
884 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
885 value.push(self.advance().unwrap());
886 }
887 }
888
889 if !value.contains('.')
894 && !value.contains('e')
895 && !value.contains('E')
896 && self.peek().is_some_and(is_identifier_continue)
897 {
898 while self.peek().is_some_and(is_identifier_continue) {
899 value.push(self.advance().unwrap());
900 }
901 let token_type = Self::keyword_type(&value);
902 return Ok(self.make_token(token_type, value, start, start_line, start_col));
903 }
904
905 Ok(self.make_token(TokenType::Number, value, start, start_line, start_col))
906 }
907
908 fn read_identifier(
909 &mut self,
910 start: usize,
911 start_line: usize,
912 start_col: usize,
913 first: char,
914 ) -> Result<Token> {
915 let mut value = String::new();
916 value.push(first);
917 while self
918 .peek()
919 .is_some_and(is_identifier_continue)
920 {
921 if self.peek() == Some('$') {
925 let next = self.peek_at(1);
926 if matches!(next, Some('{')) || next.is_some_and(|c| c.is_ascii_digit()) {
927 break;
928 }
929 }
930 value.push(self.advance().unwrap());
931 }
932
933 if value.len() == 1
936 && value
937 .as_bytes()
938 .first()
939 .is_some_and(|b| b.eq_ignore_ascii_case(&b'n'))
940 && self.peek() == Some('\'')
941 {
942 self.advance(); let mut token = self.read_string(start, start_line, start_col)?;
944 token.token_type = TokenType::NationalString;
945 return Ok(token);
946 }
947
948 if value.len() == 1
957 && value
958 .as_bytes()
959 .first()
960 .is_some_and(|b| matches!(b.to_ascii_uppercase(), b'E' | b'B' | b'X'))
961 && self.peek() == Some('\'')
962 {
963 self.advance();
964 return self.read_string(start, start_line, start_col);
965 }
966 if value.len() == 1
968 && value
969 .as_bytes()
970 .first()
971 .is_some_and(|b| b.eq_ignore_ascii_case(&b'u'))
972 && self.peek() == Some('&')
973 && self.peek_at(1) == Some('\'')
974 {
975 self.advance(); self.advance(); return self.read_string(start, start_line, start_col);
978 }
979
980 let token_type = Self::keyword_type(&value);
981 Ok(self.make_token(token_type, value, start, start_line, start_col))
982 }
983
984 fn keyword_type(word: &str) -> TokenType {
986 match word.to_uppercase().as_str() {
987 "SELECT" => TokenType::Select,
988 "FROM" => TokenType::From,
989 "WHERE" => TokenType::Where,
990 "AND" => TokenType::And,
991 "OR" => TokenType::Or,
992 "NOT" => TokenType::Not,
993 "AS" => TokenType::As,
994 "JOIN" => TokenType::Join,
995 "INNER" => TokenType::Inner,
996 "LEFT" => TokenType::Left,
997 "RIGHT" => TokenType::Right,
998 "FULL" => TokenType::Full,
999 "OUTER" => TokenType::Outer,
1000 "CROSS" => TokenType::Cross,
1001 "ON" => TokenType::On,
1002 "INSERT" => TokenType::Insert,
1003 "INTO" => TokenType::Into,
1004 "VALUES" => TokenType::Values,
1005 "UPDATE" => TokenType::Update,
1006 "SET" => TokenType::Set,
1007 "DELETE" => TokenType::Delete,
1008 "CREATE" => TokenType::Create,
1009 "TABLE" => TokenType::Table,
1010 "DROP" => TokenType::Drop,
1011 "ALTER" => TokenType::Alter,
1012 "INDEX" => TokenType::Index,
1013 "IF" => TokenType::If,
1014 "EXISTS" => TokenType::Exists,
1015 "IN" => TokenType::In,
1016 "IS" => TokenType::Is,
1017 "NULL" => TokenType::Null,
1018 "LIKE" => TokenType::Like,
1019 "ILIKE" => TokenType::ILike,
1020 "ESCAPE" => TokenType::Escape,
1021 "BETWEEN" => TokenType::Between,
1022 "CASE" => TokenType::Case,
1023 "WHEN" => TokenType::When,
1024 "THEN" => TokenType::Then,
1025 "ELSE" => TokenType::Else,
1026 "END" => TokenType::End,
1027 "ORDER" => TokenType::Order,
1028 "BY" => TokenType::By,
1029 "ASC" => TokenType::Asc,
1030 "DESC" => TokenType::Desc,
1031 "GROUP" => TokenType::Group,
1032 "HAVING" => TokenType::Having,
1033 "LIMIT" => TokenType::Limit,
1034 "OFFSET" => TokenType::Offset,
1035 "UNION" => TokenType::Union,
1036 "ALL" => TokenType::All,
1037 "DISTINCT" => TokenType::Distinct,
1038 "TRUE" => TokenType::True,
1039 "FALSE" => TokenType::False,
1040 "INTERSECT" => TokenType::Intersect,
1041 "EXCEPT" => TokenType::Except,
1042 "WITH" => TokenType::With,
1043 "RECURSIVE" => TokenType::Recursive,
1044 "ANY" => TokenType::Any,
1045 "SOME" => TokenType::Some,
1046 "CAST" => TokenType::Cast,
1047 "OVER" => TokenType::Over,
1048 "PARTITION" => TokenType::Partition,
1049 "WINDOW" => TokenType::Window,
1050 "ROWS" => TokenType::Rows,
1051 "RANGE" => TokenType::Range,
1052 "UNBOUNDED" => TokenType::Unbounded,
1053 "PRECEDING" => TokenType::Preceding,
1054 "FOLLOWING" => TokenType::Following,
1055 "FILTER" => TokenType::Filter,
1056 "INT" => TokenType::Int,
1057 "INTEGER" => TokenType::Integer,
1058 "BIGINT" => TokenType::BigInt,
1059 "SMALLINT" => TokenType::SmallInt,
1060 "TINYINT" => TokenType::TinyInt,
1061 "FLOAT" => TokenType::Float,
1062 "DOUBLE" => TokenType::Double,
1063 "DECIMAL" => TokenType::Decimal,
1064 "NUMERIC" => TokenType::Numeric,
1065 "REAL" => TokenType::Real,
1066 "VARCHAR" => TokenType::Varchar,
1067 "CHAR" | "CHARACTER" => TokenType::Char,
1068 "TEXT" => TokenType::Text,
1069 "BOOLEAN" | "BOOL" => TokenType::Boolean,
1070 "DATE" => TokenType::Date,
1071 "TIMESTAMP" => TokenType::Timestamp,
1072 "TIMESTAMPTZ" => TokenType::TimestampTz,
1073 "TIME" => TokenType::Time,
1074 "INTERVAL" => TokenType::Interval,
1075 "BLOB" => TokenType::Blob,
1076 "BYTEA" => TokenType::Bytea,
1077 "JSON" => TokenType::Json,
1078 "JSONB" => TokenType::Jsonb,
1079 "UUID" => TokenType::Uuid,
1080 "ARRAY" => TokenType::Array,
1081 "MAP" => TokenType::Map,
1082 "STRUCT" => TokenType::Struct,
1083 "PRIMARY" => TokenType::Primary,
1084 "KEY" => TokenType::Key,
1085 "FOREIGN" => TokenType::Foreign,
1086 "REFERENCES" => TokenType::References,
1087 "UNIQUE" => TokenType::Unique,
1088 "CHECK" => TokenType::Check,
1089 "DEFAULT" => TokenType::Default,
1090 "CONSTRAINT" => TokenType::Constraint,
1091 "AUTO_INCREMENT" | "AUTOINCREMENT" => TokenType::AutoIncrement,
1092 "CASCADE" => TokenType::Cascade,
1093 "RESTRICT" => TokenType::Restrict,
1094 "RETURNING" => TokenType::Returning,
1095 "CONFLICT" => TokenType::Conflict,
1096 "DO" => TokenType::Do,
1097 "NOTHING" => TokenType::Nothing,
1098 "REPLACE" => TokenType::Replace,
1099 "IGNORE" => TokenType::Ignore,
1100 "MERGE" => TokenType::Merge,
1101 "MATCHED" => TokenType::Matched,
1102 "USING" => TokenType::Using,
1103 "TRUNCATE" => TokenType::Truncate,
1104 "SCHEMA" => TokenType::Schema,
1105 "DATABASE" => TokenType::Database,
1106 "VIEW" => TokenType::View,
1107 "MATERIALIZED" => TokenType::Materialized,
1108 "TEMPORARY" => TokenType::Temporary,
1109 "TEMP" => TokenType::Temp,
1110 "BEGIN" => TokenType::Begin,
1111 "COMMIT" => TokenType::Commit,
1112 "ROLLBACK" => TokenType::Rollback,
1113 "SAVEPOINT" => TokenType::Savepoint,
1114 "TRANSACTION" => TokenType::Transaction,
1115 "EXPLAIN" => TokenType::Explain,
1116 "ANALYZE" => TokenType::Analyze,
1117 "SHOW" => TokenType::Show,
1118 "USE" => TokenType::Use,
1119 "GRANT" => TokenType::Grant,
1120 "REVOKE" => TokenType::Revoke,
1121 "LATERAL" => TokenType::Lateral,
1122 "UNNEST" => TokenType::Unnest,
1123 "PIVOT" => TokenType::Pivot,
1124 "UNPIVOT" => TokenType::Unpivot,
1125 "TABLESAMPLE" => TokenType::Tablesample,
1126 "FETCH" => TokenType::Fetch,
1127 "FIRST" => TokenType::First,
1128 "NEXT" => TokenType::Next,
1129 "ONLY" => TokenType::Only,
1130 "NULLS" => TokenType::Nulls,
1131 "RESPECT" => TokenType::Respect,
1132 "TOP" => TokenType::Top,
1133 "COLLATE" => TokenType::Collate,
1134 "QUALIFY" => TokenType::Qualify,
1135 "CUBE" => TokenType::Cube,
1136 "ROLLUP" => TokenType::Rollup,
1137 "GROUPING" => TokenType::Grouping,
1138 "SETS" => TokenType::Sets,
1139 "XOR" => TokenType::Xor,
1140 "EXTRACT" => TokenType::Extract,
1141 "EPOCH" => TokenType::Epoch,
1142 "YEAR" => TokenType::Year,
1143 "MONTH" => TokenType::Month,
1144 "DAY" => TokenType::Day,
1145 "HOUR" => TokenType::Hour,
1146 "MINUTE" => TokenType::Minute,
1147 "SECOND" => TokenType::Second,
1148 _ => TokenType::Identifier,
1149 }
1150 }
1151
1152 fn read_quoted_identifier(
1153 &mut self,
1154 start: usize,
1155 start_line: usize,
1156 start_col: usize,
1157 quote: char,
1158 ) -> Result<Token> {
1159 let end_char = if quote == '[' { ']' } else { quote };
1160 let mut value = String::new();
1161 loop {
1162 match self.advance() {
1163 Some(c) if c == end_char => {
1164 if self.peek() == Some(end_char) && end_char != ']' {
1165 self.advance();
1166 value.push(end_char);
1167 } else {
1168 return Ok(Token::with_quote(
1169 TokenType::Identifier,
1170 value,
1171 start,
1172 start_line,
1173 start_col,
1174 quote,
1175 ));
1176 }
1177 }
1178 Some(c) => value.push(c),
1179 None => {
1180 return Err(SqlglotError::TokenizerError {
1181 message: format!("Unterminated quoted identifier (expected {end_char})"),
1182 position: start,
1183 });
1184 }
1185 }
1186 }
1187 }
1188}
1189
1190#[cfg(test)]
1191mod tests {
1192 use super::*;
1193
1194 #[test]
1195 fn test_tokenize_simple_select() {
1196 let mut tokenizer = Tokenizer::new("SELECT a, b FROM t");
1197 let tokens = tokenizer.tokenize().unwrap();
1198 assert_eq!(tokens[0].token_type, TokenType::Select);
1199 assert_eq!(tokens[1].token_type, TokenType::Identifier);
1200 assert_eq!(tokens[1].value, "a");
1201 assert_eq!(tokens[2].token_type, TokenType::Comma);
1202 assert_eq!(tokens[3].token_type, TokenType::Identifier);
1203 assert_eq!(tokens[3].value, "b");
1204 assert_eq!(tokens[4].token_type, TokenType::From);
1205 assert_eq!(tokens[5].token_type, TokenType::Identifier);
1206 assert_eq!(tokens[5].value, "t");
1207 assert_eq!(tokens[6].token_type, TokenType::Eof);
1208 }
1209
1210 #[test]
1211 fn test_tokenize_string_literal() {
1212 let mut tokenizer = Tokenizer::new("'hello world'");
1213 let tokens = tokenizer.tokenize().unwrap();
1214 assert_eq!(tokens[0].token_type, TokenType::String);
1215 assert_eq!(tokens[0].value, "hello world");
1216 }
1217
1218 #[test]
1219 fn test_tokenize_operators() {
1220 let mut tokenizer = Tokenizer::new("a >= 1 AND b != 2");
1221 let tokens = tokenizer.tokenize().unwrap();
1222 assert_eq!(tokens[1].token_type, TokenType::GtEq);
1223 assert_eq!(tokens[3].token_type, TokenType::And);
1224 assert_eq!(tokens[5].token_type, TokenType::Neq);
1225 }
1226
1227 #[test]
1228 fn test_tokenize_number() {
1229 let mut tokenizer = Tokenizer::new("123.45");
1230 let tokens = tokenizer.tokenize().unwrap();
1231 assert_eq!(tokens[0].token_type, TokenType::Number);
1232 assert_eq!(tokens[0].value, "123.45");
1233 }
1234
1235 #[test]
1236 fn test_tokenize_line_comment() {
1237 let mut tok = Tokenizer::with_comments("SELECT 1 -- comment\nFROM t");
1238 let tokens = tok.tokenize().unwrap();
1239 assert!(
1240 tokens
1241 .iter()
1242 .any(|t| t.token_type == TokenType::LineComment)
1243 );
1244 }
1245
1246 #[test]
1247 fn test_tokenize_block_comment() {
1248 let mut tok = Tokenizer::with_comments("SELECT /* hello */ 1");
1249 let tokens = tok.tokenize().unwrap();
1250 assert!(
1251 tokens
1252 .iter()
1253 .any(|t| t.token_type == TokenType::BlockComment)
1254 );
1255 }
1256
1257 #[test]
1258 fn test_tokenize_cte_keywords() {
1259 let mut tok = Tokenizer::new("WITH cte AS (SELECT 1) SELECT * FROM cte");
1260 let tokens = tok.tokenize().unwrap();
1261 assert_eq!(tokens[0].token_type, TokenType::With);
1262 assert_eq!(tokens[2].token_type, TokenType::As);
1263 }
1264
1265 #[test]
1266 fn test_tokenize_double_colon() {
1267 let mut tok = Tokenizer::new("x::int");
1268 let tokens = tok.tokenize().unwrap();
1269 assert_eq!(tokens[1].token_type, TokenType::DoubleColon);
1270 }
1271
1272 #[test]
1273 fn test_tokenize_cast() {
1274 let mut tok = Tokenizer::new("CAST(x AS INT)");
1275 let tokens = tok.tokenize().unwrap();
1276 assert_eq!(tokens[0].token_type, TokenType::Cast);
1277 }
1278
1279 #[test]
1280 fn test_tokenize_window() {
1281 let mut tok = Tokenizer::new("ROW_NUMBER() OVER (PARTITION BY id ORDER BY name)");
1282 let tokens = tok.tokenize().unwrap();
1283 assert!(tokens.iter().any(|t| t.token_type == TokenType::Over));
1284 assert!(tokens.iter().any(|t| t.token_type == TokenType::Partition));
1285 }
1286
1287 #[test]
1288 fn test_line_tracking() {
1289 let mut tok = Tokenizer::new("SELECT\n 1");
1290 let tokens = tok.tokenize().unwrap();
1291 assert_eq!(tokens[0].line, 1);
1292 assert_eq!(tokens[1].line, 2);
1293 }
1294
1295 #[test]
1296 fn test_tokenize_union_intersect_except() {
1297 let mut tok = Tokenizer::new("UNION INTERSECT EXCEPT");
1298 let tokens = tok.tokenize().unwrap();
1299 assert_eq!(tokens[0].token_type, TokenType::Union);
1300 assert_eq!(tokens[1].token_type, TokenType::Intersect);
1301 assert_eq!(tokens[2].token_type, TokenType::Except);
1302 }
1303
1304 #[test]
1305 fn test_tokenize_n_prefixed_string_literal_uppercase() {
1306 let mut tok = Tokenizer::new("N'Hello'");
1307 let tokens = tok.tokenize().unwrap();
1308 assert_eq!(tokens[0].token_type, TokenType::NationalString);
1309 assert_eq!(tokens[0].value, "Hello");
1310 }
1311
1312 #[test]
1313 fn test_tokenize_n_prefixed_string_literal_lowercase() {
1314 let mut tok = Tokenizer::new("n'hello'");
1315 let tokens = tok.tokenize().unwrap();
1316 assert_eq!(tokens[0].token_type, TokenType::NationalString);
1317 assert_eq!(tokens[0].value, "hello");
1318 }
1319
1320 #[test]
1321 fn test_tokenize_n_prefixed_string_literal_escaped_quote() {
1322 let mut tok = Tokenizer::new("N'can''t stop'");
1323 let tokens = tok.tokenize().unwrap();
1324 assert_eq!(tokens[0].token_type, TokenType::NationalString);
1325 assert_eq!(tokens[0].value, "can't stop");
1326 }
1327
1328 #[test]
1329 fn test_tokenize_n_prefixed_string_literal_unicode() {
1330 let mut tok = Tokenizer::new("N'テスト'");
1331 let tokens = tok.tokenize().unwrap();
1332 assert_eq!(tokens[0].token_type, TokenType::NationalString);
1333 assert_eq!(tokens[0].value, "テスト");
1334 }
1335
1336 #[test]
1337 fn test_tokenize_identifier_n_without_quote() {
1338 let mut tok = Tokenizer::new("SELECT N FROM t");
1339 let tokens = tok.tokenize().unwrap();
1340 assert_eq!(tokens[1].token_type, TokenType::Identifier);
1341 assert_eq!(tokens[1].value, "N");
1342 }
1343
1344 #[test]
1345 fn test_tokenize_identifier_name_starting_with_n() {
1346 let mut tok = Tokenizer::new("SELECT NAME FROM t");
1347 let tokens = tok.tokenize().unwrap();
1348 assert_eq!(tokens[1].token_type, TokenType::Identifier);
1349 assert_eq!(tokens[1].value, "NAME");
1350 }
1351}