1use crate::errors::{Result, SqlglotError};
2use crate::tokens::{Token, TokenType};
3
4#[inline]
8fn is_identifier_start(c: char) -> bool {
9 c == '_' || c.is_alphabetic()
10}
11
12#[inline]
19fn is_identifier_continue(c: char) -> bool {
20 if c == '_' || c == '$' || c.is_alphanumeric() {
21 return true;
22 }
23 if c.is_ascii() || c.is_whitespace() || c.is_control() {
24 return false;
25 }
26 !matches!(
30 c,
31 '\u{00AB}' | '\u{00BB}' | '\u{2018}' | '\u{2019}' | '\u{201C}' | '\u{201D}' )
35}
36
37pub struct Tokenizer {
46 input: Vec<char>,
47 pos: usize,
48 line: usize,
49 col: usize,
50 pub preserve_comments: bool,
52 prev_token_type: Option<TokenType>,
56}
57
58impl Tokenizer {
59 #[must_use]
61 pub fn new(input: &str) -> Self {
62 Self {
63 input: input.chars().collect(),
64 pos: 0,
65 line: 1,
66 col: 1,
67 preserve_comments: false,
68 prev_token_type: None,
69 }
70 }
71
72 #[must_use]
74 pub fn with_comments(input: &str) -> Self {
75 Self {
76 input: input.chars().collect(),
77 pos: 0,
78 line: 1,
79 col: 1,
80 preserve_comments: true,
81 prev_token_type: None,
82 }
83 }
84
85 pub fn tokenize(&mut self) -> Result<Vec<Token>> {
89 let mut tokens = Vec::new();
90 loop {
91 let token = self.next_token()?;
92 match token.token_type {
93 TokenType::Eof => {
94 tokens.push(token);
95 break;
96 }
97 TokenType::Whitespace => continue,
98 TokenType::LineComment | TokenType::BlockComment => {
99 if self.preserve_comments {
100 tokens.push(token);
101 }
102 }
103 _ => {
104 self.prev_token_type = Some(token.token_type.clone());
105 tokens.push(token);
106 }
107 }
108 }
109 Ok(tokens)
110 }
111
112 fn peek(&self) -> Option<char> {
113 self.input.get(self.pos).copied()
114 }
115
116 fn peek_at(&self, offset: usize) -> Option<char> {
117 self.input.get(self.pos + offset).copied()
118 }
119
120 fn advance(&mut self) -> Option<char> {
121 let ch = self.input.get(self.pos).copied();
122 if let Some(c) = ch {
123 self.pos += 1;
124 if c == '\n' {
125 self.line += 1;
126 self.col = 1;
127 } else {
128 self.col += 1;
129 }
130 }
131 ch
132 }
133
134 fn make_token(
135 &self,
136 token_type: TokenType,
137 value: impl Into<String>,
138 start: usize,
139 start_line: usize,
140 start_col: usize,
141 ) -> Token {
142 Token::with_location(token_type, value, start, start_line, start_col)
143 }
144
145 fn next_token(&mut self) -> Result<Token> {
146 while self.peek().is_some_and(|c| c.is_whitespace()) {
148 self.advance();
149 }
150
151 let start = self.pos;
152 let start_line = self.line;
153 let start_col = self.col;
154
155 let Some(ch) = self.advance() else {
156 return Ok(self.make_token(TokenType::Eof, "", start, start_line, start_col));
157 };
158
159 match ch {
160 '(' => Ok(self.make_token(TokenType::LParen, "(", start, start_line, start_col)),
162 ')' => Ok(self.make_token(TokenType::RParen, ")", start, start_line, start_col)),
163 '[' => {
164 let prev_is_subscriptable = matches!(
176 self.prev_token_type,
177 Some(
178 TokenType::Identifier
179 | TokenType::RParen
180 | TokenType::RBracket
181 | TokenType::String
182 | TokenType::Number
183 | TokenType::Int
185 | TokenType::Integer
186 | TokenType::BigInt
187 | TokenType::SmallInt
188 | TokenType::TinyInt
189 | TokenType::Float
190 | TokenType::Double
191 | TokenType::Decimal
192 | TokenType::Numeric
193 | TokenType::Real
194 | TokenType::Varchar
195 | TokenType::Char
196 | TokenType::Text
197 | TokenType::Boolean
198 | TokenType::Bool
199 | TokenType::Date
200 | TokenType::Timestamp
201 | TokenType::TimestampTz
202 | TokenType::Time
203 | TokenType::Interval
204 | TokenType::Blob
205 | TokenType::Bytea
206 | TokenType::Json
207 | TokenType::Jsonb
208 | TokenType::Uuid
209 | TokenType::Array
210 | TokenType::Map
211 | TokenType::Struct
212 )
213 );
214
215 let mut looks_like_ident = false;
216 let mut has_space_inside = false;
220 let mut has_operator_inside = false;
221 if prev_is_subscriptable {
222 let mut scan = self.pos;
223 while scan < self.input.len() {
224 let c = self.input[scan];
225 if c == ']' {
226 break;
227 }
228 if c == '\n' || c == '[' || c == ',' {
229 break;
230 }
231 if c == ' ' || c == '\t' {
232 has_space_inside = true;
233 }
234 if matches!(
235 c,
236 '+' | '-' | '*' | '/' | '%' | '=' | '<' | '>' | '!' | '&' | '|' | '^'
237 ) {
238 has_operator_inside = true;
239 }
240 scan += 1;
241 }
242 }
243 if !prev_is_subscriptable || (has_space_inside && !has_operator_inside) {
244 let mut scan = self.pos;
245 let mut saw_quote = false;
246 while scan < self.input.len() {
247 let c = self.input[scan];
248 if c == ']' {
249 looks_like_ident =
254 scan > self.pos && (!prev_is_subscriptable || !saw_quote);
255 break;
256 }
257 if c == '\n' || c == '[' || c == ',' {
259 break;
260 }
261 if c == '\'' {
262 saw_quote = true;
263 }
264 scan += 1;
265 }
266 }
267 if looks_like_ident {
268 self.read_quoted_identifier(start, start_line, start_col, '[')
269 } else {
270 Ok(self.make_token(TokenType::LBracket, "[", start, start_line, start_col))
271 }
272 }
273 ']' => Ok(self.make_token(TokenType::RBracket, "]", start, start_line, start_col)),
274 '{' => {
275 if self.peek().is_some_and(is_identifier_start) {
281 let mut i = 1usize;
282 while self.peek_at(i).is_some_and(|c| is_identifier_continue(c)) {
283 i += 1;
284 }
285 if self.peek_at(i) == Some(':') {
286 let mut value = String::from('{');
287 let mut depth = 0usize;
288 loop {
289 match self.peek() {
290 None => break,
291 Some('{') => {
292 depth += 1;
293 value.push('{');
294 self.advance();
295 }
296 Some('}') => {
297 if depth == 0 {
298 value.push('}');
299 self.advance();
300 return Ok(self.make_token(
301 TokenType::Parameter,
302 value,
303 start,
304 start_line,
305 start_col,
306 ));
307 }
308 depth -= 1;
309 value.push('}');
310 self.advance();
311 }
312 Some(c) => {
313 value.push(c);
314 self.advance();
315 }
316 }
317 }
318 return Err(SqlglotError::TokenizerError {
319 message: "Unterminated parameter placeholder".into(),
320 position: start,
321 });
322 }
323 }
324 Ok(self.make_token(TokenType::LBrace, "{", start, start_line, start_col))
325 }
326 '}' => Ok(self.make_token(TokenType::RBrace, "}", start, start_line, start_col)),
327 ',' => Ok(self.make_token(TokenType::Comma, ",", start, start_line, start_col)),
328 ';' => Ok(self.make_token(TokenType::Semicolon, ";", start, start_line, start_col)),
329 '.' => Ok(self.make_token(TokenType::Dot, ".", start, start_line, start_col)),
330 '+' => Ok(self.make_token(TokenType::Plus, "+", start, start_line, start_col)),
331 '~' => Ok(self.make_token(TokenType::BitwiseNot, "~", start, start_line, start_col)),
332 '@' => {
333 if self.peek() == Some('>') {
334 self.advance();
335 Ok(self.make_token(TokenType::AtArrow, "@>", start, start_line, start_col))
336 } else {
337 Ok(self.make_token(TokenType::AtSign, "@", start, start_line, start_col))
338 }
339 }
340 '=' => Ok(self.make_token(TokenType::Eq, "=", start, start_line, start_col)),
341 '*' => Ok(self.make_token(TokenType::Star, "*", start, start_line, start_col)),
342 '%' => Ok(self.make_token(TokenType::Percent2, "%", start, start_line, start_col)),
343 '^' => Ok(self.make_token(TokenType::BitwiseXor, "^", start, start_line, start_col)),
344
345 ':' => {
347 if self.peek() == Some(':') {
348 self.advance();
349 Ok(self.make_token(TokenType::DoubleColon, "::", start, start_line, start_col))
350 } else {
351 Ok(self.make_token(TokenType::Colon, ":", start, start_line, start_col))
352 }
353 }
354
355 '-' => {
357 if self.peek() == Some('-') {
358 self.advance();
359 let mut value = String::from("--");
360 while self.peek().is_some_and(|c| c != '\n') {
361 value.push(self.advance().unwrap());
362 }
363 Ok(
364 self.make_token(
365 TokenType::LineComment,
366 value,
367 start,
368 start_line,
369 start_col,
370 ),
371 )
372 } else if self.peek() == Some('>') {
373 self.advance();
374 if self.peek() == Some('>') {
375 self.advance();
376 Ok(self.make_token(
377 TokenType::DoubleArrow,
378 "->>",
379 start,
380 start_line,
381 start_col,
382 ))
383 } else {
384 Ok(self.make_token(TokenType::Arrow, "->", start, start_line, start_col))
385 }
386 } else {
387 Ok(self.make_token(TokenType::Minus, "-", start, start_line, start_col))
388 }
389 }
390
391 '/' => {
393 if self.peek() == Some('*') {
394 self.advance();
395 let mut value = String::from("/*");
396 let mut depth = 1;
397 while depth > 0 {
398 match self.advance() {
399 Some('*') if self.peek() == Some('/') => {
400 self.advance();
401 depth -= 1;
402 value.push_str("*/");
403 }
404 Some('/') if self.peek() == Some('*') => {
405 self.advance();
406 depth += 1;
407 value.push_str("/*");
408 }
409 Some(c) => value.push(c),
410 None => {
411 return Err(SqlglotError::TokenizerError {
412 message: "Unterminated block comment".into(),
413 position: start,
414 });
415 }
416 }
417 }
418 Ok(self.make_token(
419 TokenType::BlockComment,
420 value,
421 start,
422 start_line,
423 start_col,
424 ))
425 } else {
426 Ok(self.make_token(TokenType::Slash, "/", start, start_line, start_col))
427 }
428 }
429
430 '<' => {
432 if self.peek() == Some('=') {
433 self.advance();
434 Ok(self.make_token(TokenType::LtEq, "<=", start, start_line, start_col))
435 } else if self.peek() == Some('>') {
436 self.advance();
437 Ok(self.make_token(TokenType::Neq, "<>", start, start_line, start_col))
438 } else if self.peek() == Some('<') {
439 self.advance();
440 Ok(self.make_token(TokenType::ShiftLeft, "<<", start, start_line, start_col))
441 } else if self.peek() == Some('@') {
442 self.advance();
443 Ok(self.make_token(TokenType::ArrowAt, "<@", start, start_line, start_col))
444 } else {
445 Ok(self.make_token(TokenType::Lt, "<", start, start_line, start_col))
446 }
447 }
448
449 '>' => {
451 if self.peek() == Some('=') {
452 self.advance();
453 Ok(self.make_token(TokenType::GtEq, ">=", start, start_line, start_col))
454 } else if self.peek() == Some('>') {
455 self.advance();
456 Ok(self.make_token(TokenType::ShiftRight, ">>", start, start_line, start_col))
457 } else {
458 Ok(self.make_token(TokenType::Gt, ">", start, start_line, start_col))
459 }
460 }
461
462 '!' => {
464 if self.peek() == Some('=') {
465 self.advance();
466 Ok(self.make_token(TokenType::Neq, "!=", start, start_line, start_col))
467 } else {
468 Err(SqlglotError::TokenizerError {
469 message: format!("Unexpected character: {ch}"),
470 position: start,
471 })
472 }
473 }
474
475 '|' => {
477 if self.peek() == Some('|') {
478 self.advance();
479 Ok(self.make_token(TokenType::Concat, "||", start, start_line, start_col))
480 } else {
481 Ok(self.make_token(TokenType::BitwiseOr, "|", start, start_line, start_col))
482 }
483 }
484
485 '&' => Ok(self.make_token(TokenType::BitwiseAnd, "&", start, start_line, start_col)),
487
488 '#' => {
490 if self.peek() == Some('>') {
491 self.advance();
492 if self.peek() == Some('>') {
493 self.advance();
494 Ok(self.make_token(
495 TokenType::HashDoubleArrow,
496 "#>>",
497 start,
498 start_line,
499 start_col,
500 ))
501 } else {
502 Ok(self.make_token(
503 TokenType::HashArrow,
504 "#>",
505 start,
506 start_line,
507 start_col,
508 ))
509 }
510 } else if self.peek() == Some('#') {
511 let save_pos = self.pos;
517 let save_line = self.line;
518 let save_col = self.col;
519 self.advance(); let inner_start = self.pos;
521 let mut found_close = false;
522 while let Some(c) = self.peek() {
523 if c == '\n' {
524 break;
525 }
526 if c == '#' && self.peek_at(1) == Some('#') {
527 found_close = true;
528 break;
529 }
530 self.advance();
531 }
532 if found_close {
533 let value: String = self.input[inner_start..self.pos].iter().collect();
534 self.advance(); self.advance(); return Ok(Token::with_quote(
537 TokenType::Identifier,
538 value,
539 start,
540 start_line,
541 start_col,
542 '#',
543 ));
544 }
545 self.pos = save_pos;
547 self.line = save_line;
548 self.col = save_col;
549 let mut value = String::from("#");
550 while self.peek().is_some_and(|c| c != '\n') {
551 value.push(self.advance().unwrap());
552 }
553 Ok(
554 self.make_token(
555 TokenType::LineComment,
556 value,
557 start,
558 start_line,
559 start_col,
560 ),
561 )
562 } else if self.peek().is_some_and(|c| c.is_ascii_digit()) {
563 let mut value = String::from("#");
566 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
567 value.push(self.advance().unwrap());
568 }
569 Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
570 } else {
571 let mut value = String::from("#");
572 while self.peek().is_some_and(|c| c != '\n') {
573 value.push(self.advance().unwrap());
574 }
575 Ok(
576 self.make_token(
577 TokenType::LineComment,
578 value,
579 start,
580 start_line,
581 start_col,
582 ),
583 )
584 }
585 }
586
587 '\'' => self.read_string(start, start_line, start_col),
589
590 c if c.is_ascii_digit() => self.read_number(start, start_line, start_col, c),
592
593 c if is_identifier_start(c) => self.read_identifier(start, start_line, start_col, c),
595
596 '"' => self.read_quoted_identifier(start, start_line, start_col, '"'),
598
599 '`' => self.read_quoted_identifier(start, start_line, start_col, '`'),
601
602 '$' => {
604 if self.peek() == Some('$') {
609 self.advance(); let mut value = String::new();
611 while let Some(c) = self.peek() {
612 if c == '$' && self.peek_at(1) == Some('$') {
613 self.advance();
614 self.advance();
615 return Ok(self.make_token(
616 TokenType::String,
617 value,
618 start,
619 start_line,
620 start_col,
621 ));
622 }
623 value.push(self.advance().unwrap());
624 }
625 return Ok(self.make_token(
627 TokenType::String,
628 value,
629 start,
630 start_line,
631 start_col,
632 ));
633 }
634 if self.peek().is_some_and(is_identifier_start) {
639 let save_pos = self.pos;
640 let save_line = self.line;
641 let save_col = self.col;
642 let mut tag = String::new();
643 while self.peek().is_some_and(is_identifier_continue) {
644 tag.push(self.advance().unwrap());
645 }
646 if self.peek() == Some('$') {
647 self.advance();
648 let mut value = String::new();
650 let mut closed = false;
651 while let Some(c) = self.peek() {
652 if c == '$' {
653 let mut matched = true;
655 for (i, ch) in tag.chars().enumerate() {
656 if self.peek_at(i + 1) != Some(ch) {
657 matched = false;
658 break;
659 }
660 }
661 if matched && self.peek_at(tag.len() + 1) == Some('$') {
662 for _ in 0..(tag.len() + 2) {
664 self.advance();
665 }
666 closed = true;
667 break;
668 }
669 }
670 value.push(self.advance().unwrap());
671 }
672 if closed {
673 return Ok(self.make_token(
674 TokenType::String,
675 value,
676 start,
677 start_line,
678 start_col,
679 ));
680 }
681 }
682 self.pos = save_pos;
685 self.line = save_line;
686 self.col = save_col;
687 }
688 if self.peek() == Some('{') {
689 let mut value = String::from("$");
692 value.push(self.advance().unwrap()); while let Some(c) = self.peek() {
694 value.push(self.advance().unwrap());
695 if c == '}' {
696 break;
697 }
698 }
699 Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
700 } else if self.peek().is_some_and(|c| c.is_ascii_digit()) {
701 let mut value = String::from("$");
702 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
703 value.push(self.advance().unwrap());
704 }
705 Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
706 } else if self.peek().is_some_and(is_identifier_start) {
707 let mut value = String::from("$");
714 while self.peek().is_some_and(is_identifier_continue) {
715 value.push(self.advance().unwrap());
716 }
717 Ok(self.make_token(TokenType::Identifier, value, start, start_line, start_col))
718 } else {
719 Ok(self.make_token(TokenType::Parameter, "$", start, start_line, start_col))
720 }
721 }
722
723 '?' => Ok(self.make_token(TokenType::Parameter, "?", start, start_line, start_col)),
724
725 _ => Err(SqlglotError::TokenizerError {
726 message: format!("Unexpected character: {ch}"),
727 position: start,
728 }),
729 }
730 }
731
732 fn read_string(&mut self, start: usize, start_line: usize, start_col: usize) -> Result<Token> {
733 let mut value = String::new();
734 loop {
735 match self.advance() {
736 Some('\'') => {
737 if self.peek() == Some('\'') {
738 self.advance();
739 value.push('\'');
740 } else {
741 return Ok(self.make_token(
742 TokenType::String,
743 value,
744 start,
745 start_line,
746 start_col,
747 ));
748 }
749 }
750 Some('\\') => match self.peek() {
751 Some('\\') => {
752 self.advance();
753 value.push('\\');
754 }
755 Some('n') => {
756 self.advance();
757 value.push('\n');
758 }
759 Some('t') => {
760 self.advance();
761 value.push('\t');
762 }
763 Some('r') => {
764 self.advance();
765 value.push('\r');
766 }
767 Some('\'') => {
768 self.advance();
769 value.push('\'');
770 }
771 Some('"') => {
772 self.advance();
773 value.push('"');
774 }
775 Some('0') => {
776 self.advance();
777 value.push('\0');
778 }
779 Some('b') => {
780 self.advance();
781 value.push('\u{0008}');
782 }
783 Some('f') => {
784 self.advance();
785 value.push('\u{000C}');
786 }
787 Some('v') => {
788 self.advance();
789 value.push('\u{000B}');
790 }
791 Some('a') => {
792 self.advance();
793 value.push('\u{0007}');
794 }
795 Some(c) if c.is_ascii_alphanumeric() || c == '?' => {
796 self.advance();
800 value.push('\\');
801 value.push(c);
802 }
803 _ => {
804 value.push('\\');
805 }
806 },
807 Some(c) => value.push(c),
808 None => {
809 return Err(SqlglotError::TokenizerError {
810 message: "Unterminated string literal".into(),
811 position: start,
812 });
813 }
814 }
815 }
816 }
817
818 fn read_number(
819 &mut self,
820 start: usize,
821 start_line: usize,
822 start_col: usize,
823 first: char,
824 ) -> Result<Token> {
825 let mut value = String::new();
826 value.push(first);
827
828 if first == '0' && self.peek().is_some_and(|c| c == 'x' || c == 'X') {
829 value.push(self.advance().unwrap());
830 while self
831 .peek()
832 .is_some_and(|c| c.is_ascii_hexdigit() || c == '_')
833 {
834 value.push(self.advance().unwrap());
835 }
836 if self.peek().is_some_and(|c| c == 'p' || c == 'P') {
839 value.push(self.advance().unwrap());
840 if self.peek().is_some_and(|c| c == '+' || c == '-') {
841 value.push(self.advance().unwrap());
842 }
843 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
844 value.push(self.advance().unwrap());
845 }
846 }
847 return Ok(self.make_token(TokenType::HexString, value, start, start_line, start_col));
848 }
849
850 while self.peek().is_some_and(|c| c.is_ascii_digit() || c == '_') {
851 value.push(self.advance().unwrap());
852 }
853
854 if self.peek() == Some('.')
855 && (self.peek_at(1).is_some_and(|c| c.is_ascii_digit())
856 || !self.peek_at(1).is_some_and(is_identifier_start))
857 {
858 value.push(self.advance().unwrap());
859 while self.peek().is_some_and(|c| c.is_ascii_digit() || c == '_') {
860 value.push(self.advance().unwrap());
861 }
862 }
863
864 if self.peek().is_some_and(|c| c == 'e' || c == 'E') {
865 value.push(self.advance().unwrap());
866 if self.peek().is_some_and(|c| c == '+' || c == '-') {
867 value.push(self.advance().unwrap());
868 }
869 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
870 value.push(self.advance().unwrap());
871 }
872 }
873
874 if !value.contains('.')
879 && !value.contains('e')
880 && !value.contains('E')
881 && self.peek().is_some_and(is_identifier_continue)
882 {
883 while self.peek().is_some_and(is_identifier_continue) {
884 value.push(self.advance().unwrap());
885 }
886 let token_type = Self::keyword_type(&value);
887 return Ok(self.make_token(token_type, value, start, start_line, start_col));
888 }
889
890 Ok(self.make_token(TokenType::Number, value, start, start_line, start_col))
891 }
892
893 fn read_identifier(
894 &mut self,
895 start: usize,
896 start_line: usize,
897 start_col: usize,
898 first: char,
899 ) -> Result<Token> {
900 let mut value = String::new();
901 value.push(first);
902 while self.peek().is_some_and(is_identifier_continue) {
903 if self.peek() == Some('$') {
907 let next = self.peek_at(1);
908 if matches!(next, Some('{')) || next.is_some_and(|c| c.is_ascii_digit()) {
909 break;
910 }
911 }
912 value.push(self.advance().unwrap());
913 }
914
915 if value.len() == 1
918 && value
919 .as_bytes()
920 .first()
921 .is_some_and(|b| b.eq_ignore_ascii_case(&b'n'))
922 && self.peek() == Some('\'')
923 {
924 self.advance(); let mut token = self.read_string(start, start_line, start_col)?;
926 token.token_type = TokenType::NationalString;
927 return Ok(token);
928 }
929
930 if value.len() == 1
939 && value
940 .as_bytes()
941 .first()
942 .is_some_and(|b| matches!(b.to_ascii_uppercase(), b'E' | b'B' | b'X'))
943 && self.peek() == Some('\'')
944 {
945 self.advance();
946 return self.read_string(start, start_line, start_col);
947 }
948 if value.len() == 1
950 && value
951 .as_bytes()
952 .first()
953 .is_some_and(|b| b.eq_ignore_ascii_case(&b'u'))
954 && self.peek() == Some('&')
955 && self.peek_at(1) == Some('\'')
956 {
957 self.advance(); self.advance(); return self.read_string(start, start_line, start_col);
960 }
961
962 let token_type = Self::keyword_type(&value);
963 Ok(self.make_token(token_type, value, start, start_line, start_col))
964 }
965
966 fn keyword_type(word: &str) -> TokenType {
968 match word.to_uppercase().as_str() {
969 "SELECT" => TokenType::Select,
970 "FROM" => TokenType::From,
971 "WHERE" => TokenType::Where,
972 "AND" => TokenType::And,
973 "OR" => TokenType::Or,
974 "NOT" => TokenType::Not,
975 "AS" => TokenType::As,
976 "JOIN" => TokenType::Join,
977 "INNER" => TokenType::Inner,
978 "LEFT" => TokenType::Left,
979 "RIGHT" => TokenType::Right,
980 "FULL" => TokenType::Full,
981 "OUTER" => TokenType::Outer,
982 "CROSS" => TokenType::Cross,
983 "ON" => TokenType::On,
984 "INSERT" => TokenType::Insert,
985 "INTO" => TokenType::Into,
986 "VALUES" => TokenType::Values,
987 "UPDATE" => TokenType::Update,
988 "SET" => TokenType::Set,
989 "DELETE" => TokenType::Delete,
990 "CREATE" => TokenType::Create,
991 "TABLE" => TokenType::Table,
992 "DROP" => TokenType::Drop,
993 "ALTER" => TokenType::Alter,
994 "INDEX" => TokenType::Index,
995 "IF" => TokenType::If,
996 "EXISTS" => TokenType::Exists,
997 "IN" => TokenType::In,
998 "IS" => TokenType::Is,
999 "NULL" => TokenType::Null,
1000 "LIKE" => TokenType::Like,
1001 "ILIKE" => TokenType::ILike,
1002 "ESCAPE" => TokenType::Escape,
1003 "BETWEEN" => TokenType::Between,
1004 "CASE" => TokenType::Case,
1005 "WHEN" => TokenType::When,
1006 "THEN" => TokenType::Then,
1007 "ELSE" => TokenType::Else,
1008 "END" => TokenType::End,
1009 "ORDER" => TokenType::Order,
1010 "BY" => TokenType::By,
1011 "ASC" => TokenType::Asc,
1012 "DESC" => TokenType::Desc,
1013 "GROUP" => TokenType::Group,
1014 "HAVING" => TokenType::Having,
1015 "LIMIT" => TokenType::Limit,
1016 "OFFSET" => TokenType::Offset,
1017 "UNION" => TokenType::Union,
1018 "ALL" => TokenType::All,
1019 "DISTINCT" => TokenType::Distinct,
1020 "TRUE" => TokenType::True,
1021 "FALSE" => TokenType::False,
1022 "INTERSECT" => TokenType::Intersect,
1023 "EXCEPT" => TokenType::Except,
1024 "WITH" => TokenType::With,
1025 "RECURSIVE" => TokenType::Recursive,
1026 "ANY" => TokenType::Any,
1027 "SOME" => TokenType::Some,
1028 "CAST" => TokenType::Cast,
1029 "OVER" => TokenType::Over,
1030 "PARTITION" => TokenType::Partition,
1031 "WINDOW" => TokenType::Window,
1032 "ROWS" => TokenType::Rows,
1033 "RANGE" => TokenType::Range,
1034 "UNBOUNDED" => TokenType::Unbounded,
1035 "PRECEDING" => TokenType::Preceding,
1036 "FOLLOWING" => TokenType::Following,
1037 "FILTER" => TokenType::Filter,
1038 "INT" => TokenType::Int,
1039 "INTEGER" => TokenType::Integer,
1040 "BIGINT" => TokenType::BigInt,
1041 "SMALLINT" => TokenType::SmallInt,
1042 "TINYINT" => TokenType::TinyInt,
1043 "FLOAT" => TokenType::Float,
1044 "DOUBLE" => TokenType::Double,
1045 "DECIMAL" => TokenType::Decimal,
1046 "NUMERIC" => TokenType::Numeric,
1047 "REAL" => TokenType::Real,
1048 "VARCHAR" => TokenType::Varchar,
1049 "CHAR" | "CHARACTER" => TokenType::Char,
1050 "TEXT" => TokenType::Text,
1051 "BOOLEAN" | "BOOL" => TokenType::Boolean,
1052 "DATE" => TokenType::Date,
1053 "TIMESTAMP" => TokenType::Timestamp,
1054 "TIMESTAMPTZ" => TokenType::TimestampTz,
1055 "TIME" => TokenType::Time,
1056 "INTERVAL" => TokenType::Interval,
1057 "BLOB" => TokenType::Blob,
1058 "BYTEA" => TokenType::Bytea,
1059 "JSON" => TokenType::Json,
1060 "JSONB" => TokenType::Jsonb,
1061 "UUID" => TokenType::Uuid,
1062 "ARRAY" => TokenType::Array,
1063 "MAP" => TokenType::Map,
1064 "STRUCT" => TokenType::Struct,
1065 "PRIMARY" => TokenType::Primary,
1066 "KEY" => TokenType::Key,
1067 "FOREIGN" => TokenType::Foreign,
1068 "REFERENCES" => TokenType::References,
1069 "UNIQUE" => TokenType::Unique,
1070 "CHECK" => TokenType::Check,
1071 "DEFAULT" => TokenType::Default,
1072 "CONSTRAINT" => TokenType::Constraint,
1073 "AUTO_INCREMENT" | "AUTOINCREMENT" => TokenType::AutoIncrement,
1074 "CASCADE" => TokenType::Cascade,
1075 "RESTRICT" => TokenType::Restrict,
1076 "RETURNING" => TokenType::Returning,
1077 "CONFLICT" => TokenType::Conflict,
1078 "DO" => TokenType::Do,
1079 "NOTHING" => TokenType::Nothing,
1080 "REPLACE" => TokenType::Replace,
1081 "IGNORE" => TokenType::Ignore,
1082 "MERGE" => TokenType::Merge,
1083 "MATCHED" => TokenType::Matched,
1084 "USING" => TokenType::Using,
1085 "TRUNCATE" => TokenType::Truncate,
1086 "SCHEMA" => TokenType::Schema,
1087 "DATABASE" => TokenType::Database,
1088 "VIEW" => TokenType::View,
1089 "MATERIALIZED" => TokenType::Materialized,
1090 "TEMPORARY" => TokenType::Temporary,
1091 "TEMP" => TokenType::Temp,
1092 "BEGIN" => TokenType::Begin,
1093 "COMMIT" => TokenType::Commit,
1094 "ROLLBACK" => TokenType::Rollback,
1095 "SAVEPOINT" => TokenType::Savepoint,
1096 "TRANSACTION" => TokenType::Transaction,
1097 "EXPLAIN" => TokenType::Explain,
1098 "ANALYZE" => TokenType::Analyze,
1099 "SHOW" => TokenType::Show,
1100 "USE" => TokenType::Use,
1101 "GRANT" => TokenType::Grant,
1102 "REVOKE" => TokenType::Revoke,
1103 "LATERAL" => TokenType::Lateral,
1104 "UNNEST" => TokenType::Unnest,
1105 "PIVOT" => TokenType::Pivot,
1106 "UNPIVOT" => TokenType::Unpivot,
1107 "TABLESAMPLE" => TokenType::Tablesample,
1108 "FETCH" => TokenType::Fetch,
1109 "FIRST" => TokenType::First,
1110 "NEXT" => TokenType::Next,
1111 "ONLY" => TokenType::Only,
1112 "NULLS" => TokenType::Nulls,
1113 "RESPECT" => TokenType::Respect,
1114 "TOP" => TokenType::Top,
1115 "COLLATE" => TokenType::Collate,
1116 "QUALIFY" => TokenType::Qualify,
1117 "CUBE" => TokenType::Cube,
1118 "ROLLUP" => TokenType::Rollup,
1119 "GROUPING" => TokenType::Grouping,
1120 "SETS" => TokenType::Sets,
1121 "XOR" => TokenType::Xor,
1122 "EXTRACT" => TokenType::Extract,
1123 "EPOCH" => TokenType::Epoch,
1124 "YEAR" => TokenType::Year,
1125 "MONTH" => TokenType::Month,
1126 "DAY" => TokenType::Day,
1127 "HOUR" => TokenType::Hour,
1128 "MINUTE" => TokenType::Minute,
1129 "SECOND" => TokenType::Second,
1130 _ => TokenType::Identifier,
1131 }
1132 }
1133
1134 fn read_quoted_identifier(
1135 &mut self,
1136 start: usize,
1137 start_line: usize,
1138 start_col: usize,
1139 quote: char,
1140 ) -> Result<Token> {
1141 let end_char = if quote == '[' { ']' } else { quote };
1142 let mut value = String::new();
1143 loop {
1144 match self.advance() {
1145 Some(c) if c == end_char => {
1146 if self.peek() == Some(end_char) && end_char != ']' {
1147 self.advance();
1148 value.push(end_char);
1149 } else {
1150 return Ok(Token::with_quote(
1151 TokenType::Identifier,
1152 value,
1153 start,
1154 start_line,
1155 start_col,
1156 quote,
1157 ));
1158 }
1159 }
1160 Some(c) => value.push(c),
1161 None => {
1162 return Err(SqlglotError::TokenizerError {
1163 message: format!("Unterminated quoted identifier (expected {end_char})"),
1164 position: start,
1165 });
1166 }
1167 }
1168 }
1169 }
1170}
1171
1172#[cfg(test)]
1173mod tests {
1174 use super::*;
1175
1176 #[test]
1177 fn test_tokenize_simple_select() {
1178 let mut tokenizer = Tokenizer::new("SELECT a, b FROM t");
1179 let tokens = tokenizer.tokenize().unwrap();
1180 assert_eq!(tokens[0].token_type, TokenType::Select);
1181 assert_eq!(tokens[1].token_type, TokenType::Identifier);
1182 assert_eq!(tokens[1].value, "a");
1183 assert_eq!(tokens[2].token_type, TokenType::Comma);
1184 assert_eq!(tokens[3].token_type, TokenType::Identifier);
1185 assert_eq!(tokens[3].value, "b");
1186 assert_eq!(tokens[4].token_type, TokenType::From);
1187 assert_eq!(tokens[5].token_type, TokenType::Identifier);
1188 assert_eq!(tokens[5].value, "t");
1189 assert_eq!(tokens[6].token_type, TokenType::Eof);
1190 }
1191
1192 #[test]
1193 fn test_tokenize_string_literal() {
1194 let mut tokenizer = Tokenizer::new("'hello world'");
1195 let tokens = tokenizer.tokenize().unwrap();
1196 assert_eq!(tokens[0].token_type, TokenType::String);
1197 assert_eq!(tokens[0].value, "hello world");
1198 }
1199
1200 #[test]
1201 fn test_tokenize_operators() {
1202 let mut tokenizer = Tokenizer::new("a >= 1 AND b != 2");
1203 let tokens = tokenizer.tokenize().unwrap();
1204 assert_eq!(tokens[1].token_type, TokenType::GtEq);
1205 assert_eq!(tokens[3].token_type, TokenType::And);
1206 assert_eq!(tokens[5].token_type, TokenType::Neq);
1207 }
1208
1209 #[test]
1210 fn test_tokenize_number() {
1211 let mut tokenizer = Tokenizer::new("123.45");
1212 let tokens = tokenizer.tokenize().unwrap();
1213 assert_eq!(tokens[0].token_type, TokenType::Number);
1214 assert_eq!(tokens[0].value, "123.45");
1215 }
1216
1217 #[test]
1218 fn test_tokenize_line_comment() {
1219 let mut tok = Tokenizer::with_comments("SELECT 1 -- comment\nFROM t");
1220 let tokens = tok.tokenize().unwrap();
1221 assert!(
1222 tokens
1223 .iter()
1224 .any(|t| t.token_type == TokenType::LineComment)
1225 );
1226 }
1227
1228 #[test]
1229 fn test_tokenize_block_comment() {
1230 let mut tok = Tokenizer::with_comments("SELECT /* hello */ 1");
1231 let tokens = tok.tokenize().unwrap();
1232 assert!(
1233 tokens
1234 .iter()
1235 .any(|t| t.token_type == TokenType::BlockComment)
1236 );
1237 }
1238
1239 #[test]
1240 fn test_tokenize_cte_keywords() {
1241 let mut tok = Tokenizer::new("WITH cte AS (SELECT 1) SELECT * FROM cte");
1242 let tokens = tok.tokenize().unwrap();
1243 assert_eq!(tokens[0].token_type, TokenType::With);
1244 assert_eq!(tokens[2].token_type, TokenType::As);
1245 }
1246
1247 #[test]
1248 fn test_tokenize_double_colon() {
1249 let mut tok = Tokenizer::new("x::int");
1250 let tokens = tok.tokenize().unwrap();
1251 assert_eq!(tokens[1].token_type, TokenType::DoubleColon);
1252 }
1253
1254 #[test]
1255 fn test_tokenize_cast() {
1256 let mut tok = Tokenizer::new("CAST(x AS INT)");
1257 let tokens = tok.tokenize().unwrap();
1258 assert_eq!(tokens[0].token_type, TokenType::Cast);
1259 }
1260
1261 #[test]
1262 fn test_tokenize_window() {
1263 let mut tok = Tokenizer::new("ROW_NUMBER() OVER (PARTITION BY id ORDER BY name)");
1264 let tokens = tok.tokenize().unwrap();
1265 assert!(tokens.iter().any(|t| t.token_type == TokenType::Over));
1266 assert!(tokens.iter().any(|t| t.token_type == TokenType::Partition));
1267 }
1268
1269 #[test]
1270 fn test_line_tracking() {
1271 let mut tok = Tokenizer::new("SELECT\n 1");
1272 let tokens = tok.tokenize().unwrap();
1273 assert_eq!(tokens[0].line, 1);
1274 assert_eq!(tokens[1].line, 2);
1275 }
1276
1277 #[test]
1278 fn test_tokenize_union_intersect_except() {
1279 let mut tok = Tokenizer::new("UNION INTERSECT EXCEPT");
1280 let tokens = tok.tokenize().unwrap();
1281 assert_eq!(tokens[0].token_type, TokenType::Union);
1282 assert_eq!(tokens[1].token_type, TokenType::Intersect);
1283 assert_eq!(tokens[2].token_type, TokenType::Except);
1284 }
1285
1286 #[test]
1287 fn test_tokenize_n_prefixed_string_literal_uppercase() {
1288 let mut tok = Tokenizer::new("N'Hello'");
1289 let tokens = tok.tokenize().unwrap();
1290 assert_eq!(tokens[0].token_type, TokenType::NationalString);
1291 assert_eq!(tokens[0].value, "Hello");
1292 }
1293
1294 #[test]
1295 fn test_tokenize_n_prefixed_string_literal_lowercase() {
1296 let mut tok = Tokenizer::new("n'hello'");
1297 let tokens = tok.tokenize().unwrap();
1298 assert_eq!(tokens[0].token_type, TokenType::NationalString);
1299 assert_eq!(tokens[0].value, "hello");
1300 }
1301
1302 #[test]
1303 fn test_tokenize_n_prefixed_string_literal_escaped_quote() {
1304 let mut tok = Tokenizer::new("N'can''t stop'");
1305 let tokens = tok.tokenize().unwrap();
1306 assert_eq!(tokens[0].token_type, TokenType::NationalString);
1307 assert_eq!(tokens[0].value, "can't stop");
1308 }
1309
1310 #[test]
1311 fn test_tokenize_n_prefixed_string_literal_unicode() {
1312 let mut tok = Tokenizer::new("N'テスト'");
1313 let tokens = tok.tokenize().unwrap();
1314 assert_eq!(tokens[0].token_type, TokenType::NationalString);
1315 assert_eq!(tokens[0].value, "テスト");
1316 }
1317
1318 #[test]
1319 fn test_tokenize_identifier_n_without_quote() {
1320 let mut tok = Tokenizer::new("SELECT N FROM t");
1321 let tokens = tok.tokenize().unwrap();
1322 assert_eq!(tokens[1].token_type, TokenType::Identifier);
1323 assert_eq!(tokens[1].value, "N");
1324 }
1325
1326 #[test]
1327 fn test_tokenize_identifier_name_starting_with_n() {
1328 let mut tok = Tokenizer::new("SELECT NAME FROM t");
1329 let tokens = tok.tokenize().unwrap();
1330 assert_eq!(tokens[1].token_type, TokenType::Identifier);
1331 assert_eq!(tokens[1].value, "NAME");
1332 }
1333}