1#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8 Select,
10 From,
11 Where,
12 With, And,
14 Or,
15 In,
16 Not,
17 Between,
18 Like,
19 Is,
20 Null,
21 OrderBy,
22 GroupBy,
23 Having,
24 As,
25 Asc,
26 Desc,
27 Limit,
28 Offset,
29 Into, DateTime, Case, When, Then, Else, End, Distinct, Over, Partition, By, Rows, Range, Unbounded, Preceding, Following, Current, Row, Union, Intersect, Except, Web, Unnest, Join, Inner, Left, Right, Full, Outer, On, Cross, Identifier(String),
73 QuotedIdentifier(String), StringLiteral(String),
75 JsonBlock(String), NumberLiteral(String),
77 Star,
78
79 Dot,
81 Comma,
82 Colon,
83 LeftParen,
84 RightParen,
85 Equal,
86 NotEqual,
87 LessThan,
88 GreaterThan,
89 LessThanOrEqual,
90 GreaterThanOrEqual,
91
92 Plus,
94 Minus,
95 Divide,
96 Modulo,
97
98 Concat, LineComment(String), BlockComment(String), Eof,
107}
108
109impl Token {
110 pub fn from_keyword(s: &str) -> Option<Token> {
112 match s.to_uppercase().as_str() {
113 "SELECT" => Some(Token::Select),
114 "FROM" => Some(Token::From),
115 "WHERE" => Some(Token::Where),
116 "WITH" => Some(Token::With),
117 "AND" => Some(Token::And),
118 "OR" => Some(Token::Or),
119 "IN" => Some(Token::In),
120 "NOT" => Some(Token::Not),
121 "BETWEEN" => Some(Token::Between),
122 "LIKE" => Some(Token::Like),
123 "IS" => Some(Token::Is),
124 "NULL" => Some(Token::Null),
125 "ORDER" => Some(Token::OrderBy),
126 "GROUP" => Some(Token::GroupBy),
127 "HAVING" => Some(Token::Having),
128 "AS" => Some(Token::As),
129 "ASC" => Some(Token::Asc),
130 "DESC" => Some(Token::Desc),
131 "LIMIT" => Some(Token::Limit),
132 "OFFSET" => Some(Token::Offset),
133 "INTO" => Some(Token::Into),
134 "DISTINCT" => Some(Token::Distinct),
135 "CASE" => Some(Token::Case),
136 "WHEN" => Some(Token::When),
137 "THEN" => Some(Token::Then),
138 "ELSE" => Some(Token::Else),
139 "END" => Some(Token::End),
140 "OVER" => Some(Token::Over),
141 "PARTITION" => Some(Token::Partition),
142 "BY" => Some(Token::By),
143 "ROWS" => Some(Token::Rows),
144 "RANGE" => Some(Token::Range),
145 "UNBOUNDED" => Some(Token::Unbounded),
146 "PRECEDING" => Some(Token::Preceding),
147 "FOLLOWING" => Some(Token::Following),
148 "CURRENT" => Some(Token::Current),
149 "ROW" => Some(Token::Row),
150 "UNION" => Some(Token::Union),
151 "INTERSECT" => Some(Token::Intersect),
152 "EXCEPT" => Some(Token::Except),
153 "WEB" => Some(Token::Web),
154 "UNNEST" => Some(Token::Unnest),
155 "JOIN" => Some(Token::Join),
156 "INNER" => Some(Token::Inner),
157 "LEFT" => Some(Token::Left),
158 "RIGHT" => Some(Token::Right),
159 "FULL" => Some(Token::Full),
160 "OUTER" => Some(Token::Outer),
161 "ON" => Some(Token::On),
162 "CROSS" => Some(Token::Cross),
163 _ => None,
164 }
165 }
166
167 pub fn is_logical_operator(&self) -> bool {
169 matches!(self, Token::And | Token::Or)
170 }
171
172 pub fn is_join_type(&self) -> bool {
174 matches!(
175 self,
176 Token::Inner | Token::Left | Token::Right | Token::Full | Token::Cross
177 )
178 }
179
180 pub fn is_clause_terminator(&self) -> bool {
182 matches!(
183 self,
184 Token::OrderBy
185 | Token::GroupBy
186 | Token::Having
187 | Token::Limit
188 | Token::Offset
189 | Token::Union
190 | Token::Intersect
191 | Token::Except
192 )
193 }
194
195 pub fn as_keyword_str(&self) -> Option<&'static str> {
197 match self {
198 Token::Select => Some("SELECT"),
199 Token::From => Some("FROM"),
200 Token::Where => Some("WHERE"),
201 Token::With => Some("WITH"),
202 Token::And => Some("AND"),
203 Token::Or => Some("OR"),
204 Token::OrderBy => Some("ORDER BY"),
205 Token::GroupBy => Some("GROUP BY"),
206 Token::Having => Some("HAVING"),
207 _ => None,
209 }
210 }
211}
212
213#[derive(Debug, Clone)]
214pub struct Lexer {
215 input: Vec<char>,
216 position: usize,
217 current_char: Option<char>,
218}
219
220impl Lexer {
221 #[must_use]
222 pub fn new(input: &str) -> Self {
223 let chars: Vec<char> = input.chars().collect();
224 let current = chars.first().copied();
225 Self {
226 input: chars,
227 position: 0,
228 current_char: current,
229 }
230 }
231
232 fn advance(&mut self) {
233 self.position += 1;
234 self.current_char = self.input.get(self.position).copied();
235 }
236
237 fn peek(&self, offset: usize) -> Option<char> {
238 self.input.get(self.position + offset).copied()
239 }
240
241 fn peek_string(&self, n: usize) -> String {
243 let mut result = String::new();
244 for i in 0..n {
245 if let Some(ch) = self.input.get(self.position + i) {
246 result.push(*ch);
247 } else {
248 break;
249 }
250 }
251 result
252 }
253
254 fn read_json_block(&mut self) -> String {
257 let mut result = String::new();
258
259 for _ in 0..6 {
261 self.advance();
262 }
263
264 while let Some(ch) = self.current_char {
266 if ch == '$' && self.peek_string(6) == "$JSON$" {
268 for _ in 0..6 {
270 self.advance();
271 }
272 break;
273 }
274 result.push(ch);
275 self.advance();
276 }
277
278 result
279 }
280
281 fn skip_whitespace(&mut self) {
282 while let Some(ch) = self.current_char {
283 if ch.is_whitespace() {
284 self.advance();
285 } else {
286 break;
287 }
288 }
289 }
290
291 fn read_line_comment(&mut self) -> String {
293 let mut result = String::new();
294
295 self.advance();
297 self.advance();
298
299 while let Some(ch) = self.current_char {
301 if ch == '\n' {
302 self.advance(); break;
304 }
305 result.push(ch);
306 self.advance();
307 }
308
309 result
310 }
311
312 fn read_block_comment(&mut self) -> String {
314 let mut result = String::new();
315
316 self.advance();
318 self.advance();
319
320 while let Some(ch) = self.current_char {
322 if ch == '*' && self.peek(1) == Some('/') {
323 self.advance(); self.advance(); break;
326 }
327 result.push(ch);
328 self.advance();
329 }
330
331 result
332 }
333
334 fn skip_whitespace_and_comments(&mut self) {
337 loop {
338 while let Some(ch) = self.current_char {
340 if ch.is_whitespace() {
341 self.advance();
342 } else {
343 break;
344 }
345 }
346
347 match self.current_char {
349 Some('-') if self.peek(1) == Some('-') => {
350 self.advance(); self.advance(); while let Some(ch) = self.current_char {
354 self.advance();
355 if ch == '\n' {
356 break;
357 }
358 }
359 }
360 Some('/') if self.peek(1) == Some('*') => {
361 self.advance(); self.advance(); while let Some(ch) = self.current_char {
365 if ch == '*' && self.peek(1) == Some('/') {
366 self.advance(); self.advance(); break;
369 }
370 self.advance();
371 }
372 }
373 _ => {
374 break;
376 }
377 }
378 }
379 }
380
381 fn read_identifier(&mut self) -> String {
382 let mut result = String::new();
383 while let Some(ch) = self.current_char {
384 if ch.is_alphanumeric() || ch == '_' {
385 result.push(ch);
386 self.advance();
387 } else {
388 break;
389 }
390 }
391 result
392 }
393
394 fn read_string(&mut self) -> String {
395 let mut result = String::new();
396 let quote_char = self.current_char.unwrap(); self.advance(); while let Some(ch) = self.current_char {
400 if ch == quote_char {
401 self.advance(); break;
403 }
404 result.push(ch);
405 self.advance();
406 }
407 result
408 }
409
410 fn read_number(&mut self) -> String {
411 let mut result = String::new();
412 let has_e = false;
413
414 while let Some(ch) = self.current_char {
416 if !has_e && (ch.is_numeric() || ch == '.') {
417 result.push(ch);
418 self.advance();
419 } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
420 result.push(ch);
422 self.advance();
423 let _ = has_e; if let Some(sign) = self.current_char {
427 if sign == '+' || sign == '-' {
428 result.push(sign);
429 self.advance();
430 }
431 }
432
433 while let Some(digit) = self.current_char {
435 if digit.is_numeric() {
436 result.push(digit);
437 self.advance();
438 } else {
439 break;
440 }
441 }
442 break; } else {
444 break;
445 }
446 }
447 result
448 }
449
450 pub fn next_token_with_comments(&mut self) -> Token {
453 self.skip_whitespace();
455
456 match self.current_char {
457 None => Token::Eof,
458 Some('-') if self.peek(1) == Some('-') => {
460 let comment_text = self.read_line_comment();
461 Token::LineComment(comment_text)
462 }
463 Some('/') if self.peek(1) == Some('*') => {
464 let comment_text = self.read_block_comment();
465 Token::BlockComment(comment_text)
466 }
467 Some('*') => {
468 self.advance();
469 Token::Star
470 }
471 Some('+') => {
472 self.advance();
473 Token::Plus
474 }
475 Some('/') => {
476 self.advance();
478 Token::Divide
479 }
480 Some('%') => {
481 self.advance();
482 Token::Modulo
483 }
484 Some('.') => {
485 self.advance();
486 Token::Dot
487 }
488 Some(',') => {
489 self.advance();
490 Token::Comma
491 }
492 Some(':') => {
493 self.advance();
494 Token::Colon
495 }
496 Some('(') => {
497 self.advance();
498 Token::LeftParen
499 }
500 Some(')') => {
501 self.advance();
502 Token::RightParen
503 }
504 Some('=') => {
505 self.advance();
506 Token::Equal
507 }
508 Some('<') => {
509 self.advance();
510 if self.current_char == Some('=') {
511 self.advance();
512 Token::LessThanOrEqual
513 } else if self.current_char == Some('>') {
514 self.advance();
515 Token::NotEqual
516 } else {
517 Token::LessThan
518 }
519 }
520 Some('>') => {
521 self.advance();
522 if self.current_char == Some('=') {
523 self.advance();
524 Token::GreaterThanOrEqual
525 } else {
526 Token::GreaterThan
527 }
528 }
529 Some('!') if self.peek(1) == Some('=') => {
530 self.advance();
531 self.advance();
532 Token::NotEqual
533 }
534 Some('|') if self.peek(1) == Some('|') => {
535 self.advance();
536 self.advance();
537 Token::Concat
538 }
539 Some('"') => {
540 let ident_val = self.read_string();
541 Token::QuotedIdentifier(ident_val)
542 }
543 Some('$') => {
544 if self.peek_string(6) == "$JSON$" {
545 let json_content = self.read_json_block();
546 Token::JsonBlock(json_content)
547 } else {
548 let ident = self.read_identifier();
549 Token::Identifier(ident)
550 }
551 }
552 Some('\'') => {
553 let string_val = self.read_string();
554 Token::StringLiteral(string_val)
555 }
556 Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
557 self.advance();
558 let num = self.read_number();
559 Token::NumberLiteral(format!("-{num}"))
560 }
561 Some('-') => {
562 self.advance();
563 Token::Minus
564 }
565 Some(ch) if ch.is_numeric() => {
566 let num = self.read_number();
567 Token::NumberLiteral(num)
568 }
569 Some('#') => {
570 self.advance();
571 let table_name = self.read_identifier();
572 if table_name.is_empty() {
573 Token::Identifier("#".to_string())
574 } else {
575 Token::Identifier(format!("#{}", table_name))
576 }
577 }
578 Some(ch) if ch.is_alphabetic() || ch == '_' => {
579 let ident = self.read_identifier();
580 Token::from_keyword(&ident).unwrap_or_else(|| Token::Identifier(ident))
581 }
582 Some(ch) => {
583 self.advance();
584 Token::Identifier(ch.to_string())
585 }
586 }
587 }
588
589 pub fn next_token(&mut self) -> Token {
592 self.skip_whitespace_and_comments();
593
594 match self.current_char {
595 None => Token::Eof,
596 Some('*') => {
597 self.advance();
598 Token::Star }
602 Some('+') => {
603 self.advance();
604 Token::Plus
605 }
606 Some('/') => {
607 if self.peek(1) == Some('*') {
609 self.skip_whitespace_and_comments();
612 return self.next_token();
613 }
614 self.advance();
615 Token::Divide
616 }
617 Some('%') => {
618 self.advance();
619 Token::Modulo
620 }
621 Some('.') => {
622 self.advance();
623 Token::Dot
624 }
625 Some(',') => {
626 self.advance();
627 Token::Comma
628 }
629 Some(':') => {
630 self.advance();
631 Token::Colon
632 }
633 Some('(') => {
634 self.advance();
635 Token::LeftParen
636 }
637 Some(')') => {
638 self.advance();
639 Token::RightParen
640 }
641 Some('=') => {
642 self.advance();
643 Token::Equal
644 }
645 Some('<') => {
646 self.advance();
647 if self.current_char == Some('=') {
648 self.advance();
649 Token::LessThanOrEqual
650 } else if self.current_char == Some('>') {
651 self.advance();
652 Token::NotEqual
653 } else {
654 Token::LessThan
655 }
656 }
657 Some('>') => {
658 self.advance();
659 if self.current_char == Some('=') {
660 self.advance();
661 Token::GreaterThanOrEqual
662 } else {
663 Token::GreaterThan
664 }
665 }
666 Some('!') if self.peek(1) == Some('=') => {
667 self.advance();
668 self.advance();
669 Token::NotEqual
670 }
671 Some('|') if self.peek(1) == Some('|') => {
672 self.advance();
673 self.advance();
674 Token::Concat
675 }
676 Some('"') => {
677 let ident_val = self.read_string();
679 Token::QuotedIdentifier(ident_val)
680 }
681 Some('$') => {
682 if self.peek_string(6) == "$JSON$" {
684 let json_content = self.read_json_block();
685 Token::JsonBlock(json_content)
686 } else {
687 let ident = self.read_identifier();
690 Token::Identifier(ident)
691 }
692 }
693 Some('\'') => {
694 let string_val = self.read_string();
696 Token::StringLiteral(string_val)
697 }
698 Some('-') if self.peek(1) == Some('-') => {
699 self.skip_whitespace_and_comments();
701 self.next_token()
702 }
703 Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
704 self.advance(); let num = self.read_number();
707 Token::NumberLiteral(format!("-{num}"))
708 }
709 Some('-') => {
710 self.advance();
712 Token::Minus
713 }
714 Some(ch) if ch.is_numeric() => {
715 let num = self.read_number();
716 Token::NumberLiteral(num)
717 }
718 Some('#') => {
719 self.advance(); let table_name = self.read_identifier();
722 if table_name.is_empty() {
723 Token::Identifier("#".to_string())
725 } else {
726 Token::Identifier(format!("#{}", table_name))
728 }
729 }
730 Some(ch) if ch.is_alphabetic() || ch == '_' => {
731 let ident = self.read_identifier();
732 match ident.to_uppercase().as_str() {
733 "SELECT" => Token::Select,
734 "FROM" => Token::From,
735 "WHERE" => Token::Where,
736 "WITH" => Token::With,
737 "AND" => Token::And,
738 "OR" => Token::Or,
739 "IN" => Token::In,
740 "NOT" => Token::Not,
741 "BETWEEN" => Token::Between,
742 "LIKE" => Token::Like,
743 "IS" => Token::Is,
744 "NULL" => Token::Null,
745 "ORDER" if self.peek_keyword("BY") => {
746 self.skip_whitespace();
747 self.read_identifier(); Token::OrderBy
749 }
750 "GROUP" if self.peek_keyword("BY") => {
751 self.skip_whitespace();
752 self.read_identifier(); Token::GroupBy
754 }
755 "HAVING" => Token::Having,
756 "AS" => Token::As,
757 "ASC" => Token::Asc,
758 "DESC" => Token::Desc,
759 "LIMIT" => Token::Limit,
760 "OFFSET" => Token::Offset,
761 "INTO" => Token::Into,
762 "DATETIME" => Token::DateTime,
763 "CASE" => Token::Case,
764 "WHEN" => Token::When,
765 "THEN" => Token::Then,
766 "ELSE" => Token::Else,
767 "END" => Token::End,
768 "DISTINCT" => Token::Distinct,
769 "OVER" => Token::Over,
770 "PARTITION" => Token::Partition,
771 "BY" => Token::By,
772 "ROWS" => Token::Rows,
774 "UNBOUNDED" => Token::Unbounded,
777 "PRECEDING" => Token::Preceding,
778 "FOLLOWING" => Token::Following,
779 "CURRENT" => Token::Current,
780 "ROW" => Token::Row,
781 "UNION" => Token::Union,
783 "INTERSECT" => Token::Intersect,
784 "EXCEPT" => Token::Except,
785 "WEB" => Token::Web,
787 "UNNEST" => Token::Unnest,
789 "JOIN" => Token::Join,
791 "INNER" => Token::Inner,
792 "LEFT" => Token::Left,
793 "RIGHT" => Token::Right,
794 "FULL" => Token::Full,
795 "OUTER" => Token::Outer,
796 "ON" => Token::On,
797 "CROSS" => Token::Cross,
798 _ => Token::Identifier(ident),
799 }
800 }
801 Some(ch) => {
802 self.advance();
803 Token::Identifier(ch.to_string())
804 }
805 }
806 }
807
808 fn peek_keyword(&mut self, keyword: &str) -> bool {
809 let saved_pos = self.position;
810 let saved_char = self.current_char;
811
812 self.skip_whitespace_and_comments();
813 let next_word = self.read_identifier();
814 let matches = next_word.to_uppercase() == keyword;
815
816 self.position = saved_pos;
818 self.current_char = saved_char;
819
820 matches
821 }
822
823 #[must_use]
824 pub fn get_position(&self) -> usize {
825 self.position
826 }
827
828 pub fn tokenize_all(&mut self) -> Vec<Token> {
829 let mut tokens = Vec::new();
830 loop {
831 let token = self.next_token();
832 if matches!(token, Token::Eof) {
833 tokens.push(token);
834 break;
835 }
836 tokens.push(token);
837 }
838 tokens
839 }
840
841 pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
842 let mut tokens = Vec::new();
843 loop {
844 self.skip_whitespace_and_comments();
845 let start_pos = self.position;
846 let token = self.next_token();
847 let end_pos = self.position;
848
849 if matches!(token, Token::Eof) {
850 break;
851 }
852 tokens.push((start_pos, end_pos, token));
853 }
854 tokens
855 }
856
857 pub fn tokenize_all_with_comments(&mut self) -> Vec<Token> {
860 let mut tokens = Vec::new();
861 loop {
862 let token = self.next_token_with_comments();
863 if matches!(token, Token::Eof) {
864 tokens.push(token);
865 break;
866 }
867 tokens.push(token);
868 }
869 tokens
870 }
871}
872
873#[cfg(test)]
874mod tests {
875 use super::*;
876
877 #[test]
878 fn test_line_comment_tokenization() {
879 let sql = "SELECT col1, -- this is a comment\ncol2 FROM table";
880 let mut lexer = Lexer::new(sql);
881 let tokens = lexer.tokenize_all_with_comments();
882
883 let comment_token = tokens.iter().find(|t| matches!(t, Token::LineComment(_)));
885 assert!(comment_token.is_some(), "Should find line comment token");
886
887 if let Some(Token::LineComment(text)) = comment_token {
888 assert_eq!(text.trim(), "this is a comment");
889 }
890 }
891
892 #[test]
893 fn test_block_comment_tokenization() {
894 let sql = "SELECT /* block comment */ col1 FROM table";
895 let mut lexer = Lexer::new(sql);
896 let tokens = lexer.tokenize_all_with_comments();
897
898 let comment_token = tokens.iter().find(|t| matches!(t, Token::BlockComment(_)));
900 assert!(comment_token.is_some(), "Should find block comment token");
901
902 if let Some(Token::BlockComment(text)) = comment_token {
903 assert_eq!(text.trim(), "block comment");
904 }
905 }
906
907 #[test]
908 fn test_multiple_comments() {
909 let sql = "-- First comment\nSELECT col1, /* inline */ col2\n-- Second comment\nFROM table";
910 let mut lexer = Lexer::new(sql);
911 let tokens = lexer.tokenize_all_with_comments();
912
913 let line_comments: Vec<_> = tokens
914 .iter()
915 .filter(|t| matches!(t, Token::LineComment(_)))
916 .collect();
917 let block_comments: Vec<_> = tokens
918 .iter()
919 .filter(|t| matches!(t, Token::BlockComment(_)))
920 .collect();
921
922 assert_eq!(line_comments.len(), 2, "Should find 2 line comments");
923 assert_eq!(block_comments.len(), 1, "Should find 1 block comment");
924 }
925
926 #[test]
927 fn test_backwards_compatibility() {
928 let sql = "SELECT -- comment\ncol1 FROM table";
930 let mut lexer = Lexer::new(sql);
931 let tokens = lexer.tokenize_all();
932
933 let has_comments = tokens
935 .iter()
936 .any(|t| matches!(t, Token::LineComment(_) | Token::BlockComment(_)));
937 assert!(
938 !has_comments,
939 "next_token() should skip comments for backwards compatibility"
940 );
941
942 assert!(tokens.iter().any(|t| matches!(t, Token::Select)));
944 assert!(tokens.iter().any(|t| matches!(t, Token::From)));
945 }
946}