1#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8 Select,
10 From,
11 Where,
12 With, And,
14 Or,
15 In,
16 Not,
17 Between,
18 Like,
19 Is,
20 Null,
21 OrderBy,
22 GroupBy,
23 Having,
24 As,
25 Asc,
26 Desc,
27 Limit,
28 Offset,
29 Into, DateTime, Case, When, Then, Else, End, Distinct, Over, Partition, By, Rows, Range, Unbounded, Preceding, Following, Current, Row, Union, Intersect, Except, Web, Unnest, Join, Inner, Left, Right, Full, Outer, On, Cross, Identifier(String),
73 QuotedIdentifier(String), StringLiteral(String),
75 JsonBlock(String), NumberLiteral(String),
77 Star,
78
79 Dot,
81 Comma,
82 Colon,
83 LeftParen,
84 RightParen,
85 Equal,
86 NotEqual,
87 LessThan,
88 GreaterThan,
89 LessThanOrEqual,
90 GreaterThanOrEqual,
91
92 Plus,
94 Minus,
95 Divide,
96 Modulo,
97
98 Concat, Eof,
103}
104
105impl Token {
106 pub fn from_keyword(s: &str) -> Option<Token> {
108 match s.to_uppercase().as_str() {
109 "SELECT" => Some(Token::Select),
110 "FROM" => Some(Token::From),
111 "WHERE" => Some(Token::Where),
112 "WITH" => Some(Token::With),
113 "AND" => Some(Token::And),
114 "OR" => Some(Token::Or),
115 "IN" => Some(Token::In),
116 "NOT" => Some(Token::Not),
117 "BETWEEN" => Some(Token::Between),
118 "LIKE" => Some(Token::Like),
119 "IS" => Some(Token::Is),
120 "NULL" => Some(Token::Null),
121 "ORDER" => Some(Token::OrderBy),
122 "GROUP" => Some(Token::GroupBy),
123 "HAVING" => Some(Token::Having),
124 "AS" => Some(Token::As),
125 "ASC" => Some(Token::Asc),
126 "DESC" => Some(Token::Desc),
127 "LIMIT" => Some(Token::Limit),
128 "OFFSET" => Some(Token::Offset),
129 "INTO" => Some(Token::Into),
130 "DISTINCT" => Some(Token::Distinct),
131 "CASE" => Some(Token::Case),
132 "WHEN" => Some(Token::When),
133 "THEN" => Some(Token::Then),
134 "ELSE" => Some(Token::Else),
135 "END" => Some(Token::End),
136 "OVER" => Some(Token::Over),
137 "PARTITION" => Some(Token::Partition),
138 "BY" => Some(Token::By),
139 "ROWS" => Some(Token::Rows),
140 "RANGE" => Some(Token::Range),
141 "UNBOUNDED" => Some(Token::Unbounded),
142 "PRECEDING" => Some(Token::Preceding),
143 "FOLLOWING" => Some(Token::Following),
144 "CURRENT" => Some(Token::Current),
145 "ROW" => Some(Token::Row),
146 "UNION" => Some(Token::Union),
147 "INTERSECT" => Some(Token::Intersect),
148 "EXCEPT" => Some(Token::Except),
149 "WEB" => Some(Token::Web),
150 "UNNEST" => Some(Token::Unnest),
151 "JOIN" => Some(Token::Join),
152 "INNER" => Some(Token::Inner),
153 "LEFT" => Some(Token::Left),
154 "RIGHT" => Some(Token::Right),
155 "FULL" => Some(Token::Full),
156 "OUTER" => Some(Token::Outer),
157 "ON" => Some(Token::On),
158 "CROSS" => Some(Token::Cross),
159 _ => None,
160 }
161 }
162
163 pub fn is_logical_operator(&self) -> bool {
165 matches!(self, Token::And | Token::Or)
166 }
167
168 pub fn is_join_type(&self) -> bool {
170 matches!(
171 self,
172 Token::Inner | Token::Left | Token::Right | Token::Full | Token::Cross
173 )
174 }
175
176 pub fn is_clause_terminator(&self) -> bool {
178 matches!(
179 self,
180 Token::OrderBy
181 | Token::GroupBy
182 | Token::Having
183 | Token::Limit
184 | Token::Offset
185 | Token::Union
186 | Token::Intersect
187 | Token::Except
188 )
189 }
190
191 pub fn as_keyword_str(&self) -> Option<&'static str> {
193 match self {
194 Token::Select => Some("SELECT"),
195 Token::From => Some("FROM"),
196 Token::Where => Some("WHERE"),
197 Token::With => Some("WITH"),
198 Token::And => Some("AND"),
199 Token::Or => Some("OR"),
200 Token::OrderBy => Some("ORDER BY"),
201 Token::GroupBy => Some("GROUP BY"),
202 Token::Having => Some("HAVING"),
203 _ => None,
205 }
206 }
207}
208
209#[derive(Debug, Clone)]
210pub struct Lexer {
211 input: Vec<char>,
212 position: usize,
213 current_char: Option<char>,
214}
215
216impl Lexer {
217 #[must_use]
218 pub fn new(input: &str) -> Self {
219 let chars: Vec<char> = input.chars().collect();
220 let current = chars.first().copied();
221 Self {
222 input: chars,
223 position: 0,
224 current_char: current,
225 }
226 }
227
228 fn advance(&mut self) {
229 self.position += 1;
230 self.current_char = self.input.get(self.position).copied();
231 }
232
233 fn peek(&self, offset: usize) -> Option<char> {
234 self.input.get(self.position + offset).copied()
235 }
236
237 fn peek_string(&self, n: usize) -> String {
239 let mut result = String::new();
240 for i in 0..n {
241 if let Some(ch) = self.input.get(self.position + i) {
242 result.push(*ch);
243 } else {
244 break;
245 }
246 }
247 result
248 }
249
250 fn read_json_block(&mut self) -> String {
253 let mut result = String::new();
254
255 for _ in 0..6 {
257 self.advance();
258 }
259
260 while let Some(ch) = self.current_char {
262 if ch == '$' && self.peek_string(6) == "$JSON$" {
264 for _ in 0..6 {
266 self.advance();
267 }
268 break;
269 }
270 result.push(ch);
271 self.advance();
272 }
273
274 result
275 }
276
277 fn skip_whitespace(&mut self) {
278 while let Some(ch) = self.current_char {
279 if ch.is_whitespace() {
280 self.advance();
281 } else {
282 break;
283 }
284 }
285 }
286
287 fn skip_whitespace_and_comments(&mut self) {
288 loop {
289 while let Some(ch) = self.current_char {
291 if ch.is_whitespace() {
292 self.advance();
293 } else {
294 break;
295 }
296 }
297
298 match self.current_char {
300 Some('-') if self.peek(1) == Some('-') => {
301 self.advance(); self.advance(); while let Some(ch) = self.current_char {
305 self.advance();
306 if ch == '\n' {
307 break;
308 }
309 }
310 }
311 Some('/') if self.peek(1) == Some('*') => {
312 self.advance(); self.advance(); while let Some(ch) = self.current_char {
316 if ch == '*' && self.peek(1) == Some('/') {
317 self.advance(); self.advance(); break;
320 }
321 self.advance();
322 }
323 }
324 _ => {
325 break;
327 }
328 }
329 }
330 }
331
332 fn read_identifier(&mut self) -> String {
333 let mut result = String::new();
334 while let Some(ch) = self.current_char {
335 if ch.is_alphanumeric() || ch == '_' {
336 result.push(ch);
337 self.advance();
338 } else {
339 break;
340 }
341 }
342 result
343 }
344
345 fn read_string(&mut self) -> String {
346 let mut result = String::new();
347 let quote_char = self.current_char.unwrap(); self.advance(); while let Some(ch) = self.current_char {
351 if ch == quote_char {
352 self.advance(); break;
354 }
355 result.push(ch);
356 self.advance();
357 }
358 result
359 }
360
361 fn read_number(&mut self) -> String {
362 let mut result = String::new();
363 let has_e = false;
364
365 while let Some(ch) = self.current_char {
367 if !has_e && (ch.is_numeric() || ch == '.') {
368 result.push(ch);
369 self.advance();
370 } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
371 result.push(ch);
373 self.advance();
374 let _ = has_e; if let Some(sign) = self.current_char {
378 if sign == '+' || sign == '-' {
379 result.push(sign);
380 self.advance();
381 }
382 }
383
384 while let Some(digit) = self.current_char {
386 if digit.is_numeric() {
387 result.push(digit);
388 self.advance();
389 } else {
390 break;
391 }
392 }
393 break; } else {
395 break;
396 }
397 }
398 result
399 }
400
401 pub fn next_token(&mut self) -> Token {
402 self.skip_whitespace_and_comments();
403
404 match self.current_char {
405 None => Token::Eof,
406 Some('*') => {
407 self.advance();
408 Token::Star }
412 Some('+') => {
413 self.advance();
414 Token::Plus
415 }
416 Some('/') => {
417 if self.peek(1) == Some('*') {
419 self.skip_whitespace_and_comments();
422 return self.next_token();
423 }
424 self.advance();
425 Token::Divide
426 }
427 Some('%') => {
428 self.advance();
429 Token::Modulo
430 }
431 Some('.') => {
432 self.advance();
433 Token::Dot
434 }
435 Some(',') => {
436 self.advance();
437 Token::Comma
438 }
439 Some(':') => {
440 self.advance();
441 Token::Colon
442 }
443 Some('(') => {
444 self.advance();
445 Token::LeftParen
446 }
447 Some(')') => {
448 self.advance();
449 Token::RightParen
450 }
451 Some('=') => {
452 self.advance();
453 Token::Equal
454 }
455 Some('<') => {
456 self.advance();
457 if self.current_char == Some('=') {
458 self.advance();
459 Token::LessThanOrEqual
460 } else if self.current_char == Some('>') {
461 self.advance();
462 Token::NotEqual
463 } else {
464 Token::LessThan
465 }
466 }
467 Some('>') => {
468 self.advance();
469 if self.current_char == Some('=') {
470 self.advance();
471 Token::GreaterThanOrEqual
472 } else {
473 Token::GreaterThan
474 }
475 }
476 Some('!') if self.peek(1) == Some('=') => {
477 self.advance();
478 self.advance();
479 Token::NotEqual
480 }
481 Some('|') if self.peek(1) == Some('|') => {
482 self.advance();
483 self.advance();
484 Token::Concat
485 }
486 Some('"') => {
487 let ident_val = self.read_string();
489 Token::QuotedIdentifier(ident_val)
490 }
491 Some('$') => {
492 if self.peek_string(6) == "$JSON$" {
494 let json_content = self.read_json_block();
495 Token::JsonBlock(json_content)
496 } else {
497 let ident = self.read_identifier();
500 Token::Identifier(ident)
501 }
502 }
503 Some('\'') => {
504 let string_val = self.read_string();
506 Token::StringLiteral(string_val)
507 }
508 Some('-') if self.peek(1) == Some('-') => {
509 self.skip_whitespace_and_comments();
511 self.next_token()
512 }
513 Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
514 self.advance(); let num = self.read_number();
517 Token::NumberLiteral(format!("-{num}"))
518 }
519 Some('-') => {
520 self.advance();
522 Token::Minus
523 }
524 Some(ch) if ch.is_numeric() => {
525 let num = self.read_number();
526 Token::NumberLiteral(num)
527 }
528 Some('#') => {
529 self.advance(); let table_name = self.read_identifier();
532 if table_name.is_empty() {
533 Token::Identifier("#".to_string())
535 } else {
536 Token::Identifier(format!("#{}", table_name))
538 }
539 }
540 Some(ch) if ch.is_alphabetic() || ch == '_' => {
541 let ident = self.read_identifier();
542 match ident.to_uppercase().as_str() {
543 "SELECT" => Token::Select,
544 "FROM" => Token::From,
545 "WHERE" => Token::Where,
546 "WITH" => Token::With,
547 "AND" => Token::And,
548 "OR" => Token::Or,
549 "IN" => Token::In,
550 "NOT" => Token::Not,
551 "BETWEEN" => Token::Between,
552 "LIKE" => Token::Like,
553 "IS" => Token::Is,
554 "NULL" => Token::Null,
555 "ORDER" if self.peek_keyword("BY") => {
556 self.skip_whitespace();
557 self.read_identifier(); Token::OrderBy
559 }
560 "GROUP" if self.peek_keyword("BY") => {
561 self.skip_whitespace();
562 self.read_identifier(); Token::GroupBy
564 }
565 "HAVING" => Token::Having,
566 "AS" => Token::As,
567 "ASC" => Token::Asc,
568 "DESC" => Token::Desc,
569 "LIMIT" => Token::Limit,
570 "OFFSET" => Token::Offset,
571 "INTO" => Token::Into,
572 "DATETIME" => Token::DateTime,
573 "CASE" => Token::Case,
574 "WHEN" => Token::When,
575 "THEN" => Token::Then,
576 "ELSE" => Token::Else,
577 "END" => Token::End,
578 "DISTINCT" => Token::Distinct,
579 "OVER" => Token::Over,
580 "PARTITION" => Token::Partition,
581 "BY" => Token::By,
582 "ROWS" => Token::Rows,
584 "UNBOUNDED" => Token::Unbounded,
587 "PRECEDING" => Token::Preceding,
588 "FOLLOWING" => Token::Following,
589 "CURRENT" => Token::Current,
590 "ROW" => Token::Row,
591 "UNION" => Token::Union,
593 "INTERSECT" => Token::Intersect,
594 "EXCEPT" => Token::Except,
595 "WEB" => Token::Web,
597 "UNNEST" => Token::Unnest,
599 "JOIN" => Token::Join,
601 "INNER" => Token::Inner,
602 "LEFT" => Token::Left,
603 "RIGHT" => Token::Right,
604 "FULL" => Token::Full,
605 "OUTER" => Token::Outer,
606 "ON" => Token::On,
607 "CROSS" => Token::Cross,
608 _ => Token::Identifier(ident),
609 }
610 }
611 Some(ch) => {
612 self.advance();
613 Token::Identifier(ch.to_string())
614 }
615 }
616 }
617
618 fn peek_keyword(&mut self, keyword: &str) -> bool {
619 let saved_pos = self.position;
620 let saved_char = self.current_char;
621
622 self.skip_whitespace_and_comments();
623 let next_word = self.read_identifier();
624 let matches = next_word.to_uppercase() == keyword;
625
626 self.position = saved_pos;
628 self.current_char = saved_char;
629
630 matches
631 }
632
633 #[must_use]
634 pub fn get_position(&self) -> usize {
635 self.position
636 }
637
638 pub fn tokenize_all(&mut self) -> Vec<Token> {
639 let mut tokens = Vec::new();
640 loop {
641 let token = self.next_token();
642 if matches!(token, Token::Eof) {
643 tokens.push(token);
644 break;
645 }
646 tokens.push(token);
647 }
648 tokens
649 }
650
651 pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
652 let mut tokens = Vec::new();
653 loop {
654 self.skip_whitespace_and_comments();
655 let start_pos = self.position;
656 let token = self.next_token();
657 let end_pos = self.position;
658
659 if matches!(token, Token::Eof) {
660 break;
661 }
662 tokens.push((start_pos, end_pos, token));
663 }
664 tokens
665 }
666}