1#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8 Select,
10 From,
11 Where,
12 With, And,
14 Or,
15 In,
16 Not,
17 Between,
18 Like,
19 Is,
20 Null,
21 OrderBy,
22 GroupBy,
23 Having,
24 As,
25 Asc,
26 Desc,
27 Limit,
28 Offset,
29 DateTime, Case, When, Then, Else, End, Distinct, Over, Partition, By, Rows, Range, Unbounded, Preceding, Following, Current, Row, Union, Intersect, Except, Web, Join, Inner, Left, Right, Full, Outer, On, Cross, Identifier(String),
69 QuotedIdentifier(String), StringLiteral(String),
71 JsonBlock(String), NumberLiteral(String),
73 Star,
74
75 Dot,
77 Comma,
78 Colon,
79 LeftParen,
80 RightParen,
81 Equal,
82 NotEqual,
83 LessThan,
84 GreaterThan,
85 LessThanOrEqual,
86 GreaterThanOrEqual,
87
88 Plus,
90 Minus,
91 Divide,
92 Modulo,
93
94 Concat, Eof,
99}
100
101impl Token {
102 pub fn from_keyword(s: &str) -> Option<Token> {
104 match s.to_uppercase().as_str() {
105 "SELECT" => Some(Token::Select),
106 "FROM" => Some(Token::From),
107 "WHERE" => Some(Token::Where),
108 "WITH" => Some(Token::With),
109 "AND" => Some(Token::And),
110 "OR" => Some(Token::Or),
111 "IN" => Some(Token::In),
112 "NOT" => Some(Token::Not),
113 "BETWEEN" => Some(Token::Between),
114 "LIKE" => Some(Token::Like),
115 "IS" => Some(Token::Is),
116 "NULL" => Some(Token::Null),
117 "ORDER" => Some(Token::OrderBy),
118 "GROUP" => Some(Token::GroupBy),
119 "HAVING" => Some(Token::Having),
120 "AS" => Some(Token::As),
121 "ASC" => Some(Token::Asc),
122 "DESC" => Some(Token::Desc),
123 "LIMIT" => Some(Token::Limit),
124 "OFFSET" => Some(Token::Offset),
125 "DISTINCT" => Some(Token::Distinct),
126 "CASE" => Some(Token::Case),
127 "WHEN" => Some(Token::When),
128 "THEN" => Some(Token::Then),
129 "ELSE" => Some(Token::Else),
130 "END" => Some(Token::End),
131 "OVER" => Some(Token::Over),
132 "PARTITION" => Some(Token::Partition),
133 "BY" => Some(Token::By),
134 "ROWS" => Some(Token::Rows),
135 "RANGE" => Some(Token::Range),
136 "UNBOUNDED" => Some(Token::Unbounded),
137 "PRECEDING" => Some(Token::Preceding),
138 "FOLLOWING" => Some(Token::Following),
139 "CURRENT" => Some(Token::Current),
140 "ROW" => Some(Token::Row),
141 "UNION" => Some(Token::Union),
142 "INTERSECT" => Some(Token::Intersect),
143 "EXCEPT" => Some(Token::Except),
144 "WEB" => Some(Token::Web),
145 "JOIN" => Some(Token::Join),
146 "INNER" => Some(Token::Inner),
147 "LEFT" => Some(Token::Left),
148 "RIGHT" => Some(Token::Right),
149 "FULL" => Some(Token::Full),
150 "OUTER" => Some(Token::Outer),
151 "ON" => Some(Token::On),
152 "CROSS" => Some(Token::Cross),
153 _ => None,
154 }
155 }
156
157 pub fn is_logical_operator(&self) -> bool {
159 matches!(self, Token::And | Token::Or)
160 }
161
162 pub fn is_join_type(&self) -> bool {
164 matches!(
165 self,
166 Token::Inner | Token::Left | Token::Right | Token::Full | Token::Cross
167 )
168 }
169
170 pub fn is_clause_terminator(&self) -> bool {
172 matches!(
173 self,
174 Token::OrderBy
175 | Token::GroupBy
176 | Token::Having
177 | Token::Limit
178 | Token::Offset
179 | Token::Union
180 | Token::Intersect
181 | Token::Except
182 )
183 }
184
185 pub fn as_keyword_str(&self) -> Option<&'static str> {
187 match self {
188 Token::Select => Some("SELECT"),
189 Token::From => Some("FROM"),
190 Token::Where => Some("WHERE"),
191 Token::With => Some("WITH"),
192 Token::And => Some("AND"),
193 Token::Or => Some("OR"),
194 Token::OrderBy => Some("ORDER BY"),
195 Token::GroupBy => Some("GROUP BY"),
196 Token::Having => Some("HAVING"),
197 _ => None,
199 }
200 }
201}
202
203#[derive(Debug, Clone)]
204pub struct Lexer {
205 input: Vec<char>,
206 position: usize,
207 current_char: Option<char>,
208}
209
210impl Lexer {
211 #[must_use]
212 pub fn new(input: &str) -> Self {
213 let chars: Vec<char> = input.chars().collect();
214 let current = chars.first().copied();
215 Self {
216 input: chars,
217 position: 0,
218 current_char: current,
219 }
220 }
221
222 fn advance(&mut self) {
223 self.position += 1;
224 self.current_char = self.input.get(self.position).copied();
225 }
226
227 fn peek(&self, offset: usize) -> Option<char> {
228 self.input.get(self.position + offset).copied()
229 }
230
231 fn peek_string(&self, n: usize) -> String {
233 let mut result = String::new();
234 for i in 0..n {
235 if let Some(ch) = self.input.get(self.position + i) {
236 result.push(*ch);
237 } else {
238 break;
239 }
240 }
241 result
242 }
243
244 fn read_json_block(&mut self) -> String {
247 let mut result = String::new();
248
249 for _ in 0..6 {
251 self.advance();
252 }
253
254 while let Some(ch) = self.current_char {
256 if ch == '$' && self.peek_string(6) == "$JSON$" {
258 for _ in 0..6 {
260 self.advance();
261 }
262 break;
263 }
264 result.push(ch);
265 self.advance();
266 }
267
268 result
269 }
270
271 fn skip_whitespace(&mut self) {
272 while let Some(ch) = self.current_char {
273 if ch.is_whitespace() {
274 self.advance();
275 } else {
276 break;
277 }
278 }
279 }
280
281 fn skip_whitespace_and_comments(&mut self) {
282 loop {
283 while let Some(ch) = self.current_char {
285 if ch.is_whitespace() {
286 self.advance();
287 } else {
288 break;
289 }
290 }
291
292 match self.current_char {
294 Some('-') if self.peek(1) == Some('-') => {
295 self.advance(); self.advance(); while let Some(ch) = self.current_char {
299 self.advance();
300 if ch == '\n' {
301 break;
302 }
303 }
304 }
305 Some('/') if self.peek(1) == Some('*') => {
306 self.advance(); self.advance(); while let Some(ch) = self.current_char {
310 if ch == '*' && self.peek(1) == Some('/') {
311 self.advance(); self.advance(); break;
314 }
315 self.advance();
316 }
317 }
318 _ => {
319 break;
321 }
322 }
323 }
324 }
325
326 fn read_identifier(&mut self) -> String {
327 let mut result = String::new();
328 while let Some(ch) = self.current_char {
329 if ch.is_alphanumeric() || ch == '_' {
330 result.push(ch);
331 self.advance();
332 } else {
333 break;
334 }
335 }
336 result
337 }
338
339 fn read_string(&mut self) -> String {
340 let mut result = String::new();
341 let quote_char = self.current_char.unwrap(); self.advance(); while let Some(ch) = self.current_char {
345 if ch == quote_char {
346 self.advance(); break;
348 }
349 result.push(ch);
350 self.advance();
351 }
352 result
353 }
354
355 fn read_number(&mut self) -> String {
356 let mut result = String::new();
357 let has_e = false;
358
359 while let Some(ch) = self.current_char {
361 if !has_e && (ch.is_numeric() || ch == '.') {
362 result.push(ch);
363 self.advance();
364 } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
365 result.push(ch);
367 self.advance();
368 let _ = has_e; if let Some(sign) = self.current_char {
372 if sign == '+' || sign == '-' {
373 result.push(sign);
374 self.advance();
375 }
376 }
377
378 while let Some(digit) = self.current_char {
380 if digit.is_numeric() {
381 result.push(digit);
382 self.advance();
383 } else {
384 break;
385 }
386 }
387 break; } else {
389 break;
390 }
391 }
392 result
393 }
394
395 pub fn next_token(&mut self) -> Token {
396 self.skip_whitespace_and_comments();
397
398 match self.current_char {
399 None => Token::Eof,
400 Some('*') => {
401 self.advance();
402 Token::Star }
406 Some('+') => {
407 self.advance();
408 Token::Plus
409 }
410 Some('/') => {
411 if self.peek(1) == Some('*') {
413 self.skip_whitespace_and_comments();
416 return self.next_token();
417 }
418 self.advance();
419 Token::Divide
420 }
421 Some('%') => {
422 self.advance();
423 Token::Modulo
424 }
425 Some('.') => {
426 self.advance();
427 Token::Dot
428 }
429 Some(',') => {
430 self.advance();
431 Token::Comma
432 }
433 Some(':') => {
434 self.advance();
435 Token::Colon
436 }
437 Some('(') => {
438 self.advance();
439 Token::LeftParen
440 }
441 Some(')') => {
442 self.advance();
443 Token::RightParen
444 }
445 Some('=') => {
446 self.advance();
447 Token::Equal
448 }
449 Some('<') => {
450 self.advance();
451 if self.current_char == Some('=') {
452 self.advance();
453 Token::LessThanOrEqual
454 } else if self.current_char == Some('>') {
455 self.advance();
456 Token::NotEqual
457 } else {
458 Token::LessThan
459 }
460 }
461 Some('>') => {
462 self.advance();
463 if self.current_char == Some('=') {
464 self.advance();
465 Token::GreaterThanOrEqual
466 } else {
467 Token::GreaterThan
468 }
469 }
470 Some('!') if self.peek(1) == Some('=') => {
471 self.advance();
472 self.advance();
473 Token::NotEqual
474 }
475 Some('|') if self.peek(1) == Some('|') => {
476 self.advance();
477 self.advance();
478 Token::Concat
479 }
480 Some('"') => {
481 let ident_val = self.read_string();
483 Token::QuotedIdentifier(ident_val)
484 }
485 Some('$') => {
486 if self.peek_string(6) == "$JSON$" {
488 let json_content = self.read_json_block();
489 Token::JsonBlock(json_content)
490 } else {
491 let ident = self.read_identifier();
494 Token::Identifier(ident)
495 }
496 }
497 Some('\'') => {
498 let string_val = self.read_string();
500 Token::StringLiteral(string_val)
501 }
502 Some('-') if self.peek(1) == Some('-') => {
503 self.skip_whitespace_and_comments();
505 self.next_token()
506 }
507 Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
508 self.advance(); let num = self.read_number();
511 Token::NumberLiteral(format!("-{num}"))
512 }
513 Some('-') => {
514 self.advance();
516 Token::Minus
517 }
518 Some(ch) if ch.is_numeric() => {
519 let num = self.read_number();
520 Token::NumberLiteral(num)
521 }
522 Some(ch) if ch.is_alphabetic() || ch == '_' => {
523 let ident = self.read_identifier();
524 match ident.to_uppercase().as_str() {
525 "SELECT" => Token::Select,
526 "FROM" => Token::From,
527 "WHERE" => Token::Where,
528 "WITH" => Token::With,
529 "AND" => Token::And,
530 "OR" => Token::Or,
531 "IN" => Token::In,
532 "NOT" => Token::Not,
533 "BETWEEN" => Token::Between,
534 "LIKE" => Token::Like,
535 "IS" => Token::Is,
536 "NULL" => Token::Null,
537 "ORDER" if self.peek_keyword("BY") => {
538 self.skip_whitespace();
539 self.read_identifier(); Token::OrderBy
541 }
542 "GROUP" if self.peek_keyword("BY") => {
543 self.skip_whitespace();
544 self.read_identifier(); Token::GroupBy
546 }
547 "HAVING" => Token::Having,
548 "AS" => Token::As,
549 "ASC" => Token::Asc,
550 "DESC" => Token::Desc,
551 "LIMIT" => Token::Limit,
552 "OFFSET" => Token::Offset,
553 "DATETIME" => Token::DateTime,
554 "CASE" => Token::Case,
555 "WHEN" => Token::When,
556 "THEN" => Token::Then,
557 "ELSE" => Token::Else,
558 "END" => Token::End,
559 "DISTINCT" => Token::Distinct,
560 "OVER" => Token::Over,
561 "PARTITION" => Token::Partition,
562 "BY" => Token::By,
563 "ROWS" => Token::Rows,
565 "UNBOUNDED" => Token::Unbounded,
568 "PRECEDING" => Token::Preceding,
569 "FOLLOWING" => Token::Following,
570 "CURRENT" => Token::Current,
571 "ROW" => Token::Row,
572 "UNION" => Token::Union,
574 "INTERSECT" => Token::Intersect,
575 "EXCEPT" => Token::Except,
576 "WEB" => Token::Web,
578 "JOIN" => Token::Join,
580 "INNER" => Token::Inner,
581 "LEFT" => Token::Left,
582 "RIGHT" => Token::Right,
583 "FULL" => Token::Full,
584 "OUTER" => Token::Outer,
585 "ON" => Token::On,
586 "CROSS" => Token::Cross,
587 _ => Token::Identifier(ident),
588 }
589 }
590 Some(ch) => {
591 self.advance();
592 Token::Identifier(ch.to_string())
593 }
594 }
595 }
596
597 fn peek_keyword(&mut self, keyword: &str) -> bool {
598 let saved_pos = self.position;
599 let saved_char = self.current_char;
600
601 self.skip_whitespace_and_comments();
602 let next_word = self.read_identifier();
603 let matches = next_word.to_uppercase() == keyword;
604
605 self.position = saved_pos;
607 self.current_char = saved_char;
608
609 matches
610 }
611
612 #[must_use]
613 pub fn get_position(&self) -> usize {
614 self.position
615 }
616
617 pub fn tokenize_all(&mut self) -> Vec<Token> {
618 let mut tokens = Vec::new();
619 loop {
620 let token = self.next_token();
621 if matches!(token, Token::Eof) {
622 tokens.push(token);
623 break;
624 }
625 tokens.push(token);
626 }
627 tokens
628 }
629
630 pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
631 let mut tokens = Vec::new();
632 loop {
633 self.skip_whitespace_and_comments();
634 let start_pos = self.position;
635 let token = self.next_token();
636 let end_pos = self.position;
637
638 if matches!(token, Token::Eof) {
639 break;
640 }
641 tokens.push((start_pos, end_pos, token));
642 }
643 tokens
644 }
645}