1#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8 Select,
10 From,
11 Where,
12 With, And,
14 Or,
15 In,
16 Not,
17 Between,
18 Like,
19 Is,
20 Null,
21 OrderBy,
22 GroupBy,
23 Having,
24 As,
25 Asc,
26 Desc,
27 Limit,
28 Offset,
29 DateTime, Case, When, Then, Else, End, Distinct, Over, Partition, By, Rows, Range, Unbounded, Preceding, Following, Current, Row, Union, Intersect, Except, Web, Unnest, Join, Inner, Left, Right, Full, Outer, On, Cross, Identifier(String),
72 QuotedIdentifier(String), StringLiteral(String),
74 JsonBlock(String), NumberLiteral(String),
76 Star,
77
78 Dot,
80 Comma,
81 Colon,
82 LeftParen,
83 RightParen,
84 Equal,
85 NotEqual,
86 LessThan,
87 GreaterThan,
88 LessThanOrEqual,
89 GreaterThanOrEqual,
90
91 Plus,
93 Minus,
94 Divide,
95 Modulo,
96
97 Concat, Eof,
102}
103
104impl Token {
105 pub fn from_keyword(s: &str) -> Option<Token> {
107 match s.to_uppercase().as_str() {
108 "SELECT" => Some(Token::Select),
109 "FROM" => Some(Token::From),
110 "WHERE" => Some(Token::Where),
111 "WITH" => Some(Token::With),
112 "AND" => Some(Token::And),
113 "OR" => Some(Token::Or),
114 "IN" => Some(Token::In),
115 "NOT" => Some(Token::Not),
116 "BETWEEN" => Some(Token::Between),
117 "LIKE" => Some(Token::Like),
118 "IS" => Some(Token::Is),
119 "NULL" => Some(Token::Null),
120 "ORDER" => Some(Token::OrderBy),
121 "GROUP" => Some(Token::GroupBy),
122 "HAVING" => Some(Token::Having),
123 "AS" => Some(Token::As),
124 "ASC" => Some(Token::Asc),
125 "DESC" => Some(Token::Desc),
126 "LIMIT" => Some(Token::Limit),
127 "OFFSET" => Some(Token::Offset),
128 "DISTINCT" => Some(Token::Distinct),
129 "CASE" => Some(Token::Case),
130 "WHEN" => Some(Token::When),
131 "THEN" => Some(Token::Then),
132 "ELSE" => Some(Token::Else),
133 "END" => Some(Token::End),
134 "OVER" => Some(Token::Over),
135 "PARTITION" => Some(Token::Partition),
136 "BY" => Some(Token::By),
137 "ROWS" => Some(Token::Rows),
138 "RANGE" => Some(Token::Range),
139 "UNBOUNDED" => Some(Token::Unbounded),
140 "PRECEDING" => Some(Token::Preceding),
141 "FOLLOWING" => Some(Token::Following),
142 "CURRENT" => Some(Token::Current),
143 "ROW" => Some(Token::Row),
144 "UNION" => Some(Token::Union),
145 "INTERSECT" => Some(Token::Intersect),
146 "EXCEPT" => Some(Token::Except),
147 "WEB" => Some(Token::Web),
148 "UNNEST" => Some(Token::Unnest),
149 "JOIN" => Some(Token::Join),
150 "INNER" => Some(Token::Inner),
151 "LEFT" => Some(Token::Left),
152 "RIGHT" => Some(Token::Right),
153 "FULL" => Some(Token::Full),
154 "OUTER" => Some(Token::Outer),
155 "ON" => Some(Token::On),
156 "CROSS" => Some(Token::Cross),
157 _ => None,
158 }
159 }
160
161 pub fn is_logical_operator(&self) -> bool {
163 matches!(self, Token::And | Token::Or)
164 }
165
166 pub fn is_join_type(&self) -> bool {
168 matches!(
169 self,
170 Token::Inner | Token::Left | Token::Right | Token::Full | Token::Cross
171 )
172 }
173
174 pub fn is_clause_terminator(&self) -> bool {
176 matches!(
177 self,
178 Token::OrderBy
179 | Token::GroupBy
180 | Token::Having
181 | Token::Limit
182 | Token::Offset
183 | Token::Union
184 | Token::Intersect
185 | Token::Except
186 )
187 }
188
189 pub fn as_keyword_str(&self) -> Option<&'static str> {
191 match self {
192 Token::Select => Some("SELECT"),
193 Token::From => Some("FROM"),
194 Token::Where => Some("WHERE"),
195 Token::With => Some("WITH"),
196 Token::And => Some("AND"),
197 Token::Or => Some("OR"),
198 Token::OrderBy => Some("ORDER BY"),
199 Token::GroupBy => Some("GROUP BY"),
200 Token::Having => Some("HAVING"),
201 _ => None,
203 }
204 }
205}
206
207#[derive(Debug, Clone)]
208pub struct Lexer {
209 input: Vec<char>,
210 position: usize,
211 current_char: Option<char>,
212}
213
214impl Lexer {
215 #[must_use]
216 pub fn new(input: &str) -> Self {
217 let chars: Vec<char> = input.chars().collect();
218 let current = chars.first().copied();
219 Self {
220 input: chars,
221 position: 0,
222 current_char: current,
223 }
224 }
225
226 fn advance(&mut self) {
227 self.position += 1;
228 self.current_char = self.input.get(self.position).copied();
229 }
230
231 fn peek(&self, offset: usize) -> Option<char> {
232 self.input.get(self.position + offset).copied()
233 }
234
235 fn peek_string(&self, n: usize) -> String {
237 let mut result = String::new();
238 for i in 0..n {
239 if let Some(ch) = self.input.get(self.position + i) {
240 result.push(*ch);
241 } else {
242 break;
243 }
244 }
245 result
246 }
247
248 fn read_json_block(&mut self) -> String {
251 let mut result = String::new();
252
253 for _ in 0..6 {
255 self.advance();
256 }
257
258 while let Some(ch) = self.current_char {
260 if ch == '$' && self.peek_string(6) == "$JSON$" {
262 for _ in 0..6 {
264 self.advance();
265 }
266 break;
267 }
268 result.push(ch);
269 self.advance();
270 }
271
272 result
273 }
274
275 fn skip_whitespace(&mut self) {
276 while let Some(ch) = self.current_char {
277 if ch.is_whitespace() {
278 self.advance();
279 } else {
280 break;
281 }
282 }
283 }
284
285 fn skip_whitespace_and_comments(&mut self) {
286 loop {
287 while let Some(ch) = self.current_char {
289 if ch.is_whitespace() {
290 self.advance();
291 } else {
292 break;
293 }
294 }
295
296 match self.current_char {
298 Some('-') if self.peek(1) == Some('-') => {
299 self.advance(); self.advance(); while let Some(ch) = self.current_char {
303 self.advance();
304 if ch == '\n' {
305 break;
306 }
307 }
308 }
309 Some('/') if self.peek(1) == Some('*') => {
310 self.advance(); self.advance(); while let Some(ch) = self.current_char {
314 if ch == '*' && self.peek(1) == Some('/') {
315 self.advance(); self.advance(); break;
318 }
319 self.advance();
320 }
321 }
322 _ => {
323 break;
325 }
326 }
327 }
328 }
329
330 fn read_identifier(&mut self) -> String {
331 let mut result = String::new();
332 while let Some(ch) = self.current_char {
333 if ch.is_alphanumeric() || ch == '_' {
334 result.push(ch);
335 self.advance();
336 } else {
337 break;
338 }
339 }
340 result
341 }
342
343 fn read_string(&mut self) -> String {
344 let mut result = String::new();
345 let quote_char = self.current_char.unwrap(); self.advance(); while let Some(ch) = self.current_char {
349 if ch == quote_char {
350 self.advance(); break;
352 }
353 result.push(ch);
354 self.advance();
355 }
356 result
357 }
358
359 fn read_number(&mut self) -> String {
360 let mut result = String::new();
361 let has_e = false;
362
363 while let Some(ch) = self.current_char {
365 if !has_e && (ch.is_numeric() || ch == '.') {
366 result.push(ch);
367 self.advance();
368 } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
369 result.push(ch);
371 self.advance();
372 let _ = has_e; if let Some(sign) = self.current_char {
376 if sign == '+' || sign == '-' {
377 result.push(sign);
378 self.advance();
379 }
380 }
381
382 while let Some(digit) = self.current_char {
384 if digit.is_numeric() {
385 result.push(digit);
386 self.advance();
387 } else {
388 break;
389 }
390 }
391 break; } else {
393 break;
394 }
395 }
396 result
397 }
398
399 pub fn next_token(&mut self) -> Token {
400 self.skip_whitespace_and_comments();
401
402 match self.current_char {
403 None => Token::Eof,
404 Some('*') => {
405 self.advance();
406 Token::Star }
410 Some('+') => {
411 self.advance();
412 Token::Plus
413 }
414 Some('/') => {
415 if self.peek(1) == Some('*') {
417 self.skip_whitespace_and_comments();
420 return self.next_token();
421 }
422 self.advance();
423 Token::Divide
424 }
425 Some('%') => {
426 self.advance();
427 Token::Modulo
428 }
429 Some('.') => {
430 self.advance();
431 Token::Dot
432 }
433 Some(',') => {
434 self.advance();
435 Token::Comma
436 }
437 Some(':') => {
438 self.advance();
439 Token::Colon
440 }
441 Some('(') => {
442 self.advance();
443 Token::LeftParen
444 }
445 Some(')') => {
446 self.advance();
447 Token::RightParen
448 }
449 Some('=') => {
450 self.advance();
451 Token::Equal
452 }
453 Some('<') => {
454 self.advance();
455 if self.current_char == Some('=') {
456 self.advance();
457 Token::LessThanOrEqual
458 } else if self.current_char == Some('>') {
459 self.advance();
460 Token::NotEqual
461 } else {
462 Token::LessThan
463 }
464 }
465 Some('>') => {
466 self.advance();
467 if self.current_char == Some('=') {
468 self.advance();
469 Token::GreaterThanOrEqual
470 } else {
471 Token::GreaterThan
472 }
473 }
474 Some('!') if self.peek(1) == Some('=') => {
475 self.advance();
476 self.advance();
477 Token::NotEqual
478 }
479 Some('|') if self.peek(1) == Some('|') => {
480 self.advance();
481 self.advance();
482 Token::Concat
483 }
484 Some('"') => {
485 let ident_val = self.read_string();
487 Token::QuotedIdentifier(ident_val)
488 }
489 Some('$') => {
490 if self.peek_string(6) == "$JSON$" {
492 let json_content = self.read_json_block();
493 Token::JsonBlock(json_content)
494 } else {
495 let ident = self.read_identifier();
498 Token::Identifier(ident)
499 }
500 }
501 Some('\'') => {
502 let string_val = self.read_string();
504 Token::StringLiteral(string_val)
505 }
506 Some('-') if self.peek(1) == Some('-') => {
507 self.skip_whitespace_and_comments();
509 self.next_token()
510 }
511 Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
512 self.advance(); let num = self.read_number();
515 Token::NumberLiteral(format!("-{num}"))
516 }
517 Some('-') => {
518 self.advance();
520 Token::Minus
521 }
522 Some(ch) if ch.is_numeric() => {
523 let num = self.read_number();
524 Token::NumberLiteral(num)
525 }
526 Some(ch) if ch.is_alphabetic() || ch == '_' => {
527 let ident = self.read_identifier();
528 match ident.to_uppercase().as_str() {
529 "SELECT" => Token::Select,
530 "FROM" => Token::From,
531 "WHERE" => Token::Where,
532 "WITH" => Token::With,
533 "AND" => Token::And,
534 "OR" => Token::Or,
535 "IN" => Token::In,
536 "NOT" => Token::Not,
537 "BETWEEN" => Token::Between,
538 "LIKE" => Token::Like,
539 "IS" => Token::Is,
540 "NULL" => Token::Null,
541 "ORDER" if self.peek_keyword("BY") => {
542 self.skip_whitespace();
543 self.read_identifier(); Token::OrderBy
545 }
546 "GROUP" if self.peek_keyword("BY") => {
547 self.skip_whitespace();
548 self.read_identifier(); Token::GroupBy
550 }
551 "HAVING" => Token::Having,
552 "AS" => Token::As,
553 "ASC" => Token::Asc,
554 "DESC" => Token::Desc,
555 "LIMIT" => Token::Limit,
556 "OFFSET" => Token::Offset,
557 "DATETIME" => Token::DateTime,
558 "CASE" => Token::Case,
559 "WHEN" => Token::When,
560 "THEN" => Token::Then,
561 "ELSE" => Token::Else,
562 "END" => Token::End,
563 "DISTINCT" => Token::Distinct,
564 "OVER" => Token::Over,
565 "PARTITION" => Token::Partition,
566 "BY" => Token::By,
567 "ROWS" => Token::Rows,
569 "UNBOUNDED" => Token::Unbounded,
572 "PRECEDING" => Token::Preceding,
573 "FOLLOWING" => Token::Following,
574 "CURRENT" => Token::Current,
575 "ROW" => Token::Row,
576 "UNION" => Token::Union,
578 "INTERSECT" => Token::Intersect,
579 "EXCEPT" => Token::Except,
580 "WEB" => Token::Web,
582 "UNNEST" => Token::Unnest,
584 "JOIN" => Token::Join,
586 "INNER" => Token::Inner,
587 "LEFT" => Token::Left,
588 "RIGHT" => Token::Right,
589 "FULL" => Token::Full,
590 "OUTER" => Token::Outer,
591 "ON" => Token::On,
592 "CROSS" => Token::Cross,
593 _ => Token::Identifier(ident),
594 }
595 }
596 Some(ch) => {
597 self.advance();
598 Token::Identifier(ch.to_string())
599 }
600 }
601 }
602
603 fn peek_keyword(&mut self, keyword: &str) -> bool {
604 let saved_pos = self.position;
605 let saved_char = self.current_char;
606
607 self.skip_whitespace_and_comments();
608 let next_word = self.read_identifier();
609 let matches = next_word.to_uppercase() == keyword;
610
611 self.position = saved_pos;
613 self.current_char = saved_char;
614
615 matches
616 }
617
618 #[must_use]
619 pub fn get_position(&self) -> usize {
620 self.position
621 }
622
623 pub fn tokenize_all(&mut self) -> Vec<Token> {
624 let mut tokens = Vec::new();
625 loop {
626 let token = self.next_token();
627 if matches!(token, Token::Eof) {
628 tokens.push(token);
629 break;
630 }
631 tokens.push(token);
632 }
633 tokens
634 }
635
636 pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
637 let mut tokens = Vec::new();
638 loop {
639 self.skip_whitespace_and_comments();
640 let start_pos = self.position;
641 let token = self.next_token();
642 let end_pos = self.position;
643
644 if matches!(token, Token::Eof) {
645 break;
646 }
647 tokens.push((start_pos, end_pos, token));
648 }
649 tokens
650 }
651}