1#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8 Select,
10 From,
11 Where,
12 With, And,
14 Or,
15 In,
16 Not,
17 Between,
18 Like,
19 Is,
20 Null,
21 OrderBy,
22 GroupBy,
23 Having,
24 As,
25 Asc,
26 Desc,
27 Limit,
28 Offset,
29 DateTime, Case, When, Then, Else, End, Distinct, Over, Partition, By, Rows, Range, Unbounded, Preceding, Following, Current, Row, Union, Intersect, Except, Web, Join, Inner, Left, Right, Full, Outer, On, Cross, Identifier(String),
69 QuotedIdentifier(String), StringLiteral(String),
71 NumberLiteral(String),
72 Star,
73
74 Dot,
76 Comma,
77 Colon,
78 LeftParen,
79 RightParen,
80 Equal,
81 NotEqual,
82 LessThan,
83 GreaterThan,
84 LessThanOrEqual,
85 GreaterThanOrEqual,
86
87 Plus,
89 Minus,
90 Divide,
91 Modulo,
92
93 Concat, Eof,
98}
99
100#[derive(Debug, Clone)]
101pub struct Lexer {
102 input: Vec<char>,
103 position: usize,
104 current_char: Option<char>,
105}
106
107impl Lexer {
108 #[must_use]
109 pub fn new(input: &str) -> Self {
110 let chars: Vec<char> = input.chars().collect();
111 let current = chars.first().copied();
112 Self {
113 input: chars,
114 position: 0,
115 current_char: current,
116 }
117 }
118
119 fn advance(&mut self) {
120 self.position += 1;
121 self.current_char = self.input.get(self.position).copied();
122 }
123
124 fn peek(&self, offset: usize) -> Option<char> {
125 self.input.get(self.position + offset).copied()
126 }
127
128 fn skip_whitespace(&mut self) {
129 while let Some(ch) = self.current_char {
130 if ch.is_whitespace() {
131 self.advance();
132 } else {
133 break;
134 }
135 }
136 }
137
138 fn skip_whitespace_and_comments(&mut self) {
139 loop {
140 while let Some(ch) = self.current_char {
142 if ch.is_whitespace() {
143 self.advance();
144 } else {
145 break;
146 }
147 }
148
149 match self.current_char {
151 Some('-') if self.peek(1) == Some('-') => {
152 self.advance(); self.advance(); while let Some(ch) = self.current_char {
156 self.advance();
157 if ch == '\n' {
158 break;
159 }
160 }
161 }
162 Some('/') if self.peek(1) == Some('*') => {
163 self.advance(); self.advance(); while let Some(ch) = self.current_char {
167 if ch == '*' && self.peek(1) == Some('/') {
168 self.advance(); self.advance(); break;
171 }
172 self.advance();
173 }
174 }
175 _ => {
176 break;
178 }
179 }
180 }
181 }
182
183 fn read_identifier(&mut self) -> String {
184 let mut result = String::new();
185 while let Some(ch) = self.current_char {
186 if ch.is_alphanumeric() || ch == '_' {
187 result.push(ch);
188 self.advance();
189 } else {
190 break;
191 }
192 }
193 result
194 }
195
196 fn read_string(&mut self) -> String {
197 let mut result = String::new();
198 let quote_char = self.current_char.unwrap(); self.advance(); while let Some(ch) = self.current_char {
202 if ch == quote_char {
203 self.advance(); break;
205 }
206 result.push(ch);
207 self.advance();
208 }
209 result
210 }
211
212 fn read_number(&mut self) -> String {
213 let mut result = String::new();
214 let mut has_e = false;
215
216 while let Some(ch) = self.current_char {
218 if !has_e && (ch.is_numeric() || ch == '.') {
219 result.push(ch);
220 self.advance();
221 } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
222 result.push(ch);
224 self.advance();
225 has_e = true;
226
227 if let Some(sign) = self.current_char {
229 if sign == '+' || sign == '-' {
230 result.push(sign);
231 self.advance();
232 }
233 }
234
235 while let Some(digit) = self.current_char {
237 if digit.is_numeric() {
238 result.push(digit);
239 self.advance();
240 } else {
241 break;
242 }
243 }
244 break; } else {
246 break;
247 }
248 }
249 result
250 }
251
252 pub fn next_token(&mut self) -> Token {
253 self.skip_whitespace_and_comments();
254
255 match self.current_char {
256 None => Token::Eof,
257 Some('*') => {
258 self.advance();
259 Token::Star }
263 Some('+') => {
264 self.advance();
265 Token::Plus
266 }
267 Some('/') => {
268 if self.peek(1) == Some('*') {
270 self.skip_whitespace_and_comments();
273 return self.next_token();
274 }
275 self.advance();
276 Token::Divide
277 }
278 Some('%') => {
279 self.advance();
280 Token::Modulo
281 }
282 Some('.') => {
283 self.advance();
284 Token::Dot
285 }
286 Some(',') => {
287 self.advance();
288 Token::Comma
289 }
290 Some(':') => {
291 self.advance();
292 Token::Colon
293 }
294 Some('(') => {
295 self.advance();
296 Token::LeftParen
297 }
298 Some(')') => {
299 self.advance();
300 Token::RightParen
301 }
302 Some('=') => {
303 self.advance();
304 Token::Equal
305 }
306 Some('<') => {
307 self.advance();
308 if self.current_char == Some('=') {
309 self.advance();
310 Token::LessThanOrEqual
311 } else if self.current_char == Some('>') {
312 self.advance();
313 Token::NotEqual
314 } else {
315 Token::LessThan
316 }
317 }
318 Some('>') => {
319 self.advance();
320 if self.current_char == Some('=') {
321 self.advance();
322 Token::GreaterThanOrEqual
323 } else {
324 Token::GreaterThan
325 }
326 }
327 Some('!') if self.peek(1) == Some('=') => {
328 self.advance();
329 self.advance();
330 Token::NotEqual
331 }
332 Some('|') if self.peek(1) == Some('|') => {
333 self.advance();
334 self.advance();
335 Token::Concat
336 }
337 Some('"') => {
338 let ident_val = self.read_string();
340 Token::QuotedIdentifier(ident_val)
341 }
342 Some('\'') => {
343 let string_val = self.read_string();
345 Token::StringLiteral(string_val)
346 }
347 Some('-') if self.peek(1) == Some('-') => {
348 self.skip_whitespace_and_comments();
350 self.next_token()
351 }
352 Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
353 self.advance(); let num = self.read_number();
356 Token::NumberLiteral(format!("-{num}"))
357 }
358 Some('-') => {
359 self.advance();
361 Token::Minus
362 }
363 Some(ch) if ch.is_numeric() => {
364 let num = self.read_number();
365 Token::NumberLiteral(num)
366 }
367 Some(ch) if ch.is_alphabetic() || ch == '_' => {
368 let ident = self.read_identifier();
369 match ident.to_uppercase().as_str() {
370 "SELECT" => Token::Select,
371 "FROM" => Token::From,
372 "WHERE" => Token::Where,
373 "WITH" => Token::With,
374 "AND" => Token::And,
375 "OR" => Token::Or,
376 "IN" => Token::In,
377 "NOT" => Token::Not,
378 "BETWEEN" => Token::Between,
379 "LIKE" => Token::Like,
380 "IS" => Token::Is,
381 "NULL" => Token::Null,
382 "ORDER" if self.peek_keyword("BY") => {
383 self.skip_whitespace();
384 self.read_identifier(); Token::OrderBy
386 }
387 "GROUP" if self.peek_keyword("BY") => {
388 self.skip_whitespace();
389 self.read_identifier(); Token::GroupBy
391 }
392 "HAVING" => Token::Having,
393 "AS" => Token::As,
394 "ASC" => Token::Asc,
395 "DESC" => Token::Desc,
396 "LIMIT" => Token::Limit,
397 "OFFSET" => Token::Offset,
398 "DATETIME" => Token::DateTime,
399 "CASE" => Token::Case,
400 "WHEN" => Token::When,
401 "THEN" => Token::Then,
402 "ELSE" => Token::Else,
403 "END" => Token::End,
404 "DISTINCT" => Token::Distinct,
405 "OVER" => Token::Over,
406 "PARTITION" => Token::Partition,
407 "BY" => Token::By,
408 "ROWS" => Token::Rows,
410 "UNBOUNDED" => Token::Unbounded,
413 "PRECEDING" => Token::Preceding,
414 "FOLLOWING" => Token::Following,
415 "CURRENT" => Token::Current,
416 "ROW" => Token::Row,
417 "UNION" => Token::Union,
419 "INTERSECT" => Token::Intersect,
420 "EXCEPT" => Token::Except,
421 "WEB" => Token::Web,
423 "JOIN" => Token::Join,
425 "INNER" => Token::Inner,
426 "LEFT" => Token::Left,
427 "RIGHT" => Token::Right,
428 "FULL" => Token::Full,
429 "OUTER" => Token::Outer,
430 "ON" => Token::On,
431 "CROSS" => Token::Cross,
432 _ => Token::Identifier(ident),
433 }
434 }
435 Some(ch) => {
436 self.advance();
437 Token::Identifier(ch.to_string())
438 }
439 }
440 }
441
442 fn peek_keyword(&mut self, keyword: &str) -> bool {
443 let saved_pos = self.position;
444 let saved_char = self.current_char;
445
446 self.skip_whitespace_and_comments();
447 let next_word = self.read_identifier();
448 let matches = next_word.to_uppercase() == keyword;
449
450 self.position = saved_pos;
452 self.current_char = saved_char;
453
454 matches
455 }
456
457 #[must_use]
458 pub fn get_position(&self) -> usize {
459 self.position
460 }
461
462 pub fn tokenize_all(&mut self) -> Vec<Token> {
463 let mut tokens = Vec::new();
464 loop {
465 let token = self.next_token();
466 if matches!(token, Token::Eof) {
467 tokens.push(token);
468 break;
469 }
470 tokens.push(token);
471 }
472 tokens
473 }
474
475 pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
476 let mut tokens = Vec::new();
477 loop {
478 self.skip_whitespace_and_comments();
479 let start_pos = self.position;
480 let token = self.next_token();
481 let end_pos = self.position;
482
483 if matches!(token, Token::Eof) {
484 break;
485 }
486 tokens.push((start_pos, end_pos, token));
487 }
488 tokens
489 }
490}