1#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8 Select,
10 From,
11 Where,
12 With, And,
14 Or,
15 In,
16 Not,
17 Between,
18 Like,
19 Is,
20 Null,
21 OrderBy,
22 GroupBy,
23 Having,
24 As,
25 Asc,
26 Desc,
27 Limit,
28 Offset,
29 DateTime, Case, When, Then, Else, End, Distinct, Over, Partition, By, Join, Inner, Left, Right, Full, Outer, On, Cross, Identifier(String),
52 QuotedIdentifier(String), StringLiteral(String),
54 NumberLiteral(String),
55 Star,
56
57 Dot,
59 Comma,
60 Colon,
61 LeftParen,
62 RightParen,
63 Equal,
64 NotEqual,
65 LessThan,
66 GreaterThan,
67 LessThanOrEqual,
68 GreaterThanOrEqual,
69
70 Plus,
72 Minus,
73 Divide,
74 Modulo,
75
76 Concat, Eof,
81}
82
83#[derive(Debug, Clone)]
84pub struct Lexer {
85 input: Vec<char>,
86 position: usize,
87 current_char: Option<char>,
88}
89
90impl Lexer {
91 #[must_use]
92 pub fn new(input: &str) -> Self {
93 let chars: Vec<char> = input.chars().collect();
94 let current = chars.first().copied();
95 Self {
96 input: chars,
97 position: 0,
98 current_char: current,
99 }
100 }
101
102 fn advance(&mut self) {
103 self.position += 1;
104 self.current_char = self.input.get(self.position).copied();
105 }
106
107 fn peek(&self, offset: usize) -> Option<char> {
108 self.input.get(self.position + offset).copied()
109 }
110
111 fn skip_whitespace(&mut self) {
112 while let Some(ch) = self.current_char {
113 if ch.is_whitespace() {
114 self.advance();
115 } else {
116 break;
117 }
118 }
119 }
120
121 fn skip_whitespace_and_comments(&mut self) {
122 loop {
123 while let Some(ch) = self.current_char {
125 if ch.is_whitespace() {
126 self.advance();
127 } else {
128 break;
129 }
130 }
131
132 match self.current_char {
134 Some('-') if self.peek(1) == Some('-') => {
135 self.advance(); self.advance(); while let Some(ch) = self.current_char {
139 self.advance();
140 if ch == '\n' {
141 break;
142 }
143 }
144 }
145 Some('/') if self.peek(1) == Some('*') => {
146 self.advance(); self.advance(); while let Some(ch) = self.current_char {
150 if ch == '*' && self.peek(1) == Some('/') {
151 self.advance(); self.advance(); break;
154 }
155 self.advance();
156 }
157 }
158 _ => {
159 break;
161 }
162 }
163 }
164 }
165
166 fn read_identifier(&mut self) -> String {
167 let mut result = String::new();
168 while let Some(ch) = self.current_char {
169 if ch.is_alphanumeric() || ch == '_' {
170 result.push(ch);
171 self.advance();
172 } else {
173 break;
174 }
175 }
176 result
177 }
178
179 fn read_string(&mut self) -> String {
180 let mut result = String::new();
181 let quote_char = self.current_char.unwrap(); self.advance(); while let Some(ch) = self.current_char {
185 if ch == quote_char {
186 self.advance(); break;
188 }
189 result.push(ch);
190 self.advance();
191 }
192 result
193 }
194
195 fn read_number(&mut self) -> String {
196 let mut result = String::new();
197 let mut has_e = false;
198
199 while let Some(ch) = self.current_char {
201 if !has_e && (ch.is_numeric() || ch == '.') {
202 result.push(ch);
203 self.advance();
204 } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
205 result.push(ch);
207 self.advance();
208 has_e = true;
209
210 if let Some(sign) = self.current_char {
212 if sign == '+' || sign == '-' {
213 result.push(sign);
214 self.advance();
215 }
216 }
217
218 while let Some(digit) = self.current_char {
220 if digit.is_numeric() {
221 result.push(digit);
222 self.advance();
223 } else {
224 break;
225 }
226 }
227 break; } else {
229 break;
230 }
231 }
232 result
233 }
234
235 pub fn next_token(&mut self) -> Token {
236 self.skip_whitespace_and_comments();
237
238 match self.current_char {
239 None => Token::Eof,
240 Some('*') => {
241 self.advance();
242 Token::Star }
246 Some('+') => {
247 self.advance();
248 Token::Plus
249 }
250 Some('/') => {
251 if self.peek(1) == Some('*') {
253 self.skip_whitespace_and_comments();
256 return self.next_token();
257 }
258 self.advance();
259 Token::Divide
260 }
261 Some('%') => {
262 self.advance();
263 Token::Modulo
264 }
265 Some('.') => {
266 self.advance();
267 Token::Dot
268 }
269 Some(',') => {
270 self.advance();
271 Token::Comma
272 }
273 Some(':') => {
274 self.advance();
275 Token::Colon
276 }
277 Some('(') => {
278 self.advance();
279 Token::LeftParen
280 }
281 Some(')') => {
282 self.advance();
283 Token::RightParen
284 }
285 Some('=') => {
286 self.advance();
287 Token::Equal
288 }
289 Some('<') => {
290 self.advance();
291 if self.current_char == Some('=') {
292 self.advance();
293 Token::LessThanOrEqual
294 } else if self.current_char == Some('>') {
295 self.advance();
296 Token::NotEqual
297 } else {
298 Token::LessThan
299 }
300 }
301 Some('>') => {
302 self.advance();
303 if self.current_char == Some('=') {
304 self.advance();
305 Token::GreaterThanOrEqual
306 } else {
307 Token::GreaterThan
308 }
309 }
310 Some('!') if self.peek(1) == Some('=') => {
311 self.advance();
312 self.advance();
313 Token::NotEqual
314 }
315 Some('|') if self.peek(1) == Some('|') => {
316 self.advance();
317 self.advance();
318 Token::Concat
319 }
320 Some('"') => {
321 let ident_val = self.read_string();
323 Token::QuotedIdentifier(ident_val)
324 }
325 Some('\'') => {
326 let string_val = self.read_string();
328 Token::StringLiteral(string_val)
329 }
330 Some('-') if self.peek(1) == Some('-') => {
331 self.skip_whitespace_and_comments();
333 self.next_token()
334 }
335 Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
336 self.advance(); let num = self.read_number();
339 Token::NumberLiteral(format!("-{num}"))
340 }
341 Some('-') => {
342 self.advance();
344 Token::Minus
345 }
346 Some(ch) if ch.is_numeric() => {
347 let num = self.read_number();
348 Token::NumberLiteral(num)
349 }
350 Some(ch) if ch.is_alphabetic() || ch == '_' => {
351 let ident = self.read_identifier();
352 match ident.to_uppercase().as_str() {
353 "SELECT" => Token::Select,
354 "FROM" => Token::From,
355 "WHERE" => Token::Where,
356 "WITH" => Token::With,
357 "AND" => Token::And,
358 "OR" => Token::Or,
359 "IN" => Token::In,
360 "NOT" => Token::Not,
361 "BETWEEN" => Token::Between,
362 "LIKE" => Token::Like,
363 "IS" => Token::Is,
364 "NULL" => Token::Null,
365 "ORDER" if self.peek_keyword("BY") => {
366 self.skip_whitespace();
367 self.read_identifier(); Token::OrderBy
369 }
370 "GROUP" if self.peek_keyword("BY") => {
371 self.skip_whitespace();
372 self.read_identifier(); Token::GroupBy
374 }
375 "HAVING" => Token::Having,
376 "AS" => Token::As,
377 "ASC" => Token::Asc,
378 "DESC" => Token::Desc,
379 "LIMIT" => Token::Limit,
380 "OFFSET" => Token::Offset,
381 "DATETIME" => Token::DateTime,
382 "CASE" => Token::Case,
383 "WHEN" => Token::When,
384 "THEN" => Token::Then,
385 "ELSE" => Token::Else,
386 "END" => Token::End,
387 "DISTINCT" => Token::Distinct,
388 "OVER" => Token::Over,
389 "PARTITION" => Token::Partition,
390 "BY" => Token::By,
391 "JOIN" => Token::Join,
393 "INNER" => Token::Inner,
394 "LEFT" => Token::Left,
395 "RIGHT" => Token::Right,
396 "FULL" => Token::Full,
397 "OUTER" => Token::Outer,
398 "ON" => Token::On,
399 "CROSS" => Token::Cross,
400 _ => Token::Identifier(ident),
401 }
402 }
403 Some(ch) => {
404 self.advance();
405 Token::Identifier(ch.to_string())
406 }
407 }
408 }
409
410 fn peek_keyword(&mut self, keyword: &str) -> bool {
411 let saved_pos = self.position;
412 let saved_char = self.current_char;
413
414 self.skip_whitespace_and_comments();
415 let next_word = self.read_identifier();
416 let matches = next_word.to_uppercase() == keyword;
417
418 self.position = saved_pos;
420 self.current_char = saved_char;
421
422 matches
423 }
424
425 #[must_use]
426 pub fn get_position(&self) -> usize {
427 self.position
428 }
429
430 pub fn tokenize_all(&mut self) -> Vec<Token> {
431 let mut tokens = Vec::new();
432 loop {
433 let token = self.next_token();
434 if matches!(token, Token::Eof) {
435 tokens.push(token);
436 break;
437 }
438 tokens.push(token);
439 }
440 tokens
441 }
442
443 pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
444 let mut tokens = Vec::new();
445 loop {
446 self.skip_whitespace_and_comments();
447 let start_pos = self.position;
448 let token = self.next_token();
449 let end_pos = self.position;
450
451 if matches!(token, Token::Eof) {
452 break;
453 }
454 tokens.push((start_pos, end_pos, token));
455 }
456 tokens
457 }
458}