1#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8 Select,
10 From,
11 Where,
12 With, And,
14 Or,
15 In,
16 Not,
17 Between,
18 Like,
19 Is,
20 Null,
21 OrderBy,
22 GroupBy,
23 Having,
24 As,
25 Asc,
26 Desc,
27 Limit,
28 Offset,
29 DateTime, Case, When, Then, Else, End, Distinct, Over, Partition, By, Join, Inner, Left, Right, Full, Outer, On, Cross, Identifier(String),
52 QuotedIdentifier(String), StringLiteral(String),
54 NumberLiteral(String),
55 Star,
56
57 Dot,
59 Comma,
60 Colon,
61 LeftParen,
62 RightParen,
63 Equal,
64 NotEqual,
65 LessThan,
66 GreaterThan,
67 LessThanOrEqual,
68 GreaterThanOrEqual,
69
70 Plus,
72 Minus,
73 Divide,
74 Modulo,
75
76 Eof,
78}
79
80#[derive(Debug, Clone)]
81pub struct Lexer {
82 input: Vec<char>,
83 position: usize,
84 current_char: Option<char>,
85}
86
87impl Lexer {
88 #[must_use]
89 pub fn new(input: &str) -> Self {
90 let chars: Vec<char> = input.chars().collect();
91 let current = chars.first().copied();
92 Self {
93 input: chars,
94 position: 0,
95 current_char: current,
96 }
97 }
98
99 fn advance(&mut self) {
100 self.position += 1;
101 self.current_char = self.input.get(self.position).copied();
102 }
103
104 fn peek(&self, offset: usize) -> Option<char> {
105 self.input.get(self.position + offset).copied()
106 }
107
108 fn skip_whitespace(&mut self) {
109 while let Some(ch) = self.current_char {
110 if ch.is_whitespace() {
111 self.advance();
112 } else {
113 break;
114 }
115 }
116 }
117
118 fn skip_whitespace_and_comments(&mut self) {
119 loop {
120 while let Some(ch) = self.current_char {
122 if ch.is_whitespace() {
123 self.advance();
124 } else {
125 break;
126 }
127 }
128
129 match self.current_char {
131 Some('-') if self.peek(1) == Some('-') => {
132 self.advance(); self.advance(); while let Some(ch) = self.current_char {
136 self.advance();
137 if ch == '\n' {
138 break;
139 }
140 }
141 }
142 Some('/') if self.peek(1) == Some('*') => {
143 self.advance(); self.advance(); while let Some(ch) = self.current_char {
147 if ch == '*' && self.peek(1) == Some('/') {
148 self.advance(); self.advance(); break;
151 }
152 self.advance();
153 }
154 }
155 _ => {
156 break;
158 }
159 }
160 }
161 }
162
163 fn read_identifier(&mut self) -> String {
164 let mut result = String::new();
165 while let Some(ch) = self.current_char {
166 if ch.is_alphanumeric() || ch == '_' {
167 result.push(ch);
168 self.advance();
169 } else {
170 break;
171 }
172 }
173 result
174 }
175
176 fn read_string(&mut self) -> String {
177 let mut result = String::new();
178 let quote_char = self.current_char.unwrap(); self.advance(); while let Some(ch) = self.current_char {
182 if ch == quote_char {
183 self.advance(); break;
185 }
186 result.push(ch);
187 self.advance();
188 }
189 result
190 }
191
192 fn read_number(&mut self) -> String {
193 let mut result = String::new();
194 let mut has_e = false;
195
196 while let Some(ch) = self.current_char {
198 if !has_e && (ch.is_numeric() || ch == '.') {
199 result.push(ch);
200 self.advance();
201 } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
202 result.push(ch);
204 self.advance();
205 has_e = true;
206
207 if let Some(sign) = self.current_char {
209 if sign == '+' || sign == '-' {
210 result.push(sign);
211 self.advance();
212 }
213 }
214
215 while let Some(digit) = self.current_char {
217 if digit.is_numeric() {
218 result.push(digit);
219 self.advance();
220 } else {
221 break;
222 }
223 }
224 break; } else {
226 break;
227 }
228 }
229 result
230 }
231
232 pub fn next_token(&mut self) -> Token {
233 self.skip_whitespace_and_comments();
234
235 match self.current_char {
236 None => Token::Eof,
237 Some('*') => {
238 self.advance();
239 Token::Star }
243 Some('+') => {
244 self.advance();
245 Token::Plus
246 }
247 Some('/') => {
248 if self.peek(1) == Some('*') {
250 self.skip_whitespace_and_comments();
253 return self.next_token();
254 }
255 self.advance();
256 Token::Divide
257 }
258 Some('%') => {
259 self.advance();
260 Token::Modulo
261 }
262 Some('.') => {
263 self.advance();
264 Token::Dot
265 }
266 Some(',') => {
267 self.advance();
268 Token::Comma
269 }
270 Some(':') => {
271 self.advance();
272 Token::Colon
273 }
274 Some('(') => {
275 self.advance();
276 Token::LeftParen
277 }
278 Some(')') => {
279 self.advance();
280 Token::RightParen
281 }
282 Some('=') => {
283 self.advance();
284 Token::Equal
285 }
286 Some('<') => {
287 self.advance();
288 if self.current_char == Some('=') {
289 self.advance();
290 Token::LessThanOrEqual
291 } else if self.current_char == Some('>') {
292 self.advance();
293 Token::NotEqual
294 } else {
295 Token::LessThan
296 }
297 }
298 Some('>') => {
299 self.advance();
300 if self.current_char == Some('=') {
301 self.advance();
302 Token::GreaterThanOrEqual
303 } else {
304 Token::GreaterThan
305 }
306 }
307 Some('!') if self.peek(1) == Some('=') => {
308 self.advance();
309 self.advance();
310 Token::NotEqual
311 }
312 Some('"') => {
313 let ident_val = self.read_string();
315 Token::QuotedIdentifier(ident_val)
316 }
317 Some('\'') => {
318 let string_val = self.read_string();
320 Token::StringLiteral(string_val)
321 }
322 Some('-') if self.peek(1) == Some('-') => {
323 self.skip_whitespace_and_comments();
325 self.next_token()
326 }
327 Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
328 self.advance(); let num = self.read_number();
331 Token::NumberLiteral(format!("-{num}"))
332 }
333 Some('-') => {
334 self.advance();
336 Token::Minus
337 }
338 Some(ch) if ch.is_numeric() => {
339 let num = self.read_number();
340 Token::NumberLiteral(num)
341 }
342 Some(ch) if ch.is_alphabetic() || ch == '_' => {
343 let ident = self.read_identifier();
344 match ident.to_uppercase().as_str() {
345 "SELECT" => Token::Select,
346 "FROM" => Token::From,
347 "WHERE" => Token::Where,
348 "WITH" => Token::With,
349 "AND" => Token::And,
350 "OR" => Token::Or,
351 "IN" => Token::In,
352 "NOT" => Token::Not,
353 "BETWEEN" => Token::Between,
354 "LIKE" => Token::Like,
355 "IS" => Token::Is,
356 "NULL" => Token::Null,
357 "ORDER" if self.peek_keyword("BY") => {
358 self.skip_whitespace();
359 self.read_identifier(); Token::OrderBy
361 }
362 "GROUP" if self.peek_keyword("BY") => {
363 self.skip_whitespace();
364 self.read_identifier(); Token::GroupBy
366 }
367 "HAVING" => Token::Having,
368 "AS" => Token::As,
369 "ASC" => Token::Asc,
370 "DESC" => Token::Desc,
371 "LIMIT" => Token::Limit,
372 "OFFSET" => Token::Offset,
373 "DATETIME" => Token::DateTime,
374 "CASE" => Token::Case,
375 "WHEN" => Token::When,
376 "THEN" => Token::Then,
377 "ELSE" => Token::Else,
378 "END" => Token::End,
379 "DISTINCT" => Token::Distinct,
380 "OVER" => Token::Over,
381 "PARTITION" => Token::Partition,
382 "BY" => Token::By,
383 "JOIN" => Token::Join,
385 "INNER" => Token::Inner,
386 "LEFT" => Token::Left,
387 "RIGHT" => Token::Right,
388 "FULL" => Token::Full,
389 "OUTER" => Token::Outer,
390 "ON" => Token::On,
391 "CROSS" => Token::Cross,
392 _ => Token::Identifier(ident),
393 }
394 }
395 Some(ch) => {
396 self.advance();
397 Token::Identifier(ch.to_string())
398 }
399 }
400 }
401
402 fn peek_keyword(&mut self, keyword: &str) -> bool {
403 let saved_pos = self.position;
404 let saved_char = self.current_char;
405
406 self.skip_whitespace_and_comments();
407 let next_word = self.read_identifier();
408 let matches = next_word.to_uppercase() == keyword;
409
410 self.position = saved_pos;
412 self.current_char = saved_char;
413
414 matches
415 }
416
417 #[must_use]
418 pub fn get_position(&self) -> usize {
419 self.position
420 }
421
422 pub fn tokenize_all(&mut self) -> Vec<Token> {
423 let mut tokens = Vec::new();
424 loop {
425 let token = self.next_token();
426 if matches!(token, Token::Eof) {
427 tokens.push(token);
428 break;
429 }
430 tokens.push(token);
431 }
432 tokens
433 }
434
435 pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
436 let mut tokens = Vec::new();
437 loop {
438 self.skip_whitespace_and_comments();
439 let start_pos = self.position;
440 let token = self.next_token();
441 let end_pos = self.position;
442
443 if matches!(token, Token::Eof) {
444 break;
445 }
446 tokens.push((start_pos, end_pos, token));
447 }
448 tokens
449 }
450}