1#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8 Select,
10 From,
11 Where,
12 With, And,
14 Or,
15 In,
16 Not,
17 Between,
18 Like,
19 Is,
20 Null,
21 OrderBy,
22 GroupBy,
23 Having,
24 As,
25 Asc,
26 Desc,
27 Limit,
28 Offset,
29 DateTime, Case, When, Then, Else, End, Distinct, Over, Partition, By, Join, Inner, Left, Right, Full, Outer, On, Cross, Identifier(String),
52 QuotedIdentifier(String), StringLiteral(String),
54 NumberLiteral(String),
55 Star,
56
57 Dot,
59 Comma,
60 LeftParen,
61 RightParen,
62 Equal,
63 NotEqual,
64 LessThan,
65 GreaterThan,
66 LessThanOrEqual,
67 GreaterThanOrEqual,
68
69 Plus,
71 Minus,
72 Divide,
73 Modulo,
74
75 Eof,
77}
78
79#[derive(Debug, Clone)]
80pub struct Lexer {
81 input: Vec<char>,
82 position: usize,
83 current_char: Option<char>,
84}
85
86impl Lexer {
87 #[must_use]
88 pub fn new(input: &str) -> Self {
89 let chars: Vec<char> = input.chars().collect();
90 let current = chars.first().copied();
91 Self {
92 input: chars,
93 position: 0,
94 current_char: current,
95 }
96 }
97
98 fn advance(&mut self) {
99 self.position += 1;
100 self.current_char = self.input.get(self.position).copied();
101 }
102
103 fn peek(&self, offset: usize) -> Option<char> {
104 self.input.get(self.position + offset).copied()
105 }
106
107 fn skip_whitespace(&mut self) {
108 while let Some(ch) = self.current_char {
109 if ch.is_whitespace() {
110 self.advance();
111 } else {
112 break;
113 }
114 }
115 }
116
117 fn skip_whitespace_and_comments(&mut self) {
118 loop {
119 while let Some(ch) = self.current_char {
121 if ch.is_whitespace() {
122 self.advance();
123 } else {
124 break;
125 }
126 }
127
128 match self.current_char {
130 Some('-') if self.peek(1) == Some('-') => {
131 self.advance(); self.advance(); while let Some(ch) = self.current_char {
135 self.advance();
136 if ch == '\n' {
137 break;
138 }
139 }
140 }
141 Some('/') if self.peek(1) == Some('*') => {
142 self.advance(); self.advance(); while let Some(ch) = self.current_char {
146 if ch == '*' && self.peek(1) == Some('/') {
147 self.advance(); self.advance(); break;
150 }
151 self.advance();
152 }
153 }
154 _ => {
155 break;
157 }
158 }
159 }
160 }
161
162 fn read_identifier(&mut self) -> String {
163 let mut result = String::new();
164 while let Some(ch) = self.current_char {
165 if ch.is_alphanumeric() || ch == '_' {
166 result.push(ch);
167 self.advance();
168 } else {
169 break;
170 }
171 }
172 result
173 }
174
175 fn read_string(&mut self) -> String {
176 let mut result = String::new();
177 let quote_char = self.current_char.unwrap(); self.advance(); while let Some(ch) = self.current_char {
181 if ch == quote_char {
182 self.advance(); break;
184 }
185 result.push(ch);
186 self.advance();
187 }
188 result
189 }
190
191 fn read_number(&mut self) -> String {
192 let mut result = String::new();
193 let mut has_e = false;
194
195 while let Some(ch) = self.current_char {
197 if !has_e && (ch.is_numeric() || ch == '.') {
198 result.push(ch);
199 self.advance();
200 } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
201 result.push(ch);
203 self.advance();
204 has_e = true;
205
206 if let Some(sign) = self.current_char {
208 if sign == '+' || sign == '-' {
209 result.push(sign);
210 self.advance();
211 }
212 }
213
214 while let Some(digit) = self.current_char {
216 if digit.is_numeric() {
217 result.push(digit);
218 self.advance();
219 } else {
220 break;
221 }
222 }
223 break; } else {
225 break;
226 }
227 }
228 result
229 }
230
231 pub fn next_token(&mut self) -> Token {
232 self.skip_whitespace_and_comments();
233
234 match self.current_char {
235 None => Token::Eof,
236 Some('*') => {
237 self.advance();
238 Token::Star }
242 Some('+') => {
243 self.advance();
244 Token::Plus
245 }
246 Some('/') => {
247 if self.peek(1) == Some('*') {
249 self.skip_whitespace_and_comments();
252 return self.next_token();
253 }
254 self.advance();
255 Token::Divide
256 }
257 Some('%') => {
258 self.advance();
259 Token::Modulo
260 }
261 Some('.') => {
262 self.advance();
263 Token::Dot
264 }
265 Some(',') => {
266 self.advance();
267 Token::Comma
268 }
269 Some('(') => {
270 self.advance();
271 Token::LeftParen
272 }
273 Some(')') => {
274 self.advance();
275 Token::RightParen
276 }
277 Some('=') => {
278 self.advance();
279 Token::Equal
280 }
281 Some('<') => {
282 self.advance();
283 if self.current_char == Some('=') {
284 self.advance();
285 Token::LessThanOrEqual
286 } else if self.current_char == Some('>') {
287 self.advance();
288 Token::NotEqual
289 } else {
290 Token::LessThan
291 }
292 }
293 Some('>') => {
294 self.advance();
295 if self.current_char == Some('=') {
296 self.advance();
297 Token::GreaterThanOrEqual
298 } else {
299 Token::GreaterThan
300 }
301 }
302 Some('!') if self.peek(1) == Some('=') => {
303 self.advance();
304 self.advance();
305 Token::NotEqual
306 }
307 Some('"') => {
308 let ident_val = self.read_string();
310 Token::QuotedIdentifier(ident_val)
311 }
312 Some('\'') => {
313 let string_val = self.read_string();
315 Token::StringLiteral(string_val)
316 }
317 Some('-') if self.peek(1) == Some('-') => {
318 self.skip_whitespace_and_comments();
320 self.next_token()
321 }
322 Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
323 self.advance(); let num = self.read_number();
326 Token::NumberLiteral(format!("-{num}"))
327 }
328 Some('-') => {
329 self.advance();
331 Token::Minus
332 }
333 Some(ch) if ch.is_numeric() => {
334 let num = self.read_number();
335 Token::NumberLiteral(num)
336 }
337 Some(ch) if ch.is_alphabetic() || ch == '_' => {
338 let ident = self.read_identifier();
339 match ident.to_uppercase().as_str() {
340 "SELECT" => Token::Select,
341 "FROM" => Token::From,
342 "WHERE" => Token::Where,
343 "WITH" => Token::With,
344 "AND" => Token::And,
345 "OR" => Token::Or,
346 "IN" => Token::In,
347 "NOT" => Token::Not,
348 "BETWEEN" => Token::Between,
349 "LIKE" => Token::Like,
350 "IS" => Token::Is,
351 "NULL" => Token::Null,
352 "ORDER" if self.peek_keyword("BY") => {
353 self.skip_whitespace();
354 self.read_identifier(); Token::OrderBy
356 }
357 "GROUP" if self.peek_keyword("BY") => {
358 self.skip_whitespace();
359 self.read_identifier(); Token::GroupBy
361 }
362 "HAVING" => Token::Having,
363 "AS" => Token::As,
364 "ASC" => Token::Asc,
365 "DESC" => Token::Desc,
366 "LIMIT" => Token::Limit,
367 "OFFSET" => Token::Offset,
368 "DATETIME" => Token::DateTime,
369 "CASE" => Token::Case,
370 "WHEN" => Token::When,
371 "THEN" => Token::Then,
372 "ELSE" => Token::Else,
373 "END" => Token::End,
374 "DISTINCT" => Token::Distinct,
375 "OVER" => Token::Over,
376 "PARTITION" => Token::Partition,
377 "BY" => Token::By,
378 "JOIN" => Token::Join,
380 "INNER" => Token::Inner,
381 "LEFT" => Token::Left,
382 "RIGHT" => Token::Right,
383 "FULL" => Token::Full,
384 "OUTER" => Token::Outer,
385 "ON" => Token::On,
386 "CROSS" => Token::Cross,
387 _ => Token::Identifier(ident),
388 }
389 }
390 Some(ch) => {
391 self.advance();
392 Token::Identifier(ch.to_string())
393 }
394 }
395 }
396
397 fn peek_keyword(&mut self, keyword: &str) -> bool {
398 let saved_pos = self.position;
399 let saved_char = self.current_char;
400
401 self.skip_whitespace_and_comments();
402 let next_word = self.read_identifier();
403 let matches = next_word.to_uppercase() == keyword;
404
405 self.position = saved_pos;
407 self.current_char = saved_char;
408
409 matches
410 }
411
412 #[must_use]
413 pub fn get_position(&self) -> usize {
414 self.position
415 }
416
417 pub fn tokenize_all(&mut self) -> Vec<Token> {
418 let mut tokens = Vec::new();
419 loop {
420 let token = self.next_token();
421 if matches!(token, Token::Eof) {
422 tokens.push(token);
423 break;
424 }
425 tokens.push(token);
426 }
427 tokens
428 }
429
430 pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
431 let mut tokens = Vec::new();
432 loop {
433 self.skip_whitespace_and_comments();
434 let start_pos = self.position;
435 let token = self.next_token();
436 let end_pos = self.position;
437
438 if matches!(token, Token::Eof) {
439 break;
440 }
441 tokens.push((start_pos, end_pos, token));
442 }
443 tokens
444 }
445}