1use std::fmt;
11
12#[derive(Debug, Clone, PartialEq)]
13pub enum Token {
14 And,
16 Or,
17 Not,
18 Between,
19 Like,
20 Escape,
21 In,
22 Is,
23 True,
24 False,
25 Null,
26
27 Equal, NotEqual, GreaterThan, GreaterOrEqual, LessThan, LessOrEqual, Plus, Minus, Star, Slash, Percent, LeftParen, RightParen, Comma, Identifier(String),
47 StringLiteral(String),
48 IntegerLiteral(i64),
49 FloatLiteral(f64),
50
51 Eof,
53}
54
55impl fmt::Display for Token {
56 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
57 match self {
58 Token::And => write!(f, "AND"),
59 Token::Or => write!(f, "OR"),
60 Token::Not => write!(f, "NOT"),
61 Token::Between => write!(f, "BETWEEN"),
62 Token::Like => write!(f, "LIKE"),
63 Token::Escape => write!(f, "ESCAPE"),
64 Token::In => write!(f, "IN"),
65 Token::Is => write!(f, "IS"),
66 Token::True => write!(f, "TRUE"),
67 Token::False => write!(f, "FALSE"),
68 Token::Null => write!(f, "NULL"),
69 Token::Equal => write!(f, "="),
70 Token::NotEqual => write!(f, "<>"),
71 Token::GreaterThan => write!(f, ">"),
72 Token::GreaterOrEqual => write!(f, ">="),
73 Token::LessThan => write!(f, "<"),
74 Token::LessOrEqual => write!(f, "<="),
75 Token::Plus => write!(f, "+"),
76 Token::Minus => write!(f, "-"),
77 Token::Star => write!(f, "*"),
78 Token::Slash => write!(f, "/"),
79 Token::Percent => write!(f, "%"),
80 Token::LeftParen => write!(f, "("),
81 Token::RightParen => write!(f, ")"),
82 Token::Comma => write!(f, ","),
83 Token::Identifier(s) => write!(f, "identifier '{}'", s),
84 Token::StringLiteral(s) => write!(f, "string '{}'", s),
85 Token::IntegerLiteral(n) => write!(f, "integer {}", n),
86 Token::FloatLiteral(n) => write!(f, "float {}", n),
87 Token::Eof => write!(f, "end of input"),
88 }
89 }
90}
91
92pub struct Lexer {
93 input: Vec<char>,
94 position: usize,
95 current_char: Option<char>,
96}
97
98impl Lexer {
99 pub fn new(input: &str) -> Self {
100 let chars: Vec<char> = input.chars().collect();
101 let current_char = chars.first().copied();
102 Lexer {
103 input: chars,
104 position: 0,
105 current_char,
106 }
107 }
108
109 fn format_error(&self, message: &str) -> String {
111 format!("{} near position {} in:\n {}",
112 message,
113 self.position,
114 String::from_iter(&self.input))
115 }
116
117 fn advance(&mut self) {
119 self.position += 1;
120 self.current_char = self.input.get(self.position).copied();
121 }
122
123 fn peek(&self) -> Option<char> {
125 self.input.get(self.position + 1).copied()
126 }
127
128 fn skip_whitespace(&mut self) {
130 while let Some(ch) = self.current_char {
131 if ch.is_whitespace() {
132 self.advance();
133 } else {
134 break;
135 }
136 }
137 }
138
139 fn skip_line_comment(&mut self) {
141 self.advance();
143 self.advance();
144
145 while let Some(ch) = self.current_char {
147 if ch == '\n' {
148 self.advance();
149 break;
150 }
151 self.advance();
152 }
153 }
154
155 fn skip_block_comment(&mut self) -> Result<(), String> {
157 self.advance();
159 self.advance();
160
161 while let Some(ch) = self.current_char {
163 if ch == '*' && self.peek() == Some('/') {
164 self.advance(); self.advance(); return Ok(());
167 }
168 self.advance();
169 }
170
171 Err(self.format_error("Unterminated block comment"))
172 }
173
174 fn read_identifier(&mut self) -> String {
176 let mut result = String::new();
177
178 while let Some(ch) = self.current_char {
179 if ch.is_alphanumeric() || ch == '_' || ch == '$' {
180 result.push(ch);
181 self.advance();
182 } else {
183 break;
184 }
185 }
186
187 result
188 }
189
190 fn keyword_or_identifier(&self, s: &str) -> Token {
192 match s.to_uppercase().as_str() {
193 "AND" => Token::And,
194 "OR" => Token::Or,
195 "NOT" => Token::Not,
196 "BETWEEN" => Token::Between,
197 "LIKE" => Token::Like,
198 "ESCAPE" => Token::Escape,
199 "IN" => Token::In,
200 "IS" => Token::Is,
201 "TRUE" => Token::True,
202 "FALSE" => Token::False,
203 "NULL" => Token::Null,
204 _ => Token::Identifier(s.to_string()),
205 }
206 }
207
208 fn read_string_literal(&mut self) -> Result<String, String> {
210 let mut result = String::new();
211
212 self.advance();
214
215 while let Some(ch) = self.current_char {
216 if ch == '\'' {
217 if self.peek() == Some('\'') {
219 result.push('\'');
220 self.advance(); self.advance(); } else {
223 self.advance(); return Ok(result);
226 }
227 } else {
228 result.push(ch);
229 self.advance();
230 }
231 }
232
233 Err(self.format_error("Unterminated string literal"))
234 }
235
236 fn read_number(&mut self) -> Result<Token, String> {
238 if self.current_char == Some('0') && matches!(self.peek(), Some('x') | Some('X')) {
240 return self.read_hex_literal();
241 }
242
243 if self.current_char == Some('0') && self.peek().is_some_and(|c| c.is_ascii_digit()) {
245 return self.read_octal_literal();
246 }
247
248 let mut num_str = String::new();
250 let mut is_float = false;
251
252 while let Some(ch) = self.current_char {
254 if ch.is_ascii_digit() {
255 num_str.push(ch);
256 self.advance();
257 } else {
258 break;
259 }
260 }
261
262 if self.current_char == Some('.') && self.peek().is_some_and(|c| c.is_ascii_digit() || c == 'e' || c == 'E') {
264 is_float = true;
265 num_str.push('.');
266 self.advance();
267
268 while let Some(ch) = self.current_char {
270 if ch.is_ascii_digit() {
271 num_str.push(ch);
272 self.advance();
273 } else {
274 break;
275 }
276 }
277 }
278
279 if matches!(self.current_char, Some('e') | Some('E')) {
281 is_float = true;
282 num_str.push('e');
283 self.advance();
284
285 if matches!(self.current_char, Some('+') | Some('-')) {
287 num_str.push(self.current_char.unwrap());
288 self.advance();
289 }
290
291 while let Some(ch) = self.current_char {
293 if ch.is_ascii_digit() {
294 num_str.push(ch);
295 self.advance();
296 } else {
297 break;
298 }
299 }
300 }
301
302 if matches!(self.current_char, Some('l') | Some('L')) && !is_float {
304 self.advance();
305 let value = num_str.parse::<i64>()
306 .map_err(|e| self.format_error(&format!("Invalid integer literal: {}", e)))?;
307 return Ok(Token::IntegerLiteral(value));
308 }
309
310 if is_float {
312 let value = num_str.parse::<f64>()
313 .map_err(|e| self.format_error(&format!("Invalid float literal: {}", e)))?;
314 Ok(Token::FloatLiteral(value))
315 } else {
316 let value = num_str.parse::<i64>()
317 .map_err(|e| self.format_error(&format!("Invalid integer literal: {}", e)))?;
318 Ok(Token::IntegerLiteral(value))
319 }
320 }
321
322 fn read_hex_literal(&mut self) -> Result<Token, String> {
324 self.advance();
326 self.advance();
327
328 let mut hex_str = String::new();
329 while let Some(ch) = self.current_char {
330 if ch.is_ascii_hexdigit() {
331 hex_str.push(ch);
332 self.advance();
333 } else {
334 break;
335 }
336 }
337
338 if hex_str.is_empty() {
339 return Err(self.format_error("Invalid hexadecimal literal: no digits after 0x"));
340 }
341
342 let value = i64::from_str_radix(&hex_str, 16)
343 .map_err(|e| self.format_error(&format!("Invalid hexadecimal literal: {}", e)))?;
344 Ok(Token::IntegerLiteral(value))
345 }
346
347 fn read_octal_literal(&mut self) -> Result<Token, String> {
349 let mut octal_str = String::new();
350
351 while let Some(ch) = self.current_char {
352 if ('0'..='7').contains(&ch) {
353 octal_str.push(ch);
354 self.advance();
355 } else {
356 break;
357 }
358 }
359
360 let value = i64::from_str_radix(&octal_str, 8)
361 .map_err(|e| self.format_error(&format!("Invalid octal literal: {}", e)))?;
362 Ok(Token::IntegerLiteral(value))
363 }
364
365 fn read_float_starting_with_dot(&mut self) -> Result<Token, String> {
367 let mut num_str = String::from("0.");
368
369 self.advance();
371
372 while let Some(ch) = self.current_char {
374 if ch.is_ascii_digit() {
375 num_str.push(ch);
376 self.advance();
377 } else {
378 break;
379 }
380 }
381
382 if matches!(self.current_char, Some('e') | Some('E')) {
384 num_str.push('e');
385 self.advance();
386
387 if matches!(self.current_char, Some('+') | Some('-')) {
389 num_str.push(self.current_char.unwrap());
390 self.advance();
391 }
392
393 while let Some(ch) = self.current_char {
395 if ch.is_ascii_digit() {
396 num_str.push(ch);
397 self.advance();
398 } else {
399 break;
400 }
401 }
402 }
403
404 let value = num_str.parse::<f64>()
405 .map_err(|e| self.format_error(&format!("Invalid float literal: {}", e)))?;
406 Ok(Token::FloatLiteral(value))
407 }
408
409 pub fn next_token(&mut self) -> Result<Token, String> {
411 loop {
412 self.skip_whitespace();
414
415 let ch = match self.current_char {
416 Some(c) => c,
417 None => return Ok(Token::Eof),
418 };
419
420 if ch == '-' && self.peek() == Some('-') {
422 self.skip_line_comment();
423 continue;
424 }
425
426 if ch == '/' && self.peek() == Some('*') {
427 self.skip_block_comment()?;
428 continue;
429 }
430
431 match ch {
433 '(' => {
434 self.advance();
435 return Ok(Token::LeftParen);
436 }
437 ')' => {
438 self.advance();
439 return Ok(Token::RightParen);
440 }
441 ',' => {
442 self.advance();
443 return Ok(Token::Comma);
444 }
445 '+' => {
446 self.advance();
447 return Ok(Token::Plus);
448 }
449 '-' => {
450 self.advance();
451 return Ok(Token::Minus);
452 }
453 '*' => {
454 self.advance();
455 return Ok(Token::Star);
456 }
457 '/' => {
458 self.advance();
459 return Ok(Token::Slash);
460 }
461 '%' => {
462 self.advance();
463 return Ok(Token::Percent);
464 }
465 '=' => {
466 self.advance();
467 return Ok(Token::Equal);
468 }
469 '!' => {
470 if self.peek() == Some('=') {
471 self.advance();
472 self.advance();
473 return Ok(Token::NotEqual);
474 }
475 return Err(self.format_error(&format!("Unexpected character: '{}'", ch)));
476 }
477 '<' => {
478 self.advance();
479 if self.current_char == Some('>') {
480 self.advance();
481 return Ok(Token::NotEqual);
482 } else if self.current_char == Some('=') {
483 self.advance();
484 return Ok(Token::LessOrEqual);
485 }
486 return Ok(Token::LessThan);
487 }
488 '>' => {
489 self.advance();
490 if self.current_char == Some('=') {
491 self.advance();
492 return Ok(Token::GreaterOrEqual);
493 }
494 return Ok(Token::GreaterThan);
495 }
496 '\'' => {
497 let s = self.read_string_literal()?;
498 return Ok(Token::StringLiteral(s));
499 }
500 '.' => {
501 if self.peek().is_some_and(|c| c.is_ascii_digit()) {
503 return self.read_float_starting_with_dot();
504 }
505 return Err(self.format_error(&format!("Unexpected character: '{}'", ch)));
506 }
507 _ => {
508 if ch.is_alphabetic() || ch == '_' || ch == '$' {
510 let ident = self.read_identifier();
511 return Ok(self.keyword_or_identifier(&ident));
512 }
513
514 if ch.is_ascii_digit() {
516 return self.read_number();
517 }
518
519 return Err(self.format_error(&format!("Unexpected character: '{}'", ch)));
520 }
521 }
522 }
523 }
524
525 pub fn tokenize(&mut self) -> Result<Vec<Token>, String> {
527 let mut tokens = Vec::new();
528 loop {
529 let token = self.next_token()?;
530 if token == Token::Eof {
531 tokens.push(token);
532 break;
533 }
534 tokens.push(token);
535 }
536 Ok(tokens)
537 }
538}
539
540#[cfg(test)]
541mod tests {
542 use super::*;
543
544 #[test]
545 fn test_keywords() {
546 let mut lexer = Lexer::new("AND or Not BETWEEN");
547 assert_eq!(lexer.next_token().unwrap(), Token::And);
548 assert_eq!(lexer.next_token().unwrap(), Token::Or);
549 assert_eq!(lexer.next_token().unwrap(), Token::Not);
550 assert_eq!(lexer.next_token().unwrap(), Token::Between);
551 }
552
553 #[test]
554 fn test_string_literal() {
555 let mut lexer = Lexer::new("'hello' 'it''s me'");
556 assert_eq!(lexer.next_token().unwrap(), Token::StringLiteral("hello".to_string()));
557 assert_eq!(lexer.next_token().unwrap(), Token::StringLiteral("it's me".to_string()));
558 }
559
560 #[test]
561 fn test_numbers() {
562 let mut lexer = Lexer::new("42 0x1A 077 3.14 1e-5 100L");
563 assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(42));
564 assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(26)); assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(63)); assert_eq!(lexer.next_token().unwrap(), Token::FloatLiteral(3.14));
567 assert!(matches!(lexer.next_token().unwrap(), Token::FloatLiteral(_)));
568 assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(100)); }
570
571 #[test]
572 fn test_comments() {
573 let mut lexer = Lexer::new("x -- comment\ny /* block */ z");
574 assert!(matches!(lexer.next_token().unwrap(), Token::Identifier(_)));
575 assert!(matches!(lexer.next_token().unwrap(), Token::Identifier(_)));
576 assert!(matches!(lexer.next_token().unwrap(), Token::Identifier(_)));
577 }
578}