1use std::fmt;
11
12#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17 And,
19 Or,
20 Not,
21 Between,
22 Like,
23 Escape,
24 In,
25 Is,
26 True,
27 False,
28 Null,
29
30 Equal, NotEqual, GreaterThan, GreaterOrEqual, LessThan, LessOrEqual, Plus, Minus, Star, Slash, Percent, LeftParen, RightParen, Comma, Identifier(String),
50 StringLiteral(String),
51 IntegerLiteral(i64),
52 FloatLiteral(f64),
53
54 Eof,
56}
57
58impl fmt::Display for Token {
59 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
60 match self {
61 Token::And => write!(f, "AND"),
62 Token::Or => write!(f, "OR"),
63 Token::Not => write!(f, "NOT"),
64 Token::Between => write!(f, "BETWEEN"),
65 Token::Like => write!(f, "LIKE"),
66 Token::Escape => write!(f, "ESCAPE"),
67 Token::In => write!(f, "IN"),
68 Token::Is => write!(f, "IS"),
69 Token::True => write!(f, "TRUE"),
70 Token::False => write!(f, "FALSE"),
71 Token::Null => write!(f, "NULL"),
72 Token::Equal => write!(f, "="),
73 Token::NotEqual => write!(f, "<>"),
74 Token::GreaterThan => write!(f, ">"),
75 Token::GreaterOrEqual => write!(f, ">="),
76 Token::LessThan => write!(f, "<"),
77 Token::LessOrEqual => write!(f, "<="),
78 Token::Plus => write!(f, "+"),
79 Token::Minus => write!(f, "-"),
80 Token::Star => write!(f, "*"),
81 Token::Slash => write!(f, "/"),
82 Token::Percent => write!(f, "%"),
83 Token::LeftParen => write!(f, "("),
84 Token::RightParen => write!(f, ")"),
85 Token::Comma => write!(f, ","),
86 Token::Identifier(s) => write!(f, "identifier '{}'", s),
87 Token::StringLiteral(s) => write!(f, "string '{}'", s),
88 Token::IntegerLiteral(n) => write!(f, "integer {}", n),
89 Token::FloatLiteral(n) => write!(f, "float {}", n),
90 Token::Eof => write!(f, "end of input"),
91 }
92 }
93}
94
95pub struct Lexer {
97 input: Vec<char>,
98 position: usize,
99 current_char: Option<char>,
100}
101
102impl Lexer {
103 pub fn new(input: &str) -> Self {
104 let chars: Vec<char> = input.chars().collect();
105 let current_char = chars.first().copied();
106 Lexer {
107 input: chars,
108 position: 0,
109 current_char,
110 }
111 }
112
113 fn format_error(&self, message: &str) -> String {
115 format!("{} near position {} in:\n {}",
116 message,
117 self.position,
118 String::from_iter(&self.input))
119 }
120
121 fn advance(&mut self) {
123 self.position += 1;
124 self.current_char = self.input.get(self.position).copied();
125 }
126
127 fn peek(&self) -> Option<char> {
129 self.input.get(self.position + 1).copied()
130 }
131
132 fn skip_whitespace(&mut self) {
134 while let Some(ch) = self.current_char {
135 if ch.is_whitespace() {
136 self.advance();
137 } else {
138 break;
139 }
140 }
141 }
142
143 fn skip_line_comment(&mut self) {
145 self.advance();
147 self.advance();
148
149 while let Some(ch) = self.current_char {
151 if ch == '\n' {
152 self.advance();
153 break;
154 }
155 self.advance();
156 }
157 }
158
159 fn skip_block_comment(&mut self) -> Result<(), String> {
161 self.advance();
163 self.advance();
164
165 while let Some(ch) = self.current_char {
167 if ch == '*' && self.peek() == Some('/') {
168 self.advance(); self.advance(); return Ok(());
171 }
172 self.advance();
173 }
174
175 Err(self.format_error("Unterminated block comment"))
176 }
177
178 fn read_identifier(&mut self) -> String {
180 let mut result = String::new();
181
182 while let Some(ch) = self.current_char {
183 if ch.is_alphanumeric() || ch == '_' || ch == '$' {
184 result.push(ch);
185 self.advance();
186 } else {
187 break;
188 }
189 }
190
191 result
192 }
193
194 fn keyword_or_identifier(&self, s: &str) -> Token {
196 match s.to_uppercase().as_str() {
197 "AND" => Token::And,
198 "OR" => Token::Or,
199 "NOT" => Token::Not,
200 "BETWEEN" => Token::Between,
201 "LIKE" => Token::Like,
202 "ESCAPE" => Token::Escape,
203 "IN" => Token::In,
204 "IS" => Token::Is,
205 "TRUE" => Token::True,
206 "FALSE" => Token::False,
207 "NULL" => Token::Null,
208 _ => Token::Identifier(s.to_string()),
209 }
210 }
211
212 fn read_string_literal(&mut self) -> Result<String, String> {
214 let mut result = String::new();
215
216 self.advance();
218
219 while let Some(ch) = self.current_char {
220 if ch == '\'' {
221 if self.peek() == Some('\'') {
223 result.push('\'');
224 self.advance(); self.advance(); } else {
227 self.advance(); return Ok(result);
230 }
231 } else {
232 result.push(ch);
233 self.advance();
234 }
235 }
236
237 Err(self.format_error("Unterminated string literal"))
238 }
239
240 fn read_number(&mut self) -> Result<Token, String> {
242 if self.current_char == Some('0') && matches!(self.peek(), Some('x') | Some('X')) {
244 return self.read_hex_literal();
245 }
246
247 if self.current_char == Some('0') && self.peek().is_some_and(|c| c.is_ascii_digit()) {
249 return self.read_octal_literal();
250 }
251
252 let mut num_str = String::new();
254 let mut is_float = false;
255
256 while let Some(ch) = self.current_char {
258 if ch.is_ascii_digit() {
259 num_str.push(ch);
260 self.advance();
261 } else {
262 break;
263 }
264 }
265
266 if self.current_char == Some('.') && self.peek().is_some_and(|c| c.is_ascii_digit() || c == 'e' || c == 'E') {
268 is_float = true;
269 num_str.push('.');
270 self.advance();
271
272 while let Some(ch) = self.current_char {
274 if ch.is_ascii_digit() {
275 num_str.push(ch);
276 self.advance();
277 } else {
278 break;
279 }
280 }
281 }
282
283 if matches!(self.current_char, Some('e') | Some('E')) {
285 is_float = true;
286 num_str.push('e');
287 self.advance();
288
289 if matches!(self.current_char, Some('+') | Some('-')) {
291 num_str.push(self.current_char.unwrap());
292 self.advance();
293 }
294
295 while let Some(ch) = self.current_char {
297 if ch.is_ascii_digit() {
298 num_str.push(ch);
299 self.advance();
300 } else {
301 break;
302 }
303 }
304 }
305
306 if matches!(self.current_char, Some('l') | Some('L')) && !is_float {
308 self.advance();
309 let value = num_str.parse::<i64>()
310 .map_err(|e| self.format_error(&format!("Invalid integer literal: {}", e)))?;
311 return Ok(Token::IntegerLiteral(value));
312 }
313
314 if is_float {
316 let value = num_str.parse::<f64>()
317 .map_err(|e| self.format_error(&format!("Invalid float literal: {}", e)))?;
318 Ok(Token::FloatLiteral(value))
319 } else {
320 let value = num_str.parse::<i64>()
321 .map_err(|e| self.format_error(&format!("Invalid integer literal: {}", e)))?;
322 Ok(Token::IntegerLiteral(value))
323 }
324 }
325
326 fn read_hex_literal(&mut self) -> Result<Token, String> {
328 self.advance();
330 self.advance();
331
332 let mut hex_str = String::new();
333 while let Some(ch) = self.current_char {
334 if ch.is_ascii_hexdigit() {
335 hex_str.push(ch);
336 self.advance();
337 } else {
338 break;
339 }
340 }
341
342 if hex_str.is_empty() {
343 return Err(self.format_error("Invalid hexadecimal literal: no digits after 0x"));
344 }
345
346 let value = i64::from_str_radix(&hex_str, 16)
347 .map_err(|e| self.format_error(&format!("Invalid hexadecimal literal: {}", e)))?;
348 Ok(Token::IntegerLiteral(value))
349 }
350
351 fn read_octal_literal(&mut self) -> Result<Token, String> {
353 let mut octal_str = String::new();
354
355 while let Some(ch) = self.current_char {
356 if ('0'..='7').contains(&ch) {
357 octal_str.push(ch);
358 self.advance();
359 } else {
360 break;
361 }
362 }
363
364 let value = i64::from_str_radix(&octal_str, 8)
365 .map_err(|e| self.format_error(&format!("Invalid octal literal: {}", e)))?;
366 Ok(Token::IntegerLiteral(value))
367 }
368
369 fn read_float_starting_with_dot(&mut self) -> Result<Token, String> {
371 let mut num_str = String::from("0.");
372
373 self.advance();
375
376 while let Some(ch) = self.current_char {
378 if ch.is_ascii_digit() {
379 num_str.push(ch);
380 self.advance();
381 } else {
382 break;
383 }
384 }
385
386 if matches!(self.current_char, Some('e') | Some('E')) {
388 num_str.push('e');
389 self.advance();
390
391 if matches!(self.current_char, Some('+') | Some('-')) {
393 num_str.push(self.current_char.unwrap());
394 self.advance();
395 }
396
397 while let Some(ch) = self.current_char {
399 if ch.is_ascii_digit() {
400 num_str.push(ch);
401 self.advance();
402 } else {
403 break;
404 }
405 }
406 }
407
408 let value = num_str.parse::<f64>()
409 .map_err(|e| self.format_error(&format!("Invalid float literal: {}", e)))?;
410 Ok(Token::FloatLiteral(value))
411 }
412
413 pub fn next_token(&mut self) -> Result<Token, String> {
415 loop {
416 self.skip_whitespace();
418
419 let ch = match self.current_char {
420 Some(c) => c,
421 None => return Ok(Token::Eof),
422 };
423
424 if ch == '-' && self.peek() == Some('-') {
426 self.skip_line_comment();
427 continue;
428 }
429
430 if ch == '/' && self.peek() == Some('*') {
431 self.skip_block_comment()?;
432 continue;
433 }
434
435 match ch {
437 '(' => {
438 self.advance();
439 return Ok(Token::LeftParen);
440 }
441 ')' => {
442 self.advance();
443 return Ok(Token::RightParen);
444 }
445 ',' => {
446 self.advance();
447 return Ok(Token::Comma);
448 }
449 '+' => {
450 self.advance();
451 return Ok(Token::Plus);
452 }
453 '-' => {
454 self.advance();
455 return Ok(Token::Minus);
456 }
457 '*' => {
458 self.advance();
459 return Ok(Token::Star);
460 }
461 '/' => {
462 self.advance();
463 return Ok(Token::Slash);
464 }
465 '%' => {
466 self.advance();
467 return Ok(Token::Percent);
468 }
469 '=' => {
470 self.advance();
471 return Ok(Token::Equal);
472 }
473 '!' => {
474 if self.peek() == Some('=') {
475 self.advance();
476 self.advance();
477 return Ok(Token::NotEqual);
478 }
479 return Err(self.format_error(&format!("Unexpected character: '{}'", ch)));
480 }
481 '<' => {
482 self.advance();
483 if self.current_char == Some('>') {
484 self.advance();
485 return Ok(Token::NotEqual);
486 } else if self.current_char == Some('=') {
487 self.advance();
488 return Ok(Token::LessOrEqual);
489 }
490 return Ok(Token::LessThan);
491 }
492 '>' => {
493 self.advance();
494 if self.current_char == Some('=') {
495 self.advance();
496 return Ok(Token::GreaterOrEqual);
497 }
498 return Ok(Token::GreaterThan);
499 }
500 '\'' => {
501 let s = self.read_string_literal()?;
502 return Ok(Token::StringLiteral(s));
503 }
504 '.' => {
505 if self.peek().is_some_and(|c| c.is_ascii_digit()) {
507 return self.read_float_starting_with_dot();
508 }
509 return Err(self.format_error(&format!("Unexpected character: '{}'", ch)));
510 }
511 _ => {
512 if ch.is_alphabetic() || ch == '_' || ch == '$' {
514 let ident = self.read_identifier();
515 return Ok(self.keyword_or_identifier(&ident));
516 }
517
518 if ch.is_ascii_digit() {
520 return self.read_number();
521 }
522
523 return Err(self.format_error(&format!("Unexpected character: '{}'", ch)));
524 }
525 }
526 }
527 }
528
529 pub fn tokenize(&mut self) -> Result<Vec<Token>, String> {
531 let mut tokens = Vec::new();
532 loop {
533 let token = self.next_token()?;
534 if token == Token::Eof {
535 tokens.push(token);
536 break;
537 }
538 tokens.push(token);
539 }
540 Ok(tokens)
541 }
542}
543
544#[cfg(test)]
545mod tests {
546 use super::*;
547
548 #[test]
549 fn test_keywords() {
550 let mut lexer = Lexer::new("AND or Not BETWEEN");
551 assert_eq!(lexer.next_token().unwrap(), Token::And);
552 assert_eq!(lexer.next_token().unwrap(), Token::Or);
553 assert_eq!(lexer.next_token().unwrap(), Token::Not);
554 assert_eq!(lexer.next_token().unwrap(), Token::Between);
555 }
556
557 #[test]
558 fn test_string_literal() {
559 let mut lexer = Lexer::new("'hello' 'it''s me'");
560 assert_eq!(lexer.next_token().unwrap(), Token::StringLiteral("hello".to_string()));
561 assert_eq!(lexer.next_token().unwrap(), Token::StringLiteral("it's me".to_string()));
562 }
563
564 #[test]
565 fn test_numbers() {
566 let mut lexer = Lexer::new("42 0x1A 077 3.14 1e-5 100L");
567 assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(42));
568 assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(26)); assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(63)); assert_eq!(lexer.next_token().unwrap(), Token::FloatLiteral(3.14));
571 assert!(matches!(lexer.next_token().unwrap(), Token::FloatLiteral(_)));
572 assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(100)); }
574
575 #[test]
576 fn test_comments() {
577 let mut lexer = Lexer::new("x -- comment\ny /* block */ z");
578 assert!(matches!(lexer.next_token().unwrap(), Token::Identifier(_)));
579 assert!(matches!(lexer.next_token().unwrap(), Token::Identifier(_)));
580 assert!(matches!(lexer.next_token().unwrap(), Token::Identifier(_)));
581 }
582}