1use crate::error::{ParseError, ParseResult};
4use crate::tokens::{Token, TokenType, LexicalState, TokenSource};
5
6pub struct Lexer {
8 input: String,
10 position: usize,
12 state: LexicalState,
14 tokens: Vec<Token>,
16 current_line: usize,
18 current_column: usize,
20 line_starts: Vec<usize>,
22}
23
24impl Lexer {
25 pub fn new(input: String) -> Self {
27 let mut line_starts = vec![0];
28 for (i, ch) in input.char_indices() {
29 if ch == '\n' {
30 line_starts.push(i + 1);
31 }
32 }
33
34 Lexer {
35 input,
36 position: 0,
37 state: LexicalState::DEFAULT,
38 tokens: Vec::new(),
39 current_line: 1,
40 current_column: 1,
41 line_starts,
42 }
43 }
44
45 pub fn next_token(&mut self) -> ParseResult<Token> {
47 self.skip_ignored()?;
49
50 if self.position >= self.input.len() {
51 return Ok(Token::new(
53 TokenType::EOF,
54 String::new(),
55 self.position,
56 self.position,
57 ));
58 }
59
60 let start_pos = self.position;
61 let start_line = self.current_line;
62 let start_column = self.current_column;
63
64 if let Some(token) = self.try_match_token(start_pos)? {
66 return Ok(token);
67 }
68
69 Err(ParseError::at_location(
71 format!("Unexpected character: '{}'", self.current_char()),
72 start_line,
73 start_column,
74 ))
75 }
76
77 fn try_match_token(&mut self, start_pos: usize) -> ParseResult<Option<Token>> {
79 let ch = self.current_char();
80
81 if self.matches_string("!=") {
82 return Ok(Some(self.consume_literal(TokenType::NE, "!=", start_pos)));
83 }
84 if self.matches_string("<>") {
85 return Ok(Some(self.consume_literal(TokenType::NE, "<>", start_pos)));
86 }
87 if self.matches_string(">=") {
88 return Ok(Some(self.consume_literal(TokenType::GE, ">=", start_pos)));
89 }
90 if self.matches_string("<=") {
91 return Ok(Some(self.consume_literal(TokenType::LE, "<=", start_pos)));
92 }
93 match ch {
94 ' ' => {
95 self.advance();
96 return Ok(Some(Token::new(
97 TokenType::SPACE,
98 " ".to_string(),
99 start_pos,
100 self.position,
101 )));
102 }
103 '\t' => {
104 self.advance();
105 return Ok(Some(Token::new(
106 TokenType::TAB,
107 "\t".to_string(),
108 start_pos,
109 self.position,
110 )));
111 }
112 '\n' => {
113 self.advance();
114 return Ok(Some(Token::new(
115 TokenType::NEWLINE,
116 "\n".to_string(),
117 start_pos,
118 self.position,
119 )));
120 }
121 '\r' => {
122 self.advance();
123 return Ok(Some(Token::new(
124 TokenType::CR,
125 "\r".to_string(),
126 start_pos,
127 self.position,
128 )));
129 }
130 '\x0c' => {
131 self.advance();
132 return Ok(Some(Token::new(
133 TokenType::FORM_FEED,
134 "\x0c".to_string(),
135 start_pos,
136 self.position,
137 )));
138 }
139 '=' => {
140 self.advance();
141 return Ok(Some(Token::new(
142 TokenType::EQ,
143 "=".to_string(),
144 start_pos,
145 self.position,
146 )));
147 }
148 '>' => {
149 self.advance();
150 return Ok(Some(Token::new(
151 TokenType::GT,
152 ">".to_string(),
153 start_pos,
154 self.position,
155 )));
156 }
157 '<' => {
158 self.advance();
159 return Ok(Some(Token::new(
160 TokenType::LT,
161 "<".to_string(),
162 start_pos,
163 self.position,
164 )));
165 }
166 '(' => {
167 self.advance();
168 return Ok(Some(Token::new(
169 TokenType::LPAREN,
170 "(".to_string(),
171 start_pos,
172 self.position,
173 )));
174 }
175 ',' => {
176 self.advance();
177 return Ok(Some(Token::new(
178 TokenType::COMMA,
179 ",".to_string(),
180 start_pos,
181 self.position,
182 )));
183 }
184 ')' => {
185 self.advance();
186 return Ok(Some(Token::new(
187 TokenType::RPAREN,
188 ")".to_string(),
189 start_pos,
190 self.position,
191 )));
192 }
193 '+' => {
194 self.advance();
195 return Ok(Some(Token::new(
196 TokenType::PLUS,
197 "+".to_string(),
198 start_pos,
199 self.position,
200 )));
201 }
202 '-' => {
203 self.advance();
204 return Ok(Some(Token::new(
205 TokenType::MINUS,
206 "-".to_string(),
207 start_pos,
208 self.position,
209 )));
210 }
211 '*' => {
212 self.advance();
213 return Ok(Some(Token::new(
214 TokenType::STAR,
215 "*".to_string(),
216 start_pos,
217 self.position,
218 )));
219 }
220 '/' => {
221 self.advance();
222 return Ok(Some(Token::new(
223 TokenType::SLASH,
224 "/".to_string(),
225 start_pos,
226 self.position,
227 )));
228 }
229 '%' => {
230 self.advance();
231 return Ok(Some(Token::new(
232 TokenType::PERCENT,
233 "%".to_string(),
234 start_pos,
235 self.position,
236 )));
237 }
238 _ => {}
239 }
240
241 if ch == '\'' {
243 return self.match_string_literal(start_pos);
244 }
245
246 if ch.is_ascii_digit()
248 || (ch == '.' && self.peek(1).is_some_and(|c| c.is_ascii_digit()))
249 {
250 return self.match_number(start_pos);
251 }
252
253 if ch.is_ascii_alphabetic() || ch == '_' || ch == '$' {
255 return self.match_identifier_or_keyword(start_pos);
256 }
257
258 Ok(None)
260 }
261
262 fn consume_literal(&mut self, token_type: TokenType, literal: &str, start_pos: usize) -> Token {
264 for _ in 0..literal.len() {
265 self.advance();
266 }
267 Token::new(token_type, literal.to_string(), start_pos, self.position)
268 }
269
270 fn match_string_literal(&mut self, start_pos: usize) -> ParseResult<Option<Token>> {
272 self.advance();
274 while self.position < self.input.len() {
275 let ch = self.current_char();
276 if ch == '\'' {
277 if self.peek(1) == Some('\'') {
279 self.advance(); self.advance(); continue;
282 }
283 self.advance(); let image = self.input[start_pos..self.position].to_string();
285 return Ok(Some(Token::new(
286 TokenType::STRING_LITERAL,
287 image,
288 start_pos,
289 self.position,
290 )));
291 }
292 self.advance();
293 }
294 Err(ParseError::at_position(
296 "Unterminated string literal".to_string(),
297 start_pos,
298 ))
299 }
300
301 fn match_number(&mut self, start_pos: usize) -> ParseResult<Option<Token>> {
303 if self.current_char() == '0' {
305 if self.peek(1).is_some_and(|ch| ch == 'x' || ch == 'X') {
306 self.advance(); self.advance(); if self.position >= self.input.len() || !self.current_char().is_ascii_hexdigit() {
310 return Err(ParseError::at_position(
311 "Expected hex digit after 0x".to_string(),
312 start_pos,
313 ));
314 }
315 while self.position < self.input.len() && self.current_char().is_ascii_hexdigit() {
316 self.advance();
317 }
318 if self.position < self.input.len() && matches!(self.current_char(), 'L' | 'l') {
320 self.advance();
321 }
322 let image = self.input[start_pos..self.position].to_string();
323 return Ok(Some(Token::new(
324 TokenType::HEX_LITERAL,
325 image,
326 start_pos,
327 self.position,
328 )));
329 }
330 if self.peek(1).is_some_and(|ch| ('0'..='7').contains(&ch)) {
331 self.advance(); while self.position < self.input.len() && ('0'..='7').contains(&self.current_char()) {
334 self.advance();
335 }
336 if self.position < self.input.len() && matches!(self.current_char(), 'L' | 'l') {
338 self.advance();
339 }
340 let image = self.input[start_pos..self.position].to_string();
341 return Ok(Some(Token::new(
342 TokenType::OCTAL_LITERAL,
343 image,
344 start_pos,
345 self.position,
346 )));
347 }
348 }
349
350 let mut is_float = false;
352 while self.position < self.input.len() && self.current_char().is_ascii_digit() {
353 self.advance();
354 }
355 if self.position < self.input.len() && self.current_char() == '.'
357 && self.peek(1).is_some_and(|ch| ch.is_ascii_digit())
358 {
359 is_float = true;
360 self.advance(); while self.position < self.input.len() && self.current_char().is_ascii_digit() {
362 self.advance();
363 }
364 }
365 if self.position < self.input.len() && matches!(self.current_char(), 'e' | 'E') {
367 is_float = true;
368 self.advance(); if self.position < self.input.len() && matches!(self.current_char(), '+' | '-') {
370 self.advance(); }
372 if self.position >= self.input.len() || !self.current_char().is_ascii_digit() {
373 return Err(ParseError::at_position(
374 "Expected digit in exponent".to_string(),
375 start_pos,
376 ));
377 }
378 while self.position < self.input.len() && self.current_char().is_ascii_digit() {
379 self.advance();
380 }
381 }
382 if is_float {
383 let image = self.input[start_pos..self.position].to_string();
384 return Ok(Some(Token::new(
385 TokenType::FLOATING_POINT_LITERAL,
386 image,
387 start_pos,
388 self.position,
389 )));
390 }
391 if self.position < self.input.len() && matches!(self.current_char(), 'L' | 'l') {
393 self.advance();
394 }
395 let image = self.input[start_pos..self.position].to_string();
396 Ok(Some(Token::new(
397 TokenType::DECIMAL_LITERAL,
398 image,
399 start_pos,
400 self.position,
401 )))
402 }
403
404 fn match_identifier_or_keyword(&mut self, start_pos: usize) -> ParseResult<Option<Token>> {
406 while self.position < self.input.len() {
408 let ch = self.current_char();
409 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
410 self.advance();
411 } else {
412 break;
413 }
414 }
415 let image = self.input[start_pos..self.position].to_string();
416 let upper = image.to_ascii_uppercase();
417
418 let token_type = match upper.as_str() {
420 "NOT" => TokenType::NOT,
421 "AND" => TokenType::AND,
422 "OR" => TokenType::OR,
423 "BETWEEN" => TokenType::BETWEEN,
424 "LIKE" => TokenType::LIKE,
425 "ESCAPE" => TokenType::ESCAPE,
426 "IN" => TokenType::IN,
427 "IS" => TokenType::IS,
428 "TRUE" => TokenType::TRUE,
429 "FALSE" => TokenType::FALSE,
430 "NULL" => TokenType::NULL,
431 _ => TokenType::ID,
432 };
433
434 Ok(Some(Token::new(token_type, image, start_pos, self.position)))
435 }
436
437 fn skip_ignored(&mut self) -> ParseResult<()> {
439 while self.position < self.input.len() {
440 let ch = self.current_char();
441
442 if ch.is_whitespace() {
444 self.advance();
445 continue;
446 }
447
448 if ch == '-' && self.peek(1) == Some('-') {
450 self.advance(); self.advance(); while self.position < self.input.len() && self.current_char() != '\n' {
453 self.advance();
454 }
455 continue;
456 }
457
458 if ch == '/' && self.peek(1) == Some('*') {
460 let start_pos = self.position;
461 self.advance(); self.advance(); loop {
464 if self.position >= self.input.len() {
465 return Err(ParseError::at_position(
466 "Unterminated block comment".to_string(),
467 start_pos,
468 ));
469 }
470 if self.current_char() == '*' && self.peek(1) == Some('/') {
471 self.advance(); self.advance(); break;
474 }
475 self.advance();
476 }
477 continue;
478 }
479
480 break;
481 }
482 Ok(())
483 }
484
485 fn current_char(&self) -> char {
487 self.input[self.position..].chars().next().unwrap_or('\0')
488 }
489
490 fn advance(&mut self) {
492 if self.position < self.input.len() {
493 let ch = self.current_char();
494 self.position += ch.len_utf8();
495
496 if ch == '\n' {
497 self.current_line += 1;
498 self.current_column = 1;
499 } else {
500 self.current_column += 1;
501 }
502 }
503 }
504
505 fn peek(&self, n: usize) -> Option<char> {
507 self.input[self.position..].chars().nth(n)
508 }
509
510 fn matches_string(&self, s: &str) -> bool {
512 self.input[self.position..].starts_with(s)
513 }
514}
515
516impl TokenSource for Lexer {
517 fn get_line_from_offset(&self, offset: usize) -> usize {
518 match self.line_starts.binary_search(&offset) {
520 Ok(line) => line + 1,
521 Err(line) => line,
522 }
523 }
524
525 fn get_column_from_offset(&self, offset: usize) -> usize {
526 let line_num = self.get_line_from_offset(offset);
527 if line_num == 0 || line_num > self.line_starts.len() {
528 return 1;
529 }
530
531 let line_start = self.line_starts[line_num - 1];
532 offset.saturating_sub(line_start) + 1
533 }
534}