1use super::token::{Span, Token, TokenKind};
24use std::borrow::Cow;
25
26#[derive(Debug, Clone, PartialEq)]
28pub struct LexError {
29 pub message: String,
30 pub span: Span,
31}
32
33impl LexError {
34 pub fn new(message: impl Into<String>, span: Span) -> Self {
35 Self {
36 message: message.into(),
37 span,
38 }
39 }
40}
41
42impl std::fmt::Display for LexError {
43 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
44 write!(
45 f,
46 "Lexer error at line {}, column {}: {}",
47 self.span.line, self.span.column, self.message
48 )
49 }
50}
51
52impl std::error::Error for LexError {}
53
54pub struct Lexer<'a> {
56 input: &'a str,
57 bytes: &'a [u8],
58 pos: usize,
59 line: usize,
60 column: usize,
61 tokens: Vec<Token<'a>>,
62 errors: Vec<LexError>,
63 placeholder_counter: u32,
65}
66
67impl<'a> Lexer<'a> {
68 pub fn new(input: &'a str) -> Self {
70 Self {
71 input,
72 bytes: input.as_bytes(),
73 pos: 0,
74 line: 1,
75 column: 1,
76 tokens: Vec::with_capacity(input.len() / 4),
77 errors: Vec::new(),
78 placeholder_counter: 0,
79 }
80 }
81
82 pub fn tokenize(mut self) -> Result<Vec<Token<'a>>, Vec<LexError>> {
84 while !self.is_at_end() {
85 self.scan_token();
86 }
87
88 self.tokens.push(Token::new(
90 TokenKind::Eof,
91 Span::new(self.pos, self.pos, self.line, self.column),
92 "",
93 ));
94
95 if self.errors.is_empty() {
96 Ok(self.tokens)
97 } else {
98 Err(self.errors)
99 }
100 }
101
102 fn is_at_end(&self) -> bool {
103 self.pos >= self.bytes.len()
104 }
105
106 fn advance(&mut self) -> Option<char> {
107 if self.pos >= self.bytes.len() {
108 return None;
109 }
110 let b = self.bytes[self.pos];
111 if b < 0x80 {
112 self.pos += 1;
114 if b == b'\n' {
115 self.line += 1;
116 self.column = 1;
117 } else {
118 self.column += 1;
119 }
120 Some(b as char)
121 } else {
122 let c = self.input[self.pos..].chars().next().unwrap();
124 self.pos += c.len_utf8();
125 self.column += 1;
126 Some(c)
127 }
128 }
129
130 fn peek(&self) -> Option<char> {
131 if self.pos >= self.bytes.len() {
132 return None;
133 }
134 let b = self.bytes[self.pos];
135 if b < 0x80 {
136 Some(b as char)
137 } else {
138 self.input[self.pos..].chars().next()
139 }
140 }
141
142 fn peek_next(&self) -> Option<char> {
143 if self.pos >= self.bytes.len() {
144 return None;
145 }
146 let first_len = if self.bytes[self.pos] < 0x80 {
147 1
148 } else {
149 self.input[self.pos..]
150 .chars()
151 .next()
152 .map_or(1, |c| c.len_utf8())
153 };
154 let next = self.pos + first_len;
155 if next >= self.bytes.len() {
156 return None;
157 }
158 let b = self.bytes[next];
159 if b < 0x80 {
160 Some(b as char)
161 } else {
162 self.input[next..].chars().next()
163 }
164 }
165
166 fn make_span(&self, start: usize, start_line: usize, start_col: usize) -> Span {
167 Span::new(start, self.pos, start_line, start_col)
168 }
169
170 fn scan_token(&mut self) {
171 let start = self.pos;
172 let start_line = self.line;
173 let start_col = self.column;
174
175 let c = match self.advance() {
176 Some(c) => c,
177 None => return,
178 };
179
180 match c {
181 ' ' | '\t' | '\r' | '\n' => {
183 }
185
186 '(' => self.add_token(TokenKind::LParen, start, start_line, start_col),
188 ')' => self.add_token(TokenKind::RParen, start, start_line, start_col),
189 '[' => self.add_token(TokenKind::LBracket, start, start_line, start_col),
190 ']' => self.add_token(TokenKind::RBracket, start, start_line, start_col),
191 ',' => self.add_token(TokenKind::Comma, start, start_line, start_col),
192 ';' => self.add_token(TokenKind::Semicolon, start, start_line, start_col),
193 '+' => self.add_token(TokenKind::Plus, start, start_line, start_col),
194 '*' => self.add_token(TokenKind::Star, start, start_line, start_col),
195 '/' => {
196 if self.peek() == Some('/') || self.peek() == Some('*') {
197 self.scan_comment(start, start_line, start_col);
198 } else {
199 self.add_token(TokenKind::Slash, start, start_line, start_col);
200 }
201 }
202 '%' => self.add_token(TokenKind::Percent, start, start_line, start_col),
203 '&' => self.add_token(TokenKind::BitAnd, start, start_line, start_col),
204 '~' => self.add_token(TokenKind::BitNot, start, start_line, start_col),
205 '?' => {
206 self.placeholder_counter += 1;
208 let span = self.make_span(start, start_line, start_col);
209 self.tokens.push(Token::new(
210 TokenKind::Placeholder(self.placeholder_counter),
211 span,
212 "?",
213 ));
214 }
215 '@' => self.add_token(TokenKind::At, start, start_line, start_col),
216
217 '-' => {
219 if self.peek() == Some('-') {
220 self.scan_line_comment(start, start_line, start_col);
222 } else if self.peek() == Some('>') {
223 self.advance();
224 if self.peek() == Some('>') {
225 self.advance();
226 self.add_token(TokenKind::DoubleArrow, start, start_line, start_col);
227 } else {
228 self.add_token(TokenKind::Arrow, start, start_line, start_col);
229 }
230 } else {
231 self.add_token(TokenKind::Minus, start, start_line, start_col);
232 }
233 }
234
235 '=' => self.add_token(TokenKind::Eq, start, start_line, start_col),
236
237 '!' => {
238 if self.peek() == Some('=') {
239 self.advance();
240 self.add_token(TokenKind::Ne, start, start_line, start_col);
241 } else {
242 self.add_error("Unexpected character '!'", start, start_line, start_col);
243 }
244 }
245
246 '<' => {
247 if self.peek() == Some('=') {
248 self.advance();
249 self.add_token(TokenKind::Le, start, start_line, start_col);
250 } else if self.peek() == Some('-') {
251 self.advance(); if self.peek() == Some('>') {
253 self.advance(); self.add_token(TokenKind::BiArrow, start, start_line, start_col);
255 } else {
256 self.add_token(TokenKind::LeftArrow, start, start_line, start_col);
257 }
258 } else if self.peek() == Some('>') {
259 self.advance();
260 self.add_token(TokenKind::Ne, start, start_line, start_col);
261 } else if self.peek() == Some('<') {
262 self.advance();
263 self.add_token(TokenKind::LeftShift, start, start_line, start_col);
264 } else {
265 self.add_token(TokenKind::Lt, start, start_line, start_col);
266 }
267 }
268
269 '>' => {
270 if self.peek() == Some('=') {
271 self.advance();
272 self.add_token(TokenKind::Ge, start, start_line, start_col);
273 } else if self.peek() == Some('>') {
274 self.advance();
275 self.add_token(TokenKind::RightShift, start, start_line, start_col);
276 } else {
277 self.add_token(TokenKind::Gt, start, start_line, start_col);
278 }
279 }
280
281 '|' => {
282 if self.peek() == Some('|') {
283 self.advance();
284 self.add_token(TokenKind::Concat, start, start_line, start_col);
285 } else {
286 self.add_token(TokenKind::BitOr, start, start_line, start_col);
287 }
288 }
289
290 ':' => {
291 if self.peek() == Some(':') {
292 self.advance();
293 self.add_token(TokenKind::DoubleColon, start, start_line, start_col);
294 } else {
295 self.add_token(TokenKind::Colon, start, start_line, start_col);
296 }
297 }
298
299 '.' => {
300 if self.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
301 self.scan_number(start, start_line, start_col, true);
302 } else {
303 self.add_token(TokenKind::Dot, start, start_line, start_col);
304 }
305 }
306
307 '\'' => self.scan_string(start, start_line, start_col, '\''),
309 '"' => self.scan_quoted_identifier(start, start_line, start_col, '"'),
310 '`' => self.scan_quoted_identifier(start, start_line, start_col, '`'),
311
312 'X' | 'x' if self.peek() == Some('\'') => {
314 self.advance();
315 self.scan_blob(start, start_line, start_col);
316 }
317
318 '0'..='9' => self.scan_number(start, start_line, start_col, false),
320
321 'a'..='z' | 'A'..='Z' | '_' => self.scan_identifier(start, start_line, start_col),
323
324 '$' => self.scan_placeholder(start, start_line, start_col),
326
327 _ => {
328 self.add_error(
329 format!("Unexpected character '{}'", c),
330 start,
331 start_line,
332 start_col,
333 );
334 }
335 }
336 }
337
338 fn scan_string(&mut self, start: usize, start_line: usize, start_col: usize, quote: char) {
339 let mut value = String::new();
340
341 while let Some(c) = self.peek() {
342 if c == quote {
343 self.advance();
344 if self.peek() == Some(quote) {
346 self.advance();
347 value.push(quote);
348 } else {
349 let span = self.make_span(start, start_line, start_col);
351 let literal = &self.input[start..self.pos];
352 self.tokens.push(Token::new(
353 TokenKind::String(Cow::Owned(value)),
354 span,
355 literal,
356 ));
357 return;
358 }
359 } else if c == '\\' {
360 self.advance();
361 if let Some(escaped) = self.advance() {
363 match escaped {
364 'n' => value.push('\n'),
365 'r' => value.push('\r'),
366 't' => value.push('\t'),
367 '\\' => value.push('\\'),
368 '\'' => value.push('\''),
369 '"' => value.push('"'),
370 '0' => value.push('\0'),
371 _ => {
372 value.push('\\');
373 value.push(escaped);
374 }
375 }
376 }
377 } else {
378 self.advance();
379 value.push(c);
380 }
381 }
382
383 self.add_error("Unterminated string literal", start, start_line, start_col);
384 }
385
386 fn scan_quoted_identifier(
387 &mut self,
388 start: usize,
389 start_line: usize,
390 start_col: usize,
391 quote: char,
392 ) {
393 let mut value = String::new();
394
395 while let Some(c) = self.peek() {
396 if c == quote {
397 self.advance();
398 if self.peek() == Some(quote) {
400 self.advance();
401 value.push(quote);
402 } else {
403 let span = self.make_span(start, start_line, start_col);
404 let literal = &self.input[start..self.pos];
405 self.tokens.push(Token::new(
406 TokenKind::QuotedIdentifier(Cow::Owned(value)),
407 span,
408 literal,
409 ));
410 return;
411 }
412 } else {
413 self.advance();
414 value.push(c);
415 }
416 }
417
418 self.add_error(
419 "Unterminated quoted identifier",
420 start,
421 start_line,
422 start_col,
423 );
424 }
425
426 fn scan_number(
427 &mut self,
428 start: usize,
429 start_line: usize,
430 start_col: usize,
431 started_with_dot: bool,
432 ) {
433 let num_start = start;
434 let mut has_dot = started_with_dot;
435 let mut has_exp = false;
436
437 while let Some(c) = self.peek() {
439 if c.is_ascii_digit() {
440 self.advance();
441 } else if c == '.' && !has_dot && !has_exp {
442 if self.peek_next() == Some('.') {
444 break;
445 }
446 has_dot = true;
447 self.advance();
448 } else if (c == 'e' || c == 'E') && !has_exp {
449 has_exp = true;
450 self.advance();
451 if self.peek() == Some('+') || self.peek() == Some('-') {
453 self.advance();
454 }
455 } else {
456 break;
457 }
458 }
459
460 let literal = &self.input[num_start..self.pos];
461 let span = self.make_span(start, start_line, start_col);
462
463 if has_dot || has_exp {
464 match literal.parse::<f64>() {
465 Ok(n) => self
466 .tokens
467 .push(Token::new(TokenKind::Float(n), span, literal)),
468 Err(_) => self.add_error("Invalid float literal", start, start_line, start_col),
469 }
470 } else {
471 match literal.parse::<i64>() {
472 Ok(n) => self
473 .tokens
474 .push(Token::new(TokenKind::Integer(n), span, literal)),
475 Err(_) => self.add_error("Invalid integer literal", start, start_line, start_col),
476 }
477 }
478 }
479
480 fn scan_identifier(&mut self, start: usize, start_line: usize, start_col: usize) {
481 while let Some(c) = self.peek() {
482 if c.is_ascii_alphanumeric() || c == '_' {
483 self.advance();
484 } else {
485 break;
486 }
487 }
488
489 let literal = &self.input[start..self.pos];
490 let span = self.make_span(start, start_line, start_col);
491
492 let kind = TokenKind::from_keyword(literal).unwrap_or(TokenKind::Identifier(literal));
494
495 self.tokens.push(Token::new(kind, span, literal));
496 }
497
498 fn scan_placeholder(&mut self, start: usize, start_line: usize, start_col: usize) {
499 let mut num = String::new();
500
501 while let Some(c) = self.peek() {
502 if c.is_ascii_digit() {
503 self.advance();
504 num.push(c);
505 } else {
506 break;
507 }
508 }
509
510 let span = self.make_span(start, start_line, start_col);
511
512 if num.is_empty() {
513 self.add_error("Expected number after $", start, start_line, start_col);
514 } else if let Ok(n) = num.parse::<u32>() {
515 self.tokens.push(Token::new(
516 TokenKind::Placeholder(n),
517 span,
518 &self.input[start..self.pos],
519 ));
520 } else {
521 self.add_error("Invalid placeholder number", start, start_line, start_col);
522 }
523 }
524
525 fn scan_comment(&mut self, start: usize, start_line: usize, start_col: usize) {
526 self.advance(); if self.peek() == Some('*') || self.input[start..self.pos].ends_with('*') {
529 let mut depth = 1;
531
532 while depth > 0 && !self.is_at_end() {
533 let c = self.peek();
534 let next = self.peek_next();
535
536 if c == Some('*') && next == Some('/') {
537 self.advance();
538 self.advance();
539 depth -= 1;
540 } else if c == Some('/') && next == Some('*') {
541 self.advance();
542 self.advance();
543 depth += 1;
544 } else {
545 self.advance();
546 }
547 }
548
549 if depth > 0 {
550 self.add_error("Unterminated block comment", start, start_line, start_col);
551 }
552 } else {
553 while let Some(c) = self.peek() {
555 if c == '\n' {
556 break;
557 }
558 self.advance();
559 }
560 }
561 }
563
564 fn scan_line_comment(&mut self, _start: usize, _start_line: usize, _start_col: usize) {
565 self.advance(); while let Some(c) = self.peek() {
568 if c == '\n' {
569 break;
570 }
571 self.advance();
572 }
573 }
575
576 fn scan_blob(&mut self, start: usize, start_line: usize, start_col: usize) {
577 let mut hex = String::new();
578
579 while let Some(c) = self.peek() {
580 if c == '\'' {
581 self.advance();
582 break;
583 } else if c.is_ascii_hexdigit() {
584 self.advance();
585 hex.push(c);
586 } else if c.is_whitespace() {
587 self.advance(); } else {
589 self.add_error(
590 "Invalid hex digit in blob literal",
591 start,
592 start_line,
593 start_col,
594 );
595 return;
596 }
597 }
598
599 if !hex.len().is_multiple_of(2) {
600 self.add_error(
601 "Blob literal must have even number of hex digits",
602 start,
603 start_line,
604 start_col,
605 );
606 return;
607 }
608
609 let bytes: Result<Vec<u8>, _> = (0..hex.len())
610 .step_by(2)
611 .map(|i| u8::from_str_radix(&hex[i..i + 2], 16))
612 .collect();
613
614 match bytes {
615 Ok(data) => {
616 let span = self.make_span(start, start_line, start_col);
617 let literal = &self.input[start..self.pos];
618 self.tokens
619 .push(Token::new(TokenKind::Blob(data), span, literal));
620 }
621 Err(_) => {
622 self.add_error("Invalid blob literal", start, start_line, start_col);
623 }
624 }
625 }
626
627 fn add_token(
628 &mut self,
629 kind: TokenKind<'a>,
630 start: usize,
631 start_line: usize,
632 start_col: usize,
633 ) {
634 let span = self.make_span(start, start_line, start_col);
635 let literal = &self.input[start..self.pos];
636 self.tokens.push(Token::new(kind, span, literal));
637 }
638
639 fn add_error(
640 &mut self,
641 message: impl Into<String>,
642 start: usize,
643 start_line: usize,
644 start_col: usize,
645 ) {
646 let span = self.make_span(start, start_line, start_col);
647 self.errors.push(LexError::new(message, span));
648 }
649}
650
651#[cfg(test)]
652mod tests {
653 use super::*;
654
655 #[test]
656 fn test_simple_select() {
657 let tokens = Lexer::new("SELECT * FROM users").tokenize().unwrap();
658 assert_eq!(tokens.len(), 5); assert_eq!(tokens[0].kind, TokenKind::Select);
660 assert_eq!(tokens[1].kind, TokenKind::Star);
661 assert_eq!(tokens[2].kind, TokenKind::From);
662 assert!(matches!(tokens[3].kind, TokenKind::Identifier(_)));
663 }
664
665 #[test]
666 fn test_string_literal() {
667 let tokens = Lexer::new("SELECT 'hello''world'").tokenize().unwrap();
668 assert!(matches!(&tokens[1].kind, TokenKind::String(s) if s == "hello'world"));
669 }
670
671 #[test]
672 #[allow(clippy::approx_constant)]
673 fn test_numbers() {
674 let tokens = Lexer::new("42 3.14 1e10 .5").tokenize().unwrap();
675 assert!(matches!(tokens[0].kind, TokenKind::Integer(42)));
676 assert!(matches!(tokens[1].kind, TokenKind::Float(f) if (f - 3.14).abs() < 0.001));
677 assert!(matches!(tokens[2].kind, TokenKind::Float(_)));
678 assert!(matches!(tokens[3].kind, TokenKind::Float(f) if (f - 0.5).abs() < 0.001));
679 }
680
681 #[test]
682 fn test_operators() {
683 let tokens = Lexer::new("= != <> < <= > >= || ->").tokenize().unwrap();
684 assert_eq!(tokens[0].kind, TokenKind::Eq);
685 assert_eq!(tokens[1].kind, TokenKind::Ne);
686 assert_eq!(tokens[2].kind, TokenKind::Ne);
687 assert_eq!(tokens[3].kind, TokenKind::Lt);
688 assert_eq!(tokens[4].kind, TokenKind::Le);
689 assert_eq!(tokens[5].kind, TokenKind::Gt);
690 assert_eq!(tokens[6].kind, TokenKind::Ge);
691 assert_eq!(tokens[7].kind, TokenKind::Concat);
692 assert_eq!(tokens[8].kind, TokenKind::Arrow);
693 }
694
695 #[test]
696 fn test_keywords() {
697 let tokens = Lexer::new("SELECT INSERT UPDATE DELETE FROM WHERE")
698 .tokenize()
699 .unwrap();
700 assert_eq!(tokens[0].kind, TokenKind::Select);
701 assert_eq!(tokens[1].kind, TokenKind::Insert);
702 assert_eq!(tokens[2].kind, TokenKind::Update);
703 assert_eq!(tokens[3].kind, TokenKind::Delete);
704 assert_eq!(tokens[4].kind, TokenKind::From);
705 assert_eq!(tokens[5].kind, TokenKind::Where);
706 }
707
708 #[test]
709 fn test_placeholder() {
710 let tokens = Lexer::new("$1 $2 $10").tokenize().unwrap();
711 assert!(matches!(tokens[0].kind, TokenKind::Placeholder(1)));
712 assert!(matches!(tokens[1].kind, TokenKind::Placeholder(2)));
713 assert!(matches!(tokens[2].kind, TokenKind::Placeholder(10)));
714 }
715
716 #[test]
717 fn test_line_comment() {
718 let tokens = Lexer::new("SELECT -- comment\n* FROM users")
719 .tokenize()
720 .unwrap();
721 assert_eq!(tokens.len(), 5); assert_eq!(tokens[0].kind, TokenKind::Select);
723 assert_eq!(tokens[1].kind, TokenKind::Star);
724 }
725
726 #[test]
727 fn test_blob_literal() {
728 let tokens = Lexer::new("X'48454C4C4F'").tokenize().unwrap();
729 assert!(matches!(&tokens[0].kind, TokenKind::Blob(b) if b == b"HELLO"));
730 }
731
732 #[test]
733 fn test_left_arrow() {
734 let tokens = Lexer::new("<-").tokenize().unwrap();
735 assert_eq!(tokens[0].kind, TokenKind::LeftArrow);
736 }
737
738 #[test]
739 fn test_biarrow() {
740 let tokens = Lexer::new("<->").tokenize().unwrap();
741 assert_eq!(tokens[0].kind, TokenKind::BiArrow);
742 }
743
744 #[test]
745 fn test_arrow_tokens_in_context() {
746 let tokens = Lexer::new("a -> b <- c <-> d").tokenize().unwrap();
747 assert!(matches!(tokens[0].kind, TokenKind::Identifier("a")));
748 assert_eq!(tokens[1].kind, TokenKind::Arrow);
749 assert!(matches!(tokens[2].kind, TokenKind::Identifier("b")));
750 assert_eq!(tokens[3].kind, TokenKind::LeftArrow);
751 assert!(matches!(tokens[4].kind, TokenKind::Identifier("c")));
752 assert_eq!(tokens[5].kind, TokenKind::BiArrow);
753 assert!(matches!(tokens[6].kind, TokenKind::Identifier("d")));
754 }
755
756 #[test]
757 fn test_relate_keyword() {
758 let tokens = Lexer::new("RELATE LIVE CONTENT EVENT DIFF")
759 .tokenize()
760 .unwrap();
761 assert_eq!(tokens[0].kind, TokenKind::Relate);
762 assert_eq!(tokens[1].kind, TokenKind::Live);
763 assert_eq!(tokens[2].kind, TokenKind::Content);
764 assert_eq!(tokens[3].kind, TokenKind::Event);
765 assert_eq!(tokens[4].kind, TokenKind::Diff);
766 }
767}