1mod cursor;
2mod token;
3use cursor::{Cursor, EOF_CHAR};
4pub use token::{Base, LiteralKind, Token, TokenKind};
5
6const fn is_ident_start(c: char) -> bool {
9 matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..)
10}
11
12const fn is_ident_cont(c: char) -> bool {
14 matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..)
15}
16
17const fn is_whitespace(c: char) -> bool {
21 matches!(
22 c,
23 ' ' | '\t' | '\n' | '\r' | '\u{000B}' | '\u{000C}' )
30}
31
32impl Cursor<'_> {
33 pub(crate) fn advance_token(&mut self) -> Token {
35 let Some(first_char) = self.bump() else {
36 return Token::new(TokenKind::Eof, 0);
37 };
38 let token_kind = match first_char {
39 '/' => match self.first() {
41 '*' => self.block_comment(),
42 _ => TokenKind::Slash,
43 },
44 '-' => match self.first() {
45 '-' => self.line_comment(),
46 _ => TokenKind::Minus,
47 },
48
49 c if is_whitespace(c) => self.whitespace(),
51
52 'u' | 'U' => {
54 if self.first() == '&' && matches!(self.second(), '\'' | '"') {
55 self.bump();
56 self.prefixed_string(
57 |terminated| LiteralKind::UnicodeEscStr { terminated },
58 true,
59 )
60 } else {
61 self.ident_or_unknown_prefix()
62 }
63 }
64 'e' | 'E' => {
66 self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false)
67 }
68
69 'b' | 'B' => {
71 self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false)
72 }
73
74 'x' | 'X' => {
76 self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false)
77 }
78
79 c if is_ident_start(c) => self.ident(),
82
83 c @ '0'..='9' => {
86 let literal_kind = self.number(c);
87 TokenKind::Literal { kind: literal_kind }
88 }
89 '.' => match self.first() {
90 '0'..='9' => {
91 let literal_kind = self.number('.');
92 TokenKind::Literal { kind: literal_kind }
93 }
94 _ => TokenKind::Dot,
95 },
96 ';' => TokenKind::Semi,
98 ',' => TokenKind::Comma,
99 '(' => TokenKind::OpenParen,
100 ')' => TokenKind::CloseParen,
101 '[' => TokenKind::OpenBracket,
102 ']' => TokenKind::CloseBracket,
103 '{' => TokenKind::OpenCurly,
104 '}' => TokenKind::CloseCurly,
105 '@' => TokenKind::At,
106 '#' => TokenKind::Pound,
107 '~' => TokenKind::Tilde,
108 '?' => TokenKind::Question,
109 ':' => TokenKind::Colon,
110 '$' => {
111 if is_ident_start(self.first()) || self.first() == '$' {
113 self.dollar_quoted_string()
114 } else {
115 while self.first().is_ascii_digit() {
117 self.bump();
118 }
119 let trailing_junk_start = self.pos_within_token();
120 self.eat_identifier();
121 TokenKind::PositionalParam {
122 trailing_junk_start,
123 }
124 }
125 }
126 '`' => TokenKind::Backtick,
127 '=' => TokenKind::Eq,
128 '!' => TokenKind::Bang,
129 '<' => TokenKind::Lt,
130 '>' => TokenKind::Gt,
131 '&' => TokenKind::And,
132 '|' => TokenKind::Or,
133 '+' => TokenKind::Plus,
134 '*' => TokenKind::Star,
135 '^' => TokenKind::Caret,
136 '%' => TokenKind::Percent,
137
138 '\'' => {
140 let terminated = self.single_quoted_string();
141 let kind = LiteralKind::Str { terminated };
142 TokenKind::Literal { kind }
143 }
144
145 '"' => {
147 let terminated = self.double_quoted_string();
148 TokenKind::QuotedIdent { terminated }
149 }
150 _ => TokenKind::Unknown,
151 };
152 let res = Token::new(token_kind, self.pos_within_token());
153 self.reset_pos_within_token();
154 res
155 }
156 pub(crate) fn ident(&mut self) -> TokenKind {
157 self.eat_while(is_ident_cont);
158 TokenKind::Ident
159 }
160
161 pub(crate) fn whitespace(&mut self) -> TokenKind {
162 self.eat_while(is_whitespace);
163 TokenKind::Whitespace
164 }
165
166 fn ident_or_unknown_prefix(&mut self) -> TokenKind {
167 self.eat_while(is_ident_cont);
169 match self.first() {
172 '"' | '\'' => TokenKind::UnknownPrefix,
173 _ => TokenKind::Ident,
174 }
175 }
176
177 pub(crate) fn line_comment(&mut self) -> TokenKind {
180 self.bump();
181
182 self.eat_while(|c| c != '\n');
183 TokenKind::LineComment
184 }
185
186 pub(crate) fn block_comment(&mut self) -> TokenKind {
188 self.bump();
189
190 let mut depth = 1usize;
191 while let Some(c) = self.bump() {
192 match c {
193 '/' if self.first() == '*' => {
194 self.bump();
195 depth += 1;
196 }
197 '*' if self.first() == '/' => {
198 self.bump();
199 depth -= 1;
200 if depth == 0 {
201 break;
205 }
206 }
207 _ => (),
208 }
209 }
210
211 TokenKind::BlockComment {
212 terminated: depth == 0,
213 }
214 }
215
216 fn prefixed_string(
217 &mut self,
218 mk_kind: fn(bool) -> LiteralKind,
219 allows_double: bool,
220 ) -> TokenKind {
221 match self.first() {
222 '\'' => {
223 self.bump();
224 let terminated = self.single_quoted_string();
225 let kind = mk_kind(terminated);
226 TokenKind::Literal { kind }
227 }
228 '"' if allows_double => {
229 self.bump();
230 let terminated = self.double_quoted_string();
231 TokenKind::QuotedIdent { terminated }
232 }
233 _ => self.ident_or_unknown_prefix(),
234 }
235 }
236
237 fn number(&mut self, first_digit: char) -> LiteralKind {
238 let mut base = Base::Decimal;
239 if first_digit == '.' {
240 return self.eat_fractional(base);
241 }
242 if first_digit == '0' {
243 match self.first() {
245 'b' | 'B' => {
247 base = Base::Binary;
248 self.bump();
249 let has_digits = self.eat_decimal_digits();
250 return self.finish_base_prefixed_int(base, has_digits);
251 }
252 'o' | 'O' => {
254 base = Base::Octal;
255 self.bump();
256 let has_digits = self.eat_decimal_digits();
257 return self.finish_base_prefixed_int(base, has_digits);
258 }
259 'x' | 'X' => {
261 base = Base::Hexadecimal;
262 self.bump();
263 let has_digits = self.eat_hexadecimal_digits();
264 return self.finish_base_prefixed_int(base, has_digits);
265 }
266 '0'..='9' | '_' => {
268 self.eat_decimal_digits();
269 }
270
271 '.' | 'e' | 'E' => {}
273
274 _ => {
276 let trailing_junk_start = self.pos_within_token();
277 self.eat_identifier();
278 return LiteralKind::Int {
279 base,
280 empty_int: false,
281 trailing_junk_start,
282 };
283 }
284 }
285 } else {
286 self.eat_decimal_digits();
288 };
289
290 match self.first() {
291 '.' => self.eat_fractional(base),
292 'e' | 'E' => {
293 let exponent_start = self.pos_within_token();
294 self.bump();
295 let empty_exponent_start = (!self.eat_numeric_exponent()).then_some(exponent_start);
296 let trailing_junk_start = self.pos_within_token();
297 self.eat_identifier();
298 LiteralKind::Numeric {
299 base,
300 empty_exponent_start,
301 trailing_junk_start,
302 }
303 }
304 _ => {
305 let trailing_junk_start = self.pos_within_token();
306 self.eat_identifier();
307 LiteralKind::Int {
308 base,
309 empty_int: false,
310 trailing_junk_start,
311 }
312 }
313 }
314 }
315
316 fn single_quoted_string(&mut self) -> bool {
317 loop {
319 match self.first() {
320 '\'' => {
322 self.bump();
323
324 match self.first() {
325 '\'' => {
327 self.bump();
328 }
329 _ => return true,
331 }
332 }
333 EOF_CHAR if self.is_eof() => break,
335 _ => {
337 self.bump();
338 }
339 }
340 }
341 false
343 }
344
345 fn double_quoted_string(&mut self) -> bool {
348 while let Some(c) = self.bump() {
349 match c {
350 '"' if self.first() == '"' => {
351 self.bump();
353 }
354 '"' => {
355 return true;
356 }
357 _ => (),
358 }
359 }
360 false
362 }
363
364 fn dollar_quoted_string(&mut self) -> TokenKind {
366 let mut start = vec![];
369 while let Some(c) = self.bump() {
370 match c {
371 '$' => {
372 break;
373 }
374 _ => {
375 start.push(c);
376 }
377 }
378 }
379
380 if start.is_empty() {
382 loop {
383 self.eat_while(|c| c != '$');
384 if self.is_eof() {
385 return TokenKind::Literal {
386 kind: LiteralKind::DollarQuotedString { terminated: false },
387 };
388 }
389 self.bump();
391 if self.first() == '$' {
392 self.bump();
393 return TokenKind::Literal {
394 kind: LiteralKind::DollarQuotedString { terminated: true },
395 };
396 }
397 }
398 } else {
399 loop {
400 self.eat_while(|c| c != start[0]);
401 if self.is_eof() {
402 return TokenKind::Literal {
403 kind: LiteralKind::DollarQuotedString { terminated: false },
404 };
405 }
406
407 let mut match_count = 0;
409 for start_char in &start {
410 if self.first() == *start_char {
411 self.bump();
412 match_count += 1;
413 } else {
414 self.bump();
415 break;
416 }
417 }
418
419 let terminated = match_count == start.len();
421 if self.first() == '$' && terminated {
422 self.bump();
423 return TokenKind::Literal {
424 kind: LiteralKind::DollarQuotedString { terminated },
425 };
426 }
427 }
428 }
429 }
430
431 fn eat_decimal_digits(&mut self) -> bool {
432 let mut has_digits = false;
433 loop {
434 match self.first() {
435 '_' if self.second().is_ascii_digit() => {
436 self.bump();
437 }
438 '0'..='9' => {
439 has_digits = true;
440 self.bump();
441 }
442 _ => break,
443 }
444 }
445 has_digits
446 }
447
448 fn finish_base_prefixed_int(&mut self, base: Base, has_digits: bool) -> LiteralKind {
449 let trailing_junk_start = self.pos_within_token();
450 self.eat_while(is_ident_cont);
451 let has_trailing_junk = self.pos_within_token() > trailing_junk_start;
452 LiteralKind::Int {
453 base,
454 empty_int: !has_digits && !has_trailing_junk,
455 trailing_junk_start,
456 }
457 }
458
459 fn eat_hexadecimal_digits(&mut self) -> bool {
460 let mut has_digits = false;
461 loop {
462 match self.first() {
463 '_' if self.second().is_ascii_hexdigit() => {
464 self.bump();
465 }
466 '0'..='9' | 'a'..='f' | 'A'..='F' => {
467 has_digits = true;
468 self.bump();
469 }
470 _ => break,
471 }
472 }
473 has_digits
474 }
475
476 fn eat_numeric_exponent(&mut self) -> bool {
479 if self.first() == '_' {
480 return false;
481 }
482 if self.first() == '-' || self.first() == '+' {
483 self.bump();
484 }
485 self.eat_decimal_digits()
486 }
487
488 fn eat_identifier(&mut self) {
489 if is_ident_start(self.first()) {
490 self.eat_while(is_ident_cont);
491 }
492 }
493
494 pub(crate) fn eat_fractional(&mut self, base: Base) -> crate::LiteralKind {
495 self.bump();
498 let mut empty_exponent_start = None;
499 if self.first().is_ascii_digit() {
500 self.eat_decimal_digits();
501 match self.first() {
502 'e' | 'E' => {
503 let exponent_start = self.pos_within_token();
504 self.bump();
505 if !self.eat_numeric_exponent() {
506 empty_exponent_start = Some(exponent_start);
507 }
508 }
509 _ => (),
510 }
511 } else {
512 match self.first() {
513 'e' | 'E' => {
514 let exponent_start = self.pos_within_token();
515 self.bump();
516 if !self.eat_numeric_exponent() {
517 empty_exponent_start = Some(exponent_start);
518 }
519 }
520 _ => (),
521 }
522 }
523 let trailing_junk_start = self.pos_within_token();
524 self.eat_identifier();
525 LiteralKind::Numeric {
526 base,
527 empty_exponent_start,
528 trailing_junk_start,
529 }
530 }
531}
532
533pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
535 let mut cursor = Cursor::new(input);
536 std::iter::from_fn(move || {
537 let token = cursor.advance_token();
538 if token.kind != TokenKind::Eof {
539 Some(token)
540 } else {
541 None
542 }
543 })
544}
545
546#[cfg(test)]
547mod tests {
548 use std::fmt;
549
550 use super::*;
551 use insta::assert_debug_snapshot;
552
553 struct TokenDebug<'a> {
554 content: &'a str,
555 token: Token,
556 }
557 impl fmt::Debug for TokenDebug<'_> {
558 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
559 write!(f, "{:?} @ {:?}", self.content, self.token.kind)
560 }
561 }
562
563 impl<'a> TokenDebug<'a> {
564 fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> {
565 TokenDebug {
566 token,
567 content: &input[start as usize..(start + token.len) as usize],
568 }
569 }
570 }
571
572 fn lex(input: &str) -> Vec<TokenDebug<'_>> {
573 let mut tokens = vec![];
574 let mut start = 0;
575
576 for token in tokenize(input) {
577 let length = token.len;
578 tokens.push(TokenDebug::new(token, input, start));
579 start += length;
580 }
581 tokens
582 }
583 #[test]
584 fn lex_statement() {
585 let result = lex("select 1;");
586 assert_debug_snapshot!(result);
587 }
588
589 #[test]
590 fn block_comment() {
591 let result = lex(r#"
592/*
593 * foo
594 * bar
595*/"#);
596 assert_debug_snapshot!(result);
597 }
598
599 #[test]
600 fn block_comment_unterminated() {
601 let result = lex(r#"
602/*
603 * foo
604 * bar
605 /*
606*/"#);
607 assert_debug_snapshot!(result);
608 }
609
610 #[test]
611 fn line_comment() {
612 let result = lex(r#"
613-- foooooooooooo bar buzz
614"#);
615 assert_debug_snapshot!(result);
616 }
617
618 #[test]
619 fn line_comment_whitespace() {
620 assert_debug_snapshot!(lex(r#"
621select 'Hello' -- This is a comment
622' World';"#))
623 }
624
625 #[test]
626 fn dollar_quoting() {
627 assert_debug_snapshot!(lex(r#"
628$$Dianne's horse$$
629$SomeTag$Dianne's horse$SomeTag$
630
631-- with dollar inside and matching tags
632$foo$hello$world$bar$
633"#))
634 }
635
636 #[test]
637 fn dollar_strings_part2() {
638 assert_debug_snapshot!(lex(r#"
639DO $doblock$
640end
641$doblock$;"#))
642 }
643
644 #[test]
645 fn dollar_quote_mismatch_tags_simple() {
646 assert_debug_snapshot!(lex(r#"
647-- dollar quoting with mismatched tags
648$foo$hello world$bar$
649"#));
650 }
651
652 #[test]
653 fn dollar_quote_mismatch_tags_complex() {
654 assert_debug_snapshot!(lex(r#"
655-- with dollar inside but mismatched tags
656$foo$hello$world$bar$
657"#));
658 }
659
660 #[test]
661 fn numeric() {
662 assert_debug_snapshot!(lex(r#"
66342
6643.5
6654.
666.001
667.123e10
6685e2
6691.925e-3
6701e-10
6711e+10
6721e10
6734664.E+5
674"#))
675 }
676
677 #[test]
678 fn numeric_non_decimal() {
679 assert_debug_snapshot!(lex(r#"
6800b100101
6810B10011001
6820o273
6830O755
6840x42f
6850XFFFF
686"#))
687 }
688
689 #[test]
690 fn numeric_with_seperators() {
691 assert_debug_snapshot!(lex(r#"
6921_500_000_000
6930b10001000_00000000
6940o_1_755
6950xFFFF_FFFF
6961.618_034
697"#))
698 }
699
700 #[test]
701 fn select_with_period() {
702 assert_debug_snapshot!(lex(r#"
703select public.users;
704"#))
705 }
706
707 #[test]
708 fn bitstring() {
709 assert_debug_snapshot!(lex(r#"
710B'1001'
711b'1001'
712X'1FF'
713x'1FF'
714"#))
715 }
716
717 #[test]
718 fn string() {
719 assert_debug_snapshot!(lex(r#"
720'Dianne''s horse'
721
722select 'foo ''
723bar';
724
725select 'foooo'
726 'bar';
727
728
729'foo \\ \n \tbar'
730
731'forgot to close the string
732"#))
733 }
734
735 #[test]
736 fn params() {
737 assert_debug_snapshot!(lex(r#"
738select $1 + $2;
739
740select $1123123123123;
741
742select $;
743"#))
744 }
745
746 #[test]
747 fn string_with_escapes() {
748 assert_debug_snapshot!(lex(r#"
751E'foo'
752
753e'bar'
754
755e'\b\f\n\r\t'
756
757e'\0\11\777'
758
759e'\x0\x11\xFF'
760
761e'\uAAAA \UFFFFFFFF'
762
763"#))
764 }
765
766 #[test]
767 fn string_unicode_escape() {
768 assert_debug_snapshot!(lex(r#"
771U&"d\0061t\+000061"
772
773U&"\0441\043B\043E\043D"
774
775u&'\0441\043B'
776
777U&"d!0061t!+000061" UESCAPE '!'
778"#))
779 }
780
781 #[test]
782 fn quoted_ident() {
783 assert_debug_snapshot!(lex(r#"
784"hello &1 -world";
785
786
787"hello-world
788"#))
789 }
790
791 #[test]
792 fn quoted_ident_with_escape_quote() {
793 assert_debug_snapshot!(lex(r#"
794"foo "" bar"
795"#))
796 }
797
798 #[test]
799 fn dollar_quoted_string() {
800 assert_debug_snapshot!(lex("$$$$"), @r#"
801 [
802 "$$$$" @ Literal { kind: DollarQuotedString { terminated: true } },
803 ]
804 "#);
805 }
806
807 #[test]
808 fn ident_non_ascii_above_latin1() {
809 assert_debug_snapshot!(lex("ẞ Ā 漢字 𐐷"), @r#"
810 [
811 "ẞ" @ Ident,
812 " " @ Whitespace,
813 "Ā" @ Ident,
814 " " @ Whitespace,
815 "漢字" @ Ident,
816 " " @ Whitespace,
817 "𐐷" @ Ident,
818 ]
819 "#);
820 }
821}