1mod cursor;
2mod token;
3use cursor::{Cursor, EOF_CHAR};
4pub use token::{Base, LiteralKind, Token, TokenKind};
5
6const fn is_ident_start(c: char) -> bool {
9 matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..='\u{FF}')
10}
11
12const fn is_ident_cont(c: char) -> bool {
14 matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..='\u{FF}')
15}
16
17const fn is_whitespace(c: char) -> bool {
21 matches!(
22 c,
23 ' ' | '\t' | '\n' | '\r' | '\u{000B}' | '\u{000C}' )
30}
31
32impl Cursor<'_> {
33 pub(crate) fn advance_token(&mut self) -> Token {
35 let Some(first_char) = self.bump() else {
36 return Token::new(TokenKind::Eof, 0);
37 };
38 let token_kind = match first_char {
39 '/' => match self.first() {
41 '*' => self.block_comment(),
42 _ => TokenKind::Slash,
43 },
44 '-' => match self.first() {
45 '-' => self.line_comment(),
46 _ => TokenKind::Minus,
47 },
48
49 c if is_whitespace(c) => self.whitespace(),
51
52 'u' | 'U' => match self.first() {
54 '&' => {
55 self.bump();
56 self.prefixed_string(
57 |terminated| LiteralKind::UnicodeEscStr { terminated },
58 true,
59 )
60 }
61 _ => self.ident_or_unknown_prefix(),
62 },
63
64 'e' | 'E' => {
66 self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false)
67 }
68
69 'b' | 'B' => {
71 self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false)
72 }
73
74 'x' | 'X' => {
76 self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false)
77 }
78
79 c if is_ident_start(c) => self.ident(),
82
83 c @ '0'..='9' => {
86 let literal_kind = self.number(c);
87 TokenKind::Literal { kind: literal_kind }
88 }
89 '.' => match self.first() {
90 '0'..='9' => {
91 let literal_kind = self.number('.');
92 TokenKind::Literal { kind: literal_kind }
93 }
94 _ => TokenKind::Dot,
95 },
96 ';' => TokenKind::Semi,
98 ',' => TokenKind::Comma,
99 '(' => TokenKind::OpenParen,
100 ')' => TokenKind::CloseParen,
101 '[' => TokenKind::OpenBracket,
102 ']' => TokenKind::CloseBracket,
103 '@' => TokenKind::At,
104 '#' => TokenKind::Pound,
105 '~' => TokenKind::Tilde,
106 '?' => TokenKind::Question,
107 ':' => TokenKind::Colon,
108 '$' => {
109 if is_ident_start(self.first()) || self.first() == '$' {
111 self.dollar_quoted_string()
112 } else {
113 while self.first().is_ascii_digit() {
115 self.bump();
116 }
117 TokenKind::PositionalParam
118 }
119 }
120 '`' => TokenKind::Backtick,
121 '=' => TokenKind::Eq,
122 '!' => TokenKind::Bang,
123 '<' => TokenKind::Lt,
124 '>' => TokenKind::Gt,
125 '&' => TokenKind::And,
126 '|' => TokenKind::Or,
127 '+' => TokenKind::Plus,
128 '*' => TokenKind::Star,
129 '^' => TokenKind::Caret,
130 '%' => TokenKind::Percent,
131
132 '\'' => {
134 let terminated = self.single_quoted_string();
135 let kind = LiteralKind::Str { terminated };
136 TokenKind::Literal { kind }
137 }
138
139 '"' => {
141 let terminated = self.double_quoted_string();
142 TokenKind::QuotedIdent { terminated }
143 }
144 _ => TokenKind::Unknown,
145 };
146 let res = Token::new(token_kind, self.pos_within_token());
147 self.reset_pos_within_token();
148 res
149 }
150 pub(crate) fn ident(&mut self) -> TokenKind {
151 self.eat_while(is_ident_cont);
152 TokenKind::Ident
153 }
154
155 pub(crate) fn whitespace(&mut self) -> TokenKind {
156 self.eat_while(is_whitespace);
157 TokenKind::Whitespace
158 }
159
160 fn ident_or_unknown_prefix(&mut self) -> TokenKind {
161 self.eat_while(is_ident_cont);
163 match self.first() {
166 '#' | '"' | '\'' => TokenKind::UnknownPrefix,
167 _ => TokenKind::Ident,
168 }
169 }
170
171 pub(crate) fn line_comment(&mut self) -> TokenKind {
174 self.bump();
175
176 self.eat_while(|c| c != '\n');
177 TokenKind::LineComment
178 }
179
180 pub(crate) fn block_comment(&mut self) -> TokenKind {
182 self.bump();
183
184 let mut depth = 1usize;
185 while let Some(c) = self.bump() {
186 match c {
187 '/' if self.first() == '*' => {
188 self.bump();
189 depth += 1;
190 }
191 '*' if self.first() == '/' => {
192 self.bump();
193 depth -= 1;
194 if depth == 0 {
195 break;
199 }
200 }
201 _ => (),
202 }
203 }
204
205 TokenKind::BlockComment {
206 terminated: depth == 0,
207 }
208 }
209
210 fn prefixed_string(
211 &mut self,
212 mk_kind: fn(bool) -> LiteralKind,
213 allows_double: bool,
214 ) -> TokenKind {
215 match self.first() {
216 '\'' => {
217 self.bump();
218 let terminated = self.single_quoted_string();
219 let kind = mk_kind(terminated);
220 TokenKind::Literal { kind }
221 }
222 '"' if allows_double => {
223 self.bump();
224 let terminated = self.double_quoted_string();
225 TokenKind::QuotedIdent { terminated }
226 }
227 _ => self.ident_or_unknown_prefix(),
228 }
229 }
230
231 fn number(&mut self, first_digit: char) -> LiteralKind {
232 let mut base = Base::Decimal;
233 if first_digit == '0' {
234 match self.first() {
236 'b' | 'B' => {
238 base = Base::Binary;
239 self.bump();
240 if !self.eat_decimal_digits() {
241 return LiteralKind::Int {
242 base,
243 empty_int: true,
244 };
245 }
246 }
247 'o' | 'O' => {
249 base = Base::Octal;
250 self.bump();
251 if !self.eat_decimal_digits() {
252 return LiteralKind::Int {
253 base,
254 empty_int: true,
255 };
256 }
257 }
258 'x' | 'X' => {
260 base = Base::Hexadecimal;
261 self.bump();
262 if !self.eat_hexadecimal_digits() {
263 return LiteralKind::Int {
264 base,
265 empty_int: true,
266 };
267 }
268 }
269 '0'..='9' | '_' => {
271 self.eat_decimal_digits();
272 }
273
274 '.' | 'e' | 'E' => {}
276
277 _ => {
279 return LiteralKind::Int {
280 base,
281 empty_int: false,
282 };
283 }
284 }
285 } else {
286 self.eat_decimal_digits();
288 };
289
290 match self.first() {
291 '.' => {
292 self.bump();
295 let mut empty_exponent = false;
296 if self.first().is_ascii_digit() {
297 self.eat_decimal_digits();
298 match self.first() {
299 'e' | 'E' => {
300 self.bump();
301 empty_exponent = !self.eat_float_exponent();
302 }
303 _ => (),
304 }
305 } else {
306 match self.first() {
307 'e' | 'E' => {
308 self.bump();
309 empty_exponent = !self.eat_float_exponent();
310 }
311 _ => (),
312 }
313 }
314 LiteralKind::Float {
315 base,
316 empty_exponent,
317 }
318 }
319 'e' | 'E' => {
320 self.bump();
321 let empty_exponent = !self.eat_float_exponent();
322 LiteralKind::Float {
323 base,
324 empty_exponent,
325 }
326 }
327 _ => LiteralKind::Int {
328 base,
329 empty_int: false,
330 },
331 }
332 }
333
334 fn single_quoted_string(&mut self) -> bool {
335 loop {
337 match self.first() {
338 '\'' => {
340 self.bump();
341
342 match self.first() {
343 '\'' => {
345 self.bump();
346 }
347 _ => return true,
349 }
350 }
351 EOF_CHAR if self.is_eof() => break,
353 _ => {
355 self.bump();
356 }
357 }
358 }
359 false
361 }
362
363 fn double_quoted_string(&mut self) -> bool {
366 while let Some(c) = self.bump() {
367 match c {
368 '"' if self.first() == '"' => {
369 self.bump();
371 }
372 '"' => {
373 return true;
374 }
375 _ => (),
376 }
377 }
378 false
380 }
381
382 fn dollar_quoted_string(&mut self) -> TokenKind {
384 let mut start = vec![];
387 while let Some(c) = self.bump() {
388 match c {
389 '$' => {
390 self.bump();
391 break;
392 }
393 _ => {
394 start.push(c);
395 }
396 }
397 }
398
399 if start.is_empty() {
401 loop {
402 self.eat_while(|c| c != '$');
403 if self.is_eof() {
404 return TokenKind::Literal {
405 kind: LiteralKind::DollarQuotedString { terminated: false },
406 };
407 }
408 self.bump();
410 if self.first() == '$' {
411 self.bump();
412 return TokenKind::Literal {
413 kind: LiteralKind::DollarQuotedString { terminated: true },
414 };
415 }
416 }
417 } else {
418 loop {
419 self.eat_while(|c| c != start[0]);
420 if self.is_eof() {
421 return TokenKind::Literal {
422 kind: LiteralKind::DollarQuotedString { terminated: false },
423 };
424 }
425
426 let mut match_count = 0;
428 for start_char in &start {
429 if self.first() == *start_char {
430 self.bump();
431 match_count += 1;
432 } else {
433 self.bump();
434 break;
435 }
436 }
437
438 let terminated = match_count == start.len();
440 if self.first() == '$' && terminated {
441 self.bump();
442 return TokenKind::Literal {
443 kind: LiteralKind::DollarQuotedString { terminated },
444 };
445 }
446 }
447 }
448 }
449
450 fn eat_decimal_digits(&mut self) -> bool {
451 let mut has_digits = false;
452 loop {
453 match self.first() {
454 '_' => {
455 self.bump();
456 }
457 '0'..='9' => {
458 has_digits = true;
459 self.bump();
460 }
461 _ => break,
462 }
463 }
464 has_digits
465 }
466
467 fn eat_hexadecimal_digits(&mut self) -> bool {
468 let mut has_digits = false;
469 loop {
470 match self.first() {
471 '_' => {
472 self.bump();
473 }
474 '0'..='9' | 'a'..='f' | 'A'..='F' => {
475 has_digits = true;
476 self.bump();
477 }
478 _ => break,
479 }
480 }
481 has_digits
482 }
483
484 fn eat_float_exponent(&mut self) -> bool {
487 if self.first() == '-' || self.first() == '+' {
488 self.bump();
489 }
490 self.eat_decimal_digits()
491 }
492}
493
494pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
496 let mut cursor = Cursor::new(input);
497 std::iter::from_fn(move || {
498 let token = cursor.advance_token();
499 if token.kind != TokenKind::Eof {
500 Some(token)
501 } else {
502 None
503 }
504 })
505}
506
507#[cfg(test)]
508mod tests {
509 use std::fmt;
510
511 use super::*;
512 use insta::assert_debug_snapshot;
513
514 struct TokenDebug<'a> {
515 content: &'a str,
516 token: Token,
517 }
518 impl fmt::Debug for TokenDebug<'_> {
519 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
520 write!(f, "{:?} @ {:?}", self.content, self.token.kind)
521 }
522 }
523
524 impl<'a> TokenDebug<'a> {
525 fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> {
526 TokenDebug {
527 token,
528 content: &input[start as usize..(start + token.len) as usize],
529 }
530 }
531 }
532
533 fn lex(input: &str) -> Vec<TokenDebug<'_>> {
534 let mut tokens = vec![];
535 let mut start = 0;
536
537 for token in tokenize(input) {
538 let length = token.len;
539 tokens.push(TokenDebug::new(token, input, start));
540 start += length;
541 }
542 tokens
543 }
544 #[test]
545 fn lex_statement() {
546 let result = lex("select 1;");
547 assert_debug_snapshot!(result);
548 }
549
550 #[test]
551 fn block_comment() {
552 let result = lex(r#"
553/*
554 * foo
555 * bar
556*/"#);
557 assert_debug_snapshot!(result);
558 }
559
560 #[test]
561 fn block_comment_unterminated() {
562 let result = lex(r#"
563/*
564 * foo
565 * bar
566 /*
567*/"#);
568 assert_debug_snapshot!(result);
569 }
570
571 #[test]
572 fn line_comment() {
573 let result = lex(r#"
574-- foooooooooooo bar buzz
575"#);
576 assert_debug_snapshot!(result);
577 }
578
579 #[test]
580 fn line_comment_whitespace() {
581 assert_debug_snapshot!(lex(r#"
582select 'Hello' -- This is a comment
583' World';"#))
584 }
585
586 #[test]
587 fn dollar_quoting() {
588 assert_debug_snapshot!(lex(r#"
589$$Dianne's horse$$
590$SomeTag$Dianne's horse$SomeTag$
591
592-- with dollar inside and matching tags
593$foo$hello$world$bar$
594"#))
595 }
596
597 #[test]
598 fn dollar_strings_part2() {
599 assert_debug_snapshot!(lex(r#"
600DO $doblock$
601end
602$doblock$;"#))
603 }
604
605 #[test]
606 fn dollar_quote_mismatch_tags_simple() {
607 assert_debug_snapshot!(lex(r#"
608-- dollar quoting with mismatched tags
609$foo$hello world$bar$
610"#));
611 }
612
613 #[test]
614 fn dollar_quote_mismatch_tags_complex() {
615 assert_debug_snapshot!(lex(r#"
616-- with dollar inside but mismatched tags
617$foo$hello$world$bar$
618"#));
619 }
620
621 #[test]
622 fn numeric() {
623 assert_debug_snapshot!(lex(r#"
62442
6253.5
6264.
627.001
628.123e10
6295e2
6301.925e-3
6311e-10
6321e+10
6331e10
6344664.E+5
635"#))
636 }
637
638 #[test]
639 fn numeric_non_decimal() {
640 assert_debug_snapshot!(lex(r#"
6410b100101
6420B10011001
6430o273
6440O755
6450x42f
6460XFFFF
647"#))
648 }
649
650 #[test]
651 fn numeric_with_seperators() {
652 assert_debug_snapshot!(lex(r#"
6531_500_000_000
6540b10001000_00000000
6550o_1_755
6560xFFFF_FFFF
6571.618_034
658"#))
659 }
660
661 #[test]
662 fn select_with_period() {
663 assert_debug_snapshot!(lex(r#"
664select public.users;
665"#))
666 }
667
668 #[test]
669 fn bitstring() {
670 assert_debug_snapshot!(lex(r#"
671B'1001'
672b'1001'
673X'1FF'
674x'1FF'
675"#))
676 }
677
678 #[test]
679 fn string() {
680 assert_debug_snapshot!(lex(r#"
681'Dianne''s horse'
682
683select 'foo ''
684bar';
685
686select 'foooo'
687 'bar';
688
689
690'foo \\ \n \tbar'
691
692'forgot to close the string
693"#))
694 }
695
696 #[test]
697 fn params() {
698 assert_debug_snapshot!(lex(r#"
699select $1 + $2;
700
701select $1123123123123;
702
703select $;
704"#))
705 }
706
707 #[test]
708 fn string_with_escapes() {
709 assert_debug_snapshot!(lex(r#"
712E'foo'
713
714e'bar'
715
716e'\b\f\n\r\t'
717
718e'\0\11\777'
719
720e'\x0\x11\xFF'
721
722e'\uAAAA \UFFFFFFFF'
723
724"#))
725 }
726
727 #[test]
728 fn string_unicode_escape() {
729 assert_debug_snapshot!(lex(r#"
732U&"d\0061t\+000061"
733
734U&"\0441\043B\043E\043D"
735
736u&'\0441\043B'
737
738U&"d!0061t!+000061" UESCAPE '!'
739"#))
740 }
741
742 #[test]
743 fn quoted_ident() {
744 assert_debug_snapshot!(lex(r#"
745"hello &1 -world";
746
747
748"hello-world
749"#))
750 }
751
752 #[test]
753 fn quoted_ident_with_escape_quote() {
754 assert_debug_snapshot!(lex(r#"
755"foo "" bar"
756"#))
757 }
758}