1mod cursor;
2mod token;
3use cursor::{Cursor, EOF_CHAR};
4pub use token::{Base, LiteralKind, Token, TokenKind};
5
6const fn is_ident_start(c: char) -> bool {
9 matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..='\u{FF}')
10}
11
12const fn is_ident_cont(c: char) -> bool {
14 matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..='\u{FF}')
15}
16
17const fn is_whitespace(c: char) -> bool {
21 matches!(
22 c,
23 ' ' | '\t' | '\n' | '\r' | '\u{000B}' | '\u{000C}' )
30}
31
32impl Cursor<'_> {
33 pub(crate) fn advance_token(&mut self) -> Token {
35 let Some(first_char) = self.bump() else {
36 return Token::new(TokenKind::Eof, 0);
37 };
38 let token_kind = match first_char {
39 '/' => match self.first() {
41 '*' => self.block_comment(),
42 _ => TokenKind::Slash,
43 },
44 '-' => match self.first() {
45 '-' => self.line_comment(),
46 _ => TokenKind::Minus,
47 },
48
49 c if is_whitespace(c) => self.whitespace(),
51
52 'u' | 'U' => match self.first() {
54 '&' => {
55 self.bump();
56 self.prefixed_string(
57 |terminated| LiteralKind::UnicodeEscStr { terminated },
58 true,
59 )
60 }
61 _ => self.ident_or_unknown_prefix(),
62 },
63
64 'e' | 'E' => {
66 self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false)
67 }
68
69 'b' | 'B' => {
71 self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false)
72 }
73
74 'x' | 'X' => {
76 self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false)
77 }
78
79 c if is_ident_start(c) => self.ident(),
82
83 c @ '0'..='9' => {
86 let literal_kind = self.number(c);
87 TokenKind::Literal { kind: literal_kind }
88 }
89 '.' => match self.first() {
90 '0'..='9' => {
91 let literal_kind = self.number('.');
92 TokenKind::Literal { kind: literal_kind }
93 }
94 _ => TokenKind::Dot,
95 },
96 ';' => TokenKind::Semi,
98 ',' => TokenKind::Comma,
99 '(' => TokenKind::OpenParen,
100 ')' => TokenKind::CloseParen,
101 '[' => TokenKind::OpenBracket,
102 ']' => TokenKind::CloseBracket,
103 '@' => TokenKind::At,
104 '#' => TokenKind::Pound,
105 '~' => TokenKind::Tilde,
106 '?' => TokenKind::Question,
107 ':' => TokenKind::Colon,
108 '$' => {
109 if is_ident_start(self.first()) || self.first() == '$' {
111 self.dollar_quoted_string()
112 } else {
113 while self.first().is_ascii_digit() {
115 self.bump();
116 }
117 TokenKind::PositionalParam
118 }
119 }
120 '`' => TokenKind::Backtick,
121 '=' => TokenKind::Eq,
122 '!' => TokenKind::Bang,
123 '<' => TokenKind::Lt,
124 '>' => TokenKind::Gt,
125 '&' => TokenKind::And,
126 '|' => TokenKind::Or,
127 '+' => TokenKind::Plus,
128 '*' => TokenKind::Star,
129 '^' => TokenKind::Caret,
130 '%' => TokenKind::Percent,
131
132 '\'' => {
134 let terminated = self.single_quoted_string();
135 let kind = LiteralKind::Str { terminated };
136 TokenKind::Literal { kind }
137 }
138
139 '"' => {
141 let terminated = self.double_quoted_string();
142 TokenKind::QuotedIdent { terminated }
143 }
144 _ => TokenKind::Unknown,
145 };
146 let res = Token::new(token_kind, self.pos_within_token());
147 self.reset_pos_within_token();
148 res
149 }
150 pub(crate) fn ident(&mut self) -> TokenKind {
151 self.eat_while(is_ident_cont);
152 TokenKind::Ident
153 }
154
155 pub(crate) fn whitespace(&mut self) -> TokenKind {
156 self.eat_while(is_whitespace);
157 TokenKind::Whitespace
158 }
159
160 fn ident_or_unknown_prefix(&mut self) -> TokenKind {
161 self.eat_while(is_ident_cont);
163 match self.first() {
166 '#' | '"' | '\'' => TokenKind::UnknownPrefix,
167 _ => TokenKind::Ident,
168 }
169 }
170
171 pub(crate) fn line_comment(&mut self) -> TokenKind {
174 self.bump();
175
176 self.eat_while(|c| c != '\n');
177 TokenKind::LineComment
178 }
179
180 pub(crate) fn block_comment(&mut self) -> TokenKind {
182 self.bump();
183
184 let mut depth = 1usize;
185 while let Some(c) = self.bump() {
186 match c {
187 '/' if self.first() == '*' => {
188 self.bump();
189 depth += 1;
190 }
191 '*' if self.first() == '/' => {
192 self.bump();
193 depth -= 1;
194 if depth == 0 {
195 break;
199 }
200 }
201 _ => (),
202 }
203 }
204
205 TokenKind::BlockComment {
206 terminated: depth == 0,
207 }
208 }
209
210 fn prefixed_string(
211 &mut self,
212 mk_kind: fn(bool) -> LiteralKind,
213 allows_double: bool,
214 ) -> TokenKind {
215 match self.first() {
216 '\'' => {
217 self.bump();
218 let terminated = self.single_quoted_string();
219 let kind = mk_kind(terminated);
220 TokenKind::Literal { kind }
221 }
222 '"' if allows_double => {
223 self.bump();
224 let terminated = self.double_quoted_string();
225 TokenKind::QuotedIdent { terminated }
226 }
227 _ => self.ident_or_unknown_prefix(),
228 }
229 }
230
231 fn number(&mut self, first_digit: char) -> LiteralKind {
232 let mut base = Base::Decimal;
233 if first_digit == '0' {
234 match self.first() {
236 'b' | 'B' => {
238 base = Base::Binary;
239 self.bump();
240 if !self.eat_decimal_digits() {
241 return LiteralKind::Int {
242 base,
243 empty_int: true,
244 };
245 }
246 }
247 'o' | 'O' => {
249 base = Base::Octal;
250 self.bump();
251 if !self.eat_decimal_digits() {
252 return LiteralKind::Int {
253 base,
254 empty_int: true,
255 };
256 }
257 }
258 'x' | 'X' => {
260 base = Base::Hexadecimal;
261 self.bump();
262 if !self.eat_hexadecimal_digits() {
263 return LiteralKind::Int {
264 base,
265 empty_int: true,
266 };
267 }
268 }
269 '0'..='9' | '_' => {
271 self.eat_decimal_digits();
272 }
273
274 '.' | 'e' | 'E' => {}
276
277 _ => {
279 return LiteralKind::Int {
280 base,
281 empty_int: false,
282 };
283 }
284 }
285 } else {
286 self.eat_decimal_digits();
288 };
289
290 match self.first() {
291 '.' => {
292 self.bump();
295 let mut empty_exponent = false;
296 if self.first().is_ascii_digit() {
297 self.eat_decimal_digits();
298 match self.first() {
299 'e' | 'E' => {
300 self.bump();
301 empty_exponent = !self.eat_float_exponent();
302 }
303 _ => (),
304 }
305 } else {
306 match self.first() {
307 'e' | 'E' => {
308 self.bump();
309 empty_exponent = !self.eat_float_exponent();
310 }
311 _ => (),
312 }
313 }
314 LiteralKind::Float {
315 base,
316 empty_exponent,
317 }
318 }
319 'e' | 'E' => {
320 self.bump();
321 let empty_exponent = !self.eat_float_exponent();
322 LiteralKind::Float {
323 base,
324 empty_exponent,
325 }
326 }
327 _ => LiteralKind::Int {
328 base,
329 empty_int: false,
330 },
331 }
332 }
333
334 fn single_quoted_string(&mut self) -> bool {
335 loop {
337 match self.first() {
338 '\'' => {
340 self.bump();
341
342 match self.first() {
343 '\'' => {
345 self.bump();
346 }
347 _ => return true,
349 }
350 }
351 EOF_CHAR if self.is_eof() => break,
353 _ => {
355 self.bump();
356 }
357 }
358 }
359 false
361 }
362
363 fn double_quoted_string(&mut self) -> bool {
366 while let Some(c) = self.bump() {
367 match c {
368 '"' if self.first() == '"' => {
369 self.bump();
371 }
372 '"' => {
373 return true;
374 }
375 _ => (),
376 }
377 }
378 false
380 }
381
382 fn dollar_quoted_string(&mut self) -> TokenKind {
384 let mut start = vec![];
387 while let Some(c) = self.bump() {
388 match c {
389 '$' => {
390 break;
391 }
392 _ => {
393 start.push(c);
394 }
395 }
396 }
397
398 if start.is_empty() {
400 loop {
401 self.eat_while(|c| c != '$');
402 if self.is_eof() {
403 return TokenKind::Literal {
404 kind: LiteralKind::DollarQuotedString { terminated: false },
405 };
406 }
407 self.bump();
409 if self.first() == '$' {
410 self.bump();
411 return TokenKind::Literal {
412 kind: LiteralKind::DollarQuotedString { terminated: true },
413 };
414 }
415 }
416 } else {
417 loop {
418 self.eat_while(|c| c != start[0]);
419 if self.is_eof() {
420 return TokenKind::Literal {
421 kind: LiteralKind::DollarQuotedString { terminated: false },
422 };
423 }
424
425 let mut match_count = 0;
427 for start_char in &start {
428 if self.first() == *start_char {
429 self.bump();
430 match_count += 1;
431 } else {
432 self.bump();
433 break;
434 }
435 }
436
437 let terminated = match_count == start.len();
439 if self.first() == '$' && terminated {
440 self.bump();
441 return TokenKind::Literal {
442 kind: LiteralKind::DollarQuotedString { terminated },
443 };
444 }
445 }
446 }
447 }
448
449 fn eat_decimal_digits(&mut self) -> bool {
450 let mut has_digits = false;
451 loop {
452 match self.first() {
453 '_' => {
454 self.bump();
455 }
456 '0'..='9' => {
457 has_digits = true;
458 self.bump();
459 }
460 _ => break,
461 }
462 }
463 has_digits
464 }
465
466 fn eat_hexadecimal_digits(&mut self) -> bool {
467 let mut has_digits = false;
468 loop {
469 match self.first() {
470 '_' => {
471 self.bump();
472 }
473 '0'..='9' | 'a'..='f' | 'A'..='F' => {
474 has_digits = true;
475 self.bump();
476 }
477 _ => break,
478 }
479 }
480 has_digits
481 }
482
483 fn eat_float_exponent(&mut self) -> bool {
486 if self.first() == '-' || self.first() == '+' {
487 self.bump();
488 }
489 self.eat_decimal_digits()
490 }
491}
492
493pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
495 let mut cursor = Cursor::new(input);
496 std::iter::from_fn(move || {
497 let token = cursor.advance_token();
498 if token.kind != TokenKind::Eof {
499 Some(token)
500 } else {
501 None
502 }
503 })
504}
505
506#[cfg(test)]
507mod tests {
508 use std::fmt;
509
510 use super::*;
511 use insta::assert_debug_snapshot;
512
513 struct TokenDebug<'a> {
514 content: &'a str,
515 token: Token,
516 }
517 impl fmt::Debug for TokenDebug<'_> {
518 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
519 write!(f, "{:?} @ {:?}", self.content, self.token.kind)
520 }
521 }
522
523 impl<'a> TokenDebug<'a> {
524 fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> {
525 TokenDebug {
526 token,
527 content: &input[start as usize..(start + token.len) as usize],
528 }
529 }
530 }
531
532 fn lex(input: &str) -> Vec<TokenDebug<'_>> {
533 let mut tokens = vec![];
534 let mut start = 0;
535
536 for token in tokenize(input) {
537 let length = token.len;
538 tokens.push(TokenDebug::new(token, input, start));
539 start += length;
540 }
541 tokens
542 }
543 #[test]
544 fn lex_statement() {
545 let result = lex("select 1;");
546 assert_debug_snapshot!(result);
547 }
548
549 #[test]
550 fn block_comment() {
551 let result = lex(r#"
552/*
553 * foo
554 * bar
555*/"#);
556 assert_debug_snapshot!(result);
557 }
558
559 #[test]
560 fn block_comment_unterminated() {
561 let result = lex(r#"
562/*
563 * foo
564 * bar
565 /*
566*/"#);
567 assert_debug_snapshot!(result);
568 }
569
570 #[test]
571 fn line_comment() {
572 let result = lex(r#"
573-- foooooooooooo bar buzz
574"#);
575 assert_debug_snapshot!(result);
576 }
577
578 #[test]
579 fn line_comment_whitespace() {
580 assert_debug_snapshot!(lex(r#"
581select 'Hello' -- This is a comment
582' World';"#))
583 }
584
585 #[test]
586 fn dollar_quoting() {
587 assert_debug_snapshot!(lex(r#"
588$$Dianne's horse$$
589$SomeTag$Dianne's horse$SomeTag$
590
591-- with dollar inside and matching tags
592$foo$hello$world$bar$
593"#))
594 }
595
596 #[test]
597 fn dollar_strings_part2() {
598 assert_debug_snapshot!(lex(r#"
599DO $doblock$
600end
601$doblock$;"#))
602 }
603
604 #[test]
605 fn dollar_quote_mismatch_tags_simple() {
606 assert_debug_snapshot!(lex(r#"
607-- dollar quoting with mismatched tags
608$foo$hello world$bar$
609"#));
610 }
611
612 #[test]
613 fn dollar_quote_mismatch_tags_complex() {
614 assert_debug_snapshot!(lex(r#"
615-- with dollar inside but mismatched tags
616$foo$hello$world$bar$
617"#));
618 }
619
620 #[test]
621 fn numeric() {
622 assert_debug_snapshot!(lex(r#"
62342
6243.5
6254.
626.001
627.123e10
6285e2
6291.925e-3
6301e-10
6311e+10
6321e10
6334664.E+5
634"#))
635 }
636
637 #[test]
638 fn numeric_non_decimal() {
639 assert_debug_snapshot!(lex(r#"
6400b100101
6410B10011001
6420o273
6430O755
6440x42f
6450XFFFF
646"#))
647 }
648
649 #[test]
650 fn numeric_with_seperators() {
651 assert_debug_snapshot!(lex(r#"
6521_500_000_000
6530b10001000_00000000
6540o_1_755
6550xFFFF_FFFF
6561.618_034
657"#))
658 }
659
660 #[test]
661 fn select_with_period() {
662 assert_debug_snapshot!(lex(r#"
663select public.users;
664"#))
665 }
666
667 #[test]
668 fn bitstring() {
669 assert_debug_snapshot!(lex(r#"
670B'1001'
671b'1001'
672X'1FF'
673x'1FF'
674"#))
675 }
676
677 #[test]
678 fn string() {
679 assert_debug_snapshot!(lex(r#"
680'Dianne''s horse'
681
682select 'foo ''
683bar';
684
685select 'foooo'
686 'bar';
687
688
689'foo \\ \n \tbar'
690
691'forgot to close the string
692"#))
693 }
694
695 #[test]
696 fn params() {
697 assert_debug_snapshot!(lex(r#"
698select $1 + $2;
699
700select $1123123123123;
701
702select $;
703"#))
704 }
705
706 #[test]
707 fn string_with_escapes() {
708 assert_debug_snapshot!(lex(r#"
711E'foo'
712
713e'bar'
714
715e'\b\f\n\r\t'
716
717e'\0\11\777'
718
719e'\x0\x11\xFF'
720
721e'\uAAAA \UFFFFFFFF'
722
723"#))
724 }
725
726 #[test]
727 fn string_unicode_escape() {
728 assert_debug_snapshot!(lex(r#"
731U&"d\0061t\+000061"
732
733U&"\0441\043B\043E\043D"
734
735u&'\0441\043B'
736
737U&"d!0061t!+000061" UESCAPE '!'
738"#))
739 }
740
741 #[test]
742 fn quoted_ident() {
743 assert_debug_snapshot!(lex(r#"
744"hello &1 -world";
745
746
747"hello-world
748"#))
749 }
750
751 #[test]
752 fn quoted_ident_with_escape_quote() {
753 assert_debug_snapshot!(lex(r#"
754"foo "" bar"
755"#))
756 }
757
758 #[test]
759 fn dollar_quoted_string() {
760 assert_debug_snapshot!(lex("$$$$"), @r#"
761 [
762 "$$$$" @ Literal { kind: DollarQuotedString { terminated: true } },
763 ]
764 "#);
765 }
766}