1mod cursor;
2mod token;
3use cursor::{Cursor, EOF_CHAR};
4pub use token::{Base, LiteralKind, Token, TokenKind};
5
6const fn is_ident_start(c: char) -> bool {
9 matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..='\u{FF}')
10}
11
12const fn is_ident_cont(c: char) -> bool {
14 matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..='\u{FF}')
15}
16
17const fn is_whitespace(c: char) -> bool {
21 matches!(
22 c,
23 ' ' | '\t' | '\n' | '\r' | '\u{000B}' | '\u{000C}' )
30}
31
32impl Cursor<'_> {
33 pub(crate) fn advance_token(&mut self) -> Token {
35 let Some(first_char) = self.bump() else {
36 return Token::new(TokenKind::Eof, 0);
37 };
38 let token_kind = match first_char {
39 '/' => match self.first() {
41 '*' => self.block_comment(),
42 _ => TokenKind::Slash,
43 },
44 '-' => match self.first() {
45 '-' => self.line_comment(),
46 _ => TokenKind::Minus,
47 },
48
49 c if is_whitespace(c) => self.whitespace(),
51
52 'u' | 'U' => match self.first() {
54 '&' => {
55 self.bump();
56 self.prefixed_string(
57 |terminated| LiteralKind::UnicodeEscStr { terminated },
58 true,
59 )
60 }
61 _ => self.ident_or_unknown_prefix(),
62 },
63
64 'e' | 'E' => {
66 self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false)
67 }
68
69 'b' | 'B' => {
71 self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false)
72 }
73
74 'x' | 'X' => {
76 self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false)
77 }
78
79 c if is_ident_start(c) => self.ident(),
82
83 c @ '0'..='9' => {
86 let literal_kind = self.number(c);
87 TokenKind::Literal { kind: literal_kind }
88 }
89 '.' => match self.first() {
90 '0'..='9' => {
91 let literal_kind = self.number('.');
92 TokenKind::Literal { kind: literal_kind }
93 }
94 _ => TokenKind::Dot,
95 },
96 ';' => TokenKind::Semi,
98 ',' => TokenKind::Comma,
99 '(' => TokenKind::OpenParen,
100 ')' => TokenKind::CloseParen,
101 '[' => TokenKind::OpenBracket,
102 ']' => TokenKind::CloseBracket,
103 '{' => TokenKind::OpenCurly,
104 '}' => TokenKind::CloseCurly,
105 '@' => TokenKind::At,
106 '#' => TokenKind::Pound,
107 '~' => TokenKind::Tilde,
108 '?' => TokenKind::Question,
109 ':' => TokenKind::Colon,
110 '$' => {
111 if is_ident_start(self.first()) || self.first() == '$' {
113 self.dollar_quoted_string()
114 } else {
115 while self.first().is_ascii_digit() {
117 self.bump();
118 }
119 TokenKind::PositionalParam
120 }
121 }
122 '`' => TokenKind::Backtick,
123 '=' => TokenKind::Eq,
124 '!' => TokenKind::Bang,
125 '<' => TokenKind::Lt,
126 '>' => TokenKind::Gt,
127 '&' => TokenKind::And,
128 '|' => TokenKind::Or,
129 '+' => TokenKind::Plus,
130 '*' => TokenKind::Star,
131 '^' => TokenKind::Caret,
132 '%' => TokenKind::Percent,
133
134 '\'' => {
136 let terminated = self.single_quoted_string();
137 let kind = LiteralKind::Str { terminated };
138 TokenKind::Literal { kind }
139 }
140
141 '"' => {
143 let terminated = self.double_quoted_string();
144 TokenKind::QuotedIdent { terminated }
145 }
146 _ => TokenKind::Unknown,
147 };
148 let res = Token::new(token_kind, self.pos_within_token());
149 self.reset_pos_within_token();
150 res
151 }
152 pub(crate) fn ident(&mut self) -> TokenKind {
153 self.eat_while(is_ident_cont);
154 TokenKind::Ident
155 }
156
157 pub(crate) fn whitespace(&mut self) -> TokenKind {
158 self.eat_while(is_whitespace);
159 TokenKind::Whitespace
160 }
161
162 fn ident_or_unknown_prefix(&mut self) -> TokenKind {
163 self.eat_while(is_ident_cont);
165 match self.first() {
168 '#' | '"' | '\'' => TokenKind::UnknownPrefix,
169 _ => TokenKind::Ident,
170 }
171 }
172
173 pub(crate) fn line_comment(&mut self) -> TokenKind {
176 self.bump();
177
178 self.eat_while(|c| c != '\n');
179 TokenKind::LineComment
180 }
181
182 pub(crate) fn block_comment(&mut self) -> TokenKind {
184 self.bump();
185
186 let mut depth = 1usize;
187 while let Some(c) = self.bump() {
188 match c {
189 '/' if self.first() == '*' => {
190 self.bump();
191 depth += 1;
192 }
193 '*' if self.first() == '/' => {
194 self.bump();
195 depth -= 1;
196 if depth == 0 {
197 break;
201 }
202 }
203 _ => (),
204 }
205 }
206
207 TokenKind::BlockComment {
208 terminated: depth == 0,
209 }
210 }
211
212 fn prefixed_string(
213 &mut self,
214 mk_kind: fn(bool) -> LiteralKind,
215 allows_double: bool,
216 ) -> TokenKind {
217 match self.first() {
218 '\'' => {
219 self.bump();
220 let terminated = self.single_quoted_string();
221 let kind = mk_kind(terminated);
222 TokenKind::Literal { kind }
223 }
224 '"' if allows_double => {
225 self.bump();
226 let terminated = self.double_quoted_string();
227 TokenKind::QuotedIdent { terminated }
228 }
229 _ => self.ident_or_unknown_prefix(),
230 }
231 }
232
233 fn number(&mut self, first_digit: char) -> LiteralKind {
234 let mut base = Base::Decimal;
235 if first_digit == '0' {
236 match self.first() {
238 'b' | 'B' => {
240 base = Base::Binary;
241 self.bump();
242 if !self.eat_decimal_digits() {
243 return LiteralKind::Int {
244 base,
245 empty_int: true,
246 };
247 }
248 }
249 'o' | 'O' => {
251 base = Base::Octal;
252 self.bump();
253 if !self.eat_decimal_digits() {
254 return LiteralKind::Int {
255 base,
256 empty_int: true,
257 };
258 }
259 }
260 'x' | 'X' => {
262 base = Base::Hexadecimal;
263 self.bump();
264 if !self.eat_hexadecimal_digits() {
265 return LiteralKind::Int {
266 base,
267 empty_int: true,
268 };
269 }
270 }
271 '0'..='9' | '_' => {
273 self.eat_decimal_digits();
274 }
275
276 '.' | 'e' | 'E' => {}
278
279 _ => {
281 return LiteralKind::Int {
282 base,
283 empty_int: false,
284 };
285 }
286 }
287 } else {
288 self.eat_decimal_digits();
290 };
291
292 match self.first() {
293 '.' => {
294 self.bump();
297 let mut empty_exponent = false;
298 if self.first().is_ascii_digit() {
299 self.eat_decimal_digits();
300 match self.first() {
301 'e' | 'E' => {
302 self.bump();
303 empty_exponent = !self.eat_float_exponent();
304 }
305 _ => (),
306 }
307 } else {
308 match self.first() {
309 'e' | 'E' => {
310 self.bump();
311 empty_exponent = !self.eat_float_exponent();
312 }
313 _ => (),
314 }
315 }
316 LiteralKind::Float {
317 base,
318 empty_exponent,
319 }
320 }
321 'e' | 'E' => {
322 self.bump();
323 let empty_exponent = !self.eat_float_exponent();
324 LiteralKind::Float {
325 base,
326 empty_exponent,
327 }
328 }
329 _ => LiteralKind::Int {
330 base,
331 empty_int: false,
332 },
333 }
334 }
335
336 fn single_quoted_string(&mut self) -> bool {
337 loop {
339 match self.first() {
340 '\'' => {
342 self.bump();
343
344 match self.first() {
345 '\'' => {
347 self.bump();
348 }
349 _ => return true,
351 }
352 }
353 EOF_CHAR if self.is_eof() => break,
355 _ => {
357 self.bump();
358 }
359 }
360 }
361 false
363 }
364
365 fn double_quoted_string(&mut self) -> bool {
368 while let Some(c) = self.bump() {
369 match c {
370 '"' if self.first() == '"' => {
371 self.bump();
373 }
374 '"' => {
375 return true;
376 }
377 _ => (),
378 }
379 }
380 false
382 }
383
384 fn dollar_quoted_string(&mut self) -> TokenKind {
386 let mut start = vec![];
389 while let Some(c) = self.bump() {
390 match c {
391 '$' => {
392 break;
393 }
394 _ => {
395 start.push(c);
396 }
397 }
398 }
399
400 if start.is_empty() {
402 loop {
403 self.eat_while(|c| c != '$');
404 if self.is_eof() {
405 return TokenKind::Literal {
406 kind: LiteralKind::DollarQuotedString { terminated: false },
407 };
408 }
409 self.bump();
411 if self.first() == '$' {
412 self.bump();
413 return TokenKind::Literal {
414 kind: LiteralKind::DollarQuotedString { terminated: true },
415 };
416 }
417 }
418 } else {
419 loop {
420 self.eat_while(|c| c != start[0]);
421 if self.is_eof() {
422 return TokenKind::Literal {
423 kind: LiteralKind::DollarQuotedString { terminated: false },
424 };
425 }
426
427 let mut match_count = 0;
429 for start_char in &start {
430 if self.first() == *start_char {
431 self.bump();
432 match_count += 1;
433 } else {
434 self.bump();
435 break;
436 }
437 }
438
439 let terminated = match_count == start.len();
441 if self.first() == '$' && terminated {
442 self.bump();
443 return TokenKind::Literal {
444 kind: LiteralKind::DollarQuotedString { terminated },
445 };
446 }
447 }
448 }
449 }
450
451 fn eat_decimal_digits(&mut self) -> bool {
452 let mut has_digits = false;
453 loop {
454 match self.first() {
455 '_' => {
456 self.bump();
457 }
458 '0'..='9' => {
459 has_digits = true;
460 self.bump();
461 }
462 _ => break,
463 }
464 }
465 has_digits
466 }
467
468 fn eat_hexadecimal_digits(&mut self) -> bool {
469 let mut has_digits = false;
470 loop {
471 match self.first() {
472 '_' => {
473 self.bump();
474 }
475 '0'..='9' | 'a'..='f' | 'A'..='F' => {
476 has_digits = true;
477 self.bump();
478 }
479 _ => break,
480 }
481 }
482 has_digits
483 }
484
485 fn eat_float_exponent(&mut self) -> bool {
488 if self.first() == '-' || self.first() == '+' {
489 self.bump();
490 }
491 self.eat_decimal_digits()
492 }
493}
494
495pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
497 let mut cursor = Cursor::new(input);
498 std::iter::from_fn(move || {
499 let token = cursor.advance_token();
500 if token.kind != TokenKind::Eof {
501 Some(token)
502 } else {
503 None
504 }
505 })
506}
507
508#[cfg(test)]
509mod tests {
510 use std::fmt;
511
512 use super::*;
513 use insta::assert_debug_snapshot;
514
515 struct TokenDebug<'a> {
516 content: &'a str,
517 token: Token,
518 }
519 impl fmt::Debug for TokenDebug<'_> {
520 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
521 write!(f, "{:?} @ {:?}", self.content, self.token.kind)
522 }
523 }
524
525 impl<'a> TokenDebug<'a> {
526 fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> {
527 TokenDebug {
528 token,
529 content: &input[start as usize..(start + token.len) as usize],
530 }
531 }
532 }
533
534 fn lex(input: &str) -> Vec<TokenDebug<'_>> {
535 let mut tokens = vec![];
536 let mut start = 0;
537
538 for token in tokenize(input) {
539 let length = token.len;
540 tokens.push(TokenDebug::new(token, input, start));
541 start += length;
542 }
543 tokens
544 }
545 #[test]
546 fn lex_statement() {
547 let result = lex("select 1;");
548 assert_debug_snapshot!(result);
549 }
550
551 #[test]
552 fn block_comment() {
553 let result = lex(r#"
554/*
555 * foo
556 * bar
557*/"#);
558 assert_debug_snapshot!(result);
559 }
560
561 #[test]
562 fn block_comment_unterminated() {
563 let result = lex(r#"
564/*
565 * foo
566 * bar
567 /*
568*/"#);
569 assert_debug_snapshot!(result);
570 }
571
572 #[test]
573 fn line_comment() {
574 let result = lex(r#"
575-- foooooooooooo bar buzz
576"#);
577 assert_debug_snapshot!(result);
578 }
579
580 #[test]
581 fn line_comment_whitespace() {
582 assert_debug_snapshot!(lex(r#"
583select 'Hello' -- This is a comment
584' World';"#))
585 }
586
587 #[test]
588 fn dollar_quoting() {
589 assert_debug_snapshot!(lex(r#"
590$$Dianne's horse$$
591$SomeTag$Dianne's horse$SomeTag$
592
593-- with dollar inside and matching tags
594$foo$hello$world$bar$
595"#))
596 }
597
598 #[test]
599 fn dollar_strings_part2() {
600 assert_debug_snapshot!(lex(r#"
601DO $doblock$
602end
603$doblock$;"#))
604 }
605
606 #[test]
607 fn dollar_quote_mismatch_tags_simple() {
608 assert_debug_snapshot!(lex(r#"
609-- dollar quoting with mismatched tags
610$foo$hello world$bar$
611"#));
612 }
613
614 #[test]
615 fn dollar_quote_mismatch_tags_complex() {
616 assert_debug_snapshot!(lex(r#"
617-- with dollar inside but mismatched tags
618$foo$hello$world$bar$
619"#));
620 }
621
622 #[test]
623 fn numeric() {
624 assert_debug_snapshot!(lex(r#"
62542
6263.5
6274.
628.001
629.123e10
6305e2
6311.925e-3
6321e-10
6331e+10
6341e10
6354664.E+5
636"#))
637 }
638
639 #[test]
640 fn numeric_non_decimal() {
641 assert_debug_snapshot!(lex(r#"
6420b100101
6430B10011001
6440o273
6450O755
6460x42f
6470XFFFF
648"#))
649 }
650
651 #[test]
652 fn numeric_with_seperators() {
653 assert_debug_snapshot!(lex(r#"
6541_500_000_000
6550b10001000_00000000
6560o_1_755
6570xFFFF_FFFF
6581.618_034
659"#))
660 }
661
662 #[test]
663 fn select_with_period() {
664 assert_debug_snapshot!(lex(r#"
665select public.users;
666"#))
667 }
668
669 #[test]
670 fn bitstring() {
671 assert_debug_snapshot!(lex(r#"
672B'1001'
673b'1001'
674X'1FF'
675x'1FF'
676"#))
677 }
678
679 #[test]
680 fn string() {
681 assert_debug_snapshot!(lex(r#"
682'Dianne''s horse'
683
684select 'foo ''
685bar';
686
687select 'foooo'
688 'bar';
689
690
691'foo \\ \n \tbar'
692
693'forgot to close the string
694"#))
695 }
696
697 #[test]
698 fn params() {
699 assert_debug_snapshot!(lex(r#"
700select $1 + $2;
701
702select $1123123123123;
703
704select $;
705"#))
706 }
707
708 #[test]
709 fn string_with_escapes() {
710 assert_debug_snapshot!(lex(r#"
713E'foo'
714
715e'bar'
716
717e'\b\f\n\r\t'
718
719e'\0\11\777'
720
721e'\x0\x11\xFF'
722
723e'\uAAAA \UFFFFFFFF'
724
725"#))
726 }
727
728 #[test]
729 fn string_unicode_escape() {
730 assert_debug_snapshot!(lex(r#"
733U&"d\0061t\+000061"
734
735U&"\0441\043B\043E\043D"
736
737u&'\0441\043B'
738
739U&"d!0061t!+000061" UESCAPE '!'
740"#))
741 }
742
743 #[test]
744 fn quoted_ident() {
745 assert_debug_snapshot!(lex(r#"
746"hello &1 -world";
747
748
749"hello-world
750"#))
751 }
752
753 #[test]
754 fn quoted_ident_with_escape_quote() {
755 assert_debug_snapshot!(lex(r#"
756"foo "" bar"
757"#))
758 }
759
760 #[test]
761 fn dollar_quoted_string() {
762 assert_debug_snapshot!(lex("$$$$"), @r#"
763 [
764 "$$$$" @ Literal { kind: DollarQuotedString { terminated: true } },
765 ]
766 "#);
767 }
768}