Skip to main content

squawk_lexer/
lib.rs

1mod cursor;
2mod token;
3use cursor::{Cursor, EOF_CHAR};
4pub use token::{Base, LiteralKind, Token, TokenKind};
5
6// via: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L346
7// ident_start		[A-Za-z\200-\377_]
8const fn is_ident_start(c: char) -> bool {
9    matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..)
10}
11
12// ident_cont		[A-Za-z\200-\377_0-9\$]
13const fn is_ident_cont(c: char) -> bool {
14    matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..)
15}
16
17// see:
18// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scansup.c#L107-L128
19// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L204-L229
20const fn is_whitespace(c: char) -> bool {
21    matches!(
22        c,
23        ' ' // space
24        | '\t' // tab
25        | '\n' // newline
26        | '\r' // carriage return
27        | '\u{000B}' // vertical tab
28        | '\u{000C}' // form feed
29    )
30}
31
32impl Cursor<'_> {
33    // see: https://github.com/rust-lang/rust/blob/ba1d7f4a083e6402679105115ded645512a7aea8/compiler/rustc_lexer/src/lib.rs#L339
34    pub(crate) fn advance_token(&mut self) -> Token {
35        let Some(first_char) = self.bump() else {
36            return Token::new(TokenKind::Eof, 0);
37        };
38        let token_kind = match first_char {
39            // Slash, comment or block comment.
40            '/' => match self.first() {
41                '*' => self.block_comment(),
42                _ => TokenKind::Slash,
43            },
44            '-' => match self.first() {
45                '-' => self.line_comment(),
46                _ => TokenKind::Minus,
47            },
48
49            // // Whitespace sequence.
50            c if is_whitespace(c) => self.whitespace(),
51
52            // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE
53            'u' | 'U' => {
54                if self.first() == '&' && matches!(self.second(), '\'' | '"') {
55                    self.bump();
56                    self.prefixed_string(
57                        |terminated| LiteralKind::UnicodeEscStr { terminated },
58                        true,
59                    )
60                } else {
61                    self.ident_or_unknown_prefix()
62                }
63            }
64            // escaped strings
65            'e' | 'E' => {
66                self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false)
67            }
68
69            // bit string
70            'b' | 'B' => {
71                self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false)
72            }
73
74            // hexadecimal byte string
75            'x' | 'X' => {
76                self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false)
77            }
78
79            // Identifier (this should be checked after other variant that can
80            // start as identifier).
81            c if is_ident_start(c) => self.ident(),
82
83            // Numeric literal.
84            // see: https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC
85            c @ '0'..='9' => {
86                let literal_kind = self.number(c);
87                TokenKind::Literal { kind: literal_kind }
88            }
89            '.' => match self.first() {
90                '0'..='9' => {
91                    let literal_kind = self.number('.');
92                    TokenKind::Literal { kind: literal_kind }
93                }
94                _ => TokenKind::Dot,
95            },
96            // One-symbol tokens.
97            ';' => TokenKind::Semi,
98            ',' => TokenKind::Comma,
99            '(' => TokenKind::OpenParen,
100            ')' => TokenKind::CloseParen,
101            '[' => TokenKind::OpenBracket,
102            ']' => TokenKind::CloseBracket,
103            '{' => TokenKind::OpenCurly,
104            '}' => TokenKind::CloseCurly,
105            '@' => TokenKind::At,
106            '#' => TokenKind::Pound,
107            '~' => TokenKind::Tilde,
108            '?' => TokenKind::Question,
109            ':' => TokenKind::Colon,
110            '$' => {
111                // Dollar quoted strings
112                if is_ident_start(self.first()) || self.first() == '$' {
113                    self.dollar_quoted_string()
114                } else {
115                    // Parameters
116                    while self.first().is_ascii_digit() {
117                        self.bump();
118                    }
119                    let trailing_junk_start = self.pos_within_token();
120                    self.eat_identifier();
121                    TokenKind::PositionalParam {
122                        trailing_junk_start,
123                    }
124                }
125            }
126            '`' => TokenKind::Backtick,
127            '=' => TokenKind::Eq,
128            '!' => TokenKind::Bang,
129            '<' => TokenKind::Lt,
130            '>' => TokenKind::Gt,
131            '&' => TokenKind::And,
132            '|' => TokenKind::Or,
133            '+' => TokenKind::Plus,
134            '*' => TokenKind::Star,
135            '^' => TokenKind::Caret,
136            '%' => TokenKind::Percent,
137
138            // String literal
139            '\'' => {
140                let terminated = self.single_quoted_string();
141                let kind = LiteralKind::Str { terminated };
142                TokenKind::Literal { kind }
143            }
144
145            // Quoted indentifiers
146            '"' => {
147                let terminated = self.double_quoted_string();
148                TokenKind::QuotedIdent { terminated }
149            }
150            _ => TokenKind::Unknown,
151        };
152        let res = Token::new(token_kind, self.pos_within_token());
153        self.reset_pos_within_token();
154        res
155    }
156    pub(crate) fn ident(&mut self) -> TokenKind {
157        self.eat_while(is_ident_cont);
158        TokenKind::Ident
159    }
160
161    pub(crate) fn whitespace(&mut self) -> TokenKind {
162        self.eat_while(is_whitespace);
163        TokenKind::Whitespace
164    }
165
166    fn ident_or_unknown_prefix(&mut self) -> TokenKind {
167        // Start is already eaten, eat the rest of identifier.
168        self.eat_while(is_ident_cont);
169        // Known prefixes must have been handled earlier. So if
170        // we see a prefix here, it is definitely an unknown prefix.
171        match self.first() {
172            '"' | '\'' => TokenKind::UnknownPrefix,
173            _ => TokenKind::Ident,
174        }
175    }
176
177    // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L227
178    // comment			("--"{non_newline}*)
179    pub(crate) fn line_comment(&mut self) -> TokenKind {
180        self.bump();
181
182        self.eat_while(|c| c != '\n');
183        TokenKind::LineComment
184    }
185
186    // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L324-L344
187    pub(crate) fn block_comment(&mut self) -> TokenKind {
188        self.bump();
189
190        let mut depth = 1usize;
191        while let Some(c) = self.bump() {
192            match c {
193                '/' if self.first() == '*' => {
194                    self.bump();
195                    depth += 1;
196                }
197                '*' if self.first() == '/' => {
198                    self.bump();
199                    depth -= 1;
200                    if depth == 0 {
201                        // This block comment is closed, so for a construction like "/* */ */"
202                        // there will be a successfully parsed block comment "/* */"
203                        // and " */" will be processed separately.
204                        break;
205                    }
206                }
207                _ => (),
208            }
209        }
210
211        TokenKind::BlockComment {
212            terminated: depth == 0,
213        }
214    }
215
216    fn prefixed_string(
217        &mut self,
218        mk_kind: fn(bool) -> LiteralKind,
219        allows_double: bool,
220    ) -> TokenKind {
221        match self.first() {
222            '\'' => {
223                self.bump();
224                let terminated = self.single_quoted_string();
225                let kind = mk_kind(terminated);
226                TokenKind::Literal { kind }
227            }
228            '"' if allows_double => {
229                self.bump();
230                let terminated = self.double_quoted_string();
231                TokenKind::QuotedIdent { terminated }
232            }
233            _ => self.ident_or_unknown_prefix(),
234        }
235    }
236
237    fn number(&mut self, first_digit: char) -> LiteralKind {
238        let mut base = Base::Decimal;
239        if first_digit == '.' {
240            return self.eat_fractional(base);
241        }
242        if first_digit == '0' {
243            // Attempt to parse encoding base.
244            match self.first() {
245                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L403
246                'b' | 'B' => {
247                    base = Base::Binary;
248                    self.bump();
249                    let has_digits = self.eat_decimal_digits();
250                    return self.finish_base_prefixed_int(base, has_digits);
251                }
252                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L402
253                'o' | 'O' => {
254                    base = Base::Octal;
255                    self.bump();
256                    let has_digits = self.eat_decimal_digits();
257                    return self.finish_base_prefixed_int(base, has_digits);
258                }
259                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L401
260                'x' | 'X' => {
261                    base = Base::Hexadecimal;
262                    self.bump();
263                    let has_digits = self.eat_hexadecimal_digits();
264                    return self.finish_base_prefixed_int(base, has_digits);
265                }
266                // Not a base prefix; consume additional digits.
267                '0'..='9' | '_' => {
268                    self.eat_decimal_digits();
269                }
270
271                // Also not a base prefix; nothing more to do here.
272                '.' | 'e' | 'E' => {}
273
274                // Just a 0.
275                _ => {
276                    let trailing_junk_start = self.pos_within_token();
277                    self.eat_identifier();
278                    return LiteralKind::Int {
279                        base,
280                        empty_int: false,
281                        trailing_junk_start,
282                    };
283                }
284            }
285        } else {
286            // No base prefix, parse number in the usual way.
287            self.eat_decimal_digits();
288        };
289
290        match self.first() {
291            '.' => self.eat_fractional(base),
292            'e' | 'E' => {
293                let exponent_start = self.pos_within_token();
294                self.bump();
295                let empty_exponent_start = (!self.eat_numeric_exponent()).then_some(exponent_start);
296                let trailing_junk_start = self.pos_within_token();
297                self.eat_identifier();
298                LiteralKind::Numeric {
299                    base,
300                    empty_exponent_start,
301                    trailing_junk_start,
302                }
303            }
304            _ => {
305                let trailing_junk_start = self.pos_within_token();
306                self.eat_identifier();
307                LiteralKind::Int {
308                    base,
309                    empty_int: false,
310                    trailing_junk_start,
311                }
312            }
313        }
314    }
315
316    fn single_quoted_string(&mut self) -> bool {
317        // Parse until either quotes are terminated or error is detected.
318        loop {
319            match self.first() {
320                // Quotes might be terminated.
321                '\'' => {
322                    self.bump();
323
324                    match self.first() {
325                        // encountered an escaped quote ''
326                        '\'' => {
327                            self.bump();
328                        }
329                        // encountered terminating quote
330                        _ => return true,
331                    }
332                }
333                // End of file, stop parsing.
334                EOF_CHAR if self.is_eof() => break,
335                // Skip the character.
336                _ => {
337                    self.bump();
338                }
339            }
340        }
341        // String was not terminated.
342        false
343    }
344
345    /// Eats double-quoted string and returns true
346    /// if string is terminated.
347    fn double_quoted_string(&mut self) -> bool {
348        while let Some(c) = self.bump() {
349            match c {
350                '"' if self.first() == '"' => {
351                    // Bump again to skip escaped character.
352                    self.bump();
353                }
354                '"' => {
355                    return true;
356                }
357                _ => (),
358            }
359        }
360        // End of file reached.
361        false
362    }
363
364    // https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING
365    fn dollar_quoted_string(&mut self) -> TokenKind {
366        // Get the start sequence of the dollar quote, i.e., 'foo' in
367        // $foo$hello$foo$
368        let mut start = vec![];
369        while let Some(c) = self.bump() {
370            match c {
371                '$' => {
372                    break;
373                }
374                _ => {
375                    start.push(c);
376                }
377            }
378        }
379
380        // we have a dollar quoted string deliminated with `$$`
381        if start.is_empty() {
382            loop {
383                self.eat_while(|c| c != '$');
384                if self.is_eof() {
385                    return TokenKind::Literal {
386                        kind: LiteralKind::DollarQuotedString { terminated: false },
387                    };
388                }
389                // eat $
390                self.bump();
391                if self.first() == '$' {
392                    self.bump();
393                    return TokenKind::Literal {
394                        kind: LiteralKind::DollarQuotedString { terminated: true },
395                    };
396                }
397            }
398        } else {
399            loop {
400                self.eat_while(|c| c != start[0]);
401                if self.is_eof() {
402                    return TokenKind::Literal {
403                        kind: LiteralKind::DollarQuotedString { terminated: false },
404                    };
405                }
406
407                // might be the start of our start/end sequence
408                let mut match_count = 0;
409                for start_char in &start {
410                    if self.first() == *start_char {
411                        self.bump();
412                        match_count += 1;
413                    } else {
414                        self.bump();
415                        break;
416                    }
417                }
418
419                // closing '$'
420                let terminated = match_count == start.len();
421                if self.first() == '$' && terminated {
422                    self.bump();
423                    return TokenKind::Literal {
424                        kind: LiteralKind::DollarQuotedString { terminated },
425                    };
426                }
427            }
428        }
429    }
430
431    fn eat_decimal_digits(&mut self) -> bool {
432        let mut has_digits = false;
433        loop {
434            match self.first() {
435                '_' if self.second().is_ascii_digit() => {
436                    self.bump();
437                }
438                '0'..='9' => {
439                    has_digits = true;
440                    self.bump();
441                }
442                _ => break,
443            }
444        }
445        has_digits
446    }
447
448    fn finish_base_prefixed_int(&mut self, base: Base, has_digits: bool) -> LiteralKind {
449        let trailing_junk_start = self.pos_within_token();
450        self.eat_while(is_ident_cont);
451        let has_trailing_junk = self.pos_within_token() > trailing_junk_start;
452        LiteralKind::Int {
453            base,
454            empty_int: !has_digits && !has_trailing_junk,
455            trailing_junk_start,
456        }
457    }
458
459    fn eat_hexadecimal_digits(&mut self) -> bool {
460        let mut has_digits = false;
461        loop {
462            match self.first() {
463                '_' if self.second().is_ascii_hexdigit() => {
464                    self.bump();
465                }
466                '0'..='9' | 'a'..='f' | 'A'..='F' => {
467                    has_digits = true;
468                    self.bump();
469                }
470                _ => break,
471            }
472        }
473        has_digits
474    }
475
476    /// Eats the numeric exponent. Returns true if at least one digit was met,
477    /// and returns false otherwise.
478    fn eat_numeric_exponent(&mut self) -> bool {
479        if self.first() == '_' {
480            return false;
481        }
482        if self.first() == '-' || self.first() == '+' {
483            self.bump();
484        }
485        self.eat_decimal_digits()
486    }
487
488    fn eat_identifier(&mut self) {
489        if is_ident_start(self.first()) {
490            self.eat_while(is_ident_cont);
491        }
492    }
493
494    pub(crate) fn eat_fractional(&mut self, base: Base) -> crate::LiteralKind {
495        // might have stuff after the ., and if it does, it needs to start
496        // with a number
497        self.bump();
498        let mut empty_exponent_start = None;
499        if self.first().is_ascii_digit() {
500            self.eat_decimal_digits();
501            match self.first() {
502                'e' | 'E' => {
503                    let exponent_start = self.pos_within_token();
504                    self.bump();
505                    if !self.eat_numeric_exponent() {
506                        empty_exponent_start = Some(exponent_start);
507                    }
508                }
509                _ => (),
510            }
511        } else {
512            match self.first() {
513                'e' | 'E' => {
514                    let exponent_start = self.pos_within_token();
515                    self.bump();
516                    if !self.eat_numeric_exponent() {
517                        empty_exponent_start = Some(exponent_start);
518                    }
519                }
520                _ => (),
521            }
522        }
523        let trailing_junk_start = self.pos_within_token();
524        self.eat_identifier();
525        LiteralKind::Numeric {
526            base,
527            empty_exponent_start,
528            trailing_junk_start,
529        }
530    }
531}
532
533/// Creates an iterator that produces tokens from the input string.
534pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
535    let mut cursor = Cursor::new(input);
536    std::iter::from_fn(move || {
537        let token = cursor.advance_token();
538        if token.kind != TokenKind::Eof {
539            Some(token)
540        } else {
541            None
542        }
543    })
544}
545
546#[cfg(test)]
547mod tests {
548    use std::fmt;
549
550    use super::*;
551    use insta::assert_debug_snapshot;
552
553    struct TokenDebug<'a> {
554        content: &'a str,
555        token: Token,
556    }
557    impl fmt::Debug for TokenDebug<'_> {
558        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
559            write!(f, "{:?} @ {:?}", self.content, self.token.kind)
560        }
561    }
562
563    impl<'a> TokenDebug<'a> {
564        fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> {
565            TokenDebug {
566                token,
567                content: &input[start as usize..(start + token.len) as usize],
568            }
569        }
570    }
571
572    fn lex(input: &str) -> Vec<TokenDebug<'_>> {
573        let mut tokens = vec![];
574        let mut start = 0;
575
576        for token in tokenize(input) {
577            let length = token.len;
578            tokens.push(TokenDebug::new(token, input, start));
579            start += length;
580        }
581        tokens
582    }
583    #[test]
584    fn lex_statement() {
585        let result = lex("select 1;");
586        assert_debug_snapshot!(result);
587    }
588
589    #[test]
590    fn block_comment() {
591        let result = lex(r#"
592/*
593 * foo
594 * bar
595*/"#);
596        assert_debug_snapshot!(result);
597    }
598
599    #[test]
600    fn block_comment_unterminated() {
601        let result = lex(r#"
602/*
603 * foo
604 * bar
605 /*
606*/"#);
607        assert_debug_snapshot!(result);
608    }
609
610    #[test]
611    fn line_comment() {
612        let result = lex(r#"
613-- foooooooooooo bar buzz
614"#);
615        assert_debug_snapshot!(result);
616    }
617
618    #[test]
619    fn line_comment_whitespace() {
620        assert_debug_snapshot!(lex(r#"
621select 'Hello' -- This is a comment
622' World';"#))
623    }
624
625    #[test]
626    fn dollar_quoting() {
627        assert_debug_snapshot!(lex(r#"
628$$Dianne's horse$$
629$SomeTag$Dianne's horse$SomeTag$
630
631-- with dollar inside and matching tags
632$foo$hello$world$bar$
633"#))
634    }
635
636    #[test]
637    fn dollar_strings_part2() {
638        assert_debug_snapshot!(lex(r#"
639DO $doblock$
640end
641$doblock$;"#))
642    }
643
644    #[test]
645    fn dollar_quote_mismatch_tags_simple() {
646        assert_debug_snapshot!(lex(r#"
647-- dollar quoting with mismatched tags
648$foo$hello world$bar$
649"#));
650    }
651
652    #[test]
653    fn dollar_quote_mismatch_tags_complex() {
654        assert_debug_snapshot!(lex(r#"
655-- with dollar inside but mismatched tags
656$foo$hello$world$bar$
657"#));
658    }
659
660    #[test]
661    fn numeric() {
662        assert_debug_snapshot!(lex(r#"
66342
6643.5
6654.
666.001
667.123e10
6685e2
6691.925e-3
6701e-10
6711e+10
6721e10
6734664.E+5
674"#))
675    }
676
677    #[test]
678    fn numeric_non_decimal() {
679        assert_debug_snapshot!(lex(r#"
6800b100101
6810B10011001
6820o273
6830O755
6840x42f
6850XFFFF
686"#))
687    }
688
689    #[test]
690    fn numeric_with_seperators() {
691        assert_debug_snapshot!(lex(r#"
6921_500_000_000
6930b10001000_00000000
6940o_1_755
6950xFFFF_FFFF
6961.618_034
697"#))
698    }
699
700    #[test]
701    fn select_with_period() {
702        assert_debug_snapshot!(lex(r#"
703select public.users;
704"#))
705    }
706
707    #[test]
708    fn bitstring() {
709        assert_debug_snapshot!(lex(r#"
710B'1001'
711b'1001'
712X'1FF'
713x'1FF'
714"#))
715    }
716
717    #[test]
718    fn string() {
719        assert_debug_snapshot!(lex(r#"
720'Dianne''s horse'
721
722select 'foo ''
723bar';
724
725select 'foooo'   
726   'bar';
727
728
729'foo \\ \n \tbar'
730
731'forgot to close the string
732"#))
733    }
734
735    #[test]
736    fn params() {
737        assert_debug_snapshot!(lex(r#"
738select $1 + $2;
739
740select $1123123123123;
741
742select $;
743"#))
744    }
745
746    #[test]
747    fn string_with_escapes() {
748        // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-ESCAPE
749
750        assert_debug_snapshot!(lex(r#"
751E'foo'
752
753e'bar'
754
755e'\b\f\n\r\t'
756
757e'\0\11\777'
758
759e'\x0\x11\xFF'
760
761e'\uAAAA \UFFFFFFFF'
762
763"#))
764    }
765
766    #[test]
767    fn string_unicode_escape() {
768        // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE
769
770        assert_debug_snapshot!(lex(r#"
771U&"d\0061t\+000061"
772
773U&"\0441\043B\043E\043D"
774
775u&'\0441\043B'
776
777U&"d!0061t!+000061" UESCAPE '!'
778"#))
779    }
780
781    #[test]
782    fn quoted_ident() {
783        assert_debug_snapshot!(lex(r#"
784"hello &1 -world";
785
786
787"hello-world
788"#))
789    }
790
791    #[test]
792    fn quoted_ident_with_escape_quote() {
793        assert_debug_snapshot!(lex(r#"
794"foo "" bar"
795"#))
796    }
797
798    #[test]
799    fn dollar_quoted_string() {
800        assert_debug_snapshot!(lex("$$$$"), @r#"
801        [
802            "$$$$" @ Literal { kind: DollarQuotedString { terminated: true } },
803        ]
804        "#);
805    }
806
807    #[test]
808    fn ident_non_ascii_above_latin1() {
809        assert_debug_snapshot!(lex("ẞ Ā 漢字 𐐷"), @r#"
810        [
811            "ẞ" @ Ident,
812            " " @ Whitespace,
813            "Ā" @ Ident,
814            " " @ Whitespace,
815            "漢字" @ Ident,
816            " " @ Whitespace,
817            "𐐷" @ Ident,
818        ]
819        "#);
820    }
821}