squawk_lexer/
lib.rs

1mod cursor;
2mod token;
3use cursor::{Cursor, EOF_CHAR};
4pub use token::{Base, LiteralKind, Token, TokenKind};
5
6// via: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L346
7// ident_start		[A-Za-z\200-\377_]
8const fn is_ident_start(c: char) -> bool {
9    matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..='\u{FF}')
10}
11
12// ident_cont		[A-Za-z\200-\377_0-9\$]
13const fn is_ident_cont(c: char) -> bool {
14    matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..='\u{FF}')
15}
16
17// see:
18// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scansup.c#L107-L128
19// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L204-L229
20const fn is_whitespace(c: char) -> bool {
21    matches!(
22        c,
23        ' ' // space
24        | '\t' // tab
25        | '\n' // newline
26        | '\r' // carriage return
27        | '\u{000B}' // vertical tab
28        | '\u{000C}' // form feed
29    )
30}
31
32impl Cursor<'_> {
33    // see: https://github.com/rust-lang/rust/blob/ba1d7f4a083e6402679105115ded645512a7aea8/compiler/rustc_lexer/src/lib.rs#L339
34    pub(crate) fn advance_token(&mut self) -> Token {
35        let Some(first_char) = self.bump() else {
36            return Token::new(TokenKind::Eof, 0);
37        };
38        let token_kind = match first_char {
39            // Slash, comment or block comment.
40            '/' => match self.first() {
41                '*' => self.block_comment(),
42                _ => TokenKind::Slash,
43            },
44            '-' => match self.first() {
45                '-' => self.line_comment(),
46                _ => TokenKind::Minus,
47            },
48
49            // // Whitespace sequence.
50            c if is_whitespace(c) => self.whitespace(),
51
52            // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE
53            'u' | 'U' => match self.first() {
54                '&' => {
55                    self.bump();
56                    self.prefixed_string(
57                        |terminated| LiteralKind::UnicodeEscStr { terminated },
58                        true,
59                    )
60                }
61                _ => self.ident_or_unknown_prefix(),
62            },
63
64            // escaped strings
65            'e' | 'E' => {
66                self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false)
67            }
68
69            // bit string
70            'b' | 'B' => {
71                self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false)
72            }
73
74            // hexadecimal byte string
75            'x' | 'X' => {
76                self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false)
77            }
78
79            // Identifier (this should be checked after other variant that can
80            // start as identifier).
81            c if is_ident_start(c) => self.ident(),
82
83            // Numeric literal.
84            // see: https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC
85            c @ '0'..='9' => {
86                let literal_kind = self.number(c);
87                TokenKind::Literal { kind: literal_kind }
88            }
89            '.' => match self.first() {
90                '0'..='9' => {
91                    let literal_kind = self.number('.');
92                    TokenKind::Literal { kind: literal_kind }
93                }
94                _ => TokenKind::Dot,
95            },
96            // One-symbol tokens.
97            ';' => TokenKind::Semi,
98            ',' => TokenKind::Comma,
99            '(' => TokenKind::OpenParen,
100            ')' => TokenKind::CloseParen,
101            '[' => TokenKind::OpenBracket,
102            ']' => TokenKind::CloseBracket,
103            '@' => TokenKind::At,
104            '#' => TokenKind::Pound,
105            '~' => TokenKind::Tilde,
106            '?' => TokenKind::Question,
107            ':' => TokenKind::Colon,
108            '$' => {
109                // Dollar quoted strings
110                if is_ident_start(self.first()) || self.first() == '$' {
111                    self.dollar_quoted_string()
112                } else {
113                    // Parameters
114                    while self.first().is_ascii_digit() {
115                        self.bump();
116                    }
117                    TokenKind::PositionalParam
118                }
119            }
120            '`' => TokenKind::Backtick,
121            '=' => TokenKind::Eq,
122            '!' => TokenKind::Bang,
123            '<' => TokenKind::Lt,
124            '>' => TokenKind::Gt,
125            '&' => TokenKind::And,
126            '|' => TokenKind::Or,
127            '+' => TokenKind::Plus,
128            '*' => TokenKind::Star,
129            '^' => TokenKind::Caret,
130            '%' => TokenKind::Percent,
131
132            // String literal
133            '\'' => {
134                let terminated = self.single_quoted_string();
135                let kind = LiteralKind::Str { terminated };
136                TokenKind::Literal { kind }
137            }
138
139            // Quoted indentifiers
140            '"' => {
141                let terminated = self.double_quoted_string();
142                TokenKind::QuotedIdent { terminated }
143            }
144            _ => TokenKind::Unknown,
145        };
146        let res = Token::new(token_kind, self.pos_within_token());
147        self.reset_pos_within_token();
148        res
149    }
150    pub(crate) fn ident(&mut self) -> TokenKind {
151        self.eat_while(is_ident_cont);
152        TokenKind::Ident
153    }
154
155    pub(crate) fn whitespace(&mut self) -> TokenKind {
156        self.eat_while(is_whitespace);
157        TokenKind::Whitespace
158    }
159
160    fn ident_or_unknown_prefix(&mut self) -> TokenKind {
161        // Start is already eaten, eat the rest of identifier.
162        self.eat_while(is_ident_cont);
163        // Known prefixes must have been handled earlier. So if
164        // we see a prefix here, it is definitely an unknown prefix.
165        match self.first() {
166            '#' | '"' | '\'' => TokenKind::UnknownPrefix,
167            _ => TokenKind::Ident,
168        }
169    }
170
171    // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L227
172    // comment			("--"{non_newline}*)
173    pub(crate) fn line_comment(&mut self) -> TokenKind {
174        self.bump();
175
176        self.eat_while(|c| c != '\n');
177        TokenKind::LineComment
178    }
179
180    // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L324-L344
181    pub(crate) fn block_comment(&mut self) -> TokenKind {
182        self.bump();
183
184        let mut depth = 1usize;
185        while let Some(c) = self.bump() {
186            match c {
187                '/' if self.first() == '*' => {
188                    self.bump();
189                    depth += 1;
190                }
191                '*' if self.first() == '/' => {
192                    self.bump();
193                    depth -= 1;
194                    if depth == 0 {
195                        // This block comment is closed, so for a construction like "/* */ */"
196                        // there will be a successfully parsed block comment "/* */"
197                        // and " */" will be processed separately.
198                        break;
199                    }
200                }
201                _ => (),
202            }
203        }
204
205        TokenKind::BlockComment {
206            terminated: depth == 0,
207        }
208    }
209
210    fn prefixed_string(
211        &mut self,
212        mk_kind: fn(bool) -> LiteralKind,
213        allows_double: bool,
214    ) -> TokenKind {
215        match self.first() {
216            '\'' => {
217                self.bump();
218                let terminated = self.single_quoted_string();
219                let kind = mk_kind(terminated);
220                TokenKind::Literal { kind }
221            }
222            '"' if allows_double => {
223                self.bump();
224                let terminated = self.double_quoted_string();
225                TokenKind::QuotedIdent { terminated }
226            }
227            _ => self.ident_or_unknown_prefix(),
228        }
229    }
230
231    fn number(&mut self, first_digit: char) -> LiteralKind {
232        let mut base = Base::Decimal;
233        if first_digit == '0' {
234            // Attempt to parse encoding base.
235            match self.first() {
236                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L403
237                'b' | 'B' => {
238                    base = Base::Binary;
239                    self.bump();
240                    if !self.eat_decimal_digits() {
241                        return LiteralKind::Int {
242                            base,
243                            empty_int: true,
244                        };
245                    }
246                }
247                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L402
248                'o' | 'O' => {
249                    base = Base::Octal;
250                    self.bump();
251                    if !self.eat_decimal_digits() {
252                        return LiteralKind::Int {
253                            base,
254                            empty_int: true,
255                        };
256                    }
257                }
258                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L401
259                'x' | 'X' => {
260                    base = Base::Hexadecimal;
261                    self.bump();
262                    if !self.eat_hexadecimal_digits() {
263                        return LiteralKind::Int {
264                            base,
265                            empty_int: true,
266                        };
267                    }
268                }
269                // Not a base prefix; consume additional digits.
270                '0'..='9' | '_' => {
271                    self.eat_decimal_digits();
272                }
273
274                // Also not a base prefix; nothing more to do here.
275                '.' | 'e' | 'E' => {}
276
277                // Just a 0.
278                _ => {
279                    return LiteralKind::Int {
280                        base,
281                        empty_int: false,
282                    };
283                }
284            }
285        } else {
286            // No base prefix, parse number in the usual way.
287            self.eat_decimal_digits();
288        };
289
290        match self.first() {
291            '.' => {
292                // might have stuff after the ., and if it does, it needs to start
293                // with a number
294                self.bump();
295                let mut empty_exponent = false;
296                if self.first().is_ascii_digit() {
297                    self.eat_decimal_digits();
298                    match self.first() {
299                        'e' | 'E' => {
300                            self.bump();
301                            empty_exponent = !self.eat_float_exponent();
302                        }
303                        _ => (),
304                    }
305                } else {
306                    match self.first() {
307                        'e' | 'E' => {
308                            self.bump();
309                            empty_exponent = !self.eat_float_exponent();
310                        }
311                        _ => (),
312                    }
313                }
314                LiteralKind::Float {
315                    base,
316                    empty_exponent,
317                }
318            }
319            'e' | 'E' => {
320                self.bump();
321                let empty_exponent = !self.eat_float_exponent();
322                LiteralKind::Float {
323                    base,
324                    empty_exponent,
325                }
326            }
327            _ => LiteralKind::Int {
328                base,
329                empty_int: false,
330            },
331        }
332    }
333
334    fn single_quoted_string(&mut self) -> bool {
335        // Parse until either quotes are terminated or error is detected.
336        loop {
337            match self.first() {
338                // Quotes might be terminated.
339                '\'' => {
340                    self.bump();
341
342                    match self.first() {
343                        // encountered an escaped quote ''
344                        '\'' => {
345                            self.bump();
346                        }
347                        // encountered terminating quote
348                        _ => return true,
349                    }
350                }
351                // End of file, stop parsing.
352                EOF_CHAR if self.is_eof() => break,
353                // Skip the character.
354                _ => {
355                    self.bump();
356                }
357            }
358        }
359        // String was not terminated.
360        false
361    }
362
363    /// Eats double-quoted string and returns true
364    /// if string is terminated.
365    fn double_quoted_string(&mut self) -> bool {
366        while let Some(c) = self.bump() {
367            match c {
368                '"' if self.first() == '"' => {
369                    // Bump again to skip escaped character.
370                    self.bump();
371                }
372                '"' => {
373                    return true;
374                }
375                _ => (),
376            }
377        }
378        // End of file reached.
379        false
380    }
381
382    // https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING
383    fn dollar_quoted_string(&mut self) -> TokenKind {
384        // Get the start sequence of the dollar quote, i.e., 'foo' in
385        // $foo$hello$foo$
386        let mut start = vec![];
387        while let Some(c) = self.bump() {
388            match c {
389                '$' => {
390                    self.bump();
391                    break;
392                }
393                _ => {
394                    start.push(c);
395                }
396            }
397        }
398
399        // we have a dollar quoted string deliminated with `$$`
400        if start.is_empty() {
401            loop {
402                self.eat_while(|c| c != '$');
403                if self.is_eof() {
404                    return TokenKind::Literal {
405                        kind: LiteralKind::DollarQuotedString { terminated: false },
406                    };
407                }
408                // eat $
409                self.bump();
410                if self.first() == '$' {
411                    self.bump();
412                    return TokenKind::Literal {
413                        kind: LiteralKind::DollarQuotedString { terminated: true },
414                    };
415                }
416            }
417        } else {
418            loop {
419                self.eat_while(|c| c != start[0]);
420                if self.is_eof() {
421                    return TokenKind::Literal {
422                        kind: LiteralKind::DollarQuotedString { terminated: false },
423                    };
424                }
425
426                // might be the start of our start/end sequence
427                let mut match_count = 0;
428                for start_char in &start {
429                    if self.first() == *start_char {
430                        self.bump();
431                        match_count += 1;
432                    } else {
433                        self.bump();
434                        break;
435                    }
436                }
437
438                // closing '$'
439                let terminated = match_count == start.len();
440                if self.first() == '$' && terminated {
441                    self.bump();
442                    return TokenKind::Literal {
443                        kind: LiteralKind::DollarQuotedString { terminated },
444                    };
445                }
446            }
447        }
448    }
449
450    fn eat_decimal_digits(&mut self) -> bool {
451        let mut has_digits = false;
452        loop {
453            match self.first() {
454                '_' => {
455                    self.bump();
456                }
457                '0'..='9' => {
458                    has_digits = true;
459                    self.bump();
460                }
461                _ => break,
462            }
463        }
464        has_digits
465    }
466
467    fn eat_hexadecimal_digits(&mut self) -> bool {
468        let mut has_digits = false;
469        loop {
470            match self.first() {
471                '_' => {
472                    self.bump();
473                }
474                '0'..='9' | 'a'..='f' | 'A'..='F' => {
475                    has_digits = true;
476                    self.bump();
477                }
478                _ => break,
479            }
480        }
481        has_digits
482    }
483
484    /// Eats the float exponent. Returns true if at least one digit was met,
485    /// and returns false otherwise.
486    fn eat_float_exponent(&mut self) -> bool {
487        if self.first() == '-' || self.first() == '+' {
488            self.bump();
489        }
490        self.eat_decimal_digits()
491    }
492}
493
494/// Creates an iterator that produces tokens from the input string.
495pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
496    let mut cursor = Cursor::new(input);
497    std::iter::from_fn(move || {
498        let token = cursor.advance_token();
499        if token.kind != TokenKind::Eof {
500            Some(token)
501        } else {
502            None
503        }
504    })
505}
506
507#[cfg(test)]
508mod tests {
509    use std::fmt;
510
511    use super::*;
512    use insta::assert_debug_snapshot;
513
514    struct TokenDebug<'a> {
515        content: &'a str,
516        token: Token,
517    }
518    impl fmt::Debug for TokenDebug<'_> {
519        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
520            write!(f, "{:?} @ {:?}", self.content, self.token.kind)
521        }
522    }
523
524    impl<'a> TokenDebug<'a> {
525        fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> {
526            TokenDebug {
527                token,
528                content: &input[start as usize..(start + token.len) as usize],
529            }
530        }
531    }
532
533    fn lex(input: &str) -> Vec<TokenDebug<'_>> {
534        let mut tokens = vec![];
535        let mut start = 0;
536
537        for token in tokenize(input) {
538            let length = token.len;
539            tokens.push(TokenDebug::new(token, input, start));
540            start += length;
541        }
542        tokens
543    }
544    #[test]
545    fn lex_statement() {
546        let result = lex("select 1;");
547        assert_debug_snapshot!(result);
548    }
549
550    #[test]
551    fn block_comment() {
552        let result = lex(r#"
553/*
554 * foo
555 * bar
556*/"#);
557        assert_debug_snapshot!(result);
558    }
559
560    #[test]
561    fn block_comment_unterminated() {
562        let result = lex(r#"
563/*
564 * foo
565 * bar
566 /*
567*/"#);
568        assert_debug_snapshot!(result);
569    }
570
571    #[test]
572    fn line_comment() {
573        let result = lex(r#"
574-- foooooooooooo bar buzz
575"#);
576        assert_debug_snapshot!(result);
577    }
578
579    #[test]
580    fn line_comment_whitespace() {
581        assert_debug_snapshot!(lex(r#"
582select 'Hello' -- This is a comment
583' World';"#))
584    }
585
586    #[test]
587    fn dollar_quoting() {
588        assert_debug_snapshot!(lex(r#"
589$$Dianne's horse$$
590$SomeTag$Dianne's horse$SomeTag$
591
592-- with dollar inside and matching tags
593$foo$hello$world$bar$
594"#))
595    }
596
597    #[test]
598    fn dollar_strings_part2() {
599        assert_debug_snapshot!(lex(r#"
600DO $doblock$
601end
602$doblock$;"#))
603    }
604
605    #[test]
606    fn dollar_quote_mismatch_tags_simple() {
607        assert_debug_snapshot!(lex(r#"
608-- dollar quoting with mismatched tags
609$foo$hello world$bar$
610"#));
611    }
612
613    #[test]
614    fn dollar_quote_mismatch_tags_complex() {
615        assert_debug_snapshot!(lex(r#"
616-- with dollar inside but mismatched tags
617$foo$hello$world$bar$
618"#));
619    }
620
621    #[test]
622    fn numeric() {
623        assert_debug_snapshot!(lex(r#"
62442
6253.5
6264.
627.001
628.123e10
6295e2
6301.925e-3
6311e-10
6321e+10
6331e10
6344664.E+5
635"#))
636    }
637
638    #[test]
639    fn numeric_non_decimal() {
640        assert_debug_snapshot!(lex(r#"
6410b100101
6420B10011001
6430o273
6440O755
6450x42f
6460XFFFF
647"#))
648    }
649
650    #[test]
651    fn numeric_with_seperators() {
652        assert_debug_snapshot!(lex(r#"
6531_500_000_000
6540b10001000_00000000
6550o_1_755
6560xFFFF_FFFF
6571.618_034
658"#))
659    }
660
661    #[test]
662    fn select_with_period() {
663        assert_debug_snapshot!(lex(r#"
664select public.users;
665"#))
666    }
667
668    #[test]
669    fn bitstring() {
670        assert_debug_snapshot!(lex(r#"
671B'1001'
672b'1001'
673X'1FF'
674x'1FF'
675"#))
676    }
677
678    #[test]
679    fn string() {
680        assert_debug_snapshot!(lex(r#"
681'Dianne''s horse'
682
683select 'foo ''
684bar';
685
686select 'foooo'   
687   'bar';
688
689
690'foo \\ \n \tbar'
691
692'forgot to close the string
693"#))
694    }
695
696    #[test]
697    fn params() {
698        assert_debug_snapshot!(lex(r#"
699select $1 + $2;
700
701select $1123123123123;
702
703select $;
704"#))
705    }
706
707    #[test]
708    fn string_with_escapes() {
709        // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-ESCAPE
710
711        assert_debug_snapshot!(lex(r#"
712E'foo'
713
714e'bar'
715
716e'\b\f\n\r\t'
717
718e'\0\11\777'
719
720e'\x0\x11\xFF'
721
722e'\uAAAA \UFFFFFFFF'
723
724"#))
725    }
726
727    #[test]
728    fn string_unicode_escape() {
729        // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE
730
731        assert_debug_snapshot!(lex(r#"
732U&"d\0061t\+000061"
733
734U&"\0441\043B\043E\043D"
735
736u&'\0441\043B'
737
738U&"d!0061t!+000061" UESCAPE '!'
739"#))
740    }
741
742    #[test]
743    fn quoted_ident() {
744        assert_debug_snapshot!(lex(r#"
745"hello &1 -world";
746
747
748"hello-world
749"#))
750    }
751
752    #[test]
753    fn quoted_ident_with_escape_quote() {
754        assert_debug_snapshot!(lex(r#"
755"foo "" bar"
756"#))
757    }
758}