Skip to main content

squawk_lexer/
lib.rs

1mod cursor;
2mod token;
3use cursor::{Cursor, EOF_CHAR};
4pub use token::{Base, LiteralKind, Token, TokenKind};
5
6// via: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L346
7// ident_start		[A-Za-z\200-\377_]
8const fn is_ident_start(c: char) -> bool {
9    matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..)
10}
11
12// ident_cont		[A-Za-z\200-\377_0-9\$]
13const fn is_ident_cont(c: char) -> bool {
14    matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..)
15}
16
17// see:
18// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scansup.c#L107-L128
19// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L204-L229
20const fn is_whitespace(c: char) -> bool {
21    matches!(
22        c,
23        ' ' // space
24        | '\t' // tab
25        | '\n' // newline
26        | '\r' // carriage return
27        | '\u{000B}' // vertical tab
28        | '\u{000C}' // form feed
29    )
30}
31
32impl Cursor<'_> {
33    // see: https://github.com/rust-lang/rust/blob/ba1d7f4a083e6402679105115ded645512a7aea8/compiler/rustc_lexer/src/lib.rs#L339
34    pub(crate) fn advance_token(&mut self) -> Token {
35        let Some(first_char) = self.bump() else {
36            return Token::new(TokenKind::Eof, 0);
37        };
38        let token_kind = match first_char {
39            // Slash, comment or block comment.
40            '/' => match self.first() {
41                '*' => self.block_comment(),
42                _ => TokenKind::Slash,
43            },
44            '-' => match self.first() {
45                '-' => self.line_comment(),
46                _ => TokenKind::Minus,
47            },
48
49            // // Whitespace sequence.
50            c if is_whitespace(c) => self.whitespace(),
51
52            // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE
53            'u' | 'U' => {
54                if self.first() == '&' && matches!(self.second(), '\'' | '"') {
55                    self.bump();
56                    self.prefixed_string(
57                        |terminated| LiteralKind::UnicodeEscStr { terminated },
58                        true,
59                        false,
60                    )
61                } else {
62                    self.ident()
63                }
64            }
65            // escaped strings
66            'e' | 'E' => {
67                self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false, true)
68            }
69
70            // bit string
71            'b' | 'B' => self.prefixed_string(
72                |terminated| LiteralKind::BitStr { terminated },
73                false,
74                false,
75            ),
76
77            // hexadecimal byte string
78            'x' | 'X' => self.prefixed_string(
79                |terminated| LiteralKind::ByteStr { terminated },
80                false,
81                false,
82            ),
83
84            // national character string
85            'n' | 'N' => match self.first() {
86                '\'' => {
87                    self.bump();
88                    let terminated = self.single_quoted_string(false);
89                    TokenKind::Literal {
90                        kind: LiteralKind::NationalStr { terminated },
91                    }
92                }
93                _ => self.ident(),
94            },
95
96            // Identifier (this should be checked after other variant that can
97            // start as identifier).
98            c if is_ident_start(c) => self.ident(),
99
100            // Numeric literal.
101            // see: https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC
102            c @ '0'..='9' => {
103                let literal_kind = self.number(c);
104                TokenKind::Literal { kind: literal_kind }
105            }
106            '.' => match self.first() {
107                '0'..='9' => {
108                    let literal_kind = self.number('.');
109                    TokenKind::Literal { kind: literal_kind }
110                }
111                _ => TokenKind::Dot,
112            },
113            // One-symbol tokens.
114            ';' => TokenKind::Semi,
115            ',' => TokenKind::Comma,
116            '(' => TokenKind::OpenParen,
117            ')' => TokenKind::CloseParen,
118            '[' => TokenKind::OpenBracket,
119            ']' => TokenKind::CloseBracket,
120            '{' => TokenKind::OpenCurly,
121            '}' => TokenKind::CloseCurly,
122            '@' => TokenKind::At,
123            '#' => TokenKind::Pound,
124            '~' => TokenKind::Tilde,
125            '?' => TokenKind::Question,
126            ':' => TokenKind::Colon,
127            '$' => {
128                if self.is_dollar_quote_start() {
129                    self.dollar_quoted_string()
130                } else {
131                    // Parameters
132                    while self.first().is_ascii_digit() {
133                        self.bump();
134                    }
135                    let trailing_junk_start = self.pos_within_token();
136                    self.eat_identifier();
137                    TokenKind::PositionalParam {
138                        trailing_junk_start,
139                    }
140                }
141            }
142            '`' => TokenKind::Backtick,
143            '=' => TokenKind::Eq,
144            '!' => TokenKind::Bang,
145            '<' => TokenKind::Lt,
146            '>' => TokenKind::Gt,
147            '&' => TokenKind::And,
148            '|' => TokenKind::Or,
149            '+' => TokenKind::Plus,
150            '*' => TokenKind::Star,
151            '^' => TokenKind::Caret,
152            '%' => TokenKind::Percent,
153
154            // String literal
155            '\'' => {
156                let terminated = self.single_quoted_string(false);
157                let kind = LiteralKind::Str { terminated };
158                TokenKind::Literal { kind }
159            }
160
161            // Quoted indentifiers
162            '"' => {
163                let terminated = self.double_quoted_string();
164                TokenKind::QuotedIdent {
165                    terminated,
166                    uescape: false,
167                }
168            }
169            _ => TokenKind::Unknown,
170        };
171        let res = Token::new(token_kind, self.pos_within_token());
172        self.reset_pos_within_token();
173        res
174    }
175    pub(crate) fn ident(&mut self) -> TokenKind {
176        self.eat_while(is_ident_cont);
177        TokenKind::Ident
178    }
179
180    pub(crate) fn whitespace(&mut self) -> TokenKind {
181        self.eat_while(is_whitespace);
182        TokenKind::Whitespace
183    }
184
185    // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L227
186    // comment			("--"{non_newline}*)
187    pub(crate) fn line_comment(&mut self) -> TokenKind {
188        self.bump();
189
190        self.eat_while(|c| c != '\n' && c != '\r');
191        TokenKind::LineComment
192    }
193
194    // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L324-L344
195    pub(crate) fn block_comment(&mut self) -> TokenKind {
196        self.bump();
197
198        let mut depth = 1usize;
199        while let Some(c) = self.bump() {
200            match c {
201                '/' if self.first() == '*' => {
202                    self.bump();
203                    depth += 1;
204                }
205                '*' if self.first() == '/' => {
206                    self.bump();
207                    depth -= 1;
208                    if depth == 0 {
209                        // This block comment is closed, so for a construction like "/* */ */"
210                        // there will be a successfully parsed block comment "/* */"
211                        // and " */" will be processed separately.
212                        break;
213                    }
214                }
215                _ => (),
216            }
217        }
218
219        TokenKind::BlockComment {
220            terminated: depth == 0,
221        }
222    }
223
224    fn prefixed_string(
225        &mut self,
226        mk_kind: fn(bool) -> LiteralKind,
227        allows_double: bool,
228        backslash_escapes: bool,
229    ) -> TokenKind {
230        match self.first() {
231            '\'' => {
232                self.bump();
233                let terminated = self.single_quoted_string(backslash_escapes);
234                let kind = mk_kind(terminated);
235                TokenKind::Literal { kind }
236            }
237            '"' if allows_double => {
238                self.bump();
239                let terminated = self.double_quoted_string();
240                TokenKind::QuotedIdent {
241                    terminated,
242                    uescape: true,
243                }
244            }
245            _ => self.ident(),
246        }
247    }
248
249    fn number(&mut self, first_digit: char) -> LiteralKind {
250        let mut base = Base::Decimal;
251        if first_digit == '.' {
252            return self.eat_fractional();
253        }
254        if first_digit == '0' {
255            // Attempt to parse encoding base.
256            match self.first() {
257                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L403
258                'b' | 'B' => {
259                    base = Base::Binary;
260                    self.bump();
261                    let has_digits = self.eat_decimal_digits();
262                    return self.finish_base_prefixed_int(base, has_digits);
263                }
264                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L402
265                'o' | 'O' => {
266                    base = Base::Octal;
267                    self.bump();
268                    let has_digits = self.eat_decimal_digits();
269                    return self.finish_base_prefixed_int(base, has_digits);
270                }
271                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L401
272                'x' | 'X' => {
273                    base = Base::Hexadecimal;
274                    self.bump();
275                    let has_digits = self.eat_hexadecimal_digits();
276                    return self.finish_base_prefixed_int(base, has_digits);
277                }
278                // Not a base prefix; consume additional digits.
279                '0'..='9' | '_' => {
280                    self.eat_decimal_digits();
281                }
282
283                // Also not a base prefix; nothing more to do here.
284                '.' | 'e' | 'E' => {}
285
286                // Just a 0.
287                _ => {
288                    let trailing_junk_start = self.pos_within_token();
289                    self.eat_identifier();
290                    return LiteralKind::Int {
291                        base,
292                        empty_int: false,
293                        trailing_junk_start,
294                    };
295                }
296            }
297        } else {
298            // No base prefix, parse number in the usual way.
299            self.eat_decimal_digits();
300        };
301
302        match self.first() {
303            '.' => {
304                self.bump();
305                self.eat_fractional()
306            }
307            'e' | 'E' => {
308                let exponent_start = self.pos_within_token();
309                self.bump();
310                let empty_exponent_start = (!self.eat_numeric_exponent()).then_some(exponent_start);
311                let trailing_junk_start = self.pos_within_token();
312                self.eat_identifier();
313                LiteralKind::Numeric {
314                    empty_exponent_start,
315                    trailing_junk_start,
316                }
317            }
318            _ => {
319                let trailing_junk_start = self.pos_within_token();
320                self.eat_identifier();
321                LiteralKind::Int {
322                    base,
323                    empty_int: false,
324                    trailing_junk_start,
325                }
326            }
327        }
328    }
329
330    fn single_quoted_string(&mut self, backslash_escapes: bool) -> bool {
331        // Parse until either quotes are terminated or error is detected.
332        loop {
333            match self.first() {
334                '\\' if backslash_escapes => {
335                    // backslash
336                    self.bump();
337                    // escaped char
338                    self.bump();
339                }
340                // Quotes might be terminated.
341                '\'' => {
342                    self.bump();
343
344                    match self.first() {
345                        // encountered an escaped quote ''
346                        '\'' => {
347                            self.bump();
348                        }
349                        // encountered terminating quote
350                        _ => return true,
351                    }
352                }
353                // End of file, stop parsing.
354                EOF_CHAR if self.is_eof() => break,
355                // Skip the character.
356                _ => {
357                    self.bump();
358                }
359            }
360        }
361        // String was not terminated.
362        false
363    }
364
365    /// Eats double-quoted string and returns true
366    /// if string is terminated.
367    fn double_quoted_string(&mut self) -> bool {
368        while let Some(c) = self.bump() {
369            match c {
370                '"' if self.first() == '"' => {
371                    // Bump again to skip escaped character.
372                    self.bump();
373                }
374                '"' => {
375                    return true;
376                }
377                _ => (),
378            }
379        }
380        // End of file reached.
381        false
382    }
383
384    /// Check for `$$` and `$tag$`
385    fn is_dollar_quote_start(&self) -> bool {
386        let mut chars = self.chars();
387        match chars.next() {
388            // `$$...` -- empty tag
389            Some('$') => true,
390            // `$tag$...` -- tag chars terminated by `$`
391            Some(c) if is_ident_start(c) => {
392                for c in chars {
393                    if c == '$' {
394                        return true;
395                    }
396                    if !is_ident_cont(c) {
397                        return false;
398                    }
399                }
400                false
401            }
402            _ => false,
403        }
404    }
405
406    // https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING
407    fn dollar_quoted_string(&mut self) -> TokenKind {
408        // Get the start sequence of the dollar quote, i.e., 'foo' in
409        // $foo$hello$foo$
410        let mut start = vec![];
411        while let Some(c) = self.bump() {
412            match c {
413                '$' => {
414                    break;
415                }
416                _ => {
417                    start.push(c);
418                }
419            }
420        }
421
422        // we have a dollar quoted string deliminated with `$$`
423        if start.is_empty() {
424            loop {
425                self.eat_while(|c| c != '$');
426                if self.is_eof() {
427                    return TokenKind::Literal {
428                        kind: LiteralKind::DollarQuotedString { terminated: false },
429                    };
430                }
431                // eat $
432                self.bump();
433                if self.first() == '$' {
434                    self.bump();
435                    return TokenKind::Literal {
436                        kind: LiteralKind::DollarQuotedString { terminated: true },
437                    };
438                }
439            }
440        } else {
441            loop {
442                self.eat_while(|c| c != '$');
443                if self.is_eof() {
444                    return TokenKind::Literal {
445                        kind: LiteralKind::DollarQuotedString { terminated: false },
446                    };
447                }
448
449                // Eat the leading '$' of a possible closing delimiter.
450                self.bump();
451
452                let mut matches_tag = true;
453                for start_char in &start {
454                    if self.first() == *start_char {
455                        self.bump();
456                    } else {
457                        matches_tag = false;
458                        break;
459                    }
460                }
461
462                if matches_tag && self.first() == '$' {
463                    self.bump();
464                    return TokenKind::Literal {
465                        kind: LiteralKind::DollarQuotedString { terminated: true },
466                    };
467                }
468            }
469        }
470    }
471
472    fn eat_decimal_digits(&mut self) -> bool {
473        let mut has_digits = false;
474        loop {
475            match self.first() {
476                '_' if self.second().is_ascii_digit() => {
477                    self.bump();
478                }
479                '0'..='9' => {
480                    has_digits = true;
481                    self.bump();
482                }
483                _ => break,
484            }
485        }
486        has_digits
487    }
488
489    fn finish_base_prefixed_int(&mut self, base: Base, has_digits: bool) -> LiteralKind {
490        let trailing_junk_start = self.pos_within_token();
491        self.eat_identifier();
492        let has_trailing_junk = self.pos_within_token() > trailing_junk_start;
493        LiteralKind::Int {
494            base,
495            empty_int: !has_digits && !has_trailing_junk,
496            trailing_junk_start,
497        }
498    }
499
500    fn eat_hexadecimal_digits(&mut self) -> bool {
501        let mut has_digits = false;
502        loop {
503            match self.first() {
504                '_' if self.second().is_ascii_hexdigit() => {
505                    self.bump();
506                }
507                '0'..='9' | 'a'..='f' | 'A'..='F' => {
508                    has_digits = true;
509                    self.bump();
510                }
511                _ => break,
512            }
513        }
514        has_digits
515    }
516
517    /// Eats the numeric exponent. Returns true if at least one digit was met,
518    /// and returns false otherwise.
519    fn eat_numeric_exponent(&mut self) -> bool {
520        if self.first() == '-' || self.first() == '+' {
521            if !self.second().is_ascii_digit() {
522                return false;
523            }
524            self.bump();
525        } else if !self.first().is_ascii_digit() {
526            return false;
527        }
528        self.eat_decimal_digits()
529    }
530
531    fn eat_identifier(&mut self) {
532        if is_ident_start(self.first()) {
533            self.eat_while(is_ident_cont);
534        }
535    }
536
537    pub(crate) fn eat_fractional(&mut self) -> crate::LiteralKind {
538        let mut empty_exponent_start = None;
539        if self.first().is_ascii_digit() {
540            self.eat_decimal_digits();
541        }
542        match self.first() {
543            'e' | 'E' => {
544                let exponent_start = self.pos_within_token();
545                self.bump();
546                if !self.eat_numeric_exponent() {
547                    empty_exponent_start = Some(exponent_start);
548                }
549            }
550            _ => (),
551        }
552        let trailing_junk_start = self.pos_within_token();
553        self.eat_identifier();
554        LiteralKind::Numeric {
555            empty_exponent_start,
556            trailing_junk_start,
557        }
558    }
559}
560
561/// Creates an iterator that produces tokens from the input string.
562pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
563    let mut cursor = Cursor::new(input);
564    std::iter::from_fn(move || {
565        let token = cursor.advance_token();
566        if token.kind != TokenKind::Eof {
567            Some(token)
568        } else {
569            None
570        }
571    })
572}
573
574#[cfg(test)]
575mod tests {
576    use std::fmt;
577
578    use super::*;
579    use insta::assert_debug_snapshot;
580
581    struct TokenDebug<'a> {
582        content: &'a str,
583        token: Token,
584    }
585    impl fmt::Debug for TokenDebug<'_> {
586        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
587            write!(f, "{:?} @ {:?}", self.content, self.token.kind)
588        }
589    }
590
591    impl<'a> TokenDebug<'a> {
592        fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> {
593            TokenDebug {
594                token,
595                content: &input[start as usize..(start + token.len) as usize],
596            }
597        }
598    }
599
600    fn lex(input: &str) -> Vec<TokenDebug<'_>> {
601        let mut tokens = vec![];
602        let mut start = 0;
603
604        for token in tokenize(input) {
605            let length = token.len;
606            tokens.push(TokenDebug::new(token, input, start));
607            start += length;
608        }
609        tokens
610    }
611    #[test]
612    fn lex_statement() {
613        let result = lex("select 1;");
614        assert_debug_snapshot!(result);
615    }
616
617    #[test]
618    fn block_comment() {
619        let result = lex(r#"
620/*
621 * foo
622 * bar
623*/"#);
624        assert_debug_snapshot!(result);
625    }
626
627    #[test]
628    fn block_comment_unterminated() {
629        let result = lex(r#"
630/*
631 * foo
632 * bar
633 /*
634*/"#);
635        assert_debug_snapshot!(result);
636    }
637
638    #[test]
639    fn line_comment() {
640        let result = lex(r#"
641-- foooooooooooo bar buzz
642"#);
643        assert_debug_snapshot!(result);
644    }
645
646    #[test]
647    fn line_comment_cr_newline() {
648        assert_debug_snapshot!(lex("select 1; -- comment\rselect 2;"), @r#"
649        [
650            "select" @ Ident,
651            " " @ Whitespace,
652            "1" @ Literal { kind: Int { base: Decimal, empty_int: false, trailing_junk_start: 1 } },
653            ";" @ Semi,
654            " " @ Whitespace,
655            "-- comment" @ LineComment,
656            "\r" @ Whitespace,
657            "select" @ Ident,
658            " " @ Whitespace,
659            "2" @ Literal { kind: Int { base: Decimal, empty_int: false, trailing_junk_start: 1 } },
660            ";" @ Semi,
661        ]
662        "#);
663    }
664
665    #[test]
666    fn line_comment_whitespace() {
667        assert_debug_snapshot!(lex(r#"
668select 'Hello' -- This is a comment
669' World';"#))
670    }
671
672    #[test]
673    fn dollar_quoting() {
674        assert_debug_snapshot!(lex(r#"
675$$Dianne's horse$$
676$SomeTag$Dianne's horse$SomeTag$
677
678-- with dollar inside and matching tags
679$foo$hello$world$bar$
680"#))
681    }
682
683    #[test]
684    fn dollar_strings_part2() {
685        assert_debug_snapshot!(lex(r#"
686DO $doblock$
687end
688$doblock$;"#))
689    }
690
691    #[test]
692    fn dollar_quote_mismatch_tags_simple() {
693        assert_debug_snapshot!(lex(r#"
694-- dollar quoting with mismatched tags
695$foo$hello world$bar$
696"#));
697    }
698
699    #[test]
700    fn dollar_quote_mismatch_tags_complex() {
701        assert_debug_snapshot!(lex(r#"
702-- with dollar inside but mismatched tags
703$foo$hello$world$bar$
704"#));
705    }
706
707    #[test]
708    fn numeric() {
709        assert_debug_snapshot!(lex(r#"
71042
7113.5
7124.
713.001
714.123e10
7155e2
7161.925e-3
7171e-10
7181e+10
7191e10
7204664.E+5
721"#))
722    }
723
724    #[test]
725    fn numeric_non_decimal() {
726        assert_debug_snapshot!(lex(r#"
7270b100101
7280B10011001
7290o273
7300O755
7310x42f
7320XFFFF
733"#))
734    }
735
736    #[test]
737    fn numeric_base_prefix_does_not_swallow_dollar_tokens() {
738        assert_debug_snapshot!(lex("123$abc 0b101$2 0o12$abc 0x12$abc 0xFF$1 0x1$$foo$$ 123$$foo$$"), @r#"
739        [
740            "123" @ Literal { kind: Int { base: Decimal, empty_int: false, trailing_junk_start: 3 } },
741            "$abc" @ PositionalParam { trailing_junk_start: 1 },
742            " " @ Whitespace,
743            "0b101" @ Literal { kind: Int { base: Binary, empty_int: false, trailing_junk_start: 5 } },
744            "$2" @ PositionalParam { trailing_junk_start: 2 },
745            " " @ Whitespace,
746            "0o12" @ Literal { kind: Int { base: Octal, empty_int: false, trailing_junk_start: 4 } },
747            "$abc" @ PositionalParam { trailing_junk_start: 1 },
748            " " @ Whitespace,
749            "0x12" @ Literal { kind: Int { base: Hexadecimal, empty_int: false, trailing_junk_start: 4 } },
750            "$abc" @ PositionalParam { trailing_junk_start: 1 },
751            " " @ Whitespace,
752            "0xFF" @ Literal { kind: Int { base: Hexadecimal, empty_int: false, trailing_junk_start: 4 } },
753            "$1" @ PositionalParam { trailing_junk_start: 2 },
754            " " @ Whitespace,
755            "0x1" @ Literal { kind: Int { base: Hexadecimal, empty_int: false, trailing_junk_start: 3 } },
756            "$$foo$$" @ Literal { kind: DollarQuotedString { terminated: true } },
757            " " @ Whitespace,
758            "123" @ Literal { kind: Int { base: Decimal, empty_int: false, trailing_junk_start: 3 } },
759            "$$foo$$" @ Literal { kind: DollarQuotedString { terminated: true } },
760        ]
761        "#);
762    }
763
764    #[test]
765    fn numeric_with_seperators() {
766        assert_debug_snapshot!(lex(r#"
7671_500_000_000
7680b10001000_00000000
7690o_1_755
7700xFFFF_FFFF
7711.618_034
772"#))
773    }
774
775    #[test]
776    fn numeric_leading_dot_with_separators() {
777        assert_debug_snapshot!(lex(".1_2 .5_5 .1_2e3"), @r#"
778        [
779            ".1_2" @ Literal { kind: Numeric { empty_exponent_start: None, trailing_junk_start: 4 } },
780            " " @ Whitespace,
781            ".5_5" @ Literal { kind: Numeric { empty_exponent_start: None, trailing_junk_start: 4 } },
782            " " @ Whitespace,
783            ".1_2e3" @ Literal { kind: Numeric { empty_exponent_start: None, trailing_junk_start: 6 } },
784        ]
785        "#)
786    }
787
788    #[test]
789    fn numeric_exponent_underscore_after_sign() {
790        assert_debug_snapshot!(lex("1e+_2 1e-_2 1.0e+_2 .1e+_2"), @r#"
791        [
792            "1e" @ Literal { kind: Numeric { empty_exponent_start: Some(1), trailing_junk_start: 2 } },
793            "+" @ Plus,
794            "_2" @ Ident,
795            " " @ Whitespace,
796            "1e" @ Literal { kind: Numeric { empty_exponent_start: Some(1), trailing_junk_start: 2 } },
797            "-" @ Minus,
798            "_2" @ Ident,
799            " " @ Whitespace,
800            "1.0e" @ Literal { kind: Numeric { empty_exponent_start: Some(3), trailing_junk_start: 4 } },
801            "+" @ Plus,
802            "_2" @ Ident,
803            " " @ Whitespace,
804            ".1e" @ Literal { kind: Numeric { empty_exponent_start: Some(2), trailing_junk_start: 3 } },
805            "+" @ Plus,
806            "_2" @ Ident,
807        ]
808        "#)
809    }
810
811    #[test]
812    fn select_with_period() {
813        assert_debug_snapshot!(lex(r#"
814select public.users;
815"#))
816    }
817
818    #[test]
819    fn bitstring() {
820        assert_debug_snapshot!(lex(r#"
821B'1001'
822b'1001'
823X'1FF'
824x'1FF'
825"#))
826    }
827
828    #[test]
829    fn national_character_string() {
830        assert_debug_snapshot!(lex("N'foo' n'bar' numeric'1'"), @r#"
831        [
832            "N'foo'" @ Literal { kind: NationalStr { terminated: true } },
833            " " @ Whitespace,
834            "n'bar'" @ Literal { kind: NationalStr { terminated: true } },
835            " " @ Whitespace,
836            "numeric" @ Ident,
837            "'1'" @ Literal { kind: Str { terminated: true } },
838        ]
839        "#);
840    }
841
842    #[test]
843    fn ident_prefix_then_string_is_consistent() {
844        assert_debug_snapshot!(
845            lex("N1'foo' E1'foo' B1'foo' X1'foo' U1'foo' uuid'00000000'"),
846            @r#"
847        [
848            "N1" @ Ident,
849            "'foo'" @ Literal { kind: Str { terminated: true } },
850            " " @ Whitespace,
851            "E1" @ Ident,
852            "'foo'" @ Literal { kind: Str { terminated: true } },
853            " " @ Whitespace,
854            "B1" @ Ident,
855            "'foo'" @ Literal { kind: Str { terminated: true } },
856            " " @ Whitespace,
857            "X1" @ Ident,
858            "'foo'" @ Literal { kind: Str { terminated: true } },
859            " " @ Whitespace,
860            "U1" @ Ident,
861            "'foo'" @ Literal { kind: Str { terminated: true } },
862            " " @ Whitespace,
863            "uuid" @ Ident,
864            "'00000000'" @ Literal { kind: Str { terminated: true } },
865        ]
866        "#);
867    }
868
869    #[test]
870    fn string() {
871        assert_debug_snapshot!(lex(r#"
872'Dianne''s horse'
873
874select 'foo ''
875bar';
876
877select 'foooo'   
878   'bar';
879
880
881'foo \\ \n \tbar'
882
883'forgot to close the string
884"#))
885    }
886
887    #[test]
888    fn params() {
889        assert_debug_snapshot!(lex(r#"
890select $1 + $2;
891
892select $1123123123123;
893
894select $;
895"#))
896    }
897
898    #[test]
899    fn string_with_escapes() {
900        // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-ESCAPE
901
902        assert_debug_snapshot!(lex(r#"
903E'foo'
904
905e'bar'
906
907e'\b\f\n\r\t'
908
909e'\0\11\777'
910
911e'\x0\x11\xFF'
912
913e'\uAAAA \UFFFFFFFF'
914
915"#))
916    }
917
918    #[test]
919    fn escape_string_with_backslash_escaped_quote() {
920        assert_debug_snapshot!(lex(r"E'foo\'bar'"), @r#"
921        [
922            "E'foo\\'bar'" @ Literal { kind: EscStr { terminated: true } },
923        ]
924        "#);
925    }
926
927    #[test]
928    fn escape_string_with_escaped_terminal_quote_is_unterminated() {
929        assert_debug_snapshot!(lex(r"E'foo\';"), @r#"
930        [
931            "E'foo\\';" @ Literal { kind: EscStr { terminated: false } },
932        ]
933        "#);
934    }
935
936    #[test]
937    fn escape_string_with_even_backslashes_before_quote_is_terminated() {
938        assert_debug_snapshot!(lex(r"E'foo\\'"), @r#"
939        [
940            "E'foo\\\\'" @ Literal { kind: EscStr { terminated: true } },
941        ]
942        "#);
943    }
944
945    #[test]
946    fn string_unicode_escape() {
947        // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE
948
949        assert_debug_snapshot!(lex(r#"
950U&"d\0061t\+000061"
951
952U&"\0441\043B\043E\043D"
953
954u&'\0441\043B'
955
956U&"d!0061t!+000061" UESCAPE '!'
957"#))
958    }
959
960    #[test]
961    fn quoted_ident() {
962        assert_debug_snapshot!(lex(r#"
963"hello &1 -world";
964
965
966"hello-world
967"#))
968    }
969
970    #[test]
971    fn quoted_ident_with_escape_quote() {
972        assert_debug_snapshot!(lex(r#"
973"foo "" bar"
974"#))
975    }
976
977    #[test]
978    fn dollar_quoted_string() {
979        assert_debug_snapshot!(lex("$$$$"), @r#"
980        [
981            "$$$$" @ Literal { kind: DollarQuotedString { terminated: true } },
982        ]
983        "#);
984    }
985
986    #[test]
987    fn tagged_dollar_quote_requires_leading_dollar() {
988        assert_debug_snapshot!(lex("select $foo$abcfoo$def$foo$;"), @r#"
989        [
990            "select" @ Ident,
991            " " @ Whitespace,
992            "$foo$abcfoo$def$foo$" @ Literal { kind: DollarQuotedString { terminated: true } },
993            ";" @ Semi,
994        ]
995        "#);
996    }
997
998    #[test]
999    fn unclosed_dollar_tag_does_not_swallow_rest_of_input() {
1000        assert_debug_snapshot!(lex("select $x;\ndrop table users;"), @r#"
1001        [
1002            "select" @ Ident,
1003            " " @ Whitespace,
1004            "$x" @ PositionalParam { trailing_junk_start: 1 },
1005            ";" @ Semi,
1006            "\n" @ Whitespace,
1007            "drop" @ Ident,
1008            " " @ Whitespace,
1009            "table" @ Ident,
1010            " " @ Whitespace,
1011            "users" @ Ident,
1012            ";" @ Semi,
1013        ]
1014        "#);
1015    }
1016
1017    #[test]
1018    fn ident_non_ascii_above_latin1() {
1019        assert_debug_snapshot!(lex("ẞ Ā 漢字 𐐷"), @r#"
1020        [
1021            "ẞ" @ Ident,
1022            " " @ Whitespace,
1023            "Ā" @ Ident,
1024            " " @ Whitespace,
1025            "漢字" @ Ident,
1026            " " @ Whitespace,
1027            "𐐷" @ Ident,
1028        ]
1029        "#);
1030    }
1031}