sqlformat/
tokenizer.rs

1use std::borrow::Cow;
2use unicode_categories::UnicodeCategories;
3use winnow::ascii::{digit0, digit1, till_line_ending, Caseless};
4use winnow::combinator::{alt, dispatch, eof, fail, opt, peek, terminated};
5use winnow::error::ContextError;
6use winnow::error::ParserError;
7use winnow::prelude::*;
8use winnow::token::{any, one_of, rest, take, take_until, take_while};
9use winnow::Result;
10
11use crate::{Dialect, FormatOptions};
12
13pub(crate) fn tokenize<'a>(
14    mut input: &'a str,
15    named_placeholders: bool,
16    options: &FormatOptions,
17) -> Vec<Token<'a>> {
18    let mut tokens: Vec<Token> = Vec::new();
19
20    let mut last_non_whitespace_token = None;
21    let mut last_reserved_token = None;
22    let mut last_reserved_top_level_token = None;
23
24    if let Ok(Some(result)) = opt(get_whitespace_token).parse_next(&mut input) {
25        tokens.push(result);
26    }
27
28    // Keep processing the string until it is empty
29    while let Ok(mut result) = get_next_token(
30        &mut input,
31        last_non_whitespace_token.clone(),
32        last_reserved_token.clone(),
33        last_reserved_top_level_token.clone(),
34        named_placeholders,
35        options.dialect,
36    ) {
37        match result.kind {
38            TokenKind::Reserved => {
39                last_reserved_token = Some(result.clone());
40            }
41            TokenKind::ReservedTopLevel => {
42                last_reserved_top_level_token = Some(result.clone());
43            }
44            TokenKind::Join => {
45                if options.joins_as_top_level {
46                    result.kind = TokenKind::ReservedTopLevel;
47                } else {
48                    result.kind = TokenKind::ReservedNewline;
49                }
50            }
51            _ => {}
52        }
53
54        if result.kind != TokenKind::Whitespace {
55            last_non_whitespace_token = Some(result.clone());
56        }
57
58        tokens.push(result);
59
60        if let Ok(Some(result)) = opt(get_whitespace_token).parse_next(&mut input) {
61            tokens.push(result);
62        }
63    }
64    tokens
65}
66
67#[derive(Debug, Clone)]
68pub(crate) struct Token<'a> {
69    pub kind: TokenKind,
70    pub value: &'a str,
71    // Only used for placeholder--there is a reason this isn't on the enum
72    pub key: Option<PlaceholderKind<'a>>,
73    /// Used to group the behaviour of variants of tokens
74    pub alias: &'a str,
75}
76
77#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
78pub(crate) enum TokenKind {
79    TypeSpecifier,
80    Whitespace,
81    String,
82    Reserved,
83    ReservedTopLevel,
84    ReservedTopLevelNoIndent,
85    ReservedNewline,
86    ReservedNewlineAfter,
87    Operator,
88    OpenParen,
89    CloseParen,
90    LineComment,
91    BlockComment,
92    Number,
93    Placeholder,
94    Word,
95    Join,
96}
97
98#[derive(Debug, Clone)]
99pub(crate) enum PlaceholderKind<'a> {
100    Named(Cow<'a, str>),
101    ZeroIndexed(usize),
102    OneIndexed(usize),
103}
104
105impl<'a> PlaceholderKind<'a> {
106    pub fn named(&'a self) -> &'a str {
107        match self {
108            PlaceholderKind::Named(val) => val.as_ref(),
109            _ => "",
110        }
111    }
112
113    pub fn indexed(&self) -> Option<usize> {
114        match self {
115            PlaceholderKind::ZeroIndexed(val) => Some(*val),
116            PlaceholderKind::OneIndexed(val) => Some(*val - 1),
117            _ => None,
118        }
119    }
120}
121
122fn get_next_token<'a>(
123    input: &mut &'a str,
124    previous_token: Option<Token<'a>>,
125    last_reserved_token: Option<Token<'a>>,
126    last_reserved_top_level_token: Option<Token<'a>>,
127    named_placeholders: bool,
128    dialect: Dialect,
129) -> Result<Token<'a>> {
130    alt((
131        get_comment_token,
132        |input: &mut _| get_type_specifier_token(input, previous_token.clone()),
133        |input: &mut _| get_string_token(input, dialect),
134        |input: &mut _| get_open_paren_token(input, dialect),
135        |input: &mut _| get_close_paren_token(input, dialect),
136        get_number_token,
137        |input: &mut _| {
138            get_reserved_word_token(
139                input,
140                previous_token.clone(),
141                last_reserved_token.clone(),
142                last_reserved_top_level_token.clone(),
143            )
144        },
145        get_operator_token,
146        |input: &mut _| get_placeholder_token(input, named_placeholders, dialect),
147        get_word_token,
148        get_any_other_char,
149    ))
150    .parse_next(input)
151}
152fn get_type_specifier_token<'i>(
153    input: &mut &'i str,
154    previous_token: Option<Token<'i>>,
155) -> Result<Token<'i>> {
156    if previous_token.is_some_and(|token| {
157        ![
158            TokenKind::CloseParen,
159            TokenKind::Placeholder,
160            TokenKind::Reserved,
161            TokenKind::String,
162            TokenKind::Number,
163            TokenKind::TypeSpecifier,
164            TokenKind::Word,
165        ]
166        .contains(&token.kind)
167    }) {
168        fail.parse_next(input)
169    } else {
170        alt(("::", "[]")).parse_next(input).map(|token| Token {
171            kind: TokenKind::TypeSpecifier,
172            value: token,
173            key: None,
174            alias: token,
175        })
176    }
177}
178fn get_whitespace_token<'i>(input: &mut &'i str) -> Result<Token<'i>> {
179    take_while(1.., char::is_whitespace)
180        .parse_next(input)
181        .map(|token| Token {
182            kind: TokenKind::Whitespace,
183            value: token,
184            key: None,
185            alias: token,
186        })
187}
188
189fn get_comment_token<'i>(input: &mut &'i str) -> Result<Token<'i>> {
190    dispatch! {any;
191        '#' => till_line_ending.value(TokenKind::LineComment),
192        '-' => ('-', till_line_ending).value(TokenKind::LineComment),
193        '/' => ('*', alt((take_until(0.., "*/"), rest)), opt(take(2usize))).value(TokenKind::BlockComment),
194        _ => fail,
195    }
196        .with_taken()
197        .parse_next(input)
198        .map(|(kind, token)| Token {
199            kind,
200            value: token,
201            key: None,
202            alias: token,
203        })
204}
205
206pub fn take_till_escaping<'a>(
207    desired: char,
208    escapes: &'static [char],
209) -> impl Parser<&'a str, &'a str, ContextError> {
210    move |input: &mut &'a str| {
211        let mut chars = input.char_indices().peekable();
212        loop {
213            let item = chars.next();
214            let next = chars.peek().map(|item| item.1);
215            match item {
216                Some((byte_pos, item)) => {
217                    // escape?
218                    if escapes.contains(&item) && next.map(|n| n == desired).unwrap_or(false) {
219                        // consume this and next char
220                        chars.next();
221                        continue;
222                    }
223
224                    if item == desired {
225                        return Ok(input.next_slice(byte_pos));
226                    }
227                }
228                None => {
229                    return rest.parse_next(input);
230                }
231            }
232        }
233    }
234}
235
236// This enables the following string patterns:
237// 1. backtick quoted string using `` to escape
238// 2. square bracket quoted string (SQL Server) using ]] to escape
239// 3. double quoted string using "" or \" to escape
240// 4. single quoted string using '' or \' to escape
241// 5. national character quoted string using N'' or N\' to escape
242// 6. hex(blob literal) does not need to escape
243fn get_string_token<'i>(input: &mut &'i str, dialect: Dialect) -> Result<Token<'i>> {
244    dispatch! {any;
245        '`' => (take_till_escaping('`', &['`']), any).void(),
246        '[' if dialect == Dialect::SQLServer => (take_till_escaping(']', &[']']), any).void(),
247        '"' => (take_till_escaping('"', &['"', '\\']), any).void(),
248        '\'' => (take_till_escaping('\'', &['\'', '\\']), any).void(),
249        'N' => ('\'', take_till_escaping('\'', &['\'', '\\']), any).void(),
250        'E' => ('\'', take_till_escaping('\'', &['\'', '\\']), any).void(),
251        'x' => ('\'', take_till_escaping('\'', &[]), any).void(),
252        'X' => ('\'', take_till_escaping('\'', &[]), any).void(),
253        _ => fail,
254    }
255    .take()
256    .parse_next(input)
257    .map(|token| Token {
258        kind: TokenKind::String,
259        value: token,
260        key: None,
261        alias: token,
262    })
263}
264
265// Like above but it doesn't replace double quotes
266fn get_placeholder_string_token<'i>(input: &mut &'i str, dialect: Dialect) -> Result<Token<'i>> {
267    dispatch! {any;
268        '`'=>( take_till_escaping('`', &['`']), any).void(),
269        '[' if dialect == Dialect::SQLServer =>( take_till_escaping(']', &[']']), any).void(),
270        '"'=>( take_till_escaping('"', &['\\']), any).void(),
271        '\''=>( take_till_escaping('\'', &['\\']), any).void(),
272        'N' =>('\'', take_till_escaping('\'', &['\\']), any).void(),
273        _ => fail,
274    }
275    .take()
276    .parse_next(input)
277    .map(|token| Token {
278        kind: TokenKind::String,
279        value: token,
280        key: None,
281        alias: token,
282    })
283}
284
285fn get_open_paren_token<'i>(input: &mut &'i str, dialect: Dialect) -> Result<Token<'i>> {
286    let case = terminated(Caseless("CASE"), end_of_word);
287    let open_paren = if dialect == Dialect::PostgreSql {
288        ("(", "[", case)
289    } else {
290        ("(", "(", case)
291    };
292
293    alt(open_paren).parse_next(input).map(|token| Token {
294        kind: TokenKind::OpenParen,
295        value: token,
296        key: None,
297        alias: token,
298    })
299}
300
301fn get_close_paren_token<'i>(input: &mut &'i str, dialect: Dialect) -> Result<Token<'i>> {
302    let end = terminated(Caseless("END"), end_of_word);
303    let close_paren = if dialect == Dialect::PostgreSql {
304        (")", "]", end)
305    } else {
306        (")", ")", end)
307    };
308    alt(close_paren).parse_next(input).map(|token| Token {
309        kind: TokenKind::CloseParen,
310        value: token,
311        key: None,
312        alias: token,
313    })
314}
315
316fn get_placeholder_token<'i>(
317    input: &mut &'i str,
318    named_placeholders: bool,
319    dialect: Dialect,
320) -> Result<Token<'i>> {
321    // The precedence changes based on 'named_placeholders' but not the exhaustiveness.
322    // This is to ensure the formatting is the same even if parameters aren't used.
323
324    if named_placeholders {
325        alt((
326            get_ident_named_placeholder_token,
327            |input: &mut _| get_string_named_placeholder_token(input, dialect),
328            get_indexed_placeholder_token,
329        ))
330        .parse_next(input)
331    } else {
332        alt((
333            get_indexed_placeholder_token,
334            get_ident_named_placeholder_token,
335            |input: &mut _| get_string_named_placeholder_token(input, dialect),
336        ))
337        .parse_next(input)
338    }
339}
340
341fn get_indexed_placeholder_token<'i>(input: &mut &'i str) -> Result<Token<'i>> {
342    alt(((one_of(('?', '$')), digit1).take(), "?"))
343        .parse_next(input)
344        .map(|token| Token {
345            kind: TokenKind::Placeholder,
346            value: token,
347            key: if token.len() > 1 {
348                if let Ok(index) = token[1..].parse::<usize>() {
349                    Some(if token.starts_with('$') {
350                        PlaceholderKind::OneIndexed(index)
351                    } else {
352                        PlaceholderKind::ZeroIndexed(index)
353                    })
354                } else {
355                    None
356                }
357            } else {
358                None
359            },
360            alias: token,
361        })
362}
363
364fn get_ident_named_placeholder_token<'i>(input: &mut &'i str) -> Result<Token<'i>> {
365    (
366        one_of(('@', ':', '$')),
367        take_while(1.., |item: char| {
368            item.is_alphanumeric() || item == '.' || item == '_' || item == '$'
369        }),
370    )
371        .take()
372        .parse_next(input)
373        .map(|token| {
374            let index = Cow::Borrowed(&token[1..]);
375            Token {
376                kind: TokenKind::Placeholder,
377                value: token,
378                key: Some(PlaceholderKind::Named(index)),
379                alias: token,
380            }
381        })
382}
383
384fn get_string_named_placeholder_token<'i>(
385    input: &mut &'i str,
386    dialect: Dialect,
387) -> Result<Token<'i>> {
388    (one_of(('@', ':')), |input: &mut _| {
389        get_placeholder_string_token(input, dialect)
390    })
391        .take()
392        .parse_next(input)
393        .map(|token| {
394            let index =
395                get_escaped_placeholder_key(&token[2..token.len() - 1], &token[token.len() - 1..]);
396            Token {
397                kind: TokenKind::Placeholder,
398                value: token,
399                key: Some(PlaceholderKind::Named(index)),
400                alias: token,
401            }
402        })
403}
404
405fn get_escaped_placeholder_key<'a>(key: &'a str, quote_char: &str) -> Cow<'a, str> {
406    Cow::Owned(key.replace(&format!("\\{}", quote_char), quote_char))
407}
408
409fn get_number_token<'i>(input: &mut &'i str) -> Result<Token<'i>> {
410    (opt("-"), alt((scientific_notation, decimal_number, digit1)))
411        .take()
412        .parse_next(input)
413        .map(|token| Token {
414            kind: TokenKind::Number,
415            value: token,
416            key: None,
417            alias: token,
418        })
419}
420
421fn decimal_number<'i>(input: &mut &'i str) -> Result<&'i str> {
422    (digit1, ".", digit0).take().parse_next(input)
423}
424
425fn scientific_notation<'i>(input: &mut &'i str) -> Result<&'i str> {
426    (
427        alt((decimal_number, digit1)),
428        "e",
429        opt(one_of(('-', '+'))),
430        digit1,
431    )
432        .take()
433        .parse_next(input)
434}
435
436fn get_reserved_word_token<'a>(
437    input: &mut &'a str,
438    previous_token: Option<Token<'a>>,
439    last_reserved_token: Option<Token<'a>>,
440    last_reserved_top_level_token: Option<Token<'a>>,
441) -> Result<Token<'a>> {
442    // A reserved word cannot be preceded by a "."
443    // this makes it so in "my_table.from", "from" is not considered a reserved word
444    if let Some(token) = previous_token {
445        if token.value == "." {
446            return Err(ParserError::from_input(input));
447        }
448    }
449
450    if !('a'..='z', 'A'..='Z', '$').contains_token(input.chars().next().unwrap_or('\0')) {
451        return Err(ParserError::from_input(input));
452    }
453
454    alt((
455        get_top_level_reserved_token(last_reserved_top_level_token),
456        get_newline_after_reserved_token(),
457        get_newline_reserved_token(last_reserved_token),
458        get_join_token(),
459        get_top_level_reserved_token_no_indent,
460        get_plain_reserved_token,
461    ))
462    .parse_next(input)
463}
464
465// We have to be a bit creative here for performance reasons
466fn get_uc_words(input: &str, words: usize) -> String {
467    input
468        .split_whitespace()
469        .take(words)
470        .collect::<Vec<&str>>()
471        .join(" ")
472        .to_ascii_uppercase()
473}
474
475fn finalize<'a>(input: &mut &'a str, token: &str) -> &'a str {
476    let final_word = token.split_whitespace().last().unwrap_or(token);
477    let input_end_pos = input.to_ascii_uppercase().find(final_word).unwrap_or(0) + final_word.len();
478    input.next_slice(input_end_pos)
479}
480
481fn get_top_level_reserved_token<'a>(
482    last_reserved_top_level_token: Option<Token<'a>>,
483) -> impl Parser<&'a str, Token<'a>, ContextError> {
484    move |input: &mut &'a str| {
485        let uc_input: String = get_uc_words(input, 4);
486        let mut uc_input = uc_input.as_str();
487
488        // First peek at the first character to determine which group to check
489        let first_char = peek(any).parse_next(input)?.to_ascii_uppercase();
490
491        // Match keywords based on their first letter
492        let result: Result<&str> = match first_char {
493            'A' => alt((
494                terminated("ADD", end_of_word),
495                terminated("AFTER", end_of_word),
496                terminated("ALTER COLUMN", end_of_word),
497                terminated("ALTER TABLE", end_of_word),
498            ))
499            .parse_next(&mut uc_input),
500
501            'C' => terminated(
502                (
503                    "CREATE ",
504                    opt(alt((
505                        "UNLOGGED ",
506                        (
507                            alt(("GLOBAL ", "LOCAL ")),
508                            opt(alt(("TEMPORARY ", "TEMP "))),
509                        )
510                            .take(),
511                    ))),
512                    "TABLE",
513                )
514                    .take(),
515                end_of_word,
516            )
517            .parse_next(&mut uc_input),
518
519            'D' => terminated("DELETE FROM", end_of_word).parse_next(&mut uc_input),
520
521            'E' => terminated("EXCEPT", end_of_word).parse_next(&mut uc_input),
522
523            'F' => alt((
524                terminated("FETCH FIRST", end_of_word),
525                terminated("FROM", end_of_word),
526                terminated(
527                    (
528                        "FOR ",
529                        alt(("UPDATE", "NO KEY UPDATE", "SHARE", "KEY SHARE")),
530                    )
531                        .take(),
532                    end_of_word,
533                ),
534            ))
535            .parse_next(&mut uc_input),
536
537            'G' => alt((
538                terminated("GROUP BY", end_of_word),
539                terminated("GO", end_of_word),
540            ))
541            .parse_next(&mut uc_input),
542
543            'H' => terminated("HAVING", end_of_word).parse_next(&mut uc_input),
544
545            'I' => alt((
546                terminated("INSERT INTO", end_of_word),
547                terminated("INSERT", end_of_word),
548            ))
549            .parse_next(&mut uc_input),
550
551            'L' => terminated("LIMIT", end_of_word).parse_next(&mut uc_input),
552
553            'M' => alt((
554                terminated("MODIFY", end_of_word),
555                terminated("MERGE INTO", end_of_word),
556            ))
557            .parse_next(&mut uc_input),
558
559            'O' => alt((
560                terminated("ORDER BY", end_of_word),
561                terminated("ON CONFLICT", end_of_word),
562            ))
563            .parse_next(&mut uc_input),
564
565            'P' => terminated("PARTITION BY", end_of_word).parse_next(&mut uc_input),
566
567            'R' => terminated("RETURNING", end_of_word).parse_next(&mut uc_input),
568
569            'S' => alt((
570                terminated("SELECT DISTINCT", end_of_word),
571                terminated("SELECT ALL", end_of_word),
572                terminated("SELECT", end_of_word),
573                terminated("SET CURRENT SCHEMA", end_of_word),
574                terminated("SET SCHEMA", end_of_word),
575                terminated("SET", end_of_word),
576            ))
577            .parse_next(&mut uc_input),
578
579            'U' => alt((
580                terminated("UPDATE", end_of_word),
581                terminated("USING", end_of_word),
582            ))
583            .parse_next(&mut uc_input),
584
585            'V' => terminated("VALUES", end_of_word).parse_next(&mut uc_input),
586
587            'W' => alt((
588                terminated("WHERE", end_of_word),
589                terminated("WINDOW", end_of_word),
590            ))
591            .parse_next(&mut uc_input),
592
593            // If the first character doesn't match any of our keywords, fail early
594            _ => Err(ParserError::from_input(&uc_input)),
595        };
596
597        if let Ok(token) = result {
598            let token = finalize(input, token);
599
600            let kind = match (
601                token,
602                last_reserved_top_level_token.as_ref().map(|v| v.alias),
603            ) {
604                ("EXCEPT", Some("SELECT")) =>
605                // If the query state doesn't allow EXCEPT, treat it as a reserved word
606                {
607                    TokenKind::Reserved
608                }
609                ("SET", Some("UPDATE")) => TokenKind::ReservedNewlineAfter,
610                ("USING", v) if v != Some("MERGE INTO") && v != Some("DELETE FROM") => {
611                    TokenKind::Reserved
612                }
613                _ => TokenKind::ReservedTopLevel,
614            };
615
616            let alias = if token.starts_with("CREATE") {
617                "CREATE"
618            } else if token.starts_with("SELECT") {
619                "SELECT"
620            } else {
621                token
622            };
623
624            Ok(Token {
625                kind,
626                value: token,
627                key: None,
628                alias,
629            })
630        } else {
631            Err(ParserError::from_input(input))
632        }
633    }
634}
635
636fn get_join_token<'a>() -> impl Parser<&'a str, Token<'a>, ContextError> {
637    move |input: &mut &'a str| {
638        let uc_input: String = get_uc_words(input, 3);
639        let mut uc_input = uc_input.as_str();
640
641        // Standard SQL joins
642        let standard_joins = alt((
643            terminated("JOIN", end_of_word),
644            terminated("INNER JOIN", end_of_word),
645            terminated("LEFT JOIN", end_of_word),
646            terminated("RIGHT JOIN", end_of_word),
647            terminated("FULL JOIN", end_of_word),
648            terminated("CROSS JOIN", end_of_word),
649            terminated("LEFT OUTER JOIN", end_of_word),
650            terminated("RIGHT OUTER JOIN", end_of_word),
651            terminated("FULL OUTER JOIN", end_of_word),
652        ));
653
654        // Warehouse-specific ANY/SEMI/ANTI joins
655        let specific_joins = alt((
656            terminated("INNER ANY JOIN", end_of_word),
657            terminated("LEFT ANY JOIN", end_of_word),
658            terminated("RIGHT ANY JOIN", end_of_word),
659            terminated("ANY JOIN", end_of_word),
660            terminated("SEMI JOIN", end_of_word),
661            terminated("LEFT SEMI JOIN", end_of_word),
662            terminated("RIGHT SEMI JOIN", end_of_word),
663            terminated("LEFT ANTI JOIN", end_of_word),
664            terminated("RIGHT ANTI JOIN", end_of_word),
665        ));
666
667        // Special joins and GLOBAL variants
668        let special_joins = alt((
669            terminated("ASOF JOIN", end_of_word),
670            terminated("LEFT ASOF JOIN", end_of_word),
671            terminated("PASTE JOIN", end_of_word),
672            terminated("GLOBAL INNER JOIN", end_of_word),
673            terminated("GLOBAL LEFT JOIN", end_of_word),
674            terminated("GLOBAL RIGHT JOIN", end_of_word),
675            terminated("GLOBAL FULL JOIN", end_of_word),
676        ));
677
678        // Combine all parsers
679        let result: Result<&str> =
680            alt((standard_joins, specific_joins, special_joins)).parse_next(&mut uc_input);
681
682        if let Ok(token) = result {
683            let final_word = token.split(' ').next_back().unwrap();
684            let input_end_pos =
685                input.to_ascii_uppercase().find(final_word).unwrap() + final_word.len();
686            let token = input.next_slice(input_end_pos);
687            let kind = TokenKind::Join;
688            Ok(Token {
689                kind,
690                value: token,
691                key: None,
692                alias: token,
693            })
694        } else {
695            Err(ParserError::from_input(input))
696        }
697    }
698}
699
700fn get_newline_after_reserved_token<'a>() -> impl Parser<&'a str, Token<'a>, ContextError> {
701    move |input: &mut &'a str| {
702        let uc_input: String = get_uc_words(input, 3);
703        let mut uc_input = uc_input.as_str();
704
705        let mut on_conflict = alt((
706            terminated("DO NOTHING", end_of_word),
707            terminated("DO UPDATE SET", end_of_word),
708        ));
709
710        let result: Result<&str> = on_conflict.parse_next(&mut uc_input);
711
712        if let Ok(token) = result {
713            let value = finalize(input, token);
714            Ok(Token {
715                kind: TokenKind::ReservedNewlineAfter,
716                value,
717                key: None,
718                alias: value,
719            })
720        } else {
721            Err(ParserError::from_input(input))
722        }
723    }
724}
725
726fn get_newline_reserved_token<'a>(
727    last_reserved_token: Option<Token<'a>>,
728) -> impl Parser<&'a str, Token<'a>, ContextError> {
729    move |input: &mut &'a str| {
730        let uc_input: String = get_uc_words(input, 3);
731        let mut uc_input = uc_input.as_str();
732
733        // We have to break up the alternatives into multiple subsets
734        // to avoid exceeding the alt() 21 element limit.
735
736        // Legacy and logical operators
737        let operators = alt((
738            terminated("CROSS APPLY", end_of_word),
739            terminated("OUTER APPLY", end_of_word),
740            terminated("AND", end_of_word),
741            terminated("OR", end_of_word),
742            terminated("XOR", end_of_word),
743            terminated("WHEN", end_of_word),
744            terminated("ELSE", end_of_word),
745        ));
746
747        let alter_table_actions = alt((
748            terminated("ADD", end_of_word),
749            terminated("DROP", end_of_word),
750            terminated("ALTER", end_of_word),
751            terminated("VALIDATE", end_of_word),
752            terminated("ENABLE", end_of_word),
753            terminated("DISABLE", end_of_word),
754        ));
755
756        // Combine all parsers
757        let result: Result<&str> = alt((operators, alter_table_actions)).parse_next(&mut uc_input);
758
759        if let Ok(token) = result {
760            let token = finalize(input, token);
761            let kind = if token == "AND"
762                && last_reserved_token.is_some()
763                && last_reserved_token.as_ref().unwrap().value == "BETWEEN"
764            {
765                // If the "AND" is part of a "BETWEEN" clause, we want to handle it as one clause by not adding a new line.
766                TokenKind::Reserved
767            } else {
768                TokenKind::ReservedNewline
769            };
770            Ok(Token {
771                kind,
772                value: token,
773                key: None,
774                alias: token,
775            })
776        } else {
777            Err(ParserError::from_input(input))
778        }
779    }
780}
781
782fn get_top_level_reserved_token_no_indent<'i>(input: &mut &'i str) -> Result<Token<'i>> {
783    let uc_input = get_uc_words(input, 2);
784    let mut uc_input = uc_input.as_str();
785
786    let result: Result<&str> = alt((
787        terminated("BEGIN", end_of_word),
788        terminated("DECLARE", end_of_word),
789        terminated("INTERSECT ALL", end_of_word),
790        terminated("INTERSECT", end_of_word),
791        terminated("MINUS", end_of_word),
792        terminated("UNION ALL", end_of_word),
793        terminated("UNION", end_of_word),
794        terminated("WITH", end_of_word),
795        terminated("$$", end_of_word),
796    ))
797    .parse_next(&mut uc_input);
798    if let Ok(token) = result {
799        let value = finalize(input, token);
800        Ok(Token {
801            kind: TokenKind::ReservedTopLevelNoIndent,
802            value,
803            key: None,
804            alias: value,
805        })
806    } else {
807        Err(ParserError::from_input(input))
808    }
809}
810fn get_plain_reserved_token<'i>(input: &mut &'i str) -> Result<Token<'i>> {
811    alt((get_plain_reserved_two_token, get_plain_reserved_one_token)).parse_next(input)
812}
813fn get_plain_reserved_one_token<'i>(input: &mut &'i str) -> Result<Token<'i>> {
814    let uc_input = get_uc_words(input, 1);
815    let mut uc_input = uc_input.as_str();
816
817    let first_char = peek(any).parse_next(input)?.to_ascii_uppercase();
818
819    let result: Result<&str> = match first_char {
820        'A' => alt((
821            terminated("ACCESSIBLE", end_of_word),
822            terminated("ACTION", end_of_word),
823            terminated("AGAINST", end_of_word),
824            terminated("AGGREGATE", end_of_word),
825            terminated("ALGORITHM", end_of_word),
826            terminated("ALL", end_of_word),
827            terminated("ALTER", end_of_word),
828            terminated("ANALYSE", end_of_word),
829            terminated("ANALYZE", end_of_word),
830            terminated("AS", end_of_word),
831            terminated("ASC", end_of_word),
832            terminated("AUTOCOMMIT", end_of_word),
833            terminated("AUTO_INCREMENT", end_of_word),
834        ))
835        .parse_next(&mut uc_input),
836
837        'B' => alt((
838            terminated("BACKUP", end_of_word),
839            terminated("BETWEEN", end_of_word),
840            terminated("BINLOG", end_of_word),
841            terminated("BOTH", end_of_word),
842        ))
843        .parse_next(&mut uc_input),
844
845        'C' => alt((
846            terminated("CASCADE", end_of_word),
847            terminated("CASE", end_of_word),
848            terminated("CHANGE", end_of_word),
849            terminated("CHANGED", end_of_word),
850            terminated("CHARSET", end_of_word),
851            terminated("CHECK", end_of_word),
852            terminated("CHECKSUM", end_of_word),
853            terminated("COLLATE", end_of_word),
854            terminated("COLLATION", end_of_word),
855            terminated("COLUMN", end_of_word),
856            terminated("COLUMNS", end_of_word),
857            terminated("COMMENT", end_of_word),
858            terminated("COMMIT", end_of_word),
859            terminated("COMMITTED", end_of_word),
860            terminated("COMPRESSED", end_of_word),
861            terminated("CONCURRENT", end_of_word),
862            terminated("CONSTRAINT", end_of_word),
863            terminated("CONTAINS", end_of_word),
864            alt((
865                terminated("CONVERT", end_of_word),
866                terminated("CREATE", end_of_word),
867                terminated("CROSS", end_of_word),
868                terminated("CURRENT_TIMESTAMP", end_of_word),
869            )),
870        ))
871        .parse_next(&mut uc_input),
872
873        'D' => alt((
874            terminated("DATABASE", end_of_word),
875            terminated("DATABASES", end_of_word),
876            terminated("DAY", end_of_word),
877            terminated("DAY_HOUR", end_of_word),
878            terminated("DAY_MINUTE", end_of_word),
879            terminated("DAY_SECOND", end_of_word),
880            terminated("DEFAULT", end_of_word),
881            terminated("DEFINER", end_of_word),
882            terminated("DELAYED", end_of_word),
883            terminated("DELETE", end_of_word),
884            terminated("DESC", end_of_word),
885            terminated("DESCRIBE", end_of_word),
886            terminated("DETERMINISTIC", end_of_word),
887            terminated("DISTINCT", end_of_word),
888            terminated("DISTINCTROW", end_of_word),
889            terminated("DIV", end_of_word),
890            terminated("DO", end_of_word),
891            terminated("DROP", end_of_word),
892            terminated("DUMPFILE", end_of_word),
893            terminated("DUPLICATE", end_of_word),
894            terminated("DYNAMIC", end_of_word),
895        ))
896        .parse_next(&mut uc_input),
897
898        'E' => alt((
899            terminated("ELSE", end_of_word),
900            terminated("ENCLOSED", end_of_word),
901            terminated("END", end_of_word),
902            terminated("ENGINE", end_of_word),
903            terminated("ENGINES", end_of_word),
904            terminated("ENGINE_TYPE", end_of_word),
905            terminated("ESCAPE", end_of_word),
906            terminated("ESCAPED", end_of_word),
907            terminated("EVENTS", end_of_word),
908            terminated("EXEC", end_of_word),
909            terminated("EXECUTE", end_of_word),
910            terminated("EXISTS", end_of_word),
911            terminated("EXPLAIN", end_of_word),
912            terminated("EXTENDED", end_of_word),
913        ))
914        .parse_next(&mut uc_input),
915
916        'F' => alt((
917            terminated("FAST", end_of_word),
918            terminated("FETCH", end_of_word),
919            terminated("FIELDS", end_of_word),
920            terminated("FILE", end_of_word),
921            terminated("FIRST", end_of_word),
922            terminated("FIXED", end_of_word),
923            terminated("FLUSH", end_of_word),
924            terminated("FOR", end_of_word),
925            terminated("FORCE", end_of_word),
926            terminated("FOREIGN", end_of_word),
927            terminated("FULL", end_of_word),
928            terminated("FULLTEXT", end_of_word),
929            terminated("FUNCTION", end_of_word),
930        ))
931        .parse_next(&mut uc_input),
932
933        'G' => alt((
934            terminated("GLOBAL", end_of_word),
935            terminated("GRANT", end_of_word),
936            terminated("GRANTS", end_of_word),
937            terminated("GROUP_CONCAT", end_of_word),
938        ))
939        .parse_next(&mut uc_input),
940
941        'H' => alt((
942            terminated("HEAP", end_of_word),
943            terminated("HIGH_PRIORITY", end_of_word),
944            terminated("HOSTS", end_of_word),
945            terminated("HOUR", end_of_word),
946            terminated("HOUR_MINUTE", end_of_word),
947            terminated("HOUR_SECOND", end_of_word),
948        ))
949        .parse_next(&mut uc_input),
950
951        'I' => alt((
952            terminated("IDENTIFIED", end_of_word),
953            terminated("IF", end_of_word),
954            terminated("IFNULL", end_of_word),
955            terminated("IGNORE", end_of_word),
956            terminated("IN", end_of_word),
957            terminated("INDEX", end_of_word),
958            terminated("INDEXES", end_of_word),
959            terminated("INFILE", end_of_word),
960            terminated("INSERT", end_of_word),
961            terminated("INSERT_ID", end_of_word),
962            terminated("INSERT_METHOD", end_of_word),
963            terminated("INTERVAL", end_of_word),
964            terminated("INTO", end_of_word),
965            terminated("INVOKER", end_of_word),
966            terminated("IS", end_of_word),
967            terminated("ISOLATION", end_of_word),
968        ))
969        .parse_next(&mut uc_input),
970
971        'K' => alt((
972            terminated("KEY", end_of_word),
973            terminated("KEYS", end_of_word),
974            terminated("KILL", end_of_word),
975        ))
976        .parse_next(&mut uc_input),
977
978        'L' => alt((
979            terminated("LAST_INSERT_ID", end_of_word),
980            terminated("LEADING", end_of_word),
981            terminated("LEVEL", end_of_word),
982            terminated("LIKE", end_of_word),
983            terminated("LINEAR", end_of_word),
984            terminated("LINES", end_of_word),
985            terminated("LOAD", end_of_word),
986            terminated("LOCAL", end_of_word),
987            terminated("LOCK", end_of_word),
988            terminated("LOCKS", end_of_word),
989            terminated("LOGS", end_of_word),
990            terminated("LOW_PRIORITY", end_of_word),
991        ))
992        .parse_next(&mut uc_input),
993
994        'M' => alt((
995            terminated("MARIA", end_of_word),
996            terminated("MASTER", end_of_word),
997            terminated("MASTER_CONNECT_RETRY", end_of_word),
998            terminated("MASTER_HOST", end_of_word),
999            terminated("MASTER_LOG_FILE", end_of_word),
1000            terminated("MATCH", end_of_word),
1001            terminated("MAX_CONNECTIONS_PER_HOUR", end_of_word),
1002            terminated("MAX_QUERIES_PER_HOUR", end_of_word),
1003            terminated("MAX_ROWS", end_of_word),
1004            terminated("MAX_UPDATES_PER_HOUR", end_of_word),
1005            terminated("MAX_USER_CONNECTIONS", end_of_word),
1006            terminated("MEDIUM", end_of_word),
1007            terminated("MERGE", end_of_word),
1008            terminated("MINUTE", end_of_word),
1009            terminated("MINUTE_SECOND", end_of_word),
1010            terminated("MIN_ROWS", end_of_word),
1011            terminated("MODE", end_of_word),
1012            terminated("MODIFY", end_of_word),
1013            terminated("MONTH", end_of_word),
1014            terminated("MRG_MYISAM", end_of_word),
1015            terminated("MYISAM", end_of_word),
1016        ))
1017        .parse_next(&mut uc_input),
1018
1019        'N' => alt((
1020            terminated("NAMES", end_of_word),
1021            terminated("NATURAL", end_of_word),
1022            terminated("NOT", end_of_word),
1023            terminated("NOW()", end_of_word),
1024            terminated("NULL", end_of_word),
1025        ))
1026        .parse_next(&mut uc_input),
1027
1028        'O' => alt((
1029            terminated("OFFSET", end_of_word),
1030            terminated("ON", end_of_word),
1031            terminated("ONLY", end_of_word),
1032            terminated("OPEN", end_of_word),
1033            terminated("OPTIMIZE", end_of_word),
1034            terminated("OPTION", end_of_word),
1035            terminated("OPTIONALLY", end_of_word),
1036            terminated("OUTFILE", end_of_word),
1037        ))
1038        .parse_next(&mut uc_input),
1039
1040        'P' => alt((
1041            terminated("PACK_KEYS", end_of_word),
1042            terminated("PAGE", end_of_word),
1043            terminated("PARTIAL", end_of_word),
1044            terminated("PARTITION", end_of_word),
1045            terminated("PARTITIONS", end_of_word),
1046            terminated("PASSWORD", end_of_word),
1047            terminated("PRIMARY", end_of_word),
1048            terminated("PRIVILEGES", end_of_word),
1049            terminated("PROCEDURE", end_of_word),
1050            terminated("PROCESS", end_of_word),
1051            terminated("PROCESSLIST", end_of_word),
1052            terminated("PURGE", end_of_word),
1053        ))
1054        .parse_next(&mut uc_input),
1055
1056        'Q' => terminated("QUICK", end_of_word).parse_next(&mut uc_input),
1057
1058        'R' => alt((
1059            terminated("RAID0", end_of_word),
1060            terminated("RAID_CHUNKS", end_of_word),
1061            terminated("RAID_CHUNKSIZE", end_of_word),
1062            terminated("RAID_TYPE", end_of_word),
1063            terminated("RANGE", end_of_word),
1064            terminated("READ", end_of_word),
1065            terminated("READ_ONLY", end_of_word),
1066            terminated("READ_WRITE", end_of_word),
1067            terminated("REFERENCES", end_of_word),
1068            terminated("REGEXP", end_of_word),
1069            terminated("RELOAD", end_of_word),
1070            terminated("RENAME", end_of_word),
1071            terminated("REPAIR", end_of_word),
1072            terminated("REPEATABLE", end_of_word),
1073            terminated("REPLACE", end_of_word),
1074            terminated("REPLICATION", end_of_word),
1075            terminated("RESET", end_of_word),
1076            alt((
1077                terminated("RESTORE", end_of_word),
1078                terminated("RESTRICT", end_of_word),
1079                terminated("RETURN", end_of_word),
1080                terminated("RETURNS", end_of_word),
1081                terminated("REVOKE", end_of_word),
1082                terminated("RLIKE", end_of_word),
1083                terminated("ROLLBACK", end_of_word),
1084                terminated("ROW", end_of_word),
1085                terminated("ROWS", end_of_word),
1086                terminated("ROW_FORMAT", end_of_word),
1087            )),
1088        ))
1089        .parse_next(&mut uc_input),
1090
1091        'S' => alt((
1092            terminated("SECOND", end_of_word),
1093            terminated("SECURITY", end_of_word),
1094            terminated("SEPARATOR", end_of_word),
1095            terminated("SERIALIZABLE", end_of_word),
1096            terminated("SESSION", end_of_word),
1097            terminated("SHARE", end_of_word),
1098            terminated("SHOW", end_of_word),
1099            terminated("SHUTDOWN", end_of_word),
1100            terminated("SLAVE", end_of_word),
1101            terminated("SONAME", end_of_word),
1102            terminated("SOUNDS", end_of_word),
1103            terminated("SQL", end_of_word),
1104            terminated("SQL_AUTO_IS_NULL", end_of_word),
1105            terminated("SQL_BIG_RESULT", end_of_word),
1106            terminated("SQL_BIG_SELECTS", end_of_word),
1107            terminated("SQL_BIG_TABLES", end_of_word),
1108            terminated("SQL_BUFFER_RESULT", end_of_word),
1109            terminated("SQL_CACHE", end_of_word),
1110            alt((
1111                terminated("SQL_CALC_FOUND_ROWS", end_of_word),
1112                terminated("SQL_LOG_BIN", end_of_word),
1113                terminated("SQL_LOG_OFF", end_of_word),
1114                terminated("SQL_LOG_UPDATE", end_of_word),
1115                terminated("SQL_LOW_PRIORITY_UPDATES", end_of_word),
1116                terminated("SQL_MAX_JOIN_SIZE", end_of_word),
1117                terminated("SQL_NO_CACHE", end_of_word),
1118                terminated("SQL_QUOTE_SHOW_CREATE", end_of_word),
1119                terminated("SQL_BIG_RESULT", end_of_word),
1120                terminated("SQL_BIG_SELECTS", end_of_word),
1121                terminated("SQL_BIG_TABLES", end_of_word),
1122                terminated("SQL_BUFFER_RESULT", end_of_word),
1123                terminated("SQL_CACHE", end_of_word),
1124                terminated("SQL_CALC_FOUND_ROWS", end_of_word),
1125                terminated("SQL_LOG_BIN", end_of_word),
1126                terminated("SQL_LOG_OFF", end_of_word),
1127                terminated("SQL_LOG_UPDATE", end_of_word),
1128                terminated("SQL_LOW_PRIORITY_UPDATES", end_of_word),
1129                terminated("SQL_MAX_JOIN_SIZE", end_of_word),
1130                alt((
1131                    terminated("SQL_NO_CACHE", end_of_word),
1132                    terminated("SQL_QUOTE_SHOW_CREATE", end_of_word),
1133                    terminated("SQL_SAFE_UPDATES", end_of_word),
1134                    terminated("SQL_SELECT_LIMIT", end_of_word),
1135                    terminated("SQL_SLAVE_SKIP_COUNTER", end_of_word),
1136                    terminated("SQL_SMALL_RESULT", end_of_word),
1137                    terminated("SQL_WARNINGS", end_of_word),
1138                    terminated("START", end_of_word),
1139                    terminated("STARTING", end_of_word),
1140                    terminated("STATUS", end_of_word),
1141                    terminated("STOP", end_of_word),
1142                    terminated("STORAGE", end_of_word),
1143                    terminated("STRAIGHT_JOIN", end_of_word),
1144                    terminated("STRING", end_of_word),
1145                    terminated("STRIPED", end_of_word),
1146                    terminated("SUPER", end_of_word),
1147                )),
1148            )),
1149        ))
1150        .parse_next(&mut uc_input),
1151
1152        'T' => alt((
1153            terminated("TABLE", end_of_word),
1154            terminated("TABLES", end_of_word),
1155            terminated("TEMPORARY", end_of_word),
1156            terminated("TERMINATED", end_of_word),
1157            terminated("THEN", end_of_word),
1158            terminated("TO", end_of_word),
1159            terminated("TRAILING", end_of_word),
1160            terminated("TRANSACTIONAL", end_of_word),
1161            terminated("TRUE", end_of_word),
1162            terminated("TRUNCATE", end_of_word),
1163            terminated("TYPE", end_of_word),
1164            terminated("TYPES", end_of_word),
1165            terminated("TBLPROPERTIES", end_of_word),
1166        ))
1167        .parse_next(&mut uc_input),
1168
1169        'U' => alt((
1170            terminated("UNCOMMITTED", end_of_word),
1171            terminated("UNIQUE", end_of_word),
1172            terminated("UNLOCK", end_of_word),
1173            terminated("UNSIGNED", end_of_word),
1174            terminated("USAGE", end_of_word),
1175            terminated("USE", end_of_word),
1176        ))
1177        .parse_next(&mut uc_input),
1178
1179        'V' => alt((
1180            terminated("VARIABLES", end_of_word),
1181            terminated("VIEW", end_of_word),
1182        ))
1183        .parse_next(&mut uc_input),
1184
1185        'W' => alt((
1186            terminated("WHEN", end_of_word),
1187            terminated("WORK", end_of_word),
1188            terminated("WRITE", end_of_word),
1189        ))
1190        .parse_next(&mut uc_input),
1191
1192        'Y' => alt((terminated("YEAR_MONTH", end_of_word),)).parse_next(&mut uc_input),
1193        // If the first character doesn't match any of our keywords, fail early
1194        _ => Err(ParserError::from_input(&uc_input)),
1195    };
1196    if let Ok(token) = result {
1197        let input_end_pos = token.len();
1198        let token = input.next_slice(input_end_pos);
1199        Ok(Token {
1200            kind: TokenKind::Reserved,
1201            value: token,
1202            key: None,
1203            alias: token,
1204        })
1205    } else {
1206        Err(ParserError::from_input(input))
1207    }
1208}
1209
1210fn get_plain_reserved_two_token<'i>(input: &mut &'i str) -> Result<Token<'i>> {
1211    let uc_input = get_uc_words(input, 2);
1212    let mut uc_input = uc_input.as_str();
1213    let result: Result<&str> = alt((
1214        terminated("CHARACTER SET", end_of_word),
1215        terminated("ON CONFLICT", end_of_word),
1216        terminated("ON CONSTRAINT", end_of_word),
1217        terminated("ON DELETE", end_of_word),
1218        terminated("ON UPDATE", end_of_word),
1219        terminated("DISTINCT FROM", end_of_word),
1220        terminated("PARTITIONED BY", end_of_word),
1221    ))
1222    .parse_next(&mut uc_input);
1223    if let Ok(token) = result {
1224        let value = finalize(input, token);
1225        Ok(Token {
1226            kind: TokenKind::Reserved,
1227            value,
1228            key: None,
1229            alias: value,
1230        })
1231    } else {
1232        Err(ParserError::from_input(input))
1233    }
1234}
1235
1236fn get_word_token<'i>(input: &mut &'i str) -> Result<Token<'i>> {
1237    take_while(1.., is_word_character)
1238        .parse_next(input)
1239        .map(|token| Token {
1240            kind: TokenKind::Word,
1241            value: token,
1242            key: None,
1243            alias: token,
1244        })
1245}
1246
1247fn get_operator_token<'i>(input: &mut &'i str) -> Result<Token<'i>> {
1248    // Define the allowed operator characters
1249    let allowed_operators = (
1250        '!', '<', '>', '=', '|', ':', '-', '~', '*', '&', '@', '^', '?', '#', '/', '%',
1251    );
1252
1253    take_while(2..=5, allowed_operators)
1254        .map(|token: &str| Token {
1255            kind: TokenKind::Operator,
1256            value: token,
1257            key: None,
1258            alias: token,
1259        })
1260        .parse_next(input)
1261}
1262fn get_any_other_char<'i>(input: &mut &'i str) -> Result<Token<'i>> {
1263    one_of(|token| token != '\n' && token != '\r')
1264        .take()
1265        .parse_next(input)
1266        .map(|token| Token {
1267            kind: TokenKind::Operator,
1268            value: token,
1269            key: None,
1270            alias: token,
1271        })
1272}
1273
1274fn end_of_word<'i>(input: &mut &'i str) -> Result<&'i str> {
1275    peek(alt((
1276        eof,
1277        one_of(|val: char| !is_word_character(val)).take(),
1278    )))
1279    .parse_next(input)
1280}
1281
1282fn is_word_character(item: char) -> bool {
1283    item.is_alphanumeric() || item.is_mark() || item.is_punctuation_connector()
1284}
sqlformat/tokenizer.rs

sqlformat/
tokenizer.rs