mysqldump_mutator/
tokenizer.rs

1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4//
5// http://www.apache.org/licenses/LICENSE-2.0
6//
7// Unless required by applicable law or agreed to in writing, software
8// distributed under the License is distributed on an "AS IS" BASIS,
9// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10// See the License for the specific language governing permissions and
11// limitations under the License.
12
13//! SQL Tokenizer
14//!
15//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
16//!
17//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
18
19use std::io::BufRead;
20use std::iter::Peekable;
21use utf8_chars::{BufReadCharsExt, Chars};
22
23use super::dialect::keywords::ALL_KEYWORDS;
24use super::dialect::Dialect;
25use std::collections::VecDeque;
26use std::fmt;
27
28/// SQL Token enumeration
29#[derive(Debug, Clone, PartialEq)]
30pub enum Token {
31    /// A keyword (like SELECT) or an optionally quoted SQL identifier
32    Word(Word),
33    /// An unsigned numeric literal
34    Number(String),
35    /// A character that could not be tokenized
36    Char(char),
37    /// Single quoted string: i.e: 'string'
38    SingleQuotedString(String),
39    /// "National" string literal: i.e: N'string'
40    NationalStringLiteral(String),
41    /// Hexadecimal string literal: i.e.: X'deadbeef'
42    HexStringLiteral(String),
43    /// Comma
44    Comma,
45    /// Whitespace (space, tab, etc)
46    Whitespace(Whitespace),
47    /// Equality operator `=`
48    Eq,
49    /// Not Equals operator `<>` (or `!=` in some dialects)
50    Neq([char; 2]),
51    /// Less Than operator `<`
52    Lt,
53    /// Greater han operator `>`
54    Gt,
55    /// Less Than Or Equals operator `<=`
56    LtEq,
57    /// Greater Than Or Equals operator `>=`
58    GtEq,
59    /// Plus operator `+`
60    Plus,
61    /// Minus operator `-`
62    Minus,
63    /// Multiplication operator `*`
64    Mult,
65    /// Division operator `/`
66    Div,
67    /// Modulo Operator `%`
68    Mod,
69    /// Left parenthesis `(`
70    LParen,
71    /// Right parenthesis `)`
72    RParen,
73    /// Period (used for compound identifiers or projections into nested types)
74    Period,
75    /// Colon `:`
76    Colon,
77    /// DoubleColon `::` (used for casting in postgresql)
78    DoubleColon,
79    /// SemiColon `;` used as separator for COPY and payload
80    SemiColon,
81    /// Backslash `\` used in terminating the COPY payload with `\.`
82    Backslash,
83    /// Left bracket `[`
84    LBracket,
85    /// Right bracket `]`
86    RBracket,
87    /// Ampersand &
88    Ampersand,
89    /// Left brace `{`
90    LBrace,
91    /// Right brace `}`
92    RBrace,
93}
94
95impl fmt::Display for Token {
96    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
97        match self {
98            Token::Word(ref w) => write!(f, "{}", w),
99            Token::Number(ref n) => f.write_str(n),
100            Token::Char(ref c) => write!(f, "{}", c),
101            Token::SingleQuotedString(ref s) => write!(f, "'{}'", s),
102            Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
103            Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
104            Token::Comma => f.write_str(","),
105            Token::Whitespace(ws) => write!(f, "{}", ws),
106            Token::Eq => f.write_str("="),
107            Token::Neq(values) => write!(f, "{}{}", values[0], values[1]),
108            Token::Lt => f.write_str("<"),
109            Token::Gt => f.write_str(">"),
110            Token::LtEq => f.write_str("<="),
111            Token::GtEq => f.write_str(">="),
112            Token::Plus => f.write_str("+"),
113            Token::Minus => f.write_str("-"),
114            Token::Mult => f.write_str("*"),
115            Token::Div => f.write_str("/"),
116            Token::Mod => f.write_str("%"),
117            Token::LParen => f.write_str("("),
118            Token::RParen => f.write_str(")"),
119            Token::Period => f.write_str("."),
120            Token::Colon => f.write_str(":"),
121            Token::DoubleColon => f.write_str("::"),
122            Token::SemiColon => f.write_str(";"),
123            Token::Backslash => f.write_str("\\"),
124            Token::LBracket => f.write_str("["),
125            Token::RBracket => f.write_str("]"),
126            Token::Ampersand => f.write_str("&"),
127            Token::LBrace => f.write_str("{"),
128            Token::RBrace => f.write_str("}"),
129        }
130    }
131}
132
133impl Token {
134    /*fn make_keyword(keyword: &str) -> Self {
135        Token::new(keyword, None)
136    }*/
137
138    /// Creates a new token. quote style is None for integers.
139    pub fn new(word: &str, quote_style: Option<char>) -> Self {
140        let word_uppercase = word.to_uppercase();
141        //TODO: need to reintroduce FnvHashSet at some point .. iterating over keywords is
142        // not fast but I want the simplicity for now while I experiment with pluggable
143        // dialects
144        let is_keyword = quote_style == None && ALL_KEYWORDS.contains(&word_uppercase.as_str());
145        Token::Word(Word {
146            value: word.to_string(),
147            quote_style,
148            keyword: if is_keyword {
149                word_uppercase
150            } else {
151                "".to_string()
152            },
153        })
154    }
155
156    /// Gets the value of the column encoded in an string. For numbers use get_number
157    pub fn get_value(&self) -> String {
158        match self {
159            Token::Word(word) => word.value.clone(),
160            Token::SingleQuotedString(ref s)
161            | Token::NationalStringLiteral(ref s)
162            | Token::HexStringLiteral(ref s) => s.clone(),
163            _ => format!("{}", self),
164        }
165    }
166
167    /// If the token contains a number and it can be parsed, returns Some(number). None if not.
168    pub fn get_number(&self) -> Option<f64> {
169        match self {
170            Token::Number(number) => {
171                let number = number.parse();
172                match number {
173                    Ok(number) => Some(number),
174                    Err(_) => None,
175                }
176            }
177            _ => None,
178        }
179    }
180
181    /// returns if the Token contains a keyword
182    pub fn is_keyword(&self) -> bool {
183        match self {
184            Token::Word(word) if word.keyword != "" => true,
185            _ => false,
186        }
187    }
188
189    /// gets the quote style. None if none or it doesn't apply
190    pub fn get_quote_style(&self) -> Option<char> {
191        match self {
192            Token::Word(word) => word.quote_style,
193            _ => None,
194        }
195    }
196}
197
198impl From<Token> for String {
199    fn from(token: Token) -> String {
200        format!("{}", token)
201    }
202}
203
204/// A keyword (like SELECT) or an optionally quoted SQL identifier
205#[derive(Debug, Clone, PartialEq)]
206pub struct Word {
207    /// The value of the token, without the enclosing quotes, and with the
208    /// escape sequences (if any) processed (TODO: escapes are not handled)
209    pub value: String,
210    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
211    /// The standard and most implementations allow using double quotes for this,
212    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
213    pub quote_style: Option<char>,
214    /// If the word was not quoted and it matched one of the known keywords,
215    /// this will have one of the values from dialect::keywords, otherwise empty
216    pub keyword: String,
217}
218
219impl fmt::Display for Word {
220    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
221        match self.quote_style {
222            Some(s) if s == '"' || s == '[' || s == '`' => {
223                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
224            }
225            None => f.write_str(&self.value),
226            _ => panic!("Unexpected quote_style!"),
227        }
228    }
229}
230impl Word {
231    fn matching_end_quote(ch: char) -> char {
232        match ch {
233            '"' => '"', // ANSI and most dialects
234            '[' => ']', // MS SQL
235            '`' => '`', // MySQL
236            _ => panic!("unexpected quoting style!"),
237        }
238    }
239}
240
241#[derive(Debug, Clone, PartialEq)]
242pub enum Whitespace {
243    Space,
244    Newline,
245    Tab,
246    SingleLineComment(String),
247    MultiLineComment(String),
248}
249
250impl fmt::Display for Whitespace {
251    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
252        match self {
253            Whitespace::Space => f.write_str(" "),
254            Whitespace::Newline => f.write_str("\n"),
255            Whitespace::Tab => f.write_str("\t"),
256            Whitespace::SingleLineComment(s) => write!(f, "--{}", s),
257            Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
258        }
259    }
260}
261
262/// Tokenizer error
263#[derive(Debug, PartialEq)]
264pub struct TokenizerError(String);
265
266/// SQL Tokenizer
267pub struct Tokenizer<'a, R: BufRead, D: Dialect> {
268    dialect: D,
269    pub query: Peekable<Chars<'a, R>>,
270    pub line: u64,
271    pub col: u64,
272    peeked_tokens: VecDeque<Token>,
273}
274
275impl<'a, R: BufRead, D: Dialect> Tokenizer<'a, R, D> {
276    /// Create a new SQL tokenizer for the specified SQL statement
277    pub fn new(dialect: D, query: &'a mut R) -> Self {
278        Self {
279            dialect,
280            query: query.chars().peekable(),
281            line: 1,
282            col: 1,
283            peeked_tokens: VecDeque::new(),
284        }
285    }
286
287    pub fn peek_token(&mut self, n: usize) -> Result<Option<Token>, TokenizerError> {
288        // We need to have a peeked token at least in case they send us n=0
289        if self.peeked_tokens.len() <= n {
290            //Peek enough in order to get the one we want. Including 1 token when n=0
291            let tokens_to_peek = n - self.peeked_tokens.len() + 1;
292            for _ in 0..tokens_to_peek {
293                match self.internal_next_token() {
294                    Ok(Some(token)) => {
295                        self.peeked_tokens.push_back(token);
296                    }
297                    _ => return Err(TokenizerError("Unexpected EOF.".to_string())),
298                }
299            }
300        }
301        Ok(Some(self.peeked_tokens[n].clone()))
302    }
303
304    pub fn pushback_token(&mut self, token: Token) {
305        self.peeked_tokens.push_front(token);
306    }
307
308    /// Get the next token or return None
309    pub fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
310        if let Some(token) = self.peeked_tokens.pop_front() {
311            //println!("{:?}", token);
312            return Ok(Some(token));
313        }
314
315        self.internal_next_token()
316        //println!("{:?}", token);
317    }
318
319    fn internal_next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
320        match self.query.peek() {
321            Some(Ok(ch)) => match *ch {
322                ' ' => self.consume_and_return(Token::Whitespace(Whitespace::Space)),
323                '\t' => self.consume_and_return(Token::Whitespace(Whitespace::Tab)),
324                '\n' => self.consume_and_return(Token::Whitespace(Whitespace::Newline)),
325                '\r' => {
326                    // Emit a single Whitespace::Newline token for \r and \r\n
327                    self.query.next();
328                    if let Some(Ok('\n')) = self.query.peek() {
329                        self.query.next();
330                    }
331                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
332                }
333                'N' => {
334                    self.query.next(); // consume, to check the next char
335                    match self.query.peek() {
336                        Some(Ok('\'')) => {
337                            // N'...' - a <national character string literal>
338                            let s = self.tokenize_single_quoted_string();
339                            Ok(Some(Token::NationalStringLiteral(s)))
340                        }
341                        _ => {
342                            // regular identifier starting with an "N"
343                            let s = self.tokenize_word('N');
344                            Ok(Some(Token::new(&s, None)))
345                        }
346                    }
347                }
348                // The spec only allows an uppercase 'X' to introduce a hex
349                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
350                x @ 'x' | x @ 'X' => {
351                    self.query.next(); // consume, to check the next char
352                    match self.query.peek() {
353                        Some(Ok('\'')) => {
354                            // X'...' - a <binary string literal>
355                            let s = self.tokenize_single_quoted_string();
356                            Ok(Some(Token::HexStringLiteral(s)))
357                        }
358                        _ => {
359                            // regular identifier starting with an "X"
360                            let s = self.tokenize_word(x);
361                            Ok(Some(Token::new(&s, None)))
362                        }
363                    }
364                }
365                // identifier or keyword
366                ch if self.dialect.is_identifier_start(ch) => {
367                    self.query.next(); // consume the first char
368                    let s = self.tokenize_word(ch);
369                    Ok(Some(Token::new(&s, None)))
370                }
371                // string
372                '\'' => {
373                    let s = self.tokenize_single_quoted_string();
374                    Ok(Some(Token::SingleQuotedString(s)))
375                }
376                // delimited (quoted) identifier
377                quote_start if self.dialect.is_delimited_identifier_start(quote_start) => {
378                    self.query.next(); // consume the opening quote
379                    let quote_end = Word::matching_end_quote(quote_start);
380                    let s = self.peeking_take_while(|_tok, ch| ch != quote_end);
381                    match self.query.next() {
382                        Some(Ok(ch)) if ch == quote_end => {
383                            Ok(Some(Token::new(&s, Some(quote_start))))
384                        }
385                        _ => Err(TokenizerError(format!(
386                            "Expected close delimiter '{}' before EOF.",
387                            quote_end
388                        ))),
389                    }
390                }
391                // numbers
392                '0'..='9' => {
393                    // TODO: https://jakewheat.github.io/sql-overview/sql-2011-foundation-grammar.html#unsigned-numeric-literal
394                    let s = self.peeking_take_while(|_tok, ch| match ch {
395                        '0'..='9' | '.' => true,
396                        _ => false,
397                    });
398                    Ok(Some(Token::Number(s)))
399                }
400                // punctuation
401                '(' => self.consume_and_return(Token::LParen),
402                ')' => self.consume_and_return(Token::RParen),
403                ',' => self.consume_and_return(Token::Comma),
404                // operators
405                '-' => {
406                    self.query.next(); // consume the '-'
407                    match self.query.peek() {
408                        Some(Ok('-')) => {
409                            self.query.next(); // consume the second '-', starting a single-line comment
410                            let mut s = self.peeking_take_while(|_tok, ch| ch != '\n');
411                            if let Some(Ok(ch)) = self.query.next() {
412                                assert_eq!(ch, '\n');
413                                s.push(ch);
414                            }
415                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment(s))))
416                        }
417                        Some(Ok('0'..='9')) => {
418                            let s = self.peeking_take_while(|_tok, ch| match ch {
419                                '0'..='9' | '.' => true,
420                                _ => false,
421                            });
422                            Ok(Some(Token::Number(format!("-{}", s))))
423                        }
424                        // a regular '-' operator
425                        _ => Ok(Some(Token::Minus)),
426                    }
427                }
428                '/' => {
429                    self.query.next(); // consume the '/'
430                    match self.query.peek() {
431                        Some(Ok('*')) => {
432                            self.query.next(); // consume the '*', starting a multi-line comment
433                            self.tokenize_multiline_comment()
434                        }
435                        // a regular '/' operator
436                        _ => Ok(Some(Token::Div)),
437                    }
438                }
439                '+' => self.consume_and_return(Token::Plus),
440                '*' => self.consume_and_return(Token::Mult),
441                '%' => self.consume_and_return(Token::Mod),
442                '=' => self.consume_and_return(Token::Eq),
443                '.' => self.consume_and_return(Token::Period),
444                '!' => {
445                    self.query.next(); // consume
446                    match self.query.peek() {
447                        Some(Ok('=')) => self.consume_and_return(Token::Neq(['!', '='])),
448                        _ => Err(TokenizerError(format!(
449                            "Tokenizer Error at Line: {}, Col: {}",
450                            self.line, self.col
451                        ))),
452                    }
453                }
454                '<' => {
455                    self.query.next(); // consume
456                    match self.query.peek() {
457                        Some(Ok('=')) => self.consume_and_return(Token::LtEq),
458                        Some(Ok('>')) => self.consume_and_return(Token::Neq(['<', '>'])),
459                        _ => Ok(Some(Token::Lt)),
460                    }
461                }
462                '>' => {
463                    self.query.next(); // consume
464                    match self.query.peek() {
465                        Some(Ok('=')) => self.consume_and_return(Token::GtEq),
466                        _ => Ok(Some(Token::Gt)),
467                    }
468                }
469                ':' => {
470                    self.query.next();
471                    match self.query.peek() {
472                        Some(Ok(':')) => self.consume_and_return(Token::DoubleColon),
473                        _ => Ok(Some(Token::Colon)),
474                    }
475                }
476                ';' => self.consume_and_return(Token::SemiColon),
477                '\\' => self.consume_and_return(Token::Backslash),
478                '[' => self.consume_and_return(Token::LBracket),
479                ']' => self.consume_and_return(Token::RBracket),
480                '&' => self.consume_and_return(Token::Ampersand),
481                '{' => self.consume_and_return(Token::LBrace),
482                '}' => self.consume_and_return(Token::RBrace),
483                other => self.consume_and_return(Token::Char(other)),
484            },
485            _ => Ok(None),
486        }
487    }
488
489    /// Tokenize an identifier or keyword, after the first char is already consumed.
490    fn tokenize_word(&mut self, first_char: char) -> String {
491        let mut s = first_char.to_string();
492        s.push_str(&self.peeking_take_while(|tok, ch| tok.dialect.is_identifier_part(ch)));
493        s
494    }
495
496    /// Read a single quoted string, starting with the opening quote.
497    fn tokenize_single_quoted_string(&mut self) -> String {
498        //TODO: handle escaped quotes in string
499        //TODO: handle newlines in string
500        //TODO: handle EOF before terminating quote
501        //TODO: handle 'string' <white space> 'string continuation'
502        let chars = &mut self.query;
503        let mut s = String::new();
504        chars.next(); // consume the opening quote
505        while let Some(Ok(ch)) = chars.peek() {
506            match *ch {
507                '\'' => {
508                    chars.next(); // consume
509                    let escaped_quote = chars
510                        .peek()
511                        .map(|c| c.as_ref().unwrap() == &'\'')
512                        .unwrap_or(false);
513                    if escaped_quote {
514                        s.push('\'');
515                        s.push('\'');
516                        chars.next();
517                    } else {
518                        break;
519                    }
520                }
521                '\\' => {
522                    chars.next(); // consume
523                    let next_char = chars.peek().unwrap().as_ref().unwrap();
524                    if next_char == &'\\'
525                        || next_char == &'\''
526                        || next_char == &'\"'
527                        || next_char == &'n'
528                        || next_char == &'t'
529                        || next_char == &'r'
530                        || next_char == &'0'
531                    {
532                        s.push('\\');
533                        s.push(*next_char);
534                        chars.next();
535                    } else {
536                        break;
537                    }
538                }
539                ch => {
540                    chars.next(); // consume
541                    s.push(ch);
542                }
543            }
544        }
545        s
546    }
547
548    fn tokenize_multiline_comment(&mut self) -> Result<Option<Token>, TokenizerError> {
549        let mut s = String::new();
550        let mut maybe_closing_comment = false;
551        // TODO: deal with nested comments
552        loop {
553            match self.query.next() {
554                Some(Ok(ch)) => {
555                    if maybe_closing_comment {
556                        if ch == '/' {
557                            break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
558                        } else {
559                            s.push('*');
560                        }
561                    }
562                    maybe_closing_comment = ch == '*';
563                    if !maybe_closing_comment {
564                        s.push(ch);
565                    }
566                }
567                _ => {
568                    break Err(TokenizerError(
569                        "Unexpected EOF while in a multi-line comment".to_string(),
570                    ));
571                }
572            }
573        }
574    }
575
576    fn consume_and_return(&mut self, t: Token) -> Result<Option<Token>, TokenizerError> {
577        self.query.next();
578        Ok(Some(t))
579    }
580
581    /// Read from `chars` until `predicate` returns `false` or EOF is hit.
582    /// Return the characters read as String, and keep the first non-matching
583    /// char available as `chars.next()`.
584    fn peeking_take_while(
585        &mut self,
586        mut predicate: impl FnMut(&mut Tokenizer<'a, R, D>, char) -> bool,
587    ) -> String {
588        let mut s = String::new();
589        while let Some(Ok(ch)) = self.query.peek() {
590            let ch = *ch;
591            if predicate(self, ch) {
592                self.query.next(); // consume
593                s.push(ch);
594            } else {
595                break;
596            }
597        }
598        s
599    }
600}