1use fallible_iterator::FallibleIterator;
3use memchr::memchr;
4
5pub use crate::dialect::TokenType;
6use crate::dialect::TokenType::*;
7use crate::dialect::{
8 is_identifier_continue, is_identifier_start, keyword_token, sentinel, MAX_KEYWORD_LEN,
9};
10use crate::parser::ast::Cmd;
11use crate::parser::parse::{yyParser, YYCODETYPE};
12use crate::parser::Context;
13
14mod error;
15#[cfg(test)]
16mod test;
17
18use crate::lexer::scan::ScanError;
19use crate::lexer::scan::Splitter;
20use crate::lexer::Scanner;
21pub use crate::parser::ParserError;
22pub use error::Error;
23
24pub struct Parser<'input> {
29 input: &'input [u8],
30 scanner: Scanner<Tokenizer>,
31 parser: yyParser<'input>,
33 had_error: bool,
34}
35
36impl<'input> Parser<'input> {
37 pub fn new(input: &'input [u8]) -> Self {
39 let lexer = Tokenizer::new();
40 let scanner = Scanner::new(lexer);
41 let ctx = Context::new(input);
42 let parser = yyParser::new(ctx);
43 Parser {
44 input,
45 scanner,
46 parser,
47 had_error: false,
48 }
49 }
50 pub fn reset(&mut self, input: &'input [u8]) {
52 self.input = input;
53 self.scanner.reset();
54 self.had_error = false;
55 }
56 pub fn line(&self) -> u64 {
58 self.scanner.line()
59 }
60 pub fn column(&self) -> usize {
62 self.scanner.column()
63 }
64
65 pub fn offset(&self) -> usize {
67 self.scanner.offset()
68 }
69
70 pub fn finalize(&mut self) {
72 self.parser.sqlite3ParserFinalize();
73 }
74}
75
76fn get_token(scanner: &mut Scanner<Tokenizer>, input: &[u8]) -> Result<TokenType, Error> {
80 let mut t = {
81 let (_, token_type) = match scanner.scan(input)? {
82 (_, None, _) => {
83 return Ok(TK_EOF);
84 }
85 (_, Some(tuple), _) => tuple,
86 };
87 token_type
88 };
89 if t == TK_ID
90 || t == TK_STRING
91 || t == TK_JOIN_KW
92 || t == TK_WINDOW
93 || t == TK_OVER
94 || yyParser::parse_fallback(t as YYCODETYPE) == TK_ID as YYCODETYPE
95 {
96 t = TK_ID;
97 }
98 Ok(t)
99}
100
101fn analyze_window_keyword(
132 scanner: &mut Scanner<Tokenizer>,
133 input: &[u8],
134) -> Result<TokenType, Error> {
135 let t = get_token(scanner, input)?;
136 if t != TK_ID {
137 return Ok(TK_ID);
138 };
139 let t = get_token(scanner, input)?;
140 if t != TK_AS {
141 return Ok(TK_ID);
142 };
143 Ok(TK_WINDOW)
144}
145fn analyze_over_keyword(
146 scanner: &mut Scanner<Tokenizer>,
147 input: &[u8],
148 last_token: TokenType,
149) -> Result<TokenType, Error> {
150 if last_token == TK_RP {
151 let t = get_token(scanner, input)?;
152 if t == TK_LP || t == TK_ID {
153 return Ok(TK_OVER);
154 }
155 }
156 Ok(TK_ID)
157}
158fn analyze_filter_keyword(
159 scanner: &mut Scanner<Tokenizer>,
160 input: &[u8],
161 last_token: TokenType,
162) -> Result<TokenType, Error> {
163 if last_token == TK_RP && get_token(scanner, input)? == TK_LP {
164 return Ok(TK_FILTER);
165 }
166 Ok(TK_ID)
167}
168
169macro_rules! try_with_position {
170 ($scanner:expr, $expr:expr) => {
171 match $expr {
172 Ok(val) => val,
173 Err(err) => {
174 let mut err = Error::from(err);
175 err.position($scanner.line(), $scanner.column(), $scanner.offset() - 1);
176 return Err(err);
177 }
178 }
179 };
180}
181
182impl FallibleIterator for Parser<'_> {
183 type Item = Cmd;
184 type Error = Error;
185
186 fn next(&mut self) -> Result<Option<Cmd>, Error> {
187 if self.had_error {
190 return Ok(None);
191 }
192 self.parser.ctx.reset();
193 let mut last_token_parsed = TK_EOF;
194 let mut eof = false;
195 loop {
196 let (start, (value, mut token_type), end) = match self.scanner.scan(self.input)? {
197 (_, None, _) => {
198 eof = true;
199 break;
200 }
201 (start, Some(tuple), end) => (start, tuple, end),
202 };
203
204 if token_type == TK_ILLEGAL {
205 self.parser.sqlite3ParserFinalize();
207 self.had_error = true;
208 return Err(Error::UnrecognizedToken(
209 Some((self.scanner.line(), self.scanner.column())),
210 Some(start.into()),
211 ));
212 }
213
214 let token = if token_type >= TK_WINDOW {
215 debug_assert!(
216 token_type == TK_OVER || token_type == TK_FILTER || token_type == TK_WINDOW
217 );
218 self.scanner.mark();
219 if token_type == TK_WINDOW {
220 token_type = analyze_window_keyword(&mut self.scanner, self.input)?;
221 } else if token_type == TK_OVER {
222 token_type =
223 analyze_over_keyword(&mut self.scanner, self.input, last_token_parsed)?;
224 } else if token_type == TK_FILTER {
225 token_type =
226 analyze_filter_keyword(&mut self.scanner, self.input, last_token_parsed)?;
227 }
228 self.scanner.reset_to_mark();
229 token_type.to_token(start, value, end)
230 } else {
231 token_type.to_token(start, value, end)
232 };
233 try_with_position!(self.scanner, self.parser.sqlite3Parser(token_type, token));
235 last_token_parsed = token_type;
236 if self.parser.ctx.done() {
237 break;
239 }
240 }
241 if last_token_parsed == TK_EOF {
242 return Ok(None); }
244 if eof && self.parser.ctx.is_ok() {
247 if last_token_parsed != TK_SEMI {
248 try_with_position!(
249 self.scanner,
250 self.parser
251 .sqlite3Parser(TK_SEMI, sentinel(self.input.len()))
252 );
253 if self.parser.ctx.error().is_some() {
254 self.had_error = true;
255 }
256 }
257 try_with_position!(
258 self.scanner,
259 self.parser
260 .sqlite3Parser(TK_EOF, sentinel(self.input.len()))
261 );
262 if self.parser.ctx.error().is_some() {
263 self.had_error = true;
264 }
265 }
266 self.parser.sqlite3ParserFinalize();
267 if let Some(e) = self.parser.ctx.error() {
268 let err = Error::ParserError(
269 e,
270 Some((self.scanner.line(), self.scanner.column())),
271 Some((self.offset() - 1).into()),
272 );
273 self.had_error = true;
274 return Err(err);
275 }
276 let cmd = self.parser.ctx.cmd();
277 if let Some(ref cmd) = cmd {
278 if let Err(e) = cmd.check() {
279 let err = Error::ParserError(
280 e,
281 Some((self.scanner.line(), self.scanner.column())),
282 Some((self.offset() - 1).into()),
283 );
284 self.had_error = true;
285 return Err(err);
286 }
287 }
288 Ok(cmd)
289 }
290}
291
292pub type Token<'input> = (&'input [u8], TokenType);
294
295#[derive(Default)]
297pub struct Tokenizer {}
298
299impl Tokenizer {
300 pub fn new() -> Self {
302 Self {}
303 }
304}
305
306impl Splitter for Tokenizer {
318 type Error = Error;
319 type TokenType = TokenType;
320
321 fn split<'input>(
322 &mut self,
323 data: &'input [u8],
324 ) -> Result<(Option<Token<'input>>, usize), Error> {
325 if data[0].is_ascii_whitespace() {
326 return Ok((
328 None,
329 match data.iter().skip(1).position(|&b| !b.is_ascii_whitespace()) {
330 Some(i) => i + 1,
331 _ => data.len(),
332 },
333 ));
334 }
335 match data[0] {
336 b'-' => {
337 if let Some(b) = data.get(1) {
338 if *b == b'-' {
339 if let Some(i) = memchr(b'\n', data) {
341 Ok((None, i + 1))
342 } else {
343 Ok((None, data.len()))
344 }
345 } else if *b == b'>' {
346 if let Some(b) = data.get(2) {
347 if *b == b'>' {
348 return Ok((Some((&data[..3], TK_PTR)), 3));
349 }
350 }
351 Ok((Some((&data[..2], TK_PTR)), 2))
352 } else {
353 Ok((Some((&data[..1], TK_MINUS)), 1))
354 }
355 } else {
356 Ok((Some((&data[..1], TK_MINUS)), 1))
357 }
358 }
359 b'(' => Ok((Some((&data[..1], TK_LP)), 1)),
360 b')' => Ok((Some((&data[..1], TK_RP)), 1)),
361 b';' => Ok((Some((&data[..1], TK_SEMI)), 1)),
362 b'+' => Ok((Some((&data[..1], TK_PLUS)), 1)),
363 b'*' => Ok((Some((&data[..1], TK_STAR)), 1)),
364 b'/' => {
365 if let Some(b) = data.get(1) {
366 if *b == b'*' {
367 let mut pb = 0;
369 let mut end = None;
370 for (i, b) in data.iter().enumerate().skip(2) {
371 if *b == b'/' && pb == b'*' {
372 end = Some(i);
373 break;
374 }
375 pb = *b;
376 }
377 if let Some(i) = end {
378 Ok((None, i + 1))
379 } else {
380 Err(Error::UnterminatedBlockComment(None, None))
381 }
382 } else {
383 Ok((Some((&data[..1], TK_SLASH)), 1))
384 }
385 } else {
386 Ok((Some((&data[..1], TK_SLASH)), 1))
387 }
388 }
389 b'%' => Ok((Some((&data[..1], TK_REM)), 1)),
390 b'=' => {
391 if let Some(b) = data.get(1) {
392 Ok(if *b == b'=' {
393 (Some((&data[..2], TK_EQ)), 2)
394 } else {
395 (Some((&data[..1], TK_EQ)), 1)
396 })
397 } else {
398 Ok((Some((&data[..1], TK_EQ)), 1))
399 }
400 }
401 b'<' => {
402 if let Some(b) = data.get(1) {
403 Ok(match *b {
404 b'=' => (Some((&data[..2], TK_LE)), 2),
405 b'>' => (Some((&data[..2], TK_NE)), 2),
406 b'<' => (Some((&data[..2], TK_LSHIFT)), 2),
407 _ => (Some((&data[..1], TK_LT)), 1),
408 })
409 } else {
410 Ok((Some((&data[..1], TK_LT)), 1))
411 }
412 }
413 b'>' => {
414 if let Some(b) = data.get(1) {
415 Ok(match *b {
416 b'=' => (Some((&data[..2], TK_GE)), 2),
417 b'>' => (Some((&data[..2], TK_RSHIFT)), 2),
418 _ => (Some((&data[..1], TK_GT)), 1),
419 })
420 } else {
421 Ok((Some((&data[..1], TK_GT)), 1))
422 }
423 }
424 b'!' => {
425 if let Some(b) = data.get(1) {
426 if *b == b'=' {
427 Ok((Some((&data[..2], TK_NE)), 2))
428 } else {
429 Err(Error::ExpectedEqualsSign(None, None))
430 }
431 } else {
432 Err(Error::ExpectedEqualsSign(None, None))
433 }
434 }
435 b'|' => {
436 if let Some(b) = data.get(1) {
437 Ok(if *b == b'|' {
438 (Some((&data[..2], TK_CONCAT)), 2)
439 } else {
440 (Some((&data[..1], TK_BITOR)), 1)
441 })
442 } else {
443 Ok((Some((&data[..1], TK_BITOR)), 1))
444 }
445 }
446 b',' => Ok((Some((&data[..1], TK_COMMA)), 1)),
447 b'&' => Ok((Some((&data[..1], TK_BITAND)), 1)),
448 b'~' => Ok((Some((&data[..1], TK_BITNOT)), 1)),
449 quote @ (b'`' | b'\'' | b'"') => literal(data, quote),
450 b'.' => {
451 if let Some(b) = data.get(1) {
452 if b.is_ascii_digit() {
453 fractional_part(data, 0)
454 } else {
455 Ok((Some((&data[..1], TK_DOT)), 1))
456 }
457 } else {
458 Ok((Some((&data[..1], TK_DOT)), 1))
459 }
460 }
461 b'0'..=b'9' => number(data),
462 b'[' => {
463 if let Some(i) = memchr(b']', data) {
464 Ok((Some((&data[0..=i], TK_ID)), i + 1))
466 } else {
467 Err(Error::UnterminatedBracket(None, None))
468 }
469 }
470 b'?' => {
471 match data.iter().skip(1).position(|&b| !b.is_ascii_digit()) {
472 Some(i) => {
473 Ok((Some((&data[1..=i], TK_VARIABLE)), i + 1))
475 }
476 None => {
477 if !data[1..].is_empty() && data[1..].iter().all(|ch| *ch == b'0') {
478 return Err(Error::BadVariableName(None, None));
479 }
480 Ok((Some((&data[1..], TK_VARIABLE)), data.len()))
481 }
482 }
483 }
484 b'$' | b'@' | b'#' | b':' => {
485 match data
486 .iter()
487 .skip(1)
488 .position(|&b| !is_identifier_continue(b))
489 {
490 Some(0) => Err(Error::BadVariableName(None, None)),
491 Some(i) => {
492 Ok((Some((&data[..=i], TK_VARIABLE)), i + 1))
494 }
495 None => {
496 if data.len() == 1 {
497 return Err(Error::BadVariableName(None, None));
498 }
499 Ok((Some((data, TK_VARIABLE)), data.len()))
500 }
501 }
502 }
503 b if is_identifier_start(b) => {
504 if b == b'x' || b == b'X' {
505 if let Some(&b'\'') = data.get(1) {
506 blob_literal(data)
507 } else {
508 Ok(self.identifierish(data))
509 }
510 } else {
511 Ok(self.identifierish(data))
512 }
513 }
514 _ => handle_unrecognized(data),
516 }
517 }
518}
519
520fn handle_unrecognized(data: &[u8]) -> Result<(Option<Token<'_>>, usize), Error> {
521 let mut end = 1;
522 while end < data.len() && !data[end].is_ascii_whitespace() {
523 end += 1;
524 }
525
526 Ok((Some((&data[..end], TokenType::TK_ILLEGAL)), end))
527}
528
529fn literal(data: &[u8], quote: u8) -> Result<(Option<Token<'_>>, usize), Error> {
530 debug_assert_eq!(data[0], quote);
531 let tt = if quote == b'\'' { TK_STRING } else { TK_ID };
532 let mut pb = 0;
533 let mut end = None;
534 for (i, b) in data.iter().enumerate().skip(1) {
536 if *b == quote {
537 if pb == quote {
538 pb = 0;
540 continue;
541 }
542 } else if pb == quote {
543 end = Some(i);
544 break;
545 }
546 pb = *b;
547 }
548 if end.is_some() || pb == quote {
549 let i = match end {
550 Some(i) => i,
551 _ => data.len(),
552 };
553 Ok((Some((&data[0..i], tt)), i))
555 } else {
556 Err(Error::UnterminatedLiteral(None, None))
557 }
558}
559
560fn blob_literal(data: &[u8]) -> Result<(Option<Token<'_>>, usize), Error> {
561 debug_assert!(data[0] == b'x' || data[0] == b'X');
562 debug_assert_eq!(data[1], b'\'');
563
564 let mut end = 2;
565 let mut valid = true;
566 while end < data.len() && data[end] != b'\'' {
567 if !data[end].is_ascii_hexdigit() {
568 valid = false;
569 }
570 end += 1;
571 }
572
573 let total_len = if end < data.len() { end + 1 } else { end };
574
575 if !valid || (end - 2) % 2 != 0 || end >= data.len() {
576 return Ok((Some((&data[..total_len], TokenType::TK_ILLEGAL)), total_len));
577 }
578
579 Ok((Some((&data[2..end], TokenType::TK_BLOB)), total_len))
580}
581
582fn number(data: &[u8]) -> Result<(Option<Token<'_>>, usize), Error> {
583 debug_assert!(data[0].is_ascii_digit());
584 if data[0] == b'0' {
585 if let Some(b) = data.get(1) {
586 if *b == b'x' || *b == b'X' {
587 return hex_integer(data);
588 }
589 } else {
590 return Ok((Some((data, TK_INTEGER)), data.len()));
591 }
592 }
593 if let Some((i, b)) = find_end_of_number(data, 1, u8::is_ascii_digit)? {
594 if b == b'.' {
595 return fractional_part(data, i);
596 } else if b == b'e' || b == b'E' {
597 return exponential_part(data, i);
598 } else if is_identifier_start(b) {
599 return Err(Error::BadNumber(None, None, Some(i + 1), unsafe {
600 String::from_utf8_unchecked(data[..i + 1].to_vec())
601 }));
602 }
603 Ok((Some((&data[..i], TK_INTEGER)), i))
604 } else {
605 Ok((Some((data, TK_INTEGER)), data.len()))
606 }
607}
608
609fn hex_integer(data: &[u8]) -> Result<(Option<Token<'_>>, usize), Error> {
610 debug_assert_eq!(data[0], b'0');
611 debug_assert!(data[1] == b'x' || data[1] == b'X');
612 if let Some((i, b)) = find_end_of_number(data, 2, u8::is_ascii_hexdigit)? {
613 if i == 2 || is_identifier_start(b) {
615 let (len, help) = if i == 2 && !is_identifier_start(b) {
616 (i, "Did you forget to add digits after '0x' or '0X'?")
617 } else {
618 (i + 1, "There are some invalid digits after '0x' or '0X'")
619 };
620 return Err(Error::MalformedHexInteger(
621 None,
622 None,
623 Some(len), Some(help), ));
626 }
627 Ok((Some((&data[..i], TK_INTEGER)), i))
628 } else {
629 if data.len() == 2 {
631 return Err(Error::MalformedHexInteger(
632 None,
633 None,
634 Some(2), Some("Did you forget to add digits after '0x' or '0X'?"), ));
637 }
638 Ok((Some((data, TK_INTEGER)), data.len()))
639 }
640}
641
642fn fractional_part(data: &[u8], i: usize) -> Result<(Option<Token<'_>>, usize), Error> {
643 debug_assert_eq!(data[i], b'.');
644 if let Some((i, b)) = find_end_of_number(data, i + 1, u8::is_ascii_digit)? {
645 if b == b'e' || b == b'E' {
646 return exponential_part(data, i);
647 } else if is_identifier_start(b) {
648 return Err(Error::BadNumber(None, None, Some(i + 1), unsafe {
649 String::from_utf8_unchecked(data[..i + 1].to_vec())
650 }));
651 }
652 Ok((Some((&data[..i], TK_FLOAT)), i))
653 } else {
654 Ok((Some((data, TK_FLOAT)), data.len()))
655 }
656}
657
658fn exponential_part(data: &[u8], i: usize) -> Result<(Option<Token<'_>>, usize), Error> {
659 debug_assert!(data[i] == b'e' || data[i] == b'E');
660 if let Some(b) = data.get(i + 1) {
662 let i = if *b == b'+' || *b == b'-' { i + 1 } else { i };
663 if let Some((j, b)) = find_end_of_number(data, i + 1, u8::is_ascii_digit)? {
664 if j == i + 1 || is_identifier_start(b) {
665 let len = if is_identifier_start(b) { j + 1 } else { j };
666 return Err(Error::BadNumber(None, None, Some(len), unsafe {
667 String::from_utf8_unchecked(data[..len].to_vec())
668 }));
669 }
670 Ok((Some((&data[..j], TK_FLOAT)), j))
671 } else {
672 if data.len() == i + 1 {
673 return Err(Error::BadNumber(None, None, Some(i + 1), unsafe {
674 String::from_utf8_unchecked(data[..i + 1].to_vec())
675 }));
676 }
677 Ok((Some((data, TK_FLOAT)), data.len()))
678 }
679 } else {
680 Err(Error::BadNumber(None, None, Some(data.len()), unsafe {
681 String::from_utf8_unchecked(data.to_vec())
682 }))
683 }
684}
685
686fn find_end_of_number(
687 data: &[u8],
688 i: usize,
689 test: fn(&u8) -> bool,
690) -> Result<Option<(usize, u8)>, Error> {
691 for (j, &b) in data.iter().enumerate().skip(i) {
692 if test(&b) {
693 continue;
694 } else if b == b'_' {
695 if j >= 1 && data.get(j - 1).map_or(false, test) && data.get(j + 1).map_or(false, test)
696 {
697 continue;
698 }
699 return Err(Error::BadNumber(None, None, Some(j), unsafe {
700 String::from_utf8_unchecked(data[..j].to_vec())
701 }));
702 } else {
703 return Ok(Some((j, b)));
704 }
705 }
706 Ok(None)
707}
708
709impl Tokenizer {
710 fn identifierish<'input>(&mut self, data: &'input [u8]) -> (Option<Token<'input>>, usize) {
711 debug_assert!(is_identifier_start(data[0]));
712 let end = data
714 .iter()
715 .skip(1)
716 .position(|&b| !is_identifier_continue(b));
717 let i = match end {
718 Some(i) => i + 1,
719 _ => data.len(),
720 };
721 let word = &data[..i];
722 let tt = if word.len() >= 2 && word.len() <= MAX_KEYWORD_LEN && word.is_ascii() {
723 keyword_token(word).unwrap_or(TK_ID)
724 } else {
725 TK_ID
726 };
727 (Some((word, tt)), i)
728 }
729}
730
731#[cfg(test)]
732mod tests {
733 use super::Tokenizer;
734 use crate::dialect::TokenType;
735 use crate::lexer::sql::Error;
736 use crate::lexer::Scanner;
737
738 #[test]
739 fn fallible_iterator() -> Result<(), Error> {
740 let tokenizer = Tokenizer::new();
741 let input = b"PRAGMA parser_trace=ON;";
742 let mut s = Scanner::new(tokenizer);
743 expect_token(&mut s, input, b"PRAGMA", TokenType::TK_PRAGMA)?;
744 expect_token(&mut s, input, b"parser_trace", TokenType::TK_ID)?;
745 Ok(())
746 }
747
748 #[test]
749 fn invalid_number_literal() -> Result<(), Error> {
750 let tokenizer = Tokenizer::new();
751 let input = b"SELECT 1E;";
752 let mut s = Scanner::new(tokenizer);
753 expect_token(&mut s, input, b"SELECT", TokenType::TK_SELECT)?;
754 let err = s.scan(input).unwrap_err();
755 assert!(matches!(err, Error::BadNumber(_, _, _, _)));
756 Ok(())
757 }
758
759 fn expect_token(
760 s: &mut Scanner<Tokenizer>,
761 input: &[u8],
762 token: &[u8],
763 token_type: TokenType,
764 ) -> Result<(), Error> {
765 let (t, tt) = s.scan(input)?.1.unwrap();
766 assert_eq!(token, t);
767 assert_eq!(token_type, tt);
768 Ok(())
769 }
770}