#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LexerState {
Normal,
SingleQuoted,
SingleQuotedBackslash,
SingleQuotedMaybeEnd,
Backtick,
DoubleQuoted,
SquareBracket,
MaybeDash,
LineComment,
MaybeBlockStart,
BlockComment,
MaybeBlockEnd,
ConditionalComment,
ConditionalMaybeEnd,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LexerEvent {
None,
Semicolon,
}
impl LexerState {
#[inline(always)]
pub fn feed(self, b: u8) -> (LexerState, LexerEvent) {
use LexerState::*;
match (self, b) {
(Normal, b'\'') => (SingleQuoted, LexerEvent::None),
(Normal, b'`') => (Backtick, LexerEvent::None),
(Normal, b'"') => (DoubleQuoted, LexerEvent::None),
(Normal, b'[') => (SquareBracket, LexerEvent::None),
(Normal, b'-') => (MaybeDash, LexerEvent::None),
(Normal, b'/') => (MaybeBlockStart, LexerEvent::None),
(Normal, b';') => (Normal, LexerEvent::Semicolon),
(Normal, _) => (Normal, LexerEvent::None),
(SingleQuoted, b'\\') => (SingleQuotedBackslash, LexerEvent::None),
(SingleQuoted, b'\'') => (SingleQuotedMaybeEnd, LexerEvent::None),
(SingleQuoted, _) => (SingleQuoted, LexerEvent::None),
(SingleQuotedBackslash, _) => (SingleQuoted, LexerEvent::None),
(SingleQuotedMaybeEnd, b'\'') => (SingleQuoted, LexerEvent::None),
(SingleQuotedMaybeEnd, b';') => (Normal, LexerEvent::Semicolon),
(SingleQuotedMaybeEnd, b'`') => (Backtick, LexerEvent::None),
(SingleQuotedMaybeEnd, b'"') => (DoubleQuoted, LexerEvent::None),
(SingleQuotedMaybeEnd, b'[') => (SquareBracket, LexerEvent::None),
(SingleQuotedMaybeEnd, b'-') => (MaybeDash, LexerEvent::None),
(SingleQuotedMaybeEnd, b'/') => (MaybeBlockStart, LexerEvent::None),
(SingleQuotedMaybeEnd, _) => (Normal, LexerEvent::None),
(Backtick, b'`') => (Normal, LexerEvent::None),
(Backtick, _) => (Backtick, LexerEvent::None),
(DoubleQuoted, b'"') => (Normal, LexerEvent::None),
(DoubleQuoted, _) => (DoubleQuoted, LexerEvent::None),
(SquareBracket, b']') => (Normal, LexerEvent::None),
(SquareBracket, _) => (SquareBracket, LexerEvent::None),
(MaybeDash, b'-') => (LineComment, LexerEvent::None),
(MaybeDash, b';') => (Normal, LexerEvent::Semicolon),
(MaybeDash, b'\'') => (SingleQuoted, LexerEvent::None),
(MaybeDash, _) => (Normal, LexerEvent::None),
(LineComment, b'\n') => (Normal, LexerEvent::None),
(LineComment, _) => (LineComment, LexerEvent::None),
(MaybeBlockStart, b'*') => (BlockComment, LexerEvent::None),
(MaybeBlockStart, b';') => (Normal, LexerEvent::Semicolon),
(MaybeBlockStart, b'\'') => (SingleQuoted, LexerEvent::None),
(MaybeBlockStart, _) => (Normal, LexerEvent::None),
(BlockComment, b'*') => (MaybeBlockEnd, LexerEvent::None),
(BlockComment, _) => (BlockComment, LexerEvent::None),
(MaybeBlockEnd, b'/') => (Normal, LexerEvent::None),
(MaybeBlockEnd, b'*') => (MaybeBlockEnd, LexerEvent::None),
(MaybeBlockEnd, _) => (BlockComment, LexerEvent::None),
(ConditionalComment, b'*') => (ConditionalMaybeEnd, LexerEvent::None),
(ConditionalComment, b'\'') => (SingleQuoted, LexerEvent::None),
(ConditionalComment, b'`') => (Backtick, LexerEvent::None),
(ConditionalComment, b'"') => (DoubleQuoted, LexerEvent::None),
(ConditionalComment, _) => (ConditionalComment, LexerEvent::None),
(ConditionalMaybeEnd, b'/') => (Normal, LexerEvent::None),
(ConditionalMaybeEnd, b'*') => (ConditionalMaybeEnd, LexerEvent::None),
(ConditionalMaybeEnd, b'\'') => (SingleQuoted, LexerEvent::None),
(ConditionalMaybeEnd, _) => (ConditionalComment, LexerEvent::None),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn scan(input: &[u8]) -> (LexerState, Vec<usize>) {
let mut state = LexerState::Normal;
let mut semicolons = Vec::new();
for (i, &b) in input.iter().enumerate() {
let (new_state, event) = state.feed(b);
state = new_state;
if event == LexerEvent::Semicolon {
semicolons.push(i);
}
}
(state, semicolons)
}
#[test]
fn simple_statements() {
let (_, scs) = scan(b"SELECT 1; SELECT 2;");
assert_eq!(scs, vec![8, 18]);
}
#[test]
fn semicolon_in_single_quoted_string() {
let input = b"INSERT INTO t VALUES ('a;b');";
let (_, scs) = scan(input);
assert_eq!(scs, vec![input.len() - 1]);
}
#[test]
fn double_single_quote_escape() {
let input = b"INSERT INTO t VALUES ('it''s');";
let (_, scs) = scan(input);
assert_eq!(scs, vec![input.len() - 1]);
}
#[test]
fn backslash_escape_in_string() {
let input = b"INSERT INTO t VALUES ('a\\'b');";
let (_, scs) = scan(input);
assert_eq!(scs, vec![input.len() - 1]);
}
#[test]
fn backslash_at_end_of_string() {
let input = b"SELECT 'hello\\\\';";
let (_, scs) = scan(input);
assert_eq!(scs, vec![input.len() - 1]);
}
#[test]
fn backtick_identifier() {
let input = b"INSERT INTO `ta;ble` VALUES (1);";
let (_, scs) = scan(input);
assert_eq!(scs, vec![input.len() - 1]);
}
#[test]
fn double_quoted_identifier() {
let input = b"INSERT INTO \"ta;ble\" VALUES (1);";
let (_, scs) = scan(input);
assert_eq!(scs, vec![input.len() - 1]);
}
#[test]
fn square_bracket_identifier() {
let input = b"INSERT INTO [ta;ble] VALUES (1);";
let (_, scs) = scan(input);
assert_eq!(scs, vec![input.len() - 1]);
}
#[test]
fn mssql_nstring() {
let input = b"INSERT INTO [t] VALUES (N'hello;world');";
let (_, scs) = scan(input);
assert_eq!(scs, vec![input.len() - 1]);
}
#[test]
fn line_comment() {
let input = b"-- this is ; a comment\nSELECT 1;";
let (_, scs) = scan(input);
assert_eq!(scs, vec![input.len() - 1]);
}
#[test]
fn block_comment() {
let input = b"/* ; */ SELECT 1;";
let (_, scs) = scan(input);
assert_eq!(scs, vec![input.len() - 1]);
}
#[test]
fn nested_star_in_block_comment() {
let input = b"/* * ; ** */ SELECT 1;";
let (_, scs) = scan(input);
assert_eq!(scs, vec![input.len() - 1]);
}
#[test]
fn conditional_comment_no_semicolons() {
let input = b"/*!50001 SET NAMES utf8; */;";
let (_, scs) = scan(input);
assert_eq!(scs, vec![input.len() - 1]);
}
#[test]
fn string_with_parentheses_and_commas() {
let input = b"INSERT INTO t VALUES ('(a,b)','c');";
let (_, scs) = scan(input);
assert_eq!(scs, vec![input.len() - 1]);
}
#[test]
fn empty_string() {
let input = b"INSERT INTO t VALUES ('');";
let (_, scs) = scan(input);
assert_eq!(scs, vec![input.len() - 1]);
}
#[test]
fn multiple_inserts() {
let input = b"INSERT INTO t VALUES (1); INSERT INTO t VALUES (2);";
let (_, scs) = scan(input);
assert_eq!(scs, vec![24, 50]);
}
}