use super::token::{
is_keyword, is_operator, is_operator_char, is_punctuator, punctuator_str, Position, Token,
TokenType,
};
use crate::common::SmartString;
pub struct Lexer {
input: Box<[u8]>,
position: usize,
read_position: usize,
ch: char,
pos: Position,
last_error: Option<String>,
}
impl Lexer {
pub fn new(input: &str) -> Self {
let bytes: Box<[u8]> = input.as_bytes().into();
let mut lexer = Self {
input: bytes,
position: 0,
read_position: 0,
ch: '\0',
pos: Position::new(0, 1, 1),
last_error: None,
};
lexer.read_char();
lexer
}
#[inline]
fn decode_char_at(&self, pos: usize) -> (char, usize) {
if pos >= self.input.len() {
return ('\0', 0);
}
let b = self.input[pos];
if b < 0x80 {
return (b as char, 1);
}
let remaining = &self.input[pos..];
match std::str::from_utf8(remaining) {
Ok(s) => {
if let Some(c) = s.chars().next() {
(c, c.len_utf8())
} else {
('\0', 0)
}
}
Err(_) => {
('\u{FFFD}', 1)
}
}
}
fn read_char(&mut self) {
if self.ch == '\n' {
self.pos.line += 1;
self.pos.column = 1;
} else if self.ch != '\0' {
self.pos.column += 1;
}
if self.read_position >= self.input.len() {
self.ch = '\0'; self.position = self.read_position;
} else {
let (ch, len) = self.decode_char_at(self.read_position);
self.ch = ch;
self.position = self.read_position;
self.read_position += len;
}
self.pos.offset = self.position;
}
#[inline]
fn peek_char(&self) -> char {
self.decode_char_at(self.read_position).0
}
fn peek_char_n(&self, n: usize) -> char {
if n == 0 {
return self.ch;
}
let mut pos = self.read_position;
for i in 1..n {
if pos >= self.input.len() {
return '\0';
}
let (_, len) = self.decode_char_at(pos);
pos += len;
if i == n - 1 {
break;
}
}
self.decode_char_at(pos).0
}
fn is_comment_start_after_dashes(&self) -> bool {
let char_after_second_dash = self.peek_char_n(2);
char_after_second_dash == '\0'
|| char_after_second_dash == ' '
|| char_after_second_dash == '\t'
|| char_after_second_dash == '\n'
|| char_after_second_dash == '\r'
}
pub fn next_token(&mut self) -> Token {
self.skip_whitespace();
let pos = self.pos;
match self.ch {
'\0' => Token::eof(pos),
'\'' => {
let literal = self.read_string_literal();
if let Some(err) = self.last_error.take() {
return Token::error(err, "", pos);
}
Token::new(TokenType::String, literal, pos)
}
'"' => {
let literal = self.read_quoted_identifier('"');
if let Some(err) = self.last_error.take() {
return Token::error(err, "", pos);
}
Token::new_quoted(TokenType::Identifier, literal, pos)
}
'`' => {
let literal = self.read_quoted_identifier('`');
if let Some(err) = self.last_error.take() {
return Token::error(err, "", pos);
}
Token::new(TokenType::Identifier, literal, pos)
}
c if c.is_ascii_digit() => {
let literal = self.read_number();
if literal.contains('.') || literal.contains('e') || literal.contains('E') {
Token::new(TokenType::Float, literal, pos)
} else {
Token::new(TokenType::Integer, literal, pos)
}
}
'#' => {
let literal = self.read_line_comment();
Token::new(TokenType::Comment, literal, pos)
}
'-' if self.peek_char() == '-' && self.is_comment_start_after_dashes() => {
let literal = self.read_line_comment();
Token::new(TokenType::Comment, literal, pos)
}
'/' if self.peek_char() == '*' => {
let literal = self.read_block_comment();
Token::new(TokenType::Comment, literal, pos)
}
'$' if self.peek_char().is_ascii_digit() => {
let literal = self.read_parameter();
Token::new(TokenType::Parameter, literal, pos)
}
'?' => {
self.read_char();
Token::new(TokenType::Parameter, "?", pos)
}
':' if self.peek_char().is_alphabetic() || self.peek_char() == '_' => {
let literal = self.read_named_parameter();
Token::new(TokenType::Parameter, literal, pos)
}
'*' => {
self.read_char();
Token::new(TokenType::Operator, "*", pos)
}
c if is_punctuator(c) => {
self.read_char();
Token::new(TokenType::Punctuator, punctuator_str(c).unwrap(), pos)
}
c if is_operator_char(c) => {
let literal = self.read_operator();
Token::new(TokenType::Operator, literal, pos)
}
c if c.is_alphabetic() || c == '_' => {
let literal = self.read_identifier();
if is_keyword(&literal) {
Token::new(TokenType::Keyword, literal.to_uppercase(), pos)
} else {
Token::new(TokenType::Identifier, literal, pos)
}
}
c => {
self.read_char();
Token::error(
format!("unrecognized character: {:?}", c),
c.to_string(),
pos,
)
}
}
}
fn skip_whitespace(&mut self) {
while self.ch.is_whitespace() {
if self.ch == '\r' && self.peek_char() == '\n' {
self.read_char(); }
self.read_char();
}
}
fn read_identifier(&mut self) -> SmartString {
let mut result = SmartString::new("");
result.push(self.ch);
self.read_char();
while self.ch.is_alphanumeric() || self.ch == '_' || self.ch == '$' {
result.push(self.ch);
self.read_char();
}
result
}
fn read_number(&mut self) -> SmartString {
let mut result = SmartString::new("");
result.push(self.ch);
self.read_char();
while self.ch.is_ascii_digit() {
result.push(self.ch);
self.read_char();
}
if self.ch == '.' && self.peek_char().is_ascii_digit() {
result.push(self.ch);
self.read_char();
while self.ch.is_ascii_digit() {
result.push(self.ch);
self.read_char();
}
}
if self.ch == 'e' || self.ch == 'E' {
result.push(self.ch);
self.read_char();
if self.ch == '+' || self.ch == '-' {
result.push(self.ch);
self.read_char();
}
if !self.ch.is_ascii_digit() {
self.last_error = Some("invalid number format: exponent has no digits".to_string());
return result;
}
while self.ch.is_ascii_digit() {
result.push(self.ch);
self.read_char();
}
}
result
}
fn read_string_literal(&mut self) -> SmartString {
let quote = self.ch;
let quote_byte = quote as u8;
let start_pos = self.position;
let mut scan_pos = self.read_position; let input_len = self.input.len();
let mut found_escape = false;
while scan_pos < input_len {
let b = self.input[scan_pos];
if b == quote_byte {
if scan_pos + 1 < input_len && self.input[scan_pos + 1] == quote_byte {
found_escape = true;
break;
}
let end_pos = scan_pos + 1;
for &byte in &self.input[self.read_position..scan_pos] {
if byte == b'\n' {
self.pos.line += 1;
self.pos.column = 1;
} else {
self.pos.column += 1;
}
}
self.pos.column += 1;
self.position = scan_pos;
self.read_position = end_pos;
self.pos.offset = self.position;
if self.read_position >= input_len {
self.ch = '\0';
self.position = self.read_position;
} else {
let (ch, len) = self.decode_char_at(self.read_position);
self.ch = ch;
self.position = self.read_position;
self.read_position += len;
}
self.pos.offset = self.position;
let slice =
unsafe { std::str::from_utf8_unchecked(&self.input[start_pos..end_pos]) };
return SmartString::new(slice);
} else if b == b'\\' || b == 0 {
found_escape = true;
break;
}
scan_pos += 1;
}
if !found_escape && scan_pos >= input_len {
}
let estimated_len = if scan_pos > self.read_position {
scan_pos - start_pos + 2
} else {
32
};
let mut result = String::with_capacity(estimated_len);
result.push(quote);
self.read_char();
loop {
if self.ch == '\0' {
if self.position >= self.input.len() {
self.last_error = Some("unterminated string literal".to_string());
} else {
self.last_error =
Some("NULL byte (0x00) is not allowed in string literals".to_string());
}
result.push(quote);
break;
} else if self.ch == quote {
if self.peek_char() == quote {
result.push(self.ch);
self.read_char();
self.read_char();
} else {
result.push(quote);
self.read_char();
break;
}
} else if self.ch == '\\' {
result.push(self.ch);
self.read_char();
if self.ch != '\0' {
result.push(self.ch);
self.read_char();
}
} else {
result.push(self.ch);
self.read_char();
}
}
SmartString::from_string(result)
}
fn read_quoted_identifier(&mut self, quote: char) -> SmartString {
let mut result = SmartString::new("");
self.read_char();
while self.ch != '\0' {
if self.ch == quote && self.peek_char() == quote {
result.push(self.ch);
self.read_char(); self.read_char(); } else if self.ch == quote {
break;
} else {
result.push(self.ch);
self.read_char();
}
}
if self.ch == quote {
self.read_char();
} else if self.position >= self.input.len() {
self.last_error = Some(format!(
"unterminated quoted identifier starting with {}",
quote
));
} else {
self.last_error =
Some("NULL byte (0x00) is not allowed in quoted identifiers".to_string());
}
result
}
fn read_line_comment(&mut self) -> SmartString {
let mut result = SmartString::new("");
result.push(self.ch);
if self.ch == '-' && self.peek_char() == '-' {
self.read_char(); result.push(self.ch); self.read_char(); } else if self.ch == '#' {
self.read_char(); }
while self.ch != '\n' && self.ch != '\0' {
result.push(self.ch);
self.read_char();
}
result
}
fn read_block_comment(&mut self) -> SmartString {
let mut result = SmartString::new("");
result.push(self.ch); self.read_char();
result.push(self.ch); self.read_char();
while !(self.ch == '*' && self.peek_char() == '/') && self.ch != '\0' {
result.push(self.ch);
self.read_char();
}
if self.ch != '\0' {
result.push(self.ch); self.read_char();
result.push(self.ch); self.read_char();
} else {
self.last_error = Some("unterminated block comment".to_string());
}
result
}
fn read_operator(&mut self) -> SmartString {
let mut result = SmartString::new("");
let first_char = self.ch;
result.push(first_char);
self.read_char();
if self.ch != '\0' {
let two_chars: SmartString =
SmartString::from_iter([first_char, self.ch].iter().copied());
if is_operator(&two_chars) {
result.push(self.ch);
self.read_char();
if self.ch != '\0' {
let mut three_chars = two_chars.clone();
three_chars.push(self.ch);
if is_operator(&three_chars) {
result.push(self.ch);
self.read_char();
}
}
}
}
result
}
fn read_parameter(&mut self) -> SmartString {
let mut result = SmartString::new("");
result.push(self.ch); self.read_char();
while self.ch.is_ascii_digit() {
result.push(self.ch);
self.read_char();
}
if result.len() == 1 {
self.last_error = Some("parameter number expected after $".to_string());
}
result
}
fn read_named_parameter(&mut self) -> SmartString {
let mut result = SmartString::new("");
result.push(self.ch); self.read_char();
while self.ch.is_alphanumeric() || self.ch == '_' {
result.push(self.ch);
self.read_char();
}
result
}
pub fn get_error(&self) -> Option<&str> {
self.last_error.as_deref()
}
pub fn peek_token(&mut self) -> Token {
let saved_position = self.position;
let saved_read_position = self.read_position;
let saved_ch = self.ch;
let saved_pos = self.pos;
let token = self.next_token();
self.position = saved_position;
self.read_position = saved_read_position;
self.ch = saved_ch;
self.pos = saved_pos;
token
}
pub fn peek_tokens(&mut self, n: usize) -> Vec<Token> {
if n == 0 {
return Vec::new();
}
let saved_position = self.position;
let saved_read_position = self.read_position;
let saved_ch = self.ch;
let saved_pos = self.pos;
let mut tokens = Vec::with_capacity(n);
for _ in 0..n {
let token = self.next_token();
if token.is_eof() {
tokens.push(token);
break;
}
tokens.push(token);
}
self.position = saved_position;
self.read_position = saved_read_position;
self.ch = saved_ch;
self.pos = saved_pos;
tokens
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple_select() {
let mut lexer = Lexer::new("SELECT * FROM users");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Keyword);
assert_eq!(token.literal, "SELECT");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Operator);
assert_eq!(token.literal, "*");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Keyword);
assert_eq!(token.literal, "FROM");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Identifier);
assert_eq!(token.literal, "users");
let token = lexer.next_token();
assert!(token.is_eof());
}
#[test]
fn test_numbers() {
let mut lexer = Lexer::new("123 45.67 -89 3.14e10 1.5E-3");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Integer);
assert_eq!(token.literal, "123");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Float);
assert_eq!(token.literal, "45.67");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Operator);
assert_eq!(token.literal, "-");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Integer);
assert_eq!(token.literal, "89");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Float);
assert_eq!(token.literal, "3.14e10");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Float);
assert_eq!(token.literal, "1.5E-3");
}
#[test]
fn test_string_literals() {
let mut lexer = Lexer::new("'hello' 'world''s' 'escaped\\ntext'");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::String);
assert_eq!(token.literal, "'hello'");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::String);
assert_eq!(token.literal, "'world's'");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::String);
assert_eq!(token.literal, "'escaped\\ntext'");
}
#[test]
fn test_quoted_identifiers() {
let mut lexer = Lexer::new("\"table name\" `column`");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Identifier);
assert_eq!(token.literal, "table name");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Identifier);
assert_eq!(token.literal, "column");
}
#[test]
fn test_operators() {
let mut lexer = Lexer::new("= <> >= <= != + - * / || -> ->>");
let expected = vec![
"=", "<>", ">=", "<=", "!=", "+", "-", "*", "/", "||", "->", "->>",
];
for exp in expected {
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Operator);
assert_eq!(token.literal, exp);
}
}
#[test]
fn test_punctuators() {
let mut lexer = Lexer::new("( ) , ; . [ ]");
let expected = vec!["(", ")", ",", ";", ".", "[", "]"];
for exp in expected {
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Punctuator);
assert_eq!(token.literal, exp);
}
}
#[test]
fn test_comments() {
let mut lexer = Lexer::new("-- line comment\nSELECT /* block */ 1");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Comment);
assert!(token.literal.contains("line comment"));
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Keyword);
assert_eq!(token.literal, "SELECT");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Comment);
assert!(token.literal.contains("block"));
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Integer);
assert_eq!(token.literal, "1");
}
#[test]
fn test_double_negation() {
let mut lexer = Lexer::new("SELECT --5");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Keyword);
assert_eq!(token.literal, "SELECT");
let token = lexer.next_token();
assert_eq!(
token.token_type,
TokenType::Operator,
"Expected Operator, got {:?} with literal '{}'",
token.token_type,
token.literal
);
assert_eq!(token.literal, "-");
let token = lexer.next_token();
assert_eq!(
token.token_type,
TokenType::Operator,
"Expected Operator, got {:?} with literal '{}'",
token.token_type,
token.literal
);
assert_eq!(token.literal, "-");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Integer);
assert_eq!(token.literal, "5");
let mut lexer = Lexer::new("SELECT --val");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Keyword);
assert_eq!(token.literal, "SELECT");
let token = lexer.next_token();
assert_eq!(
token.token_type,
TokenType::Operator,
"Expected Operator for first -, got {:?} with literal '{}'",
token.token_type,
token.literal
);
assert_eq!(token.literal, "-");
let token = lexer.next_token();
assert_eq!(
token.token_type,
TokenType::Operator,
"Expected Operator for second -, got {:?} with literal '{}'",
token.token_type,
token.literal
);
assert_eq!(token.literal, "-");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Identifier);
assert_eq!(token.literal, "val");
let mut lexer = Lexer::new("SELECT -- comment");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Keyword);
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Comment);
}
#[test]
fn test_parameters() {
let mut lexer = Lexer::new("$1 $23 ? :name :user_id :_private");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Parameter);
assert_eq!(token.literal, "$1");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Parameter);
assert_eq!(token.literal, "$23");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Parameter);
assert_eq!(token.literal, "?");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Parameter);
assert_eq!(token.literal, ":name");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Parameter);
assert_eq!(token.literal, ":user_id");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Parameter);
assert_eq!(token.literal, ":_private");
}
#[test]
fn test_keywords_case_insensitive() {
let mut lexer = Lexer::new("select SELECT Select");
for _ in 0..3 {
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Keyword);
assert_eq!(token.literal, "SELECT");
}
}
#[test]
fn test_position_tracking() {
let mut lexer = Lexer::new("SELECT\nFROM");
let token = lexer.next_token();
assert_eq!(token.position.line, 1);
assert_eq!(token.position.column, 1);
let token = lexer.next_token();
assert_eq!(token.position.line, 2);
assert_eq!(token.position.column, 1);
}
#[test]
fn test_peek_token() {
let mut lexer = Lexer::new("SELECT FROM");
let peek1 = lexer.peek_token();
assert_eq!(peek1.literal, "SELECT");
let peek2 = lexer.peek_token();
assert_eq!(peek2.literal, "SELECT");
let actual = lexer.next_token();
assert_eq!(actual.literal, "SELECT");
let next = lexer.next_token();
assert_eq!(next.literal, "FROM");
}
#[test]
fn test_peek_tokens() {
let mut lexer = Lexer::new("SELECT * FROM users");
let peeked = lexer.peek_tokens(3);
assert_eq!(peeked.len(), 3);
assert_eq!(peeked[0].literal, "SELECT");
assert_eq!(peeked[1].literal, "*");
assert_eq!(peeked[2].literal, "FROM");
let actual = lexer.next_token();
assert_eq!(actual.literal, "SELECT");
}
#[test]
fn test_complex_query() {
let query = r#"
SELECT u.id, u.name, COUNT(o.id) as order_count
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
WHERE u.active = TRUE AND o.amount >= 100.50
GROUP BY u.id, u.name
HAVING COUNT(o.id) > 0
ORDER BY order_count DESC
LIMIT 10
"#;
let mut lexer = Lexer::new(query);
let mut tokens = Vec::new();
loop {
let token = lexer.next_token();
if token.is_eof() {
break;
}
tokens.push(token);
}
assert!(tokens.len() > 30);
assert!(tokens.iter().any(|t| t.is_keyword("SELECT")));
assert!(tokens.iter().any(|t| t.is_keyword("FROM")));
assert!(tokens.iter().any(|t| t.is_keyword("JOIN")));
assert!(tokens.iter().any(|t| t.is_keyword("WHERE")));
assert!(tokens.iter().any(|t| t.is_keyword("GROUP")));
assert!(tokens.iter().any(|t| t.is_keyword("HAVING")));
assert!(tokens.iter().any(|t| t.is_keyword("ORDER")));
assert!(tokens.iter().any(|t| t.is_keyword("LIMIT")));
}
#[test]
fn test_error_token() {
let mut lexer = Lexer::new("SELECT © FROM");
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Keyword);
let token = lexer.next_token();
assert_eq!(token.token_type, TokenType::Error);
assert!(!token.literal.is_empty());
}
}