vibesql_parser/lexer/
mod.rs

1//! SQL Lexer module - tokenizes SQL text into a stream of tokens.
2//!
3//! The lexer is organized into focused submodules:
4//! - `keywords`: SQL keyword recognition and mapping
5//! - `numbers`: Numeric literal parsing (integers, decimals, scientific notation)
6//! - `strings`: String literal parsing with escape handling
7//! - `identifiers`: Regular and delimited identifier handling
8//! - `operators`: Multi-character operator recognition
9
10use std::fmt;
11
12use crate::token::Token;
13
14mod identifiers;
15mod keywords;
16mod numbers;
17mod operators;
18mod strings;
19
20/// Lexer error returned when tokenization fails.
21#[derive(Debug, Clone, PartialEq)]
22pub struct LexerError {
23    pub message: String,
24    pub position: usize,
25}
26
27impl fmt::Display for LexerError {
28    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
29        write!(f, "Lexer error at position {}: {}", self.position, self.message)
30    }
31}
32
33/// SQL Lexer - converts SQL text into tokens.
34///
35/// Uses direct &str access for zero-copy tokenization.
36/// Tracks byte position for efficient slicing.
37pub struct Lexer<'a> {
38    input: &'a str,
39    byte_pos: usize,
40}
41
42impl<'a> Lexer<'a> {
43    /// Create a new lexer from SQL input.
44    #[inline]
45    pub fn new(input: &'a str) -> Self {
46        Lexer { input, byte_pos: 0 }
47    }
48
49    /// Tokenize the entire input.
50    pub fn tokenize(&mut self) -> Result<Vec<Token>, LexerError> {
51        // Pre-allocate based on estimated token count (~1 token per 6 bytes)
52        let estimated_tokens = (self.input.len() / 6).max(4);
53        let mut tokens = Vec::with_capacity(estimated_tokens);
54
55        loop {
56            self.skip_whitespace_and_comments();
57
58            if self.is_eof() {
59                tokens.push(Token::Eof);
60                break;
61            }
62
63            let token = self.next_token()?;
64            tokens.push(token);
65        }
66
67        Ok(tokens)
68    }
69
70    /// Get the next token.
71    fn next_token(&mut self) -> Result<Token, LexerError> {
72        let ch = self.current_char();
73
74        match ch {
75            ';' => {
76                self.advance();
77                Ok(Token::Semicolon)
78            }
79            ',' => {
80                self.advance();
81                Ok(Token::Comma)
82            }
83            '(' => {
84                self.advance();
85                Ok(Token::LParen)
86            }
87            ')' => {
88                self.advance();
89                Ok(Token::RParen)
90            }
91            '=' | '<' | '>' | '!' | '|' => self.tokenize_operator(ch),
92            '@' => {
93                // Check for @@ (session variable) or @ (user variable)
94                if self.peek_byte(1) == Some(b'@') {
95                    self.tokenize_session_variable()
96                } else {
97                    self.tokenize_user_variable()
98                }
99            }
100            '.' => {
101                // Check if this is the start of a decimal number (e.g., .2, .5E+10)
102                if self.peek_byte(1).map(|b| b.is_ascii_digit()).unwrap_or(false) {
103                    self.tokenize_number()
104                } else {
105                    self.advance();
106                    Ok(Token::Symbol('.'))
107                }
108            }
109            '+' | '-' | '*' | '/' => {
110                let symbol = ch;
111                self.advance();
112                Ok(Token::Symbol(symbol))
113            }
114            '\'' => self.tokenize_string(),
115            '"' => self.tokenize_delimited_identifier(),
116            '`' => self.tokenize_backtick_identifier(),
117            '0'..='9' => self.tokenize_number(),
118            'a'..='z' | 'A'..='Z' | '_' => self.tokenize_identifier_or_keyword(),
119            '?' => {
120                self.advance();
121                Ok(Token::Placeholder)
122            }
123            '$' => {
124                // Check if followed by digits for numbered placeholder ($1, $2, etc.)
125                if self.peek_byte(1).map(|b| b.is_ascii_digit()).unwrap_or(false) {
126                    self.tokenize_numbered_placeholder()
127                } else {
128                    Err(LexerError {
129                        message: "Expected digit after '$' for numbered placeholder".to_string(),
130                        position: self.position(),
131                    })
132                }
133            }
134            ':' => {
135                // Check if followed by alphabetic character or underscore for named placeholder
136                if self.peek_byte(1).map(|b| b.is_ascii_alphabetic() || b == b'_').unwrap_or(false)
137                {
138                    self.tokenize_named_placeholder()
139                } else {
140                    // Just a colon symbol (could be used in other contexts)
141                    self.advance();
142                    Ok(Token::Symbol(':'))
143                }
144            }
145            _ => Err(LexerError {
146                message: format!("Unexpected character: '{}'", ch),
147                position: self.byte_pos,
148            }),
149        }
150    }
151
152    /// Skip whitespace characters.
153    #[inline]
154    fn skip_whitespace(&mut self) {
155        while let Some(b) = self.peek_byte(0) {
156            if b.is_ascii_whitespace() {
157                self.byte_pos += 1;
158            } else {
159                break;
160            }
161        }
162    }
163
164    /// Skip whitespace and SQL comments.
165    /// SQL supports line comments starting with -- until end of line.
166    fn skip_whitespace_and_comments(&mut self) {
167        loop {
168            self.skip_whitespace();
169
170            if self.is_eof() {
171                break;
172            }
173
174            // Check for -- line comment
175            if self.peek_byte(0) == Some(b'-') && self.peek_byte(1) == Some(b'-') {
176                // Skip until end of line
177                while let Some(b) = self.peek_byte(0) {
178                    self.byte_pos += 1;
179                    if b == b'\n' {
180                        break;
181                    }
182                }
183                // Continue loop to skip the newline and any following whitespace/comments
184                continue;
185            }
186
187            // No more whitespace or comments
188            break;
189        }
190    }
191
192    /// Get current character without advancing.
193    #[inline]
194    pub(super) fn current_char(&self) -> char {
195        if self.byte_pos >= self.input.len() {
196            '\0'
197        } else {
198            // Fast path for ASCII (most common case in SQL)
199            let b = self.input.as_bytes()[self.byte_pos];
200            if b.is_ascii() {
201                b as char
202            } else {
203                // Slow path for multi-byte UTF-8
204                self.input[self.byte_pos..].chars().next().unwrap_or('\0')
205            }
206        }
207    }
208
209    /// Peek ahead n bytes without advancing (for ASCII characters).
210    #[inline]
211    pub(super) fn peek_byte(&self, n: usize) -> Option<u8> {
212        let peek_pos = self.byte_pos + n;
213        if peek_pos < self.input.len() {
214            Some(self.input.as_bytes()[peek_pos])
215        } else {
216            None
217        }
218    }
219
220    /// Advance to next character.
221    #[inline]
222    pub(super) fn advance(&mut self) {
223        if self.byte_pos < self.input.len() {
224            // Fast path for ASCII
225            let b = self.input.as_bytes()[self.byte_pos];
226            if b.is_ascii() {
227                self.byte_pos += 1;
228            } else {
229                // Slow path for multi-byte UTF-8
230                if let Some(ch) = self.input[self.byte_pos..].chars().next() {
231                    self.byte_pos += ch.len_utf8();
232                }
233            }
234        }
235    }
236
237    /// Check if we've reached end of input.
238    #[inline]
239    pub(super) fn is_eof(&self) -> bool {
240        self.byte_pos >= self.input.len()
241    }
242
243    /// Get the current byte position (for error reporting).
244    #[inline]
245    pub(super) fn position(&self) -> usize {
246        self.byte_pos
247    }
248
249    /// Get a slice of the input from start to current position.
250    #[inline]
251    pub(super) fn slice_from(&self, start: usize) -> &'a str {
252        &self.input[start..self.byte_pos]
253    }
254
255    /// Tokenize a session variable (@@variable, @@session.variable, @@global.variable).
256    fn tokenize_session_variable(&mut self) -> Result<Token, LexerError> {
257        self.advance(); // Skip first @
258        self.advance(); // Skip second @
259
260        let start = self.byte_pos;
261
262        // Read the variable name (which may include scope prefix like 'global' or 'session')
263        while !self.is_eof() {
264            let ch = self.current_char();
265            if ch.is_ascii_alphanumeric() || ch == '_' || ch == '.' {
266                self.advance();
267            } else {
268                break;
269            }
270        }
271
272        if self.byte_pos == start {
273            return Err(LexerError {
274                message: "Expected variable name after @@".to_string(),
275                position: self.byte_pos,
276            });
277        }
278
279        let var_name = self.slice_from(start).to_string();
280        Ok(Token::SessionVariable(var_name))
281    }
282
283    /// Tokenize a user variable (@variable).
284    fn tokenize_user_variable(&mut self) -> Result<Token, LexerError> {
285        self.advance(); // Skip @
286
287        let start = self.byte_pos;
288
289        // Read the variable name
290        while !self.is_eof() {
291            let ch = self.current_char();
292            if ch.is_ascii_alphanumeric() || ch == '_' {
293                self.advance();
294            } else {
295                break;
296            }
297        }
298
299        if self.byte_pos == start {
300            return Err(LexerError {
301                message: "Expected variable name after @".to_string(),
302                position: self.byte_pos,
303            });
304        }
305
306        let var_name = self.slice_from(start).to_string();
307        Ok(Token::UserVariable(var_name))
308    }
309
310    /// Tokenize a numbered placeholder ($1, $2, etc.).
311    /// PostgreSQL-style: 1-indexed ($1 = first parameter).
312    fn tokenize_numbered_placeholder(&mut self) -> Result<Token, LexerError> {
313        self.advance(); // consume '$'
314
315        let start_pos = self.position();
316        let mut num_str = String::new();
317
318        // Read all digits
319        while !self.is_eof() {
320            let ch = self.current_char();
321            if ch.is_ascii_digit() {
322                num_str.push(ch);
323                self.advance();
324            } else {
325                break;
326            }
327        }
328
329        if num_str.is_empty() {
330            return Err(LexerError {
331                message: "Expected digit after '$' for numbered placeholder".to_string(),
332                position: start_pos,
333            });
334        }
335
336        let index: usize = num_str.parse().map_err(|_| LexerError {
337            message: format!("Invalid numbered placeholder: ${}", num_str),
338            position: start_pos,
339        })?;
340
341        // PostgreSQL requires $1 or higher (no $0)
342        if index == 0 {
343            return Err(LexerError {
344                message: "Numbered placeholder must be $1 or higher (no $0)".to_string(),
345                position: start_pos,
346            });
347        }
348
349        Ok(Token::NumberedPlaceholder(index))
350    }
351
352    /// Tokenize a named placeholder (:name, :user_id, etc.).
353    fn tokenize_named_placeholder(&mut self) -> Result<Token, LexerError> {
354        self.advance(); // consume ':'
355
356        let mut name = String::new();
357
358        // Read the identifier (alphanumeric or underscore)
359        while !self.is_eof() {
360            let ch = self.current_char();
361            if ch.is_ascii_alphanumeric() || ch == '_' {
362                name.push(ch);
363                self.advance();
364            } else {
365                break;
366            }
367        }
368
369        if name.is_empty() {
370            return Err(LexerError {
371                message: "Expected identifier after ':' for named placeholder".to_string(),
372                position: self.position(),
373            });
374        }
375
376        Ok(Token::NamedPlaceholder(name))
377    }
378}