vibesql_parser/lexer/
mod.rs

1//! SQL Lexer module - tokenizes SQL text into a stream of tokens.
2//!
3//! The lexer is organized into focused submodules:
4//! - `keywords`: SQL keyword recognition and mapping
5//! - `numbers`: Numeric literal parsing (integers, decimals, scientific notation)
6//! - `strings`: String literal parsing with escape handling
7//! - `identifiers`: Regular and delimited identifier handling
8//! - `operators`: Multi-character operator recognition
9
10use std::fmt;
11
12use crate::token::Token;
13
14mod identifiers;
15mod keywords;
16mod numbers;
17mod operators;
18mod strings;
19
20/// Lexer error returned when tokenization fails.
21#[derive(Debug, Clone, PartialEq)]
22pub struct LexerError {
23    pub message: String,
24    pub position: usize,
25}
26
27impl fmt::Display for LexerError {
28    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
29        write!(f, "Lexer error at position {}: {}", self.position, self.message)
30    }
31}
32
33/// SQL Lexer - converts SQL text into tokens.
34///
35/// Uses direct &str access for zero-copy tokenization.
36/// Tracks byte position for efficient slicing.
37pub struct Lexer<'a> {
38    input: &'a str,
39    byte_pos: usize,
40}
41
42impl<'a> Lexer<'a> {
43    /// Create a new lexer from SQL input.
44    #[inline]
45    pub fn new(input: &'a str) -> Self {
46        Lexer { input, byte_pos: 0 }
47    }
48
49    /// Tokenize the entire input.
50    pub fn tokenize(&mut self) -> Result<Vec<Token>, LexerError> {
51        // Pre-allocate based on estimated token count (~1 token per 6 bytes)
52        let estimated_tokens = (self.input.len() / 6).max(4);
53        let mut tokens = Vec::with_capacity(estimated_tokens);
54
55        loop {
56            self.skip_whitespace_and_comments();
57
58            if self.is_eof() {
59                tokens.push(Token::Eof);
60                break;
61            }
62
63            let token = self.next_token()?;
64            tokens.push(token);
65        }
66
67        Ok(tokens)
68    }
69
70    /// Get the next token.
71    fn next_token(&mut self) -> Result<Token, LexerError> {
72        let ch = self.current_char();
73
74        match ch {
75            ';' => {
76                self.advance();
77                Ok(Token::Semicolon)
78            }
79            ',' => {
80                self.advance();
81                Ok(Token::Comma)
82            }
83            '(' => {
84                self.advance();
85                Ok(Token::LParen)
86            }
87            ')' => {
88                self.advance();
89                Ok(Token::RParen)
90            }
91            '=' | '<' | '>' | '!' | '|' => self.tokenize_operator(ch),
92            '@' => {
93                // Check for @@ (session variable) or @ (user variable)
94                if self.peek_byte(1) == Some(b'@') {
95                    self.tokenize_session_variable()
96                } else {
97                    self.tokenize_user_variable()
98                }
99            }
100            '.' => {
101                // Check if this is the start of a decimal number (e.g., .2, .5E+10)
102                if self.peek_byte(1).map(|b| b.is_ascii_digit()).unwrap_or(false) {
103                    self.tokenize_number()
104                } else {
105                    self.advance();
106                    Ok(Token::Symbol('.'))
107                }
108            }
109            '+' | '-' | '*' | '/' => {
110                let symbol = ch;
111                self.advance();
112                Ok(Token::Symbol(symbol))
113            }
114            '\'' => self.tokenize_string(),
115            '"' => self.tokenize_delimited_identifier(),
116            '`' => self.tokenize_backtick_identifier(),
117            '0'..='9' => self.tokenize_number(),
118            'a'..='z' | 'A'..='Z' | '_' => self.tokenize_identifier_or_keyword(),
119            '?' => {
120                self.advance();
121                Ok(Token::Placeholder)
122            }
123            '$' => {
124                // Check if followed by digits for numbered placeholder ($1, $2, etc.)
125                if self.peek_byte(1).map(|b| b.is_ascii_digit()).unwrap_or(false) {
126                    self.tokenize_numbered_placeholder()
127                } else {
128                    Err(LexerError {
129                        message: "Expected digit after '$' for numbered placeholder".to_string(),
130                        position: self.position(),
131                    })
132                }
133            }
134            ':' => {
135                // Check if followed by alphabetic character or underscore for named placeholder
136                if self.peek_byte(1).map(|b| b.is_ascii_alphabetic() || b == b'_').unwrap_or(false) {
137                    self.tokenize_named_placeholder()
138                } else {
139                    // Just a colon symbol (could be used in other contexts)
140                    self.advance();
141                    Ok(Token::Symbol(':'))
142                }
143            }
144            _ => Err(LexerError {
145                message: format!("Unexpected character: '{}'", ch),
146                position: self.byte_pos,
147            }),
148        }
149    }
150
151    /// Skip whitespace characters.
152    #[inline]
153    fn skip_whitespace(&mut self) {
154        while let Some(b) = self.peek_byte(0) {
155            if b.is_ascii_whitespace() {
156                self.byte_pos += 1;
157            } else {
158                break;
159            }
160        }
161    }
162
163    /// Skip whitespace and SQL comments.
164    /// SQL supports line comments starting with -- until end of line.
165    fn skip_whitespace_and_comments(&mut self) {
166        loop {
167            self.skip_whitespace();
168
169            if self.is_eof() {
170                break;
171            }
172
173            // Check for -- line comment
174            if self.peek_byte(0) == Some(b'-') && self.peek_byte(1) == Some(b'-') {
175                // Skip until end of line
176                while let Some(b) = self.peek_byte(0) {
177                    self.byte_pos += 1;
178                    if b == b'\n' {
179                        break;
180                    }
181                }
182                // Continue loop to skip the newline and any following whitespace/comments
183                continue;
184            }
185
186            // No more whitespace or comments
187            break;
188        }
189    }
190
191    /// Get current character without advancing.
192    #[inline]
193    pub(super) fn current_char(&self) -> char {
194        if self.byte_pos >= self.input.len() {
195            '\0'
196        } else {
197            // Fast path for ASCII (most common case in SQL)
198            let b = self.input.as_bytes()[self.byte_pos];
199            if b.is_ascii() {
200                b as char
201            } else {
202                // Slow path for multi-byte UTF-8
203                self.input[self.byte_pos..].chars().next().unwrap_or('\0')
204            }
205        }
206    }
207
208    /// Peek ahead n bytes without advancing (for ASCII characters).
209    #[inline]
210    pub(super) fn peek_byte(&self, n: usize) -> Option<u8> {
211        let peek_pos = self.byte_pos + n;
212        if peek_pos < self.input.len() {
213            Some(self.input.as_bytes()[peek_pos])
214        } else {
215            None
216        }
217    }
218
219    /// Advance to next character.
220    #[inline]
221    pub(super) fn advance(&mut self) {
222        if self.byte_pos < self.input.len() {
223            // Fast path for ASCII
224            let b = self.input.as_bytes()[self.byte_pos];
225            if b.is_ascii() {
226                self.byte_pos += 1;
227            } else {
228                // Slow path for multi-byte UTF-8
229                if let Some(ch) = self.input[self.byte_pos..].chars().next() {
230                    self.byte_pos += ch.len_utf8();
231                }
232            }
233        }
234    }
235
236    /// Check if we've reached end of input.
237    #[inline]
238    pub(super) fn is_eof(&self) -> bool {
239        self.byte_pos >= self.input.len()
240    }
241
242    /// Get the current byte position (for error reporting).
243    #[inline]
244    pub(super) fn position(&self) -> usize {
245        self.byte_pos
246    }
247
248    /// Get a slice of the input from start to current position.
249    #[inline]
250    pub(super) fn slice_from(&self, start: usize) -> &'a str {
251        &self.input[start..self.byte_pos]
252    }
253
254    /// Tokenize a session variable (@@variable, @@session.variable, @@global.variable).
255    fn tokenize_session_variable(&mut self) -> Result<Token, LexerError> {
256        self.advance(); // Skip first @
257        self.advance(); // Skip second @
258
259        let start = self.byte_pos;
260
261        // Read the variable name (which may include scope prefix like 'global' or 'session')
262        while !self.is_eof() {
263            let ch = self.current_char();
264            if ch.is_ascii_alphanumeric() || ch == '_' || ch == '.' {
265                self.advance();
266            } else {
267                break;
268            }
269        }
270
271        if self.byte_pos == start {
272            return Err(LexerError {
273                message: "Expected variable name after @@".to_string(),
274                position: self.byte_pos,
275            });
276        }
277
278        let var_name = self.slice_from(start).to_string();
279        Ok(Token::SessionVariable(var_name))
280    }
281
282    /// Tokenize a user variable (@variable).
283    fn tokenize_user_variable(&mut self) -> Result<Token, LexerError> {
284        self.advance(); // Skip @
285
286        let start = self.byte_pos;
287
288        // Read the variable name
289        while !self.is_eof() {
290            let ch = self.current_char();
291            if ch.is_ascii_alphanumeric() || ch == '_' {
292                self.advance();
293            } else {
294                break;
295            }
296        }
297
298        if self.byte_pos == start {
299            return Err(LexerError {
300                message: "Expected variable name after @".to_string(),
301                position: self.byte_pos,
302            });
303        }
304
305        let var_name = self.slice_from(start).to_string();
306        Ok(Token::UserVariable(var_name))
307    }
308
309    /// Tokenize a numbered placeholder ($1, $2, etc.).
310    /// PostgreSQL-style: 1-indexed ($1 = first parameter).
311    fn tokenize_numbered_placeholder(&mut self) -> Result<Token, LexerError> {
312        self.advance(); // consume '$'
313
314        let start_pos = self.position();
315        let mut num_str = String::new();
316
317        // Read all digits
318        while !self.is_eof() {
319            let ch = self.current_char();
320            if ch.is_ascii_digit() {
321                num_str.push(ch);
322                self.advance();
323            } else {
324                break;
325            }
326        }
327
328        if num_str.is_empty() {
329            return Err(LexerError {
330                message: "Expected digit after '$' for numbered placeholder".to_string(),
331                position: start_pos,
332            });
333        }
334
335        let index: usize = num_str.parse().map_err(|_| LexerError {
336            message: format!("Invalid numbered placeholder: ${}", num_str),
337            position: start_pos,
338        })?;
339
340        // PostgreSQL requires $1 or higher (no $0)
341        if index == 0 {
342            return Err(LexerError {
343                message: "Numbered placeholder must be $1 or higher (no $0)".to_string(),
344                position: start_pos,
345            });
346        }
347
348        Ok(Token::NumberedPlaceholder(index))
349    }
350
351    /// Tokenize a named placeholder (:name, :user_id, etc.).
352    fn tokenize_named_placeholder(&mut self) -> Result<Token, LexerError> {
353        self.advance(); // consume ':'
354
355        let mut name = String::new();
356
357        // Read the identifier (alphanumeric or underscore)
358        while !self.is_eof() {
359            let ch = self.current_char();
360            if ch.is_ascii_alphanumeric() || ch == '_' {
361                name.push(ch);
362                self.advance();
363            } else {
364                break;
365            }
366        }
367
368        if name.is_empty() {
369            return Err(LexerError {
370                message: "Expected identifier after ':' for named placeholder".to_string(),
371                position: self.position(),
372            });
373        }
374
375        Ok(Token::NamedPlaceholder(name))
376    }
377}