vibesql_parser/lexer/
mod.rs

1//! SQL Lexer module - tokenizes SQL text into a stream of tokens.
2//!
3//! The lexer is organized into focused submodules:
4//! - `keywords`: SQL keyword recognition and mapping
5//! - `numbers`: Numeric literal parsing (integers, decimals, scientific notation)
6//! - `strings`: String literal parsing with escape handling
7//! - `identifiers`: Regular and delimited identifier handling
8//! - `operators`: Multi-character operator recognition
9
10use std::fmt;
11
12use crate::token::Token;
13
14mod identifiers;
15mod keywords;
16mod numbers;
17mod operators;
18mod strings;
19
20/// Byte range span in the source text.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub struct Span {
23    /// Start byte offset (inclusive)
24    pub start: usize,
25    /// End byte offset (exclusive)
26    pub end: usize,
27}
28
29impl Span {
30    /// Create a new span.
31    pub fn new(start: usize, end: usize) -> Self {
32        Span { start, end }
33    }
34
35    /// Extract the text covered by this span from the source.
36    pub fn extract<'a>(&self, source: &'a str) -> &'a str {
37        &source[self.start..self.end]
38    }
39}
40
41/// Lexer error returned when tokenization fails.
42#[derive(Debug, Clone, PartialEq)]
43pub struct LexerError {
44    pub message: String,
45    pub position: usize,
46    /// The token text that caused the error (for SQLite-compatible error messages)
47    pub near_token: Option<String>,
48}
49
50impl fmt::Display for LexerError {
51    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
52        // SQLite-compatible error format: near "TOKEN": syntax error
53        if let Some(ref token) = self.near_token {
54            write!(f, "near \"{}\": syntax error", token)
55        } else {
56            write!(f, "Lexer error at position {}: {}", self.position, self.message)
57        }
58    }
59}
60
61/// SQL Lexer - converts SQL text into tokens.
62///
63/// Uses direct &str access for zero-copy tokenization.
64/// Tracks byte position for efficient slicing.
65pub struct Lexer<'a> {
66    input: &'a str,
67    byte_pos: usize,
68}
69
70impl<'a> Lexer<'a> {
71    /// Create a new lexer from SQL input.
72    #[inline]
73    pub fn new(input: &'a str) -> Self {
74        Lexer { input, byte_pos: 0 }
75    }
76
77    /// Returns the original source input.
78    pub fn input(&self) -> &'a str {
79        self.input
80    }
81
82    /// Tokenize the entire input.
83    pub fn tokenize(&mut self) -> Result<Vec<Token>, LexerError> {
84        // Pre-allocate based on estimated token count (~1 token per 6 bytes)
85        let estimated_tokens = (self.input.len() / 6).max(4);
86        let mut tokens = Vec::with_capacity(estimated_tokens);
87
88        loop {
89            self.skip_whitespace_and_comments();
90
91            if self.is_eof() {
92                tokens.push(Token::Eof);
93                break;
94            }
95
96            let token = self.next_token()?;
97            tokens.push(token);
98        }
99
100        Ok(tokens)
101    }
102
103    /// Tokenize the entire input, returning tokens with their byte spans.
104    ///
105    /// This is useful when you need to extract the original source text
106    /// for a token (e.g., for preserving original identifier case).
107    pub fn tokenize_with_spans(&mut self) -> Result<Vec<(Token, Span)>, LexerError> {
108        let estimated_tokens = (self.input.len() / 6).max(4);
109        let mut tokens = Vec::with_capacity(estimated_tokens);
110
111        loop {
112            self.skip_whitespace_and_comments();
113
114            let start = self.byte_pos;
115
116            if self.is_eof() {
117                tokens.push((Token::Eof, Span::new(start, start)));
118                break;
119            }
120
121            let token = self.next_token()?;
122            let end = self.byte_pos;
123            tokens.push((token, Span::new(start, end)));
124        }
125
126        Ok(tokens)
127    }
128
129    /// Get the next token.
130    fn next_token(&mut self) -> Result<Token, LexerError> {
131        let ch = self.current_char();
132
133        match ch {
134            ';' => {
135                self.advance();
136                Ok(Token::Semicolon)
137            }
138            ',' => {
139                self.advance();
140                Ok(Token::Comma)
141            }
142            '(' => {
143                self.advance();
144                Ok(Token::LParen)
145            }
146            ')' => {
147                self.advance();
148                Ok(Token::RParen)
149            }
150            '=' | '<' | '>' | '!' | '|' | '&' | '~' => self.tokenize_operator(ch),
151            '@' => {
152                // Check for @@ (session variable) or @ (user variable)
153                if self.peek_byte(1) == Some(b'@') {
154                    self.tokenize_session_variable()
155                } else {
156                    self.tokenize_user_variable()
157                }
158            }
159            '.' => {
160                // Check if this is the start of a decimal number (e.g., .2, .5E+10)
161                if self.peek_byte(1).map(|b| b.is_ascii_digit()).unwrap_or(false) {
162                    self.tokenize_number()
163                } else {
164                    self.advance();
165                    Ok(Token::Symbol('.'))
166                }
167            }
168            '+' | '*' | '/' | '%' => {
169                let symbol = ch;
170                self.advance();
171                Ok(Token::Symbol(symbol))
172            }
173            '-' => {
174                // Could be -> or ->> (JSON extract operators) or just -
175                self.tokenize_operator(ch)
176            }
177            '\'' => self.tokenize_string(),
178            '"' => self.tokenize_delimited_identifier(),
179            '`' => self.tokenize_backtick_identifier(),
180            '[' => self.tokenize_bracket_identifier(),
181            '0'..='9' => self.tokenize_number(),
182            'x' | 'X' => {
183                // Check if this is a hex blob literal (x'...' or X'...')
184                if self.peek_byte(1) == Some(b'\'') {
185                    self.tokenize_blob_literal()
186                } else {
187                    self.tokenize_identifier_or_keyword()
188                }
189            }
190            'a'..='w' | 'y'..='z' | 'A'..='W' | 'Y'..='Z' | '_' => {
191                self.tokenize_identifier_or_keyword()
192            }
193            '?' => {
194                self.advance();
195                Ok(Token::Placeholder)
196            }
197            '$' => {
198                // Check what follows the $
199                let next = self.peek_byte(1);
200                if next.map(|b| b.is_ascii_digit()).unwrap_or(false) {
201                    // PostgreSQL-style numbered placeholder ($1, $2, etc.)
202                    self.tokenize_numbered_placeholder()
203                } else if next.map(|b| b.is_ascii_alphabetic() || b == b'_').unwrap_or(false) {
204                    // SQLite/TCL-style named placeholder ($name, $x1, etc.)
205                    self.tokenize_dollar_named_placeholder()
206                } else if next == Some(b':') {
207                    // TCL global variable syntax ($::name) - treat as named placeholder
208                    self.tokenize_tcl_global_placeholder()
209                } else {
210                    let token = self.extract_error_token();
211                    Err(LexerError {
212                        message: "Expected digit or identifier after '$' for placeholder"
213                            .to_string(),
214                        position: self.position(),
215                        near_token: Some(token),
216                    })
217                }
218            }
219            ':' => {
220                // Check if followed by alphabetic character or underscore for named placeholder
221                if self.peek_byte(1).map(|b| b.is_ascii_alphabetic() || b == b'_').unwrap_or(false)
222                {
223                    self.tokenize_named_placeholder()
224                } else {
225                    // Just a colon symbol (could be used in other contexts)
226                    self.advance();
227                    Ok(Token::Symbol(':'))
228                }
229            }
230            _ => {
231                // SQLite-compatible error: extract a token-like span for "near" message
232                let token = self.extract_error_token();
233                Err(LexerError {
234                    message: format!("Unexpected character: '{}'", ch),
235                    position: self.byte_pos,
236                    near_token: Some(token),
237                })
238            }
239        }
240    }
241
242    /// Skip whitespace characters.
243    #[inline]
244    fn skip_whitespace(&mut self) {
245        while let Some(b) = self.peek_byte(0) {
246            if b.is_ascii_whitespace() {
247                self.byte_pos += 1;
248            } else {
249                break;
250            }
251        }
252    }
253
254    /// Skip whitespace and SQL comments.
255    /// SQL supports line comments starting with -- until end of line.
256    fn skip_whitespace_and_comments(&mut self) {
257        loop {
258            self.skip_whitespace();
259
260            if self.is_eof() {
261                break;
262            }
263
264            // Check for -- line comment
265            if self.peek_byte(0) == Some(b'-') && self.peek_byte(1) == Some(b'-') {
266                // Skip until end of line
267                while let Some(b) = self.peek_byte(0) {
268                    self.byte_pos += 1;
269                    if b == b'\n' {
270                        break;
271                    }
272                }
273                // Continue loop to skip the newline and any following whitespace/comments
274                continue;
275            }
276
277            // No more whitespace or comments
278            break;
279        }
280    }
281
282    /// Get current character without advancing.
283    #[inline]
284    pub(super) fn current_char(&self) -> char {
285        if self.byte_pos >= self.input.len() {
286            '\0'
287        } else {
288            // Fast path for ASCII (most common case in SQL)
289            let b = self.input.as_bytes()[self.byte_pos];
290            if b.is_ascii() {
291                b as char
292            } else {
293                // Slow path for multi-byte UTF-8
294                self.input[self.byte_pos..].chars().next().unwrap_or('\0')
295            }
296        }
297    }
298
299    /// Peek ahead n bytes without advancing (for ASCII characters).
300    #[inline]
301    pub(super) fn peek_byte(&self, n: usize) -> Option<u8> {
302        let peek_pos = self.byte_pos + n;
303        if peek_pos < self.input.len() {
304            Some(self.input.as_bytes()[peek_pos])
305        } else {
306            None
307        }
308    }
309
310    /// Advance to next character.
311    #[inline]
312    pub(super) fn advance(&mut self) {
313        if self.byte_pos < self.input.len() {
314            // Fast path for ASCII
315            let b = self.input.as_bytes()[self.byte_pos];
316            if b.is_ascii() {
317                self.byte_pos += 1;
318            } else {
319                // Slow path for multi-byte UTF-8
320                if let Some(ch) = self.input[self.byte_pos..].chars().next() {
321                    self.byte_pos += ch.len_utf8();
322                }
323            }
324        }
325    }
326
327    /// Check if we've reached end of input.
328    #[inline]
329    pub(super) fn is_eof(&self) -> bool {
330        self.byte_pos >= self.input.len()
331    }
332
333    /// Get the current byte position (for error reporting).
334    #[inline]
335    pub(super) fn position(&self) -> usize {
336        self.byte_pos
337    }
338
339    /// Get a slice of the input from start to current position.
340    #[inline]
341    pub(super) fn slice_from(&self, start: usize) -> &'a str {
342        &self.input[start..self.byte_pos]
343    }
344
345    /// Extract a token-like span from current position for error messages.
346    /// SQLite shows the problematic token in error messages like: near "#1": syntax error
347    /// This reads ahead to capture a reasonable token span (alphanumeric, symbols, etc.)
348    fn extract_error_token(&self) -> String {
349        let start = self.byte_pos;
350        let mut end = start;
351        let bytes = self.input.as_bytes();
352
353        // Read ahead to capture a token-like span
354        while end < bytes.len() {
355            let b = bytes[end];
356            // Continue for alphanumeric, symbols that could be part of a token
357            if b.is_ascii_alphanumeric() || b == b'_' || b == b'#' || b == b'$' || b == b'@' {
358                end += 1;
359            } else if end == start {
360                // Include at least one character
361                end += 1;
362                break;
363            } else {
364                break;
365            }
366        }
367
368        self.input[start..end].to_string()
369    }
370
371    /// Tokenize a session variable (@@variable, @@session.variable, @@global.variable).
372    fn tokenize_session_variable(&mut self) -> Result<Token, LexerError> {
373        self.advance(); // Skip first @
374        self.advance(); // Skip second @
375
376        let start = self.byte_pos;
377
378        // Read the variable name (which may include scope prefix like 'global' or 'session')
379        while !self.is_eof() {
380            let ch = self.current_char();
381            if ch.is_ascii_alphanumeric() || ch == '_' || ch == '.' {
382                self.advance();
383            } else {
384                break;
385            }
386        }
387
388        if self.byte_pos == start {
389            return Err(LexerError {
390                message: "Expected variable name after @@".to_string(),
391                position: self.byte_pos,
392                near_token: Some("@@".to_string()),
393            });
394        }
395
396        let var_name = self.slice_from(start).to_string();
397        Ok(Token::SessionVariable(var_name))
398    }
399
400    /// Tokenize a user variable (@variable).
401    fn tokenize_user_variable(&mut self) -> Result<Token, LexerError> {
402        self.advance(); // Skip @
403
404        let start = self.byte_pos;
405
406        // Read the variable name
407        while !self.is_eof() {
408            let ch = self.current_char();
409            if ch.is_ascii_alphanumeric() || ch == '_' {
410                self.advance();
411            } else {
412                break;
413            }
414        }
415
416        if self.byte_pos == start {
417            return Err(LexerError {
418                message: "Expected variable name after @".to_string(),
419                position: self.byte_pos,
420                near_token: Some("@".to_string()),
421            });
422        }
423
424        let var_name = self.slice_from(start).to_string();
425        Ok(Token::UserVariable(var_name))
426    }
427
428    /// Tokenize a numbered placeholder ($1, $2, etc.).
429    /// PostgreSQL-style: 1-indexed ($1 = first parameter).
430    fn tokenize_numbered_placeholder(&mut self) -> Result<Token, LexerError> {
431        self.advance(); // consume '$'
432
433        let start_pos = self.position();
434        let mut num_str = String::new();
435
436        // Read all digits
437        while !self.is_eof() {
438            let ch = self.current_char();
439            if ch.is_ascii_digit() {
440                num_str.push(ch);
441                self.advance();
442            } else {
443                break;
444            }
445        }
446
447        if num_str.is_empty() {
448            return Err(LexerError {
449                message: "Expected digit after '$' for numbered placeholder".to_string(),
450                position: start_pos,
451                near_token: Some("$".to_string()),
452            });
453        }
454
455        let index: usize = num_str.parse().map_err(|_| LexerError {
456            message: format!("Invalid numbered placeholder: ${}", num_str),
457            position: start_pos,
458            near_token: Some(format!("${}", num_str)),
459        })?;
460
461        // PostgreSQL requires $1 or higher (no $0)
462        if index == 0 {
463            return Err(LexerError {
464                message: "Numbered placeholder must be $1 or higher (no $0)".to_string(),
465                position: start_pos,
466                near_token: Some("$0".to_string()),
467            });
468        }
469
470        Ok(Token::NumberedPlaceholder(index))
471    }
472
473    /// Tokenize a named placeholder (:name, :user_id, etc.).
474    fn tokenize_named_placeholder(&mut self) -> Result<Token, LexerError> {
475        self.advance(); // consume ':'
476
477        let mut name = String::new();
478
479        // Read the identifier (alphanumeric or underscore)
480        while !self.is_eof() {
481            let ch = self.current_char();
482            if ch.is_ascii_alphanumeric() || ch == '_' {
483                name.push(ch);
484                self.advance();
485            } else {
486                break;
487            }
488        }
489
490        if name.is_empty() {
491            return Err(LexerError {
492                message: "Expected identifier after ':' for named placeholder".to_string(),
493                position: self.position(),
494                near_token: Some(":".to_string()),
495            });
496        }
497
498        Ok(Token::NamedPlaceholder(name))
499    }
500
501    /// Tokenize a dollar-prefixed named placeholder ($name, $x1, $user_id, etc.).
502    /// SQLite/TCL style - same as :name but with $ prefix.
503    fn tokenize_dollar_named_placeholder(&mut self) -> Result<Token, LexerError> {
504        self.advance(); // consume '$'
505
506        let mut name = String::new();
507
508        // Read the identifier (alphanumeric or underscore)
509        while !self.is_eof() {
510            let ch = self.current_char();
511            if ch.is_ascii_alphanumeric() || ch == '_' {
512                name.push(ch);
513                self.advance();
514            } else {
515                break;
516            }
517        }
518
519        if name.is_empty() {
520            return Err(LexerError {
521                message: "Expected identifier after '$' for named placeholder".to_string(),
522                position: self.position(),
523                near_token: Some("$".to_string()),
524            });
525        }
526
527        // Use NamedPlaceholder since it's functionally equivalent to :name
528        Ok(Token::NamedPlaceholder(name))
529    }
530
531    /// Tokenize a TCL global variable placeholder ($::name, $::namespace::var, etc.).
532    /// TCL uses $::name for global namespace variables. We treat these as named placeholders.
533    fn tokenize_tcl_global_placeholder(&mut self) -> Result<Token, LexerError> {
534        self.advance(); // consume '$'
535
536        let mut name = String::new();
537
538        // Consume the :: prefix and any subsequent ::namespace:: parts
539        while !self.is_eof() {
540            let ch = self.current_char();
541            if ch == ':' || ch.is_ascii_alphanumeric() || ch == '_' {
542                name.push(ch);
543                self.advance();
544            } else {
545                break;
546            }
547        }
548
549        if name.is_empty() || name == ":" || name == "::" {
550            return Err(LexerError {
551                message: "Expected identifier after '$::' for TCL global placeholder".to_string(),
552                position: self.position(),
553                near_token: Some(format!("${}", name)),
554            });
555        }
556
557        // Use NamedPlaceholder - the name includes :: prefix for uniqueness
558        Ok(Token::NamedPlaceholder(name))
559    }
560}