vibesql_parser/lexer/
mod.rs1use std::fmt;
11
12use crate::token::Token;
13
14mod identifiers;
15mod keywords;
16mod numbers;
17mod operators;
18mod strings;
19
20#[derive(Debug, Clone, PartialEq)]
22pub struct LexerError {
23 pub message: String,
24 pub position: usize,
25}
26
27impl fmt::Display for LexerError {
28 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
29 write!(f, "Lexer error at position {}: {}", self.position, self.message)
30 }
31}
32
33pub struct Lexer<'a> {
38 input: &'a str,
39 byte_pos: usize,
40}
41
42impl<'a> Lexer<'a> {
43 #[inline]
45 pub fn new(input: &'a str) -> Self {
46 Lexer { input, byte_pos: 0 }
47 }
48
49 pub fn tokenize(&mut self) -> Result<Vec<Token>, LexerError> {
51 let estimated_tokens = (self.input.len() / 6).max(4);
53 let mut tokens = Vec::with_capacity(estimated_tokens);
54
55 loop {
56 self.skip_whitespace_and_comments();
57
58 if self.is_eof() {
59 tokens.push(Token::Eof);
60 break;
61 }
62
63 let token = self.next_token()?;
64 tokens.push(token);
65 }
66
67 Ok(tokens)
68 }
69
70 fn next_token(&mut self) -> Result<Token, LexerError> {
72 let ch = self.current_char();
73
74 match ch {
75 ';' => {
76 self.advance();
77 Ok(Token::Semicolon)
78 }
79 ',' => {
80 self.advance();
81 Ok(Token::Comma)
82 }
83 '(' => {
84 self.advance();
85 Ok(Token::LParen)
86 }
87 ')' => {
88 self.advance();
89 Ok(Token::RParen)
90 }
91 '=' | '<' | '>' | '!' | '|' => self.tokenize_operator(ch),
92 '@' => {
93 if self.peek_byte(1) == Some(b'@') {
95 self.tokenize_session_variable()
96 } else {
97 self.tokenize_user_variable()
98 }
99 }
100 '.' => {
101 if self.peek_byte(1).map(|b| b.is_ascii_digit()).unwrap_or(false) {
103 self.tokenize_number()
104 } else {
105 self.advance();
106 Ok(Token::Symbol('.'))
107 }
108 }
109 '+' | '-' | '*' | '/' => {
110 let symbol = ch;
111 self.advance();
112 Ok(Token::Symbol(symbol))
113 }
114 '\'' => self.tokenize_string(),
115 '"' => self.tokenize_delimited_identifier(),
116 '`' => self.tokenize_backtick_identifier(),
117 '0'..='9' => self.tokenize_number(),
118 'a'..='z' | 'A'..='Z' | '_' => self.tokenize_identifier_or_keyword(),
119 '?' => {
120 self.advance();
121 Ok(Token::Placeholder)
122 }
123 '$' => {
124 if self.peek_byte(1).map(|b| b.is_ascii_digit()).unwrap_or(false) {
126 self.tokenize_numbered_placeholder()
127 } else {
128 Err(LexerError {
129 message: "Expected digit after '$' for numbered placeholder".to_string(),
130 position: self.position(),
131 })
132 }
133 }
134 ':' => {
135 if self.peek_byte(1).map(|b| b.is_ascii_alphabetic() || b == b'_').unwrap_or(false) {
137 self.tokenize_named_placeholder()
138 } else {
139 self.advance();
141 Ok(Token::Symbol(':'))
142 }
143 }
144 _ => Err(LexerError {
145 message: format!("Unexpected character: '{}'", ch),
146 position: self.byte_pos,
147 }),
148 }
149 }
150
151 #[inline]
153 fn skip_whitespace(&mut self) {
154 while let Some(b) = self.peek_byte(0) {
155 if b.is_ascii_whitespace() {
156 self.byte_pos += 1;
157 } else {
158 break;
159 }
160 }
161 }
162
163 fn skip_whitespace_and_comments(&mut self) {
166 loop {
167 self.skip_whitespace();
168
169 if self.is_eof() {
170 break;
171 }
172
173 if self.peek_byte(0) == Some(b'-') && self.peek_byte(1) == Some(b'-') {
175 while let Some(b) = self.peek_byte(0) {
177 self.byte_pos += 1;
178 if b == b'\n' {
179 break;
180 }
181 }
182 continue;
184 }
185
186 break;
188 }
189 }
190
191 #[inline]
193 pub(super) fn current_char(&self) -> char {
194 if self.byte_pos >= self.input.len() {
195 '\0'
196 } else {
197 let b = self.input.as_bytes()[self.byte_pos];
199 if b.is_ascii() {
200 b as char
201 } else {
202 self.input[self.byte_pos..].chars().next().unwrap_or('\0')
204 }
205 }
206 }
207
208 #[inline]
210 pub(super) fn peek_byte(&self, n: usize) -> Option<u8> {
211 let peek_pos = self.byte_pos + n;
212 if peek_pos < self.input.len() {
213 Some(self.input.as_bytes()[peek_pos])
214 } else {
215 None
216 }
217 }
218
219 #[inline]
221 pub(super) fn advance(&mut self) {
222 if self.byte_pos < self.input.len() {
223 let b = self.input.as_bytes()[self.byte_pos];
225 if b.is_ascii() {
226 self.byte_pos += 1;
227 } else {
228 if let Some(ch) = self.input[self.byte_pos..].chars().next() {
230 self.byte_pos += ch.len_utf8();
231 }
232 }
233 }
234 }
235
236 #[inline]
238 pub(super) fn is_eof(&self) -> bool {
239 self.byte_pos >= self.input.len()
240 }
241
242 #[inline]
244 pub(super) fn position(&self) -> usize {
245 self.byte_pos
246 }
247
248 #[inline]
250 pub(super) fn slice_from(&self, start: usize) -> &'a str {
251 &self.input[start..self.byte_pos]
252 }
253
254 fn tokenize_session_variable(&mut self) -> Result<Token, LexerError> {
256 self.advance(); self.advance(); let start = self.byte_pos;
260
261 while !self.is_eof() {
263 let ch = self.current_char();
264 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '.' {
265 self.advance();
266 } else {
267 break;
268 }
269 }
270
271 if self.byte_pos == start {
272 return Err(LexerError {
273 message: "Expected variable name after @@".to_string(),
274 position: self.byte_pos,
275 });
276 }
277
278 let var_name = self.slice_from(start).to_string();
279 Ok(Token::SessionVariable(var_name))
280 }
281
282 fn tokenize_user_variable(&mut self) -> Result<Token, LexerError> {
284 self.advance(); let start = self.byte_pos;
287
288 while !self.is_eof() {
290 let ch = self.current_char();
291 if ch.is_ascii_alphanumeric() || ch == '_' {
292 self.advance();
293 } else {
294 break;
295 }
296 }
297
298 if self.byte_pos == start {
299 return Err(LexerError {
300 message: "Expected variable name after @".to_string(),
301 position: self.byte_pos,
302 });
303 }
304
305 let var_name = self.slice_from(start).to_string();
306 Ok(Token::UserVariable(var_name))
307 }
308
309 fn tokenize_numbered_placeholder(&mut self) -> Result<Token, LexerError> {
312 self.advance(); let start_pos = self.position();
315 let mut num_str = String::new();
316
317 while !self.is_eof() {
319 let ch = self.current_char();
320 if ch.is_ascii_digit() {
321 num_str.push(ch);
322 self.advance();
323 } else {
324 break;
325 }
326 }
327
328 if num_str.is_empty() {
329 return Err(LexerError {
330 message: "Expected digit after '$' for numbered placeholder".to_string(),
331 position: start_pos,
332 });
333 }
334
335 let index: usize = num_str.parse().map_err(|_| LexerError {
336 message: format!("Invalid numbered placeholder: ${}", num_str),
337 position: start_pos,
338 })?;
339
340 if index == 0 {
342 return Err(LexerError {
343 message: "Numbered placeholder must be $1 or higher (no $0)".to_string(),
344 position: start_pos,
345 });
346 }
347
348 Ok(Token::NumberedPlaceholder(index))
349 }
350
351 fn tokenize_named_placeholder(&mut self) -> Result<Token, LexerError> {
353 self.advance(); let mut name = String::new();
356
357 while !self.is_eof() {
359 let ch = self.current_char();
360 if ch.is_ascii_alphanumeric() || ch == '_' {
361 name.push(ch);
362 self.advance();
363 } else {
364 break;
365 }
366 }
367
368 if name.is_empty() {
369 return Err(LexerError {
370 message: "Expected identifier after ':' for named placeholder".to_string(),
371 position: self.position(),
372 });
373 }
374
375 Ok(Token::NamedPlaceholder(name))
376 }
377}