vibesql_parser/lexer/
mod.rs1use std::fmt;
11
12use crate::token::Token;
13
14mod identifiers;
15mod keywords;
16mod numbers;
17mod operators;
18mod strings;
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub struct Span {
23 pub start: usize,
25 pub end: usize,
27}
28
29impl Span {
30 pub fn new(start: usize, end: usize) -> Self {
32 Span { start, end }
33 }
34
35 pub fn extract<'a>(&self, source: &'a str) -> &'a str {
37 &source[self.start..self.end]
38 }
39}
40
41#[derive(Debug, Clone, PartialEq)]
43pub struct LexerError {
44 pub message: String,
45 pub position: usize,
46 pub near_token: Option<String>,
48}
49
50impl fmt::Display for LexerError {
51 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
52 if let Some(ref token) = self.near_token {
54 write!(f, "near \"{}\": syntax error", token)
55 } else {
56 write!(f, "Lexer error at position {}: {}", self.position, self.message)
57 }
58 }
59}
60
61pub struct Lexer<'a> {
66 input: &'a str,
67 byte_pos: usize,
68}
69
70impl<'a> Lexer<'a> {
71 #[inline]
73 pub fn new(input: &'a str) -> Self {
74 Lexer { input, byte_pos: 0 }
75 }
76
77 pub fn input(&self) -> &'a str {
79 self.input
80 }
81
82 pub fn tokenize(&mut self) -> Result<Vec<Token>, LexerError> {
84 let estimated_tokens = (self.input.len() / 6).max(4);
86 let mut tokens = Vec::with_capacity(estimated_tokens);
87
88 loop {
89 self.skip_whitespace_and_comments();
90
91 if self.is_eof() {
92 tokens.push(Token::Eof);
93 break;
94 }
95
96 let token = self.next_token()?;
97 tokens.push(token);
98 }
99
100 Ok(tokens)
101 }
102
103 pub fn tokenize_with_spans(&mut self) -> Result<Vec<(Token, Span)>, LexerError> {
108 let estimated_tokens = (self.input.len() / 6).max(4);
109 let mut tokens = Vec::with_capacity(estimated_tokens);
110
111 loop {
112 self.skip_whitespace_and_comments();
113
114 let start = self.byte_pos;
115
116 if self.is_eof() {
117 tokens.push((Token::Eof, Span::new(start, start)));
118 break;
119 }
120
121 let token = self.next_token()?;
122 let end = self.byte_pos;
123 tokens.push((token, Span::new(start, end)));
124 }
125
126 Ok(tokens)
127 }
128
129 fn next_token(&mut self) -> Result<Token, LexerError> {
131 let ch = self.current_char();
132
133 match ch {
134 ';' => {
135 self.advance();
136 Ok(Token::Semicolon)
137 }
138 ',' => {
139 self.advance();
140 Ok(Token::Comma)
141 }
142 '(' => {
143 self.advance();
144 Ok(Token::LParen)
145 }
146 ')' => {
147 self.advance();
148 Ok(Token::RParen)
149 }
150 '=' | '<' | '>' | '!' | '|' | '&' | '~' => self.tokenize_operator(ch),
151 '@' => {
152 if self.peek_byte(1) == Some(b'@') {
154 self.tokenize_session_variable()
155 } else {
156 self.tokenize_user_variable()
157 }
158 }
159 '.' => {
160 if self.peek_byte(1).map(|b| b.is_ascii_digit()).unwrap_or(false) {
162 self.tokenize_number()
163 } else {
164 self.advance();
165 Ok(Token::Symbol('.'))
166 }
167 }
168 '+' | '*' | '/' | '%' => {
169 let symbol = ch;
170 self.advance();
171 Ok(Token::Symbol(symbol))
172 }
173 '-' => {
174 self.tokenize_operator(ch)
176 }
177 '\'' => self.tokenize_string(),
178 '"' => self.tokenize_delimited_identifier(),
179 '`' => self.tokenize_backtick_identifier(),
180 '[' => self.tokenize_bracket_identifier(),
181 '0'..='9' => self.tokenize_number(),
182 'x' | 'X' => {
183 if self.peek_byte(1) == Some(b'\'') {
185 self.tokenize_blob_literal()
186 } else {
187 self.tokenize_identifier_or_keyword()
188 }
189 }
190 'a'..='w' | 'y'..='z' | 'A'..='W' | 'Y'..='Z' | '_' => {
191 self.tokenize_identifier_or_keyword()
192 }
193 '?' => {
194 self.advance();
195 Ok(Token::Placeholder)
196 }
197 '$' => {
198 let next = self.peek_byte(1);
200 if next.map(|b| b.is_ascii_digit()).unwrap_or(false) {
201 self.tokenize_numbered_placeholder()
203 } else if next.map(|b| b.is_ascii_alphabetic() || b == b'_').unwrap_or(false) {
204 self.tokenize_dollar_named_placeholder()
206 } else if next == Some(b':') {
207 self.tokenize_tcl_global_placeholder()
209 } else {
210 let token = self.extract_error_token();
211 Err(LexerError {
212 message: "Expected digit or identifier after '$' for placeholder"
213 .to_string(),
214 position: self.position(),
215 near_token: Some(token),
216 })
217 }
218 }
219 ':' => {
220 if self.peek_byte(1).map(|b| b.is_ascii_alphabetic() || b == b'_').unwrap_or(false)
222 {
223 self.tokenize_named_placeholder()
224 } else {
225 self.advance();
227 Ok(Token::Symbol(':'))
228 }
229 }
230 _ => {
231 let token = self.extract_error_token();
233 Err(LexerError {
234 message: format!("Unexpected character: '{}'", ch),
235 position: self.byte_pos,
236 near_token: Some(token),
237 })
238 }
239 }
240 }
241
242 #[inline]
244 fn skip_whitespace(&mut self) {
245 while let Some(b) = self.peek_byte(0) {
246 if b.is_ascii_whitespace() {
247 self.byte_pos += 1;
248 } else {
249 break;
250 }
251 }
252 }
253
254 fn skip_whitespace_and_comments(&mut self) {
257 loop {
258 self.skip_whitespace();
259
260 if self.is_eof() {
261 break;
262 }
263
264 if self.peek_byte(0) == Some(b'-') && self.peek_byte(1) == Some(b'-') {
266 while let Some(b) = self.peek_byte(0) {
268 self.byte_pos += 1;
269 if b == b'\n' {
270 break;
271 }
272 }
273 continue;
275 }
276
277 break;
279 }
280 }
281
282 #[inline]
284 pub(super) fn current_char(&self) -> char {
285 if self.byte_pos >= self.input.len() {
286 '\0'
287 } else {
288 let b = self.input.as_bytes()[self.byte_pos];
290 if b.is_ascii() {
291 b as char
292 } else {
293 self.input[self.byte_pos..].chars().next().unwrap_or('\0')
295 }
296 }
297 }
298
299 #[inline]
301 pub(super) fn peek_byte(&self, n: usize) -> Option<u8> {
302 let peek_pos = self.byte_pos + n;
303 if peek_pos < self.input.len() {
304 Some(self.input.as_bytes()[peek_pos])
305 } else {
306 None
307 }
308 }
309
310 #[inline]
312 pub(super) fn advance(&mut self) {
313 if self.byte_pos < self.input.len() {
314 let b = self.input.as_bytes()[self.byte_pos];
316 if b.is_ascii() {
317 self.byte_pos += 1;
318 } else {
319 if let Some(ch) = self.input[self.byte_pos..].chars().next() {
321 self.byte_pos += ch.len_utf8();
322 }
323 }
324 }
325 }
326
327 #[inline]
329 pub(super) fn is_eof(&self) -> bool {
330 self.byte_pos >= self.input.len()
331 }
332
333 #[inline]
335 pub(super) fn position(&self) -> usize {
336 self.byte_pos
337 }
338
339 #[inline]
341 pub(super) fn slice_from(&self, start: usize) -> &'a str {
342 &self.input[start..self.byte_pos]
343 }
344
345 fn extract_error_token(&self) -> String {
349 let start = self.byte_pos;
350 let mut end = start;
351 let bytes = self.input.as_bytes();
352
353 while end < bytes.len() {
355 let b = bytes[end];
356 if b.is_ascii_alphanumeric() || b == b'_' || b == b'#' || b == b'$' || b == b'@' {
358 end += 1;
359 } else if end == start {
360 end += 1;
362 break;
363 } else {
364 break;
365 }
366 }
367
368 self.input[start..end].to_string()
369 }
370
371 fn tokenize_session_variable(&mut self) -> Result<Token, LexerError> {
373 self.advance(); self.advance(); let start = self.byte_pos;
377
378 while !self.is_eof() {
380 let ch = self.current_char();
381 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '.' {
382 self.advance();
383 } else {
384 break;
385 }
386 }
387
388 if self.byte_pos == start {
389 return Err(LexerError {
390 message: "Expected variable name after @@".to_string(),
391 position: self.byte_pos,
392 near_token: Some("@@".to_string()),
393 });
394 }
395
396 let var_name = self.slice_from(start).to_string();
397 Ok(Token::SessionVariable(var_name))
398 }
399
400 fn tokenize_user_variable(&mut self) -> Result<Token, LexerError> {
402 self.advance(); let start = self.byte_pos;
405
406 while !self.is_eof() {
408 let ch = self.current_char();
409 if ch.is_ascii_alphanumeric() || ch == '_' {
410 self.advance();
411 } else {
412 break;
413 }
414 }
415
416 if self.byte_pos == start {
417 return Err(LexerError {
418 message: "Expected variable name after @".to_string(),
419 position: self.byte_pos,
420 near_token: Some("@".to_string()),
421 });
422 }
423
424 let var_name = self.slice_from(start).to_string();
425 Ok(Token::UserVariable(var_name))
426 }
427
428 fn tokenize_numbered_placeholder(&mut self) -> Result<Token, LexerError> {
431 self.advance(); let start_pos = self.position();
434 let mut num_str = String::new();
435
436 while !self.is_eof() {
438 let ch = self.current_char();
439 if ch.is_ascii_digit() {
440 num_str.push(ch);
441 self.advance();
442 } else {
443 break;
444 }
445 }
446
447 if num_str.is_empty() {
448 return Err(LexerError {
449 message: "Expected digit after '$' for numbered placeholder".to_string(),
450 position: start_pos,
451 near_token: Some("$".to_string()),
452 });
453 }
454
455 let index: usize = num_str.parse().map_err(|_| LexerError {
456 message: format!("Invalid numbered placeholder: ${}", num_str),
457 position: start_pos,
458 near_token: Some(format!("${}", num_str)),
459 })?;
460
461 if index == 0 {
463 return Err(LexerError {
464 message: "Numbered placeholder must be $1 or higher (no $0)".to_string(),
465 position: start_pos,
466 near_token: Some("$0".to_string()),
467 });
468 }
469
470 Ok(Token::NumberedPlaceholder(index))
471 }
472
473 fn tokenize_named_placeholder(&mut self) -> Result<Token, LexerError> {
475 self.advance(); let mut name = String::new();
478
479 while !self.is_eof() {
481 let ch = self.current_char();
482 if ch.is_ascii_alphanumeric() || ch == '_' {
483 name.push(ch);
484 self.advance();
485 } else {
486 break;
487 }
488 }
489
490 if name.is_empty() {
491 return Err(LexerError {
492 message: "Expected identifier after ':' for named placeholder".to_string(),
493 position: self.position(),
494 near_token: Some(":".to_string()),
495 });
496 }
497
498 Ok(Token::NamedPlaceholder(name))
499 }
500
501 fn tokenize_dollar_named_placeholder(&mut self) -> Result<Token, LexerError> {
504 self.advance(); let mut name = String::new();
507
508 while !self.is_eof() {
510 let ch = self.current_char();
511 if ch.is_ascii_alphanumeric() || ch == '_' {
512 name.push(ch);
513 self.advance();
514 } else {
515 break;
516 }
517 }
518
519 if name.is_empty() {
520 return Err(LexerError {
521 message: "Expected identifier after '$' for named placeholder".to_string(),
522 position: self.position(),
523 near_token: Some("$".to_string()),
524 });
525 }
526
527 Ok(Token::NamedPlaceholder(name))
529 }
530
531 fn tokenize_tcl_global_placeholder(&mut self) -> Result<Token, LexerError> {
534 self.advance(); let mut name = String::new();
537
538 while !self.is_eof() {
540 let ch = self.current_char();
541 if ch == ':' || ch.is_ascii_alphanumeric() || ch == '_' {
542 name.push(ch);
543 self.advance();
544 } else {
545 break;
546 }
547 }
548
549 if name.is_empty() || name == ":" || name == "::" {
550 return Err(LexerError {
551 message: "Expected identifier after '$::' for TCL global placeholder".to_string(),
552 position: self.position(),
553 near_token: Some(format!("${}", name)),
554 });
555 }
556
557 Ok(Token::NamedPlaceholder(name))
559 }
560}