vibesql_parser/lexer/
mod.rs1use std::fmt;
11
12use crate::token::Token;
13
14mod identifiers;
15mod keywords;
16mod numbers;
17mod operators;
18mod strings;
19
20#[derive(Debug, Clone, PartialEq)]
22pub struct LexerError {
23 pub message: String,
24 pub position: usize,
25}
26
27impl fmt::Display for LexerError {
28 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
29 write!(f, "Lexer error at position {}: {}", self.position, self.message)
30 }
31}
32
33pub struct Lexer<'a> {
38 input: &'a str,
39 byte_pos: usize,
40}
41
42impl<'a> Lexer<'a> {
43 #[inline]
45 pub fn new(input: &'a str) -> Self {
46 Lexer { input, byte_pos: 0 }
47 }
48
49 pub fn tokenize(&mut self) -> Result<Vec<Token>, LexerError> {
51 let estimated_tokens = (self.input.len() / 6).max(4);
53 let mut tokens = Vec::with_capacity(estimated_tokens);
54
55 loop {
56 self.skip_whitespace_and_comments();
57
58 if self.is_eof() {
59 tokens.push(Token::Eof);
60 break;
61 }
62
63 let token = self.next_token()?;
64 tokens.push(token);
65 }
66
67 Ok(tokens)
68 }
69
70 fn next_token(&mut self) -> Result<Token, LexerError> {
72 let ch = self.current_char();
73
74 match ch {
75 ';' => {
76 self.advance();
77 Ok(Token::Semicolon)
78 }
79 ',' => {
80 self.advance();
81 Ok(Token::Comma)
82 }
83 '(' => {
84 self.advance();
85 Ok(Token::LParen)
86 }
87 ')' => {
88 self.advance();
89 Ok(Token::RParen)
90 }
91 '=' | '<' | '>' | '!' | '|' => self.tokenize_operator(ch),
92 '@' => {
93 if self.peek_byte(1) == Some(b'@') {
95 self.tokenize_session_variable()
96 } else {
97 self.tokenize_user_variable()
98 }
99 }
100 '.' => {
101 if self.peek_byte(1).map(|b| b.is_ascii_digit()).unwrap_or(false) {
103 self.tokenize_number()
104 } else {
105 self.advance();
106 Ok(Token::Symbol('.'))
107 }
108 }
109 '+' | '-' | '*' | '/' => {
110 let symbol = ch;
111 self.advance();
112 Ok(Token::Symbol(symbol))
113 }
114 '\'' => self.tokenize_string(),
115 '"' => self.tokenize_delimited_identifier(),
116 '`' => self.tokenize_backtick_identifier(),
117 '0'..='9' => self.tokenize_number(),
118 'a'..='z' | 'A'..='Z' | '_' => self.tokenize_identifier_or_keyword(),
119 '?' => {
120 self.advance();
121 Ok(Token::Placeholder)
122 }
123 '$' => {
124 if self.peek_byte(1).map(|b| b.is_ascii_digit()).unwrap_or(false) {
126 self.tokenize_numbered_placeholder()
127 } else {
128 Err(LexerError {
129 message: "Expected digit after '$' for numbered placeholder".to_string(),
130 position: self.position(),
131 })
132 }
133 }
134 ':' => {
135 if self.peek_byte(1).map(|b| b.is_ascii_alphabetic() || b == b'_').unwrap_or(false)
137 {
138 self.tokenize_named_placeholder()
139 } else {
140 self.advance();
142 Ok(Token::Symbol(':'))
143 }
144 }
145 _ => Err(LexerError {
146 message: format!("Unexpected character: '{}'", ch),
147 position: self.byte_pos,
148 }),
149 }
150 }
151
152 #[inline]
154 fn skip_whitespace(&mut self) {
155 while let Some(b) = self.peek_byte(0) {
156 if b.is_ascii_whitespace() {
157 self.byte_pos += 1;
158 } else {
159 break;
160 }
161 }
162 }
163
164 fn skip_whitespace_and_comments(&mut self) {
167 loop {
168 self.skip_whitespace();
169
170 if self.is_eof() {
171 break;
172 }
173
174 if self.peek_byte(0) == Some(b'-') && self.peek_byte(1) == Some(b'-') {
176 while let Some(b) = self.peek_byte(0) {
178 self.byte_pos += 1;
179 if b == b'\n' {
180 break;
181 }
182 }
183 continue;
185 }
186
187 break;
189 }
190 }
191
192 #[inline]
194 pub(super) fn current_char(&self) -> char {
195 if self.byte_pos >= self.input.len() {
196 '\0'
197 } else {
198 let b = self.input.as_bytes()[self.byte_pos];
200 if b.is_ascii() {
201 b as char
202 } else {
203 self.input[self.byte_pos..].chars().next().unwrap_or('\0')
205 }
206 }
207 }
208
209 #[inline]
211 pub(super) fn peek_byte(&self, n: usize) -> Option<u8> {
212 let peek_pos = self.byte_pos + n;
213 if peek_pos < self.input.len() {
214 Some(self.input.as_bytes()[peek_pos])
215 } else {
216 None
217 }
218 }
219
220 #[inline]
222 pub(super) fn advance(&mut self) {
223 if self.byte_pos < self.input.len() {
224 let b = self.input.as_bytes()[self.byte_pos];
226 if b.is_ascii() {
227 self.byte_pos += 1;
228 } else {
229 if let Some(ch) = self.input[self.byte_pos..].chars().next() {
231 self.byte_pos += ch.len_utf8();
232 }
233 }
234 }
235 }
236
237 #[inline]
239 pub(super) fn is_eof(&self) -> bool {
240 self.byte_pos >= self.input.len()
241 }
242
243 #[inline]
245 pub(super) fn position(&self) -> usize {
246 self.byte_pos
247 }
248
249 #[inline]
251 pub(super) fn slice_from(&self, start: usize) -> &'a str {
252 &self.input[start..self.byte_pos]
253 }
254
255 fn tokenize_session_variable(&mut self) -> Result<Token, LexerError> {
257 self.advance(); self.advance(); let start = self.byte_pos;
261
262 while !self.is_eof() {
264 let ch = self.current_char();
265 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '.' {
266 self.advance();
267 } else {
268 break;
269 }
270 }
271
272 if self.byte_pos == start {
273 return Err(LexerError {
274 message: "Expected variable name after @@".to_string(),
275 position: self.byte_pos,
276 });
277 }
278
279 let var_name = self.slice_from(start).to_string();
280 Ok(Token::SessionVariable(var_name))
281 }
282
283 fn tokenize_user_variable(&mut self) -> Result<Token, LexerError> {
285 self.advance(); let start = self.byte_pos;
288
289 while !self.is_eof() {
291 let ch = self.current_char();
292 if ch.is_ascii_alphanumeric() || ch == '_' {
293 self.advance();
294 } else {
295 break;
296 }
297 }
298
299 if self.byte_pos == start {
300 return Err(LexerError {
301 message: "Expected variable name after @".to_string(),
302 position: self.byte_pos,
303 });
304 }
305
306 let var_name = self.slice_from(start).to_string();
307 Ok(Token::UserVariable(var_name))
308 }
309
310 fn tokenize_numbered_placeholder(&mut self) -> Result<Token, LexerError> {
313 self.advance(); let start_pos = self.position();
316 let mut num_str = String::new();
317
318 while !self.is_eof() {
320 let ch = self.current_char();
321 if ch.is_ascii_digit() {
322 num_str.push(ch);
323 self.advance();
324 } else {
325 break;
326 }
327 }
328
329 if num_str.is_empty() {
330 return Err(LexerError {
331 message: "Expected digit after '$' for numbered placeholder".to_string(),
332 position: start_pos,
333 });
334 }
335
336 let index: usize = num_str.parse().map_err(|_| LexerError {
337 message: format!("Invalid numbered placeholder: ${}", num_str),
338 position: start_pos,
339 })?;
340
341 if index == 0 {
343 return Err(LexerError {
344 message: "Numbered placeholder must be $1 or higher (no $0)".to_string(),
345 position: start_pos,
346 });
347 }
348
349 Ok(Token::NumberedPlaceholder(index))
350 }
351
352 fn tokenize_named_placeholder(&mut self) -> Result<Token, LexerError> {
354 self.advance(); let mut name = String::new();
357
358 while !self.is_eof() {
360 let ch = self.current_char();
361 if ch.is_ascii_alphanumeric() || ch == '_' {
362 name.push(ch);
363 self.advance();
364 } else {
365 break;
366 }
367 }
368
369 if name.is_empty() {
370 return Err(LexerError {
371 message: "Expected identifier after ':' for named placeholder".to_string(),
372 position: self.position(),
373 });
374 }
375
376 Ok(Token::NamedPlaceholder(name))
377 }
378}