1use anyhow::anyhow;
2
3use crate::ast::{Token, TokenType};
4
5pub struct Scanner {
6 source_chars: Vec<char>,
7 tokens: Vec<Token>,
8 start: usize,
9 current: usize,
10 line: u32,
11 col: u32,
12 open_type_brackets: Option<u32>,
13}
14
15impl Scanner {
16 pub fn new(source: &str) -> Self {
17 Self {
18 source_chars: source.chars().collect(),
19 tokens: vec![],
20 start: 0,
21 current: 0,
22 line: 1,
23 col: 0,
24 open_type_brackets: None,
25 }
26 }
27
28 pub fn tokens(&self) -> &Vec<Token> {
29 &self.tokens
30 }
31
32 fn advance(&mut self) -> char {
33 let c = self.source_chars[self.current];
34 self.current += 1;
35 self.col += 1;
36 c
37 }
38
39 fn n_advance(&mut self, n: usize) -> char {
40 debug_assert!(n > 0);
41 let mut c = self.advance();
42 for _ in 1..n {
43 c = self.advance();
44 }
45 c
46 }
47
48 fn is_at_end(&self) -> bool {
49 self.current >= self.source_chars.len()
50 }
51
52 fn peek(&self) -> char {
53 if self.is_at_end() {
54 '\0'
55 } else {
56 self.source_chars[self.current]
57 }
58 }
59
60 fn peek_prev(&self) -> Option<char> {
61 self.peek_prev_i(1)
62 }
63
64 fn peek_prev_i(&self, i: usize) -> Option<char> {
65 let idx = self.current.checked_sub(i)?;
66 Some(self.source_chars[idx])
67 }
68
69 fn peek_next_i(&mut self, i: usize) -> char {
70 if self.current + i >= self.source_chars.len() {
71 '\0'
72 } else {
73 self.source_chars[self.current + i]
74 }
75 }
76
77 fn n_peek(&mut self, n: usize) -> Option<&[char]> {
78 self.source_chars.get(self.current..self.current + n)
79 }
80
81 fn match_char(&mut self, expected: char) -> bool {
82 if self.peek() != expected {
83 return false;
84 };
85
86 self.current += 1;
87 true
88 }
89
90 fn add_token(&mut self, token_type: TokenType) {
91 self.tokens.push(Token {
92 kind: token_type,
93 lexeme: self.source_chars[self.start..self.current].iter().collect(),
94 line: self.line,
95 col: self.col,
96 });
97 }
98
99 fn current_source_str(&self) -> String {
100 self.source_chars[self.start..self.current].iter().collect()
101 }
102
103 fn reset(&mut self) {
104 self.tokens.clear();
105 self.start = 0;
106 self.current = 0;
107 self.col = 1;
108 self.line = 1;
109 }
110
111 fn new_line(&mut self) {
112 self.line += 1;
113 self.col = 1;
114 }
115
116 pub fn scan(&mut self) -> anyhow::Result<()> {
117 self.reset();
118 while self.current < self.source_chars.len() {
119 self.start = self.current;
120 self.scan_token()?;
121 }
122 self.tokens.push(Token {
123 kind: TokenType::Eof,
124 lexeme: String::from("eof"),
125 line: self.line,
126 col: self.col,
127 });
128
129 Ok(())
130 }
131
132 fn is_raw_string(&mut self, c: char) -> bool {
133 let next_c = self.peek();
134 (c == 'r' || c == 'R') && (next_c == '\'' || next_c == '"')
135 }
136
137 fn is_bytes(&mut self, c: char) -> bool {
138 let next_c = self.peek();
139 (c == 'b' || c == 'B') && (next_c == '\'' || next_c == '"')
140 }
141
142 fn is_raw_bytes(&mut self, c: char) -> bool {
143 let next_c = self.peek();
144 let next_next_c = self.peek_next_i(1);
145 (c == 'b' || c == 'B' || c == 'r' || c == 'R')
146 && ((c == 'b' && next_c == 'r')
147 || (c == 'B' && next_c == 'R')
148 || (c == 'r' && next_c == 'b')
149 || (c == 'R' && next_c == 'B'))
150 && (next_next_c == '\'' || next_next_c == '"')
151 }
152
153 fn scan_string(&mut self, delimiter: char) -> anyhow::Result<()> {
154 loop {
155 let peek_char = self.peek();
156 if peek_char == '\0' {
157 return Err(anyhow!(self.error_str("Found unterminated string")));
158 }
159 let escaped = self.peek_prev().is_some_and(|prev| {
160 prev == '\\' && self.peek_prev_i(2).is_some_and(|prev_2| prev_2 != '\\')
161 });
162 if !escaped && self.match_char(delimiter) {
163 break;
164 }
165 self.advance();
166 }
167 Ok(())
168 }
169
170 fn scan_triple_quoted_string(&mut self, delimiter: char) -> anyhow::Result<()> {
171 loop {
172 let peek_char = self.peek();
173 if peek_char == '\0' {
174 return Err(anyhow!(self.error_str("Found unterminated string")));
175 }
176 let escaped = self.peek_prev().is_some_and(|prev| {
177 prev == '\\' && self.peek_prev_i(2).is_some_and(|prev_2| prev_2 != '\\')
178 });
179 if !escaped && self.match_char(delimiter) {
180 let curr = self.current - 1;
181 if self.match_char(delimiter) && self.match_char(delimiter) {
182 break;
183 } else {
184 self.current = curr;
185 }
186 }
187 if peek_char == '\n' {
188 self.new_line();
189 }
190 self.advance();
191 }
192 Ok(())
193 }
194
195 fn string_slice(&mut self, start_offset: usize, end_offset: usize) -> String {
196 self.source_chars[self.start + 1 + start_offset..self.current - 1 - end_offset]
197 .iter()
198 .collect::<String>()
199 }
200
201 fn match_number(&mut self) -> anyhow::Result<()> {
202 let mut found_dot = false;
203 let mut found_e = false;
204 loop {
205 let peek_char = self.peek();
206
207 if peek_char == '\0' {
208 self.add_token(TokenType::Number(
209 self.source_chars[self.start..self.current]
210 .iter()
211 .collect::<String>(),
212 ));
213 break;
214 }
215
216 if peek_char == '.' {
217 if found_dot || found_e {
218 return Err(anyhow!(self.error_str("Found invalid number")));
219 }
220 found_dot = true;
221 self.advance();
222 } else if peek_char == 'e' || peek_char == 'E' {
223 if found_e {
224 return Err(anyhow!(self.error_str("Found invalid number")));
225 }
226 found_e = true;
227 let peek_next_char = self.peek_next_i(1);
228 if peek_next_char == '+' || peek_next_char == '-' {
229 self.advance();
230 if !(self.peek_next_i(1).is_ascii_digit()) {
231 return Err(anyhow!(self.error_str("Found invalid number")));
232 }
233 self.advance();
234 } else if peek_next_char.is_ascii_digit() {
235 self.advance();
236 } else {
237 return Err(anyhow!(self.error_str("Found invalid number")));
238 }
239 } else if peek_char.is_ascii_digit() {
240 self.advance();
241 } else {
242 self.add_token(TokenType::Number(
243 self.source_chars[self.start..self.current]
244 .iter()
245 .collect::<String>(),
246 ));
247 break;
248 }
249 }
250
251 Ok(())
252 }
253
254 fn match_string(&mut self, delimiter: char) -> anyhow::Result<()> {
255 self.scan_string(delimiter)?;
256 let str_slice = self.string_slice(0, 0);
257 self.add_token(TokenType::String(str_slice));
258 Ok(())
259 }
260
261 fn match_triple_quoted_string(&mut self, delimiter: char) -> anyhow::Result<()> {
262 self.scan_triple_quoted_string(delimiter)?;
263 let str_slice = self.string_slice(2, 2);
264 self.add_token(TokenType::String(str_slice));
265 Ok(())
266 }
267
268 fn match_bytes(&mut self, delimiter: char) -> anyhow::Result<()> {
269 self.scan_string(delimiter)?;
270 let str_slice = self.string_slice(1, 0);
271 self.add_token(TokenType::Bytes(str_slice));
272 Ok(())
273 }
274
275 fn match_triple_quoted_bytes(&mut self, delimiter: char) -> anyhow::Result<()> {
276 self.scan_triple_quoted_string(delimiter)?;
277 let str_slice = self.string_slice(3, 2);
278 self.add_token(TokenType::Bytes(str_slice));
279 Ok(())
280 }
281
282 fn match_raw_bytes(&mut self, delimiter: char) -> anyhow::Result<()> {
283 self.scan_string(delimiter)?;
284 let str_slice = self.string_slice(2, 0);
285 self.add_token(TokenType::RawBytes(str_slice));
286 Ok(())
287 }
288
289 fn match_triple_quoted_raw_bytes(&mut self, delimiter: char) -> anyhow::Result<()> {
290 self.scan_triple_quoted_string(delimiter)?;
291 let str_slice = self.string_slice(4, 2);
292 self.add_token(TokenType::RawBytes(str_slice));
293 Ok(())
294 }
295
296 fn match_raw_string(&mut self, delimiter: char) -> anyhow::Result<()> {
297 self.scan_string(delimiter)?;
298 let str_slice = self.string_slice(1, 0);
299 self.add_token(TokenType::RawString(str_slice));
300 Ok(())
301 }
302
303 fn match_triple_quoted_raw_string(&mut self, delimiter: char) -> anyhow::Result<()> {
304 self.scan_triple_quoted_string(delimiter)?;
305 let str_slice = self.string_slice(3, 2);
306 self.add_token(TokenType::RawString(str_slice));
307 Ok(())
308 }
309
310 fn match_reserved_keyword(&mut self, token_type: TokenType) {
311 if let Some(Token {
312 kind: TokenType::Dot,
313 ..
314 }) = self.tokens.last()
315 {
316 self.add_token(TokenType::Identifier(self.current_source_str()));
317 return;
318 }
319 self.add_token(token_type);
320 }
321
322 fn match_keyword_or_identifier(&mut self) {
323 loop {
324 let peek_char = self.peek();
325 if !(peek_char.is_alphanumeric() || peek_char == '_') {
326 break;
327 }
328 self.advance();
329 }
330 let identifer: String = self.source_chars[self.start..self.current].iter().collect();
331
332 match identifer.to_lowercase().as_str() {
333 "array" => {
334 self.match_reserved_keyword(TokenType::Array);
335 if self.peek() == '<' && self.open_type_brackets.is_none() {
336 self.open_type_brackets = Some(0);
337 }
338 }
339 "struct" => {
340 self.match_reserved_keyword(TokenType::Struct);
341 if self.peek() == '<' && self.open_type_brackets.is_none() {
342 self.open_type_brackets = Some(0)
343 }
344 }
345 "all" => self.match_reserved_keyword(TokenType::All),
346 "and" => self.match_reserved_keyword(TokenType::And),
347 "any" => self.match_reserved_keyword(TokenType::Any),
348 "as" => self.match_reserved_keyword(TokenType::As),
349 "asc" => self.match_reserved_keyword(TokenType::Asc),
350 "assert_rows_modified" => self.match_reserved_keyword(TokenType::AssertRowsModified),
351 "at" => self.match_reserved_keyword(TokenType::At),
352 "between" => self.match_reserved_keyword(TokenType::Between),
353 "by" => self.match_reserved_keyword(TokenType::By),
354 "case" => self.match_reserved_keyword(TokenType::Case),
355 "cast" => self.match_reserved_keyword(TokenType::Cast),
356 "collate" => self.match_reserved_keyword(TokenType::Collate),
357 "contains" => self.match_reserved_keyword(TokenType::Contains),
358 "create" => self.match_reserved_keyword(TokenType::Create),
359 "cross" => self.match_reserved_keyword(TokenType::Cross),
360 "cube" => self.match_reserved_keyword(TokenType::Cube),
361 "current" => self.match_reserved_keyword(TokenType::Current),
362 "default" => self.match_reserved_keyword(TokenType::Default),
363 "define" => self.match_reserved_keyword(TokenType::Define),
364 "desc" => self.match_reserved_keyword(TokenType::Desc),
365 "distinct" => self.match_reserved_keyword(TokenType::Distinct),
366 "else" => self.match_reserved_keyword(TokenType::Else),
367 "end" => self.match_reserved_keyword(TokenType::End),
368 "enum" => self.match_reserved_keyword(TokenType::Enum),
369 "escape" => self.match_reserved_keyword(TokenType::Escape),
370 "except" => self.match_reserved_keyword(TokenType::Except),
371 "exclude" => self.match_reserved_keyword(TokenType::Exclude),
372 "exists" => self.match_reserved_keyword(TokenType::Exists),
373 "extract" => self.match_reserved_keyword(TokenType::Extract),
374 "false" => self.match_reserved_keyword(TokenType::False),
375 "fetch" => self.match_reserved_keyword(TokenType::Fetch),
376 "following" => self.match_reserved_keyword(TokenType::Following),
377 "for" => self.match_reserved_keyword(TokenType::For),
378 "from" => self.match_reserved_keyword(TokenType::From),
379 "full" => self.match_reserved_keyword(TokenType::Full),
380 "group" => self.match_reserved_keyword(TokenType::Group),
381 "grouping" => self.match_reserved_keyword(TokenType::Grouping),
382 "groups" => self.match_reserved_keyword(TokenType::Groups),
383 "hash" => self.match_reserved_keyword(TokenType::Hash),
384 "having" => self.match_reserved_keyword(TokenType::Having),
385 "if" => self.match_reserved_keyword(TokenType::If),
386 "ignore" => self.match_reserved_keyword(TokenType::Ignore),
387 "in" => self.match_reserved_keyword(TokenType::In),
388 "inner" => self.match_reserved_keyword(TokenType::Inner),
389 "intersect" => self.match_reserved_keyword(TokenType::Intersect),
390 "interval" => self.match_reserved_keyword(TokenType::Interval),
391 "into" => self.match_reserved_keyword(TokenType::Into),
392 "is" => self.match_reserved_keyword(TokenType::Is),
393 "join" => self.match_reserved_keyword(TokenType::Join),
394 "lateral" => self.match_reserved_keyword(TokenType::Lateral),
395 "left" => self.match_reserved_keyword(TokenType::Left),
396 "like" => self.match_reserved_keyword(TokenType::Like),
397 "limit" => self.match_reserved_keyword(TokenType::Limit),
398 "lookup" => self.match_reserved_keyword(TokenType::Lookup),
399 "merge" => self.match_reserved_keyword(TokenType::Merge),
400 "natural" => self.match_reserved_keyword(TokenType::Natural),
401 "new" => self.match_reserved_keyword(TokenType::New),
402 "no" => self.match_reserved_keyword(TokenType::No),
403 "not" => self.match_reserved_keyword(TokenType::Not),
404 "null" => self.match_reserved_keyword(TokenType::Null),
405 "nulls" => self.match_reserved_keyword(TokenType::Nulls),
406 "of" => self.match_reserved_keyword(TokenType::Of),
407 "on" => self.match_reserved_keyword(TokenType::On),
408 "or" => self.match_reserved_keyword(TokenType::Or),
409 "order" => self.match_reserved_keyword(TokenType::Order),
410 "outer" => self.match_reserved_keyword(TokenType::Outer),
411 "over" => self.match_reserved_keyword(TokenType::Over),
412 "partition" => self.match_reserved_keyword(TokenType::Partition),
413 "preceding" => self.match_reserved_keyword(TokenType::Preceding),
414 "proto" => self.match_reserved_keyword(TokenType::Proto),
415 "qualify" => self.match_reserved_keyword(TokenType::Qualify),
416 "range" => self.match_reserved_keyword(TokenType::Range),
417 "recursive" => self.match_reserved_keyword(TokenType::Recursive),
418 "respect" => self.match_reserved_keyword(TokenType::Respect),
419 "right" => self.match_reserved_keyword(TokenType::Right),
420 "rollup" => self.match_reserved_keyword(TokenType::Rollup),
421 "rows" => self.match_reserved_keyword(TokenType::Rows),
422 "select" => self.match_reserved_keyword(TokenType::Select),
423 "set" => self.match_reserved_keyword(TokenType::Set),
424 "some" => self.match_reserved_keyword(TokenType::Some),
425 "tablesample" => self.match_reserved_keyword(TokenType::Tablesample),
426 "then" => self.match_reserved_keyword(TokenType::Then),
427 "to" => self.match_reserved_keyword(TokenType::To),
428 "treat" => self.match_reserved_keyword(TokenType::Treat),
429 "true" => self.match_reserved_keyword(TokenType::True),
430 "union" => self.match_reserved_keyword(TokenType::Union),
431 "unnest" => self.match_reserved_keyword(TokenType::Unnest),
432 "using" => self.match_reserved_keyword(TokenType::Using),
433 "when" => self.match_reserved_keyword(TokenType::When),
434 "where" => self.match_reserved_keyword(TokenType::Where),
435 "window" => self.match_reserved_keyword(TokenType::Window),
436 "with" => self.match_reserved_keyword(TokenType::With),
437 "within" => self.match_reserved_keyword(TokenType::Within),
438 _ => self.add_token(TokenType::Identifier(self.current_source_str())),
439 }
440 }
441
442 fn scan_token(&mut self) -> anyhow::Result<()> {
443 let curr_char = self.advance();
444 match curr_char {
445 '(' => self.add_token(TokenType::LeftParen),
446 ')' => self.add_token(TokenType::RightParen),
447 '[' => self.add_token(TokenType::LeftSquare),
448 ']' => self.add_token(TokenType::RightSquare),
449 '*' => self.add_token(TokenType::Star),
450 ',' => self.add_token(TokenType::Comma),
451 ':' => self.add_token(TokenType::Colon),
452 ';' => self.add_token(TokenType::Semicolon),
453 '.' => {
454 let peek_char = self.peek();
455 if peek_char.is_ascii_digit() {
456 self.match_number()?;
457 } else {
458 self.add_token(TokenType::Dot);
459 }
460 }
461 '+' => self.add_token(TokenType::Plus),
462 '=' => {
463 if self.match_char('>') {
464 self.add_token(TokenType::RightArrow);
465 } else {
466 self.add_token(TokenType::Equal)
467 }
468 }
469 '/' => {
470 if self.match_char('*') {
471 loop {
472 if self.peek() == '\0' {
473 return Err(anyhow!(self.error_str("Found unterminated comment")));
474 }
475 if self.peek() == '\n' {
476 self.new_line();
477 }
478 let peek_chars = self.n_peek(2);
479 if peek_chars.is_some()
480 && peek_chars
481 .unwrap()
482 .iter()
483 .zip("*/".chars())
484 .all(|(&c1, c2)| c1 == c2)
485 {
486 self.n_advance(2);
487 break;
488 }
489 self.advance();
490 }
491 } else {
492 self.add_token(TokenType::Slash)
493 }
494 }
495 '#' => loop {
496 let peek_char = self.peek();
497 if peek_char == '\n' || peek_char == '\0' {
498 break;
499 }
500 self.advance();
501 },
502 '-' => {
503 if self.match_char('-') {
504 loop {
505 let peek_char = self.peek();
506 if peek_char == '\n' || peek_char == '\0' {
507 break;
508 }
509 self.advance();
510 }
511 } else {
512 self.add_token(TokenType::Minus)
513 }
514 }
515 '<' => {
516 if self.match_char('>') {
517 self.add_token(TokenType::NotEqual);
518 } else if self.match_char('=') {
519 self.add_token(TokenType::LessEqual);
520 } else if self.match_char('<') {
521 self.add_token(TokenType::BitwiseLeftShift);
522 } else {
523 if self.open_type_brackets.is_some() {
524 self.open_type_brackets = self.open_type_brackets.map(|n| n + 1);
525 }
526 self.add_token(TokenType::Less);
527 }
528 }
529 '!' => {
530 if self.match_char('=') {
531 self.add_token(TokenType::BangEqual);
532 } else {
533 self.add_token(TokenType::Bang);
534 }
535 }
536 '>' => {
537 if self.match_char('=') {
538 self.add_token(TokenType::GreaterEqual);
539 } else if self.peek() == '>' {
540 if self.open_type_brackets.is_some() {
541 self.open_type_brackets = self.open_type_brackets.map(|n| n - 1);
542 self.add_token(TokenType::Greater);
543 } else {
544 self.match_char('>');
545 self.add_token(TokenType::BitwiseRightShift);
546 }
547 } else {
548 if self.open_type_brackets.is_some() {
549 self.open_type_brackets = self.open_type_brackets.and_then(|n| {
550 let new_n = n - 1;
551 if new_n == 0 { None } else { Some(new_n) }
552 });
553 }
554 self.add_token(TokenType::Greater);
555 }
556 }
557 '~' => {
558 self.add_token(TokenType::BitwiseNot);
559 }
560 '&' => {
561 self.add_token(TokenType::BitwiseAnd);
562 }
563 '|' => {
564 if self.match_char('|') {
565 self.add_token(TokenType::ConcatOperator);
566 } else {
567 self.add_token(TokenType::BitwiseOr);
568 }
569 }
570 '^' => {
571 self.add_token(TokenType::BitwiseXor);
572 }
573 '\n' => {
574 self.new_line();
575 }
576 '\r' | ' ' | '\t' => {}
577
578 c if c == '\'' || c == '"' => {
580 let peek = self.peek();
581 if peek == c && peek == self.peek_next_i(1) {
582 self.advance();
583 self.advance();
584 self.match_triple_quoted_string(c)?;
585 } else {
586 self.match_string(c)?;
587 }
588 }
589
590 c if self.is_raw_string(c) => {
592 let peek_next = self.peek_next_i(1);
593 if self.peek() == peek_next && peek_next == self.peek_next_i(2) {
594 self.advance();
595 self.advance();
596 let delimiter = self.advance();
597 self.match_triple_quoted_raw_string(delimiter)?;
598 } else {
599 let delimiter = self.advance();
600 self.match_raw_string(delimiter)?;
601 }
602 }
603
604 c if self.is_bytes(c) => {
606 let peek_next = self.peek_next_i(1);
607 if self.peek() == peek_next && peek_next == self.peek_next_i(2) {
608 self.advance();
609 let delimiter = self.advance();
610 self.match_triple_quoted_bytes(delimiter)?;
611 } else {
612 let delimiter = self.advance();
613 self.match_bytes(delimiter)?;
614 }
615 }
616
617 c if self.is_raw_bytes(c) => {
619 let peek_next_next = self.peek_next_i(2);
620 if self.peek_next_i(1) == peek_next_next && peek_next_next == self.peek_next_i(3) {
621 self.advance();
622 self.advance();
623 self.advance();
624 let delimiter = self.advance();
625 self.match_triple_quoted_raw_bytes(delimiter)?;
626 } else {
627 self.advance();
628 let delimiter = self.advance();
629 self.match_raw_bytes(delimiter)?;
630 }
631 }
632
633 c if c.is_ascii_digit() => {
635 self.match_number()?;
636 }
637
638 c if c.is_alphabetic() || c == '_' => {
640 self.match_keyword_or_identifier();
641 }
642
643 '@' => {
645 let is_system_variable = self.match_char('@');
646 loop {
647 let peek_char = self.peek();
648 if !(peek_char.is_alphanumeric() || peek_char == '_') {
649 break;
650 }
651 self.advance();
652 }
653 if is_system_variable {
654 self.add_token(TokenType::SystemVariable(
655 self.source_chars[self.start + 2..self.current]
656 .iter()
657 .collect(),
658 ));
659 } else {
660 self.add_token(TokenType::QueryNamedParameter(
661 self.source_chars[self.start + 1..self.current]
662 .iter()
663 .collect(),
664 ));
665 }
666 }
667
668 '?' => {
670 self.advance();
671 self.add_token(TokenType::QueryPositionalParameter);
672 }
673
674 '`' => {
675 let quoted_ident_start_idx = self.current - 1;
676 loop {
677 let curr_char = self.advance();
678 if curr_char == '`' {
679 let quoted_ident_end_idx = self.current - 1;
680 if quoted_ident_end_idx == quoted_ident_start_idx + 1 {
681 return Err(anyhow!(self.error_str("Found empty quoted identifier.")));
682 }
683 self.add_token(TokenType::QuotedIdentifier(
684 self.source_chars[(quoted_ident_start_idx + 1)..quoted_ident_end_idx]
685 .iter()
686 .collect::<String>(),
687 ));
688 break;
689 }
690 if self.peek() == '\0' {
691 return Err(anyhow!(
692 self.error_str("Found unterminated quoted identifier")
693 ));
694 }
695 }
696 }
697
698 _ => {
699 return Err(anyhow!(self.error_str(&format!(
700 "Found unexpected character while scanning: {}",
701 curr_char
702 ))));
703 }
704 }
705 Ok(())
706 }
707
708 fn error_str(&mut self, error: &str) -> String {
709 format!(
710 "[line: {}, col: {}] Scanner error: {}",
711 self.line, self.col, error
712 )
713 }
714}