1use crate::errors::{Result, SqlglotError};
2use crate::tokens::{Token, TokenType};
3
4pub struct Tokenizer {
13 input: Vec<char>,
14 pos: usize,
15 line: usize,
16 col: usize,
17 pub preserve_comments: bool,
19}
20
21impl Tokenizer {
22 #[must_use]
24 pub fn new(input: &str) -> Self {
25 Self {
26 input: input.chars().collect(),
27 pos: 0,
28 line: 1,
29 col: 1,
30 preserve_comments: false,
31 }
32 }
33
34 #[must_use]
36 pub fn with_comments(input: &str) -> Self {
37 Self {
38 input: input.chars().collect(),
39 pos: 0,
40 line: 1,
41 col: 1,
42 preserve_comments: true,
43 }
44 }
45
46 pub fn tokenize(&mut self) -> Result<Vec<Token>> {
50 let mut tokens = Vec::new();
51 loop {
52 let token = self.next_token()?;
53 match token.token_type {
54 TokenType::Eof => {
55 tokens.push(token);
56 break;
57 }
58 TokenType::Whitespace => continue,
59 TokenType::LineComment | TokenType::BlockComment => {
60 if self.preserve_comments {
61 tokens.push(token);
62 }
63 }
64 _ => tokens.push(token),
65 }
66 }
67 Ok(tokens)
68 }
69
70 fn peek(&self) -> Option<char> {
71 self.input.get(self.pos).copied()
72 }
73
74 fn peek_at(&self, offset: usize) -> Option<char> {
75 self.input.get(self.pos + offset).copied()
76 }
77
78 fn advance(&mut self) -> Option<char> {
79 let ch = self.input.get(self.pos).copied();
80 if let Some(c) = ch {
81 self.pos += 1;
82 if c == '\n' {
83 self.line += 1;
84 self.col = 1;
85 } else {
86 self.col += 1;
87 }
88 }
89 ch
90 }
91
92 fn make_token(
93 &self,
94 token_type: TokenType,
95 value: impl Into<String>,
96 start: usize,
97 start_line: usize,
98 start_col: usize,
99 ) -> Token {
100 Token::with_location(token_type, value, start, start_line, start_col)
101 }
102
103 fn next_token(&mut self) -> Result<Token> {
104 while self.peek().is_some_and(|c| c.is_whitespace()) {
106 self.advance();
107 }
108
109 let start = self.pos;
110 let start_line = self.line;
111 let start_col = self.col;
112
113 let Some(ch) = self.advance() else {
114 return Ok(self.make_token(TokenType::Eof, "", start, start_line, start_col));
115 };
116
117 match ch {
118 '(' => Ok(self.make_token(TokenType::LParen, "(", start, start_line, start_col)),
120 ')' => Ok(self.make_token(TokenType::RParen, ")", start, start_line, start_col)),
121 '[' => {
122 let mut looks_like_ident = false;
126 if let Some(first_inner) = self.peek()
127 && (first_inner.is_ascii_alphabetic() || first_inner == '_')
128 {
129 let mut scan = self.pos;
130 while scan < self.input.len() {
131 if self.input[scan] == ']' {
132 looks_like_ident = scan > self.pos;
133 break;
134 }
135 if self.input[scan] == ',' || self.input[scan] == '\n' {
136 break;
137 }
138 scan += 1;
139 }
140 }
141 if looks_like_ident {
142 self.read_quoted_identifier(start, start_line, start_col, '[')
143 } else {
144 Ok(self.make_token(TokenType::LBracket, "[", start, start_line, start_col))
145 }
146 }
147 ']' => Ok(self.make_token(TokenType::RBracket, "]", start, start_line, start_col)),
148 '{' => Ok(self.make_token(TokenType::LBrace, "{", start, start_line, start_col)),
149 '}' => Ok(self.make_token(TokenType::RBrace, "}", start, start_line, start_col)),
150 ',' => Ok(self.make_token(TokenType::Comma, ",", start, start_line, start_col)),
151 ';' => Ok(self.make_token(TokenType::Semicolon, ";", start, start_line, start_col)),
152 '.' => Ok(self.make_token(TokenType::Dot, ".", start, start_line, start_col)),
153 '+' => Ok(self.make_token(TokenType::Plus, "+", start, start_line, start_col)),
154 '~' => Ok(self.make_token(TokenType::BitwiseNot, "~", start, start_line, start_col)),
155 '@' => Ok(self.make_token(TokenType::AtSign, "@", start, start_line, start_col)),
156 '=' => Ok(self.make_token(TokenType::Eq, "=", start, start_line, start_col)),
157 '*' => Ok(self.make_token(TokenType::Star, "*", start, start_line, start_col)),
158 '%' => Ok(self.make_token(TokenType::Percent2, "%", start, start_line, start_col)),
159 '^' => Ok(self.make_token(TokenType::BitwiseXor, "^", start, start_line, start_col)),
160
161 ':' => {
163 if self.peek() == Some(':') {
164 self.advance();
165 Ok(self.make_token(TokenType::DoubleColon, "::", start, start_line, start_col))
166 } else {
167 Ok(self.make_token(TokenType::Colon, ":", start, start_line, start_col))
168 }
169 }
170
171 '-' => {
173 if self.peek() == Some('-') {
174 self.advance();
175 let mut value = String::from("--");
176 while self.peek().is_some_and(|c| c != '\n') {
177 value.push(self.advance().unwrap());
178 }
179 Ok(
180 self.make_token(
181 TokenType::LineComment,
182 value,
183 start,
184 start_line,
185 start_col,
186 ),
187 )
188 } else if self.peek() == Some('>') {
189 self.advance();
190 if self.peek() == Some('>') {
191 self.advance();
192 Ok(self.make_token(
193 TokenType::DoubleArrow,
194 "->>",
195 start,
196 start_line,
197 start_col,
198 ))
199 } else {
200 Ok(self.make_token(TokenType::Arrow, "->", start, start_line, start_col))
201 }
202 } else {
203 Ok(self.make_token(TokenType::Minus, "-", start, start_line, start_col))
204 }
205 }
206
207 '/' => {
209 if self.peek() == Some('*') {
210 self.advance();
211 let mut value = String::from("/*");
212 let mut depth = 1;
213 while depth > 0 {
214 match self.advance() {
215 Some('*') if self.peek() == Some('/') => {
216 self.advance();
217 depth -= 1;
218 value.push_str("*/");
219 }
220 Some('/') if self.peek() == Some('*') => {
221 self.advance();
222 depth += 1;
223 value.push_str("/*");
224 }
225 Some(c) => value.push(c),
226 None => {
227 return Err(SqlglotError::TokenizerError {
228 message: "Unterminated block comment".into(),
229 position: start,
230 });
231 }
232 }
233 }
234 Ok(self.make_token(
235 TokenType::BlockComment,
236 value,
237 start,
238 start_line,
239 start_col,
240 ))
241 } else {
242 Ok(self.make_token(TokenType::Slash, "/", start, start_line, start_col))
243 }
244 }
245
246 '<' => {
248 if self.peek() == Some('=') {
249 self.advance();
250 Ok(self.make_token(TokenType::LtEq, "<=", start, start_line, start_col))
251 } else if self.peek() == Some('>') {
252 self.advance();
253 Ok(self.make_token(TokenType::Neq, "<>", start, start_line, start_col))
254 } else if self.peek() == Some('<') {
255 self.advance();
256 Ok(self.make_token(TokenType::ShiftLeft, "<<", start, start_line, start_col))
257 } else {
258 Ok(self.make_token(TokenType::Lt, "<", start, start_line, start_col))
259 }
260 }
261
262 '>' => {
264 if self.peek() == Some('=') {
265 self.advance();
266 Ok(self.make_token(TokenType::GtEq, ">=", start, start_line, start_col))
267 } else if self.peek() == Some('>') {
268 self.advance();
269 Ok(self.make_token(TokenType::ShiftRight, ">>", start, start_line, start_col))
270 } else {
271 Ok(self.make_token(TokenType::Gt, ">", start, start_line, start_col))
272 }
273 }
274
275 '!' => {
277 if self.peek() == Some('=') {
278 self.advance();
279 Ok(self.make_token(TokenType::Neq, "!=", start, start_line, start_col))
280 } else {
281 Err(SqlglotError::TokenizerError {
282 message: format!("Unexpected character: {ch}"),
283 position: start,
284 })
285 }
286 }
287
288 '|' => {
290 if self.peek() == Some('|') {
291 self.advance();
292 Ok(self.make_token(TokenType::Concat, "||", start, start_line, start_col))
293 } else {
294 Ok(self.make_token(TokenType::BitwiseOr, "|", start, start_line, start_col))
295 }
296 }
297
298 '&' => Ok(self.make_token(TokenType::BitwiseAnd, "&", start, start_line, start_col)),
300
301 '#' => {
303 if self.peek() == Some('>') {
304 self.advance();
305 if self.peek() == Some('>') {
306 self.advance();
307 Ok(self.make_token(
308 TokenType::HashDoubleArrow,
309 "#>>",
310 start,
311 start_line,
312 start_col,
313 ))
314 } else {
315 Ok(self.make_token(
316 TokenType::HashArrow,
317 "#>",
318 start,
319 start_line,
320 start_col,
321 ))
322 }
323 } else {
324 let mut value = String::from("#");
325 while self.peek().is_some_and(|c| c != '\n') {
326 value.push(self.advance().unwrap());
327 }
328 Ok(
329 self.make_token(
330 TokenType::LineComment,
331 value,
332 start,
333 start_line,
334 start_col,
335 ),
336 )
337 }
338 }
339
340 '\'' => self.read_string(start, start_line, start_col),
342
343 c if c.is_ascii_digit() => self.read_number(start, start_line, start_col, c),
345
346 c if c.is_ascii_alphabetic() || c == '_' => {
348 self.read_identifier(start, start_line, start_col, c)
349 }
350
351 '"' => self.read_quoted_identifier(start, start_line, start_col, '"'),
353
354 '`' => self.read_quoted_identifier(start, start_line, start_col, '`'),
356
357 '$' => {
359 if self.peek().is_some_and(|c| c.is_ascii_digit()) {
360 let mut value = String::from("$");
361 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
362 value.push(self.advance().unwrap());
363 }
364 Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
365 } else {
366 Ok(self.make_token(TokenType::Parameter, "$", start, start_line, start_col))
367 }
368 }
369
370 '?' => Ok(self.make_token(TokenType::Parameter, "?", start, start_line, start_col)),
371
372 _ => Err(SqlglotError::TokenizerError {
373 message: format!("Unexpected character: {ch}"),
374 position: start,
375 }),
376 }
377 }
378
379 fn read_string(&mut self, start: usize, start_line: usize, start_col: usize) -> Result<Token> {
380 let mut value = String::new();
381 loop {
382 match self.advance() {
383 Some('\'') => {
384 if self.peek() == Some('\'') {
385 self.advance();
386 value.push('\'');
387 } else {
388 return Ok(self.make_token(
389 TokenType::String,
390 value,
391 start,
392 start_line,
393 start_col,
394 ));
395 }
396 }
397 Some('\\') => match self.peek() {
398 Some('\\') => {
399 self.advance();
400 value.push('\\');
401 }
402 Some('n') => {
403 self.advance();
404 value.push('\n');
405 }
406 Some('t') => {
407 self.advance();
408 value.push('\t');
409 }
410 Some('r') => {
411 self.advance();
412 value.push('\r');
413 }
414 _ => {
415 value.push('\\');
416 }
417 },
418 Some(c) => value.push(c),
419 None => {
420 return Err(SqlglotError::TokenizerError {
421 message: "Unterminated string literal".into(),
422 position: start,
423 });
424 }
425 }
426 }
427 }
428
429 fn read_number(
430 &mut self,
431 start: usize,
432 start_line: usize,
433 start_col: usize,
434 first: char,
435 ) -> Result<Token> {
436 let mut value = String::new();
437 value.push(first);
438
439 if first == '0' && self.peek().is_some_and(|c| c == 'x' || c == 'X') {
440 value.push(self.advance().unwrap());
441 while self.peek().is_some_and(|c| c.is_ascii_hexdigit()) {
442 value.push(self.advance().unwrap());
443 }
444 return Ok(self.make_token(TokenType::HexString, value, start, start_line, start_col));
445 }
446
447 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
448 value.push(self.advance().unwrap());
449 }
450
451 if self.peek() == Some('.') && self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) {
452 value.push(self.advance().unwrap());
453 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
454 value.push(self.advance().unwrap());
455 }
456 }
457
458 if self.peek().is_some_and(|c| c == 'e' || c == 'E') {
459 value.push(self.advance().unwrap());
460 if self.peek().is_some_and(|c| c == '+' || c == '-') {
461 value.push(self.advance().unwrap());
462 }
463 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
464 value.push(self.advance().unwrap());
465 }
466 }
467
468 Ok(self.make_token(TokenType::Number, value, start, start_line, start_col))
469 }
470
471 fn read_identifier(
472 &mut self,
473 start: usize,
474 start_line: usize,
475 start_col: usize,
476 first: char,
477 ) -> Result<Token> {
478 let mut value = String::new();
479 value.push(first);
480 while self
481 .peek()
482 .is_some_and(|c| c.is_ascii_alphanumeric() || c == '_')
483 {
484 value.push(self.advance().unwrap());
485 }
486
487 let token_type = Self::keyword_type(&value);
488 Ok(self.make_token(token_type, value, start, start_line, start_col))
489 }
490
491 fn keyword_type(word: &str) -> TokenType {
493 match word.to_uppercase().as_str() {
494 "SELECT" => TokenType::Select,
495 "FROM" => TokenType::From,
496 "WHERE" => TokenType::Where,
497 "AND" => TokenType::And,
498 "OR" => TokenType::Or,
499 "NOT" => TokenType::Not,
500 "AS" => TokenType::As,
501 "JOIN" => TokenType::Join,
502 "INNER" => TokenType::Inner,
503 "LEFT" => TokenType::Left,
504 "RIGHT" => TokenType::Right,
505 "FULL" => TokenType::Full,
506 "OUTER" => TokenType::Outer,
507 "CROSS" => TokenType::Cross,
508 "ON" => TokenType::On,
509 "INSERT" => TokenType::Insert,
510 "INTO" => TokenType::Into,
511 "VALUES" => TokenType::Values,
512 "UPDATE" => TokenType::Update,
513 "SET" => TokenType::Set,
514 "DELETE" => TokenType::Delete,
515 "CREATE" => TokenType::Create,
516 "TABLE" => TokenType::Table,
517 "DROP" => TokenType::Drop,
518 "ALTER" => TokenType::Alter,
519 "INDEX" => TokenType::Index,
520 "IF" => TokenType::If,
521 "EXISTS" => TokenType::Exists,
522 "IN" => TokenType::In,
523 "IS" => TokenType::Is,
524 "NULL" => TokenType::Null,
525 "LIKE" => TokenType::Like,
526 "ILIKE" => TokenType::ILike,
527 "ESCAPE" => TokenType::Escape,
528 "BETWEEN" => TokenType::Between,
529 "CASE" => TokenType::Case,
530 "WHEN" => TokenType::When,
531 "THEN" => TokenType::Then,
532 "ELSE" => TokenType::Else,
533 "END" => TokenType::End,
534 "ORDER" => TokenType::Order,
535 "BY" => TokenType::By,
536 "ASC" => TokenType::Asc,
537 "DESC" => TokenType::Desc,
538 "GROUP" => TokenType::Group,
539 "HAVING" => TokenType::Having,
540 "LIMIT" => TokenType::Limit,
541 "OFFSET" => TokenType::Offset,
542 "UNION" => TokenType::Union,
543 "ALL" => TokenType::All,
544 "DISTINCT" => TokenType::Distinct,
545 "TRUE" => TokenType::True,
546 "FALSE" => TokenType::False,
547 "INTERSECT" => TokenType::Intersect,
548 "EXCEPT" => TokenType::Except,
549 "WITH" => TokenType::With,
550 "RECURSIVE" => TokenType::Recursive,
551 "ANY" => TokenType::Any,
552 "SOME" => TokenType::Some,
553 "CAST" => TokenType::Cast,
554 "OVER" => TokenType::Over,
555 "PARTITION" => TokenType::Partition,
556 "WINDOW" => TokenType::Window,
557 "ROWS" => TokenType::Rows,
558 "RANGE" => TokenType::Range,
559 "UNBOUNDED" => TokenType::Unbounded,
560 "PRECEDING" => TokenType::Preceding,
561 "FOLLOWING" => TokenType::Following,
562 "FILTER" => TokenType::Filter,
563 "INT" => TokenType::Int,
564 "INTEGER" => TokenType::Integer,
565 "BIGINT" => TokenType::BigInt,
566 "SMALLINT" => TokenType::SmallInt,
567 "TINYINT" => TokenType::TinyInt,
568 "FLOAT" => TokenType::Float,
569 "DOUBLE" => TokenType::Double,
570 "DECIMAL" => TokenType::Decimal,
571 "NUMERIC" => TokenType::Numeric,
572 "REAL" => TokenType::Real,
573 "VARCHAR" => TokenType::Varchar,
574 "CHAR" | "CHARACTER" => TokenType::Char,
575 "TEXT" => TokenType::Text,
576 "BOOLEAN" | "BOOL" => TokenType::Boolean,
577 "DATE" => TokenType::Date,
578 "TIMESTAMP" => TokenType::Timestamp,
579 "TIMESTAMPTZ" => TokenType::TimestampTz,
580 "TIME" => TokenType::Time,
581 "INTERVAL" => TokenType::Interval,
582 "BLOB" => TokenType::Blob,
583 "BYTEA" => TokenType::Bytea,
584 "JSON" => TokenType::Json,
585 "JSONB" => TokenType::Jsonb,
586 "UUID" => TokenType::Uuid,
587 "ARRAY" => TokenType::Array,
588 "MAP" => TokenType::Map,
589 "STRUCT" => TokenType::Struct,
590 "PRIMARY" => TokenType::Primary,
591 "KEY" => TokenType::Key,
592 "FOREIGN" => TokenType::Foreign,
593 "REFERENCES" => TokenType::References,
594 "UNIQUE" => TokenType::Unique,
595 "CHECK" => TokenType::Check,
596 "DEFAULT" => TokenType::Default,
597 "CONSTRAINT" => TokenType::Constraint,
598 "AUTO_INCREMENT" | "AUTOINCREMENT" => TokenType::AutoIncrement,
599 "CASCADE" => TokenType::Cascade,
600 "RESTRICT" => TokenType::Restrict,
601 "RETURNING" => TokenType::Returning,
602 "CONFLICT" => TokenType::Conflict,
603 "DO" => TokenType::Do,
604 "NOTHING" => TokenType::Nothing,
605 "REPLACE" => TokenType::Replace,
606 "IGNORE" => TokenType::Ignore,
607 "MERGE" => TokenType::Merge,
608 "MATCHED" => TokenType::Matched,
609 "USING" => TokenType::Using,
610 "TRUNCATE" => TokenType::Truncate,
611 "SCHEMA" => TokenType::Schema,
612 "DATABASE" => TokenType::Database,
613 "VIEW" => TokenType::View,
614 "MATERIALIZED" => TokenType::Materialized,
615 "TEMPORARY" => TokenType::Temporary,
616 "TEMP" => TokenType::Temp,
617 "BEGIN" => TokenType::Begin,
618 "COMMIT" => TokenType::Commit,
619 "ROLLBACK" => TokenType::Rollback,
620 "SAVEPOINT" => TokenType::Savepoint,
621 "TRANSACTION" => TokenType::Transaction,
622 "EXPLAIN" => TokenType::Explain,
623 "ANALYZE" => TokenType::Analyze,
624 "SHOW" => TokenType::Show,
625 "USE" => TokenType::Use,
626 "GRANT" => TokenType::Grant,
627 "REVOKE" => TokenType::Revoke,
628 "LATERAL" => TokenType::Lateral,
629 "UNNEST" => TokenType::Unnest,
630 "PIVOT" => TokenType::Pivot,
631 "UNPIVOT" => TokenType::Unpivot,
632 "TABLESAMPLE" => TokenType::Tablesample,
633 "FETCH" => TokenType::Fetch,
634 "FIRST" => TokenType::First,
635 "NEXT" => TokenType::Next,
636 "ONLY" => TokenType::Only,
637 "NULLS" => TokenType::Nulls,
638 "RESPECT" => TokenType::Respect,
639 "TOP" => TokenType::Top,
640 "COLLATE" => TokenType::Collate,
641 "QUALIFY" => TokenType::Qualify,
642 "CUBE" => TokenType::Cube,
643 "ROLLUP" => TokenType::Rollup,
644 "GROUPING" => TokenType::Grouping,
645 "SETS" => TokenType::Sets,
646 "XOR" => TokenType::Xor,
647 "EXTRACT" => TokenType::Extract,
648 "EPOCH" => TokenType::Epoch,
649 "YEAR" => TokenType::Year,
650 "MONTH" => TokenType::Month,
651 "DAY" => TokenType::Day,
652 "HOUR" => TokenType::Hour,
653 "MINUTE" => TokenType::Minute,
654 "SECOND" => TokenType::Second,
655 _ => TokenType::Identifier,
656 }
657 }
658
659 fn read_quoted_identifier(
660 &mut self,
661 start: usize,
662 start_line: usize,
663 start_col: usize,
664 quote: char,
665 ) -> Result<Token> {
666 let end_char = if quote == '[' { ']' } else { quote };
667 let mut value = String::new();
668 loop {
669 match self.advance() {
670 Some(c) if c == end_char => {
671 if self.peek() == Some(end_char) && end_char != ']' {
672 self.advance();
673 value.push(end_char);
674 } else {
675 return Ok(Token::with_quote(
676 TokenType::Identifier,
677 value,
678 start,
679 start_line,
680 start_col,
681 quote,
682 ));
683 }
684 }
685 Some(c) => value.push(c),
686 None => {
687 return Err(SqlglotError::TokenizerError {
688 message: format!("Unterminated quoted identifier (expected {end_char})"),
689 position: start,
690 });
691 }
692 }
693 }
694 }
695}
696
697#[cfg(test)]
698mod tests {
699 use super::*;
700
701 #[test]
702 fn test_tokenize_simple_select() {
703 let mut tokenizer = Tokenizer::new("SELECT a, b FROM t");
704 let tokens = tokenizer.tokenize().unwrap();
705 assert_eq!(tokens[0].token_type, TokenType::Select);
706 assert_eq!(tokens[1].token_type, TokenType::Identifier);
707 assert_eq!(tokens[1].value, "a");
708 assert_eq!(tokens[2].token_type, TokenType::Comma);
709 assert_eq!(tokens[3].token_type, TokenType::Identifier);
710 assert_eq!(tokens[3].value, "b");
711 assert_eq!(tokens[4].token_type, TokenType::From);
712 assert_eq!(tokens[5].token_type, TokenType::Identifier);
713 assert_eq!(tokens[5].value, "t");
714 assert_eq!(tokens[6].token_type, TokenType::Eof);
715 }
716
717 #[test]
718 fn test_tokenize_string_literal() {
719 let mut tokenizer = Tokenizer::new("'hello world'");
720 let tokens = tokenizer.tokenize().unwrap();
721 assert_eq!(tokens[0].token_type, TokenType::String);
722 assert_eq!(tokens[0].value, "hello world");
723 }
724
725 #[test]
726 fn test_tokenize_operators() {
727 let mut tokenizer = Tokenizer::new("a >= 1 AND b != 2");
728 let tokens = tokenizer.tokenize().unwrap();
729 assert_eq!(tokens[1].token_type, TokenType::GtEq);
730 assert_eq!(tokens[3].token_type, TokenType::And);
731 assert_eq!(tokens[5].token_type, TokenType::Neq);
732 }
733
734 #[test]
735 fn test_tokenize_number() {
736 let mut tokenizer = Tokenizer::new("123.45");
737 let tokens = tokenizer.tokenize().unwrap();
738 assert_eq!(tokens[0].token_type, TokenType::Number);
739 assert_eq!(tokens[0].value, "123.45");
740 }
741
742 #[test]
743 fn test_tokenize_line_comment() {
744 let mut tok = Tokenizer::with_comments("SELECT 1 -- comment\nFROM t");
745 let tokens = tok.tokenize().unwrap();
746 assert!(
747 tokens
748 .iter()
749 .any(|t| t.token_type == TokenType::LineComment)
750 );
751 }
752
753 #[test]
754 fn test_tokenize_block_comment() {
755 let mut tok = Tokenizer::with_comments("SELECT /* hello */ 1");
756 let tokens = tok.tokenize().unwrap();
757 assert!(
758 tokens
759 .iter()
760 .any(|t| t.token_type == TokenType::BlockComment)
761 );
762 }
763
764 #[test]
765 fn test_tokenize_cte_keywords() {
766 let mut tok = Tokenizer::new("WITH cte AS (SELECT 1) SELECT * FROM cte");
767 let tokens = tok.tokenize().unwrap();
768 assert_eq!(tokens[0].token_type, TokenType::With);
769 assert_eq!(tokens[2].token_type, TokenType::As);
770 }
771
772 #[test]
773 fn test_tokenize_double_colon() {
774 let mut tok = Tokenizer::new("x::int");
775 let tokens = tok.tokenize().unwrap();
776 assert_eq!(tokens[1].token_type, TokenType::DoubleColon);
777 }
778
779 #[test]
780 fn test_tokenize_cast() {
781 let mut tok = Tokenizer::new("CAST(x AS INT)");
782 let tokens = tok.tokenize().unwrap();
783 assert_eq!(tokens[0].token_type, TokenType::Cast);
784 }
785
786 #[test]
787 fn test_tokenize_window() {
788 let mut tok = Tokenizer::new("ROW_NUMBER() OVER (PARTITION BY id ORDER BY name)");
789 let tokens = tok.tokenize().unwrap();
790 assert!(tokens.iter().any(|t| t.token_type == TokenType::Over));
791 assert!(tokens.iter().any(|t| t.token_type == TokenType::Partition));
792 }
793
794 #[test]
795 fn test_line_tracking() {
796 let mut tok = Tokenizer::new("SELECT\n 1");
797 let tokens = tok.tokenize().unwrap();
798 assert_eq!(tokens[0].line, 1);
799 assert_eq!(tokens[1].line, 2);
800 }
801
802 #[test]
803 fn test_tokenize_union_intersect_except() {
804 let mut tok = Tokenizer::new("UNION INTERSECT EXCEPT");
805 let tokens = tok.tokenize().unwrap();
806 assert_eq!(tokens[0].token_type, TokenType::Union);
807 assert_eq!(tokens[1].token_type, TokenType::Intersect);
808 assert_eq!(tokens[2].token_type, TokenType::Except);
809 }
810}