1use crate::errors::{Result, SqlglotError};
2use crate::tokens::{Token, TokenType};
3
4#[inline]
8fn is_identifier_start(c: char) -> bool {
9 c == '_' || c.is_alphabetic()
10}
11
12#[inline]
16fn is_identifier_continue(c: char) -> bool {
17 c == '_' || c == '$' || c.is_alphanumeric()
18}
19
20pub struct Tokenizer {
29 input: Vec<char>,
30 pos: usize,
31 line: usize,
32 col: usize,
33 pub preserve_comments: bool,
35}
36
37impl Tokenizer {
38 #[must_use]
40 pub fn new(input: &str) -> Self {
41 Self {
42 input: input.chars().collect(),
43 pos: 0,
44 line: 1,
45 col: 1,
46 preserve_comments: false,
47 }
48 }
49
50 #[must_use]
52 pub fn with_comments(input: &str) -> Self {
53 Self {
54 input: input.chars().collect(),
55 pos: 0,
56 line: 1,
57 col: 1,
58 preserve_comments: true,
59 }
60 }
61
62 pub fn tokenize(&mut self) -> Result<Vec<Token>> {
66 let mut tokens = Vec::new();
67 loop {
68 let token = self.next_token()?;
69 match token.token_type {
70 TokenType::Eof => {
71 tokens.push(token);
72 break;
73 }
74 TokenType::Whitespace => continue,
75 TokenType::LineComment | TokenType::BlockComment => {
76 if self.preserve_comments {
77 tokens.push(token);
78 }
79 }
80 _ => tokens.push(token),
81 }
82 }
83 Ok(tokens)
84 }
85
86 fn peek(&self) -> Option<char> {
87 self.input.get(self.pos).copied()
88 }
89
90 fn peek_at(&self, offset: usize) -> Option<char> {
91 self.input.get(self.pos + offset).copied()
92 }
93
94 fn advance(&mut self) -> Option<char> {
95 let ch = self.input.get(self.pos).copied();
96 if let Some(c) = ch {
97 self.pos += 1;
98 if c == '\n' {
99 self.line += 1;
100 self.col = 1;
101 } else {
102 self.col += 1;
103 }
104 }
105 ch
106 }
107
108 fn make_token(
109 &self,
110 token_type: TokenType,
111 value: impl Into<String>,
112 start: usize,
113 start_line: usize,
114 start_col: usize,
115 ) -> Token {
116 Token::with_location(token_type, value, start, start_line, start_col)
117 }
118
119 fn next_token(&mut self) -> Result<Token> {
120 while self.peek().is_some_and(|c| c.is_whitespace()) {
122 self.advance();
123 }
124
125 let start = self.pos;
126 let start_line = self.line;
127 let start_col = self.col;
128
129 let Some(ch) = self.advance() else {
130 return Ok(self.make_token(TokenType::Eof, "", start, start_line, start_col));
131 };
132
133 match ch {
134 '(' => Ok(self.make_token(TokenType::LParen, "(", start, start_line, start_col)),
136 ')' => Ok(self.make_token(TokenType::RParen, ")", start, start_line, start_col)),
137 '[' => {
138 let mut looks_like_ident = false;
142 if let Some(first_inner) = self.peek()
143 && (first_inner.is_ascii_alphabetic() || first_inner == '_')
144 {
145 let mut scan = self.pos;
146 while scan < self.input.len() {
147 if self.input[scan] == ']' {
148 looks_like_ident = scan > self.pos;
149 break;
150 }
151 if self.input[scan] == ',' || self.input[scan] == '\n' {
152 break;
153 }
154 scan += 1;
155 }
156 }
157 if looks_like_ident {
158 self.read_quoted_identifier(start, start_line, start_col, '[')
159 } else {
160 Ok(self.make_token(TokenType::LBracket, "[", start, start_line, start_col))
161 }
162 }
163 ']' => Ok(self.make_token(TokenType::RBracket, "]", start, start_line, start_col)),
164 '{' => Ok(self.make_token(TokenType::LBrace, "{", start, start_line, start_col)),
165 '}' => Ok(self.make_token(TokenType::RBrace, "}", start, start_line, start_col)),
166 ',' => Ok(self.make_token(TokenType::Comma, ",", start, start_line, start_col)),
167 ';' => Ok(self.make_token(TokenType::Semicolon, ";", start, start_line, start_col)),
168 '.' => Ok(self.make_token(TokenType::Dot, ".", start, start_line, start_col)),
169 '+' => Ok(self.make_token(TokenType::Plus, "+", start, start_line, start_col)),
170 '~' => Ok(self.make_token(TokenType::BitwiseNot, "~", start, start_line, start_col)),
171 '@' => {
172 if self.peek() == Some('>') {
173 self.advance();
174 Ok(self.make_token(TokenType::AtArrow, "@>", start, start_line, start_col))
175 } else {
176 Ok(self.make_token(TokenType::AtSign, "@", start, start_line, start_col))
177 }
178 }
179 '=' => Ok(self.make_token(TokenType::Eq, "=", start, start_line, start_col)),
180 '*' => Ok(self.make_token(TokenType::Star, "*", start, start_line, start_col)),
181 '%' => Ok(self.make_token(TokenType::Percent2, "%", start, start_line, start_col)),
182 '^' => Ok(self.make_token(TokenType::BitwiseXor, "^", start, start_line, start_col)),
183
184 ':' => {
186 if self.peek() == Some(':') {
187 self.advance();
188 Ok(self.make_token(TokenType::DoubleColon, "::", start, start_line, start_col))
189 } else {
190 Ok(self.make_token(TokenType::Colon, ":", start, start_line, start_col))
191 }
192 }
193
194 '-' => {
196 if self.peek() == Some('-') {
197 self.advance();
198 let mut value = String::from("--");
199 while self.peek().is_some_and(|c| c != '\n') {
200 value.push(self.advance().unwrap());
201 }
202 Ok(
203 self.make_token(
204 TokenType::LineComment,
205 value,
206 start,
207 start_line,
208 start_col,
209 ),
210 )
211 } else if self.peek() == Some('>') {
212 self.advance();
213 if self.peek() == Some('>') {
214 self.advance();
215 Ok(self.make_token(
216 TokenType::DoubleArrow,
217 "->>",
218 start,
219 start_line,
220 start_col,
221 ))
222 } else {
223 Ok(self.make_token(TokenType::Arrow, "->", start, start_line, start_col))
224 }
225 } else {
226 Ok(self.make_token(TokenType::Minus, "-", start, start_line, start_col))
227 }
228 }
229
230 '/' => {
232 if self.peek() == Some('*') {
233 self.advance();
234 let mut value = String::from("/*");
235 let mut depth = 1;
236 while depth > 0 {
237 match self.advance() {
238 Some('*') if self.peek() == Some('/') => {
239 self.advance();
240 depth -= 1;
241 value.push_str("*/");
242 }
243 Some('/') if self.peek() == Some('*') => {
244 self.advance();
245 depth += 1;
246 value.push_str("/*");
247 }
248 Some(c) => value.push(c),
249 None => {
250 return Err(SqlglotError::TokenizerError {
251 message: "Unterminated block comment".into(),
252 position: start,
253 });
254 }
255 }
256 }
257 Ok(self.make_token(
258 TokenType::BlockComment,
259 value,
260 start,
261 start_line,
262 start_col,
263 ))
264 } else {
265 Ok(self.make_token(TokenType::Slash, "/", start, start_line, start_col))
266 }
267 }
268
269 '<' => {
271 if self.peek() == Some('=') {
272 self.advance();
273 Ok(self.make_token(TokenType::LtEq, "<=", start, start_line, start_col))
274 } else if self.peek() == Some('>') {
275 self.advance();
276 Ok(self.make_token(TokenType::Neq, "<>", start, start_line, start_col))
277 } else if self.peek() == Some('<') {
278 self.advance();
279 Ok(self.make_token(TokenType::ShiftLeft, "<<", start, start_line, start_col)) } else if self.peek() == Some('@') {
280 self.advance();
281 Ok(self.make_token(TokenType::ArrowAt, "<@", start, start_line, start_col)) } else {
282 Ok(self.make_token(TokenType::Lt, "<", start, start_line, start_col))
283 }
284 }
285
286 '>' => {
288 if self.peek() == Some('=') {
289 self.advance();
290 Ok(self.make_token(TokenType::GtEq, ">=", start, start_line, start_col))
291 } else if self.peek() == Some('>') {
292 self.advance();
293 Ok(self.make_token(TokenType::ShiftRight, ">>", start, start_line, start_col))
294 } else {
295 Ok(self.make_token(TokenType::Gt, ">", start, start_line, start_col))
296 }
297 }
298
299 '!' => {
301 if self.peek() == Some('=') {
302 self.advance();
303 Ok(self.make_token(TokenType::Neq, "!=", start, start_line, start_col))
304 } else {
305 Err(SqlglotError::TokenizerError {
306 message: format!("Unexpected character: {ch}"),
307 position: start,
308 })
309 }
310 }
311
312 '|' => {
314 if self.peek() == Some('|') {
315 self.advance();
316 Ok(self.make_token(TokenType::Concat, "||", start, start_line, start_col))
317 } else {
318 Ok(self.make_token(TokenType::BitwiseOr, "|", start, start_line, start_col))
319 }
320 }
321
322 '&' => Ok(self.make_token(TokenType::BitwiseAnd, "&", start, start_line, start_col)),
324
325 '#' => {
327 if self.peek() == Some('>') {
328 self.advance();
329 if self.peek() == Some('>') {
330 self.advance();
331 Ok(self.make_token(
332 TokenType::HashDoubleArrow,
333 "#>>",
334 start,
335 start_line,
336 start_col,
337 ))
338 } else {
339 Ok(self.make_token(
340 TokenType::HashArrow,
341 "#>",
342 start,
343 start_line,
344 start_col,
345 ))
346 }
347 } else {
348 let mut value = String::from("#");
349 while self.peek().is_some_and(|c| c != '\n') {
350 value.push(self.advance().unwrap());
351 }
352 Ok(
353 self.make_token(
354 TokenType::LineComment,
355 value,
356 start,
357 start_line,
358 start_col,
359 ),
360 )
361 }
362 }
363
364 '\'' => self.read_string(start, start_line, start_col),
366
367 c if c.is_ascii_digit() => self.read_number(start, start_line, start_col, c),
369
370 c if is_identifier_start(c) => {
372 self.read_identifier(start, start_line, start_col, c)
373 }
374
375 '"' => self.read_quoted_identifier(start, start_line, start_col, '"'),
377
378 '`' => self.read_quoted_identifier(start, start_line, start_col, '`'),
380
381 '$' => {
383 if self.peek().is_some_and(|c| c.is_ascii_digit()) {
384 let mut value = String::from("$");
385 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
386 value.push(self.advance().unwrap());
387 }
388 Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
389 } else {
390 Ok(self.make_token(TokenType::Parameter, "$", start, start_line, start_col))
391 }
392 }
393
394 '?' => Ok(self.make_token(TokenType::Parameter, "?", start, start_line, start_col)),
395
396 _ => Err(SqlglotError::TokenizerError {
397 message: format!("Unexpected character: {ch}"),
398 position: start,
399 }),
400 }
401 }
402
403 fn read_string(&mut self, start: usize, start_line: usize, start_col: usize) -> Result<Token> {
404 let mut value = String::new();
405 loop {
406 match self.advance() {
407 Some('\'') => {
408 if self.peek() == Some('\'') {
409 self.advance();
410 value.push('\'');
411 } else {
412 return Ok(self.make_token(
413 TokenType::String,
414 value,
415 start,
416 start_line,
417 start_col,
418 ));
419 }
420 }
421 Some('\\') => match self.peek() {
422 Some('\\') => {
423 self.advance();
424 value.push('\\');
425 }
426 Some('n') => {
427 self.advance();
428 value.push('\n');
429 }
430 Some('t') => {
431 self.advance();
432 value.push('\t');
433 }
434 Some('r') => {
435 self.advance();
436 value.push('\r');
437 }
438 _ => {
439 value.push('\\');
440 }
441 },
442 Some(c) => value.push(c),
443 None => {
444 return Err(SqlglotError::TokenizerError {
445 message: "Unterminated string literal".into(),
446 position: start,
447 });
448 }
449 }
450 }
451 }
452
453 fn read_number(
454 &mut self,
455 start: usize,
456 start_line: usize,
457 start_col: usize,
458 first: char,
459 ) -> Result<Token> {
460 let mut value = String::new();
461 value.push(first);
462
463 if first == '0' && self.peek().is_some_and(|c| c == 'x' || c == 'X') {
464 value.push(self.advance().unwrap());
465 while self.peek().is_some_and(|c| c.is_ascii_hexdigit()) {
466 value.push(self.advance().unwrap());
467 }
468 return Ok(self.make_token(TokenType::HexString, value, start, start_line, start_col));
469 }
470
471 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
472 value.push(self.advance().unwrap());
473 }
474
475 if self.peek() == Some('.') && self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) {
476 value.push(self.advance().unwrap());
477 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
478 value.push(self.advance().unwrap());
479 }
480 }
481
482 if self.peek().is_some_and(|c| c == 'e' || c == 'E') {
483 value.push(self.advance().unwrap());
484 if self.peek().is_some_and(|c| c == '+' || c == '-') {
485 value.push(self.advance().unwrap());
486 }
487 while self.peek().is_some_and(|c| c.is_ascii_digit()) {
488 value.push(self.advance().unwrap());
489 }
490 }
491
492 Ok(self.make_token(TokenType::Number, value, start, start_line, start_col))
493 }
494
495 fn read_identifier(
496 &mut self,
497 start: usize,
498 start_line: usize,
499 start_col: usize,
500 first: char,
501 ) -> Result<Token> {
502 let mut value = String::new();
503 value.push(first);
504 while self
505 .peek()
506 .is_some_and(is_identifier_continue)
507 {
508 value.push(self.advance().unwrap());
509 }
510
511 if value.len() == 1
514 && value
515 .as_bytes()
516 .first()
517 .is_some_and(|b| b.eq_ignore_ascii_case(&b'n'))
518 && self.peek() == Some('\'')
519 {
520 self.advance(); let mut token = self.read_string(start, start_line, start_col)?;
522 token.token_type = TokenType::NationalString;
523 return Ok(token);
524 }
525
526 let token_type = Self::keyword_type(&value);
527 Ok(self.make_token(token_type, value, start, start_line, start_col))
528 }
529
530 fn keyword_type(word: &str) -> TokenType {
532 match word.to_uppercase().as_str() {
533 "SELECT" => TokenType::Select,
534 "FROM" => TokenType::From,
535 "WHERE" => TokenType::Where,
536 "AND" => TokenType::And,
537 "OR" => TokenType::Or,
538 "NOT" => TokenType::Not,
539 "AS" => TokenType::As,
540 "JOIN" => TokenType::Join,
541 "INNER" => TokenType::Inner,
542 "LEFT" => TokenType::Left,
543 "RIGHT" => TokenType::Right,
544 "FULL" => TokenType::Full,
545 "OUTER" => TokenType::Outer,
546 "CROSS" => TokenType::Cross,
547 "ON" => TokenType::On,
548 "INSERT" => TokenType::Insert,
549 "INTO" => TokenType::Into,
550 "VALUES" => TokenType::Values,
551 "UPDATE" => TokenType::Update,
552 "SET" => TokenType::Set,
553 "DELETE" => TokenType::Delete,
554 "CREATE" => TokenType::Create,
555 "TABLE" => TokenType::Table,
556 "DROP" => TokenType::Drop,
557 "ALTER" => TokenType::Alter,
558 "INDEX" => TokenType::Index,
559 "IF" => TokenType::If,
560 "EXISTS" => TokenType::Exists,
561 "IN" => TokenType::In,
562 "IS" => TokenType::Is,
563 "NULL" => TokenType::Null,
564 "LIKE" => TokenType::Like,
565 "ILIKE" => TokenType::ILike,
566 "ESCAPE" => TokenType::Escape,
567 "BETWEEN" => TokenType::Between,
568 "CASE" => TokenType::Case,
569 "WHEN" => TokenType::When,
570 "THEN" => TokenType::Then,
571 "ELSE" => TokenType::Else,
572 "END" => TokenType::End,
573 "ORDER" => TokenType::Order,
574 "BY" => TokenType::By,
575 "ASC" => TokenType::Asc,
576 "DESC" => TokenType::Desc,
577 "GROUP" => TokenType::Group,
578 "HAVING" => TokenType::Having,
579 "LIMIT" => TokenType::Limit,
580 "OFFSET" => TokenType::Offset,
581 "UNION" => TokenType::Union,
582 "ALL" => TokenType::All,
583 "DISTINCT" => TokenType::Distinct,
584 "TRUE" => TokenType::True,
585 "FALSE" => TokenType::False,
586 "INTERSECT" => TokenType::Intersect,
587 "EXCEPT" => TokenType::Except,
588 "WITH" => TokenType::With,
589 "RECURSIVE" => TokenType::Recursive,
590 "ANY" => TokenType::Any,
591 "SOME" => TokenType::Some,
592 "CAST" => TokenType::Cast,
593 "OVER" => TokenType::Over,
594 "PARTITION" => TokenType::Partition,
595 "WINDOW" => TokenType::Window,
596 "ROWS" => TokenType::Rows,
597 "RANGE" => TokenType::Range,
598 "UNBOUNDED" => TokenType::Unbounded,
599 "PRECEDING" => TokenType::Preceding,
600 "FOLLOWING" => TokenType::Following,
601 "FILTER" => TokenType::Filter,
602 "INT" => TokenType::Int,
603 "INTEGER" => TokenType::Integer,
604 "BIGINT" => TokenType::BigInt,
605 "SMALLINT" => TokenType::SmallInt,
606 "TINYINT" => TokenType::TinyInt,
607 "FLOAT" => TokenType::Float,
608 "DOUBLE" => TokenType::Double,
609 "DECIMAL" => TokenType::Decimal,
610 "NUMERIC" => TokenType::Numeric,
611 "REAL" => TokenType::Real,
612 "VARCHAR" => TokenType::Varchar,
613 "CHAR" | "CHARACTER" => TokenType::Char,
614 "TEXT" => TokenType::Text,
615 "BOOLEAN" | "BOOL" => TokenType::Boolean,
616 "DATE" => TokenType::Date,
617 "TIMESTAMP" => TokenType::Timestamp,
618 "TIMESTAMPTZ" => TokenType::TimestampTz,
619 "TIME" => TokenType::Time,
620 "INTERVAL" => TokenType::Interval,
621 "BLOB" => TokenType::Blob,
622 "BYTEA" => TokenType::Bytea,
623 "JSON" => TokenType::Json,
624 "JSONB" => TokenType::Jsonb,
625 "UUID" => TokenType::Uuid,
626 "ARRAY" => TokenType::Array,
627 "MAP" => TokenType::Map,
628 "STRUCT" => TokenType::Struct,
629 "PRIMARY" => TokenType::Primary,
630 "KEY" => TokenType::Key,
631 "FOREIGN" => TokenType::Foreign,
632 "REFERENCES" => TokenType::References,
633 "UNIQUE" => TokenType::Unique,
634 "CHECK" => TokenType::Check,
635 "DEFAULT" => TokenType::Default,
636 "CONSTRAINT" => TokenType::Constraint,
637 "AUTO_INCREMENT" | "AUTOINCREMENT" => TokenType::AutoIncrement,
638 "CASCADE" => TokenType::Cascade,
639 "RESTRICT" => TokenType::Restrict,
640 "RETURNING" => TokenType::Returning,
641 "CONFLICT" => TokenType::Conflict,
642 "DO" => TokenType::Do,
643 "NOTHING" => TokenType::Nothing,
644 "REPLACE" => TokenType::Replace,
645 "IGNORE" => TokenType::Ignore,
646 "MERGE" => TokenType::Merge,
647 "MATCHED" => TokenType::Matched,
648 "USING" => TokenType::Using,
649 "TRUNCATE" => TokenType::Truncate,
650 "SCHEMA" => TokenType::Schema,
651 "DATABASE" => TokenType::Database,
652 "VIEW" => TokenType::View,
653 "MATERIALIZED" => TokenType::Materialized,
654 "TEMPORARY" => TokenType::Temporary,
655 "TEMP" => TokenType::Temp,
656 "BEGIN" => TokenType::Begin,
657 "COMMIT" => TokenType::Commit,
658 "ROLLBACK" => TokenType::Rollback,
659 "SAVEPOINT" => TokenType::Savepoint,
660 "TRANSACTION" => TokenType::Transaction,
661 "EXPLAIN" => TokenType::Explain,
662 "ANALYZE" => TokenType::Analyze,
663 "SHOW" => TokenType::Show,
664 "USE" => TokenType::Use,
665 "GRANT" => TokenType::Grant,
666 "REVOKE" => TokenType::Revoke,
667 "LATERAL" => TokenType::Lateral,
668 "UNNEST" => TokenType::Unnest,
669 "PIVOT" => TokenType::Pivot,
670 "UNPIVOT" => TokenType::Unpivot,
671 "TABLESAMPLE" => TokenType::Tablesample,
672 "FETCH" => TokenType::Fetch,
673 "FIRST" => TokenType::First,
674 "NEXT" => TokenType::Next,
675 "ONLY" => TokenType::Only,
676 "NULLS" => TokenType::Nulls,
677 "RESPECT" => TokenType::Respect,
678 "TOP" => TokenType::Top,
679 "COLLATE" => TokenType::Collate,
680 "QUALIFY" => TokenType::Qualify,
681 "CUBE" => TokenType::Cube,
682 "ROLLUP" => TokenType::Rollup,
683 "GROUPING" => TokenType::Grouping,
684 "SETS" => TokenType::Sets,
685 "XOR" => TokenType::Xor,
686 "EXTRACT" => TokenType::Extract,
687 "EPOCH" => TokenType::Epoch,
688 "YEAR" => TokenType::Year,
689 "MONTH" => TokenType::Month,
690 "DAY" => TokenType::Day,
691 "HOUR" => TokenType::Hour,
692 "MINUTE" => TokenType::Minute,
693 "SECOND" => TokenType::Second,
694 _ => TokenType::Identifier,
695 }
696 }
697
698 fn read_quoted_identifier(
699 &mut self,
700 start: usize,
701 start_line: usize,
702 start_col: usize,
703 quote: char,
704 ) -> Result<Token> {
705 let end_char = if quote == '[' { ']' } else { quote };
706 let mut value = String::new();
707 loop {
708 match self.advance() {
709 Some(c) if c == end_char => {
710 if self.peek() == Some(end_char) && end_char != ']' {
711 self.advance();
712 value.push(end_char);
713 } else {
714 return Ok(Token::with_quote(
715 TokenType::Identifier,
716 value,
717 start,
718 start_line,
719 start_col,
720 quote,
721 ));
722 }
723 }
724 Some(c) => value.push(c),
725 None => {
726 return Err(SqlglotError::TokenizerError {
727 message: format!("Unterminated quoted identifier (expected {end_char})"),
728 position: start,
729 });
730 }
731 }
732 }
733 }
734}
735
736#[cfg(test)]
737mod tests {
738 use super::*;
739
740 #[test]
741 fn test_tokenize_simple_select() {
742 let mut tokenizer = Tokenizer::new("SELECT a, b FROM t");
743 let tokens = tokenizer.tokenize().unwrap();
744 assert_eq!(tokens[0].token_type, TokenType::Select);
745 assert_eq!(tokens[1].token_type, TokenType::Identifier);
746 assert_eq!(tokens[1].value, "a");
747 assert_eq!(tokens[2].token_type, TokenType::Comma);
748 assert_eq!(tokens[3].token_type, TokenType::Identifier);
749 assert_eq!(tokens[3].value, "b");
750 assert_eq!(tokens[4].token_type, TokenType::From);
751 assert_eq!(tokens[5].token_type, TokenType::Identifier);
752 assert_eq!(tokens[5].value, "t");
753 assert_eq!(tokens[6].token_type, TokenType::Eof);
754 }
755
756 #[test]
757 fn test_tokenize_string_literal() {
758 let mut tokenizer = Tokenizer::new("'hello world'");
759 let tokens = tokenizer.tokenize().unwrap();
760 assert_eq!(tokens[0].token_type, TokenType::String);
761 assert_eq!(tokens[0].value, "hello world");
762 }
763
764 #[test]
765 fn test_tokenize_operators() {
766 let mut tokenizer = Tokenizer::new("a >= 1 AND b != 2");
767 let tokens = tokenizer.tokenize().unwrap();
768 assert_eq!(tokens[1].token_type, TokenType::GtEq);
769 assert_eq!(tokens[3].token_type, TokenType::And);
770 assert_eq!(tokens[5].token_type, TokenType::Neq);
771 }
772
773 #[test]
774 fn test_tokenize_number() {
775 let mut tokenizer = Tokenizer::new("123.45");
776 let tokens = tokenizer.tokenize().unwrap();
777 assert_eq!(tokens[0].token_type, TokenType::Number);
778 assert_eq!(tokens[0].value, "123.45");
779 }
780
781 #[test]
782 fn test_tokenize_line_comment() {
783 let mut tok = Tokenizer::with_comments("SELECT 1 -- comment\nFROM t");
784 let tokens = tok.tokenize().unwrap();
785 assert!(
786 tokens
787 .iter()
788 .any(|t| t.token_type == TokenType::LineComment)
789 );
790 }
791
792 #[test]
793 fn test_tokenize_block_comment() {
794 let mut tok = Tokenizer::with_comments("SELECT /* hello */ 1");
795 let tokens = tok.tokenize().unwrap();
796 assert!(
797 tokens
798 .iter()
799 .any(|t| t.token_type == TokenType::BlockComment)
800 );
801 }
802
803 #[test]
804 fn test_tokenize_cte_keywords() {
805 let mut tok = Tokenizer::new("WITH cte AS (SELECT 1) SELECT * FROM cte");
806 let tokens = tok.tokenize().unwrap();
807 assert_eq!(tokens[0].token_type, TokenType::With);
808 assert_eq!(tokens[2].token_type, TokenType::As);
809 }
810
811 #[test]
812 fn test_tokenize_double_colon() {
813 let mut tok = Tokenizer::new("x::int");
814 let tokens = tok.tokenize().unwrap();
815 assert_eq!(tokens[1].token_type, TokenType::DoubleColon);
816 }
817
818 #[test]
819 fn test_tokenize_cast() {
820 let mut tok = Tokenizer::new("CAST(x AS INT)");
821 let tokens = tok.tokenize().unwrap();
822 assert_eq!(tokens[0].token_type, TokenType::Cast);
823 }
824
825 #[test]
826 fn test_tokenize_window() {
827 let mut tok = Tokenizer::new("ROW_NUMBER() OVER (PARTITION BY id ORDER BY name)");
828 let tokens = tok.tokenize().unwrap();
829 assert!(tokens.iter().any(|t| t.token_type == TokenType::Over));
830 assert!(tokens.iter().any(|t| t.token_type == TokenType::Partition));
831 }
832
833 #[test]
834 fn test_line_tracking() {
835 let mut tok = Tokenizer::new("SELECT\n 1");
836 let tokens = tok.tokenize().unwrap();
837 assert_eq!(tokens[0].line, 1);
838 assert_eq!(tokens[1].line, 2);
839 }
840
841 #[test]
842 fn test_tokenize_union_intersect_except() {
843 let mut tok = Tokenizer::new("UNION INTERSECT EXCEPT");
844 let tokens = tok.tokenize().unwrap();
845 assert_eq!(tokens[0].token_type, TokenType::Union);
846 assert_eq!(tokens[1].token_type, TokenType::Intersect);
847 assert_eq!(tokens[2].token_type, TokenType::Except);
848 }
849
850 #[test]
851 fn test_tokenize_n_prefixed_string_literal_uppercase() {
852 let mut tok = Tokenizer::new("N'Hello'");
853 let tokens = tok.tokenize().unwrap();
854 assert_eq!(tokens[0].token_type, TokenType::NationalString);
855 assert_eq!(tokens[0].value, "Hello");
856 }
857
858 #[test]
859 fn test_tokenize_n_prefixed_string_literal_lowercase() {
860 let mut tok = Tokenizer::new("n'hello'");
861 let tokens = tok.tokenize().unwrap();
862 assert_eq!(tokens[0].token_type, TokenType::NationalString);
863 assert_eq!(tokens[0].value, "hello");
864 }
865
866 #[test]
867 fn test_tokenize_n_prefixed_string_literal_escaped_quote() {
868 let mut tok = Tokenizer::new("N'can''t stop'");
869 let tokens = tok.tokenize().unwrap();
870 assert_eq!(tokens[0].token_type, TokenType::NationalString);
871 assert_eq!(tokens[0].value, "can't stop");
872 }
873
874 #[test]
875 fn test_tokenize_n_prefixed_string_literal_unicode() {
876 let mut tok = Tokenizer::new("N'テスト'");
877 let tokens = tok.tokenize().unwrap();
878 assert_eq!(tokens[0].token_type, TokenType::NationalString);
879 assert_eq!(tokens[0].value, "テスト");
880 }
881
882 #[test]
883 fn test_tokenize_identifier_n_without_quote() {
884 let mut tok = Tokenizer::new("SELECT N FROM t");
885 let tokens = tok.tokenize().unwrap();
886 assert_eq!(tokens[1].token_type, TokenType::Identifier);
887 assert_eq!(tokens[1].value, "N");
888 }
889
890 #[test]
891 fn test_tokenize_identifier_name_starting_with_n() {
892 let mut tok = Tokenizer::new("SELECT NAME FROM t");
893 let tokens = tok.tokenize().unwrap();
894 assert_eq!(tokens[1].token_type, TokenType::Identifier);
895 assert_eq!(tokens[1].value, "NAME");
896 }
897}