1use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17 Select,
19 From,
20 Where,
21 As,
22 Null,
23 True,
24 False,
25 And,
26 Or,
27 Not,
28 Create,
29 Table,
30 Insert,
31 Into,
32 Values,
33 Index,
34 On,
35 Begin,
36 Commit,
37 Rollback,
38 Order,
39 By,
40 Limit,
41
42 Ident(String), QuotedIdent(String), SessionVar(String),
54
55 Integer(i64),
57 Float(f64),
58 String(String),
59
60 Plus,
62 Minus,
63 Star,
64 Slash,
65 Eq,
66 NotEq,
67 Lt,
68 LtEq,
69 Gt,
70 GtEq,
71 InetContainedBy,
74 InetContainedByEq,
77 InetContains,
80 InetContainsEq,
83 InetOverlap,
86
87 LParen,
89 RParen,
90 LBracket,
91 RBracket,
92 Comma,
93 Semicolon,
94 Dot,
95 At,
104 JsonGet,
108 JsonGetText,
110 JsonGetPath,
113 JsonGetPathText,
115 JsonContains,
119 TsMatch,
123 L2Distance,
124 InnerProduct,
127 CosineDistance,
129 DoubleColon,
132 ColonEq,
135 Colon,
139 Concat,
141 Is,
143 Between,
144 In,
145 Like,
146 Group,
147 Distinct,
148 Union,
149 All,
150 Join,
151 Inner,
152 Left,
153 Cross,
154 Outer,
155 Default,
156 Savepoint,
157 Release,
158 To,
159 Having,
160 Show,
161 Extract,
162 Offset,
163 Asc,
164 Desc,
165 Interval,
168 Placeholder(u16),
172
173 Drop,
177 For,
179 Tables,
184 Except,
187 Publication,
189 Subscription,
191 Connection,
194
195 Eof,
196}
197
198#[derive(Debug, Clone, PartialEq, Eq)]
199pub enum LexErrorKind {
200 UnknownChar(char),
201 UnterminatedString,
202 UnterminatedQuotedIdent,
203 UnterminatedBlockComment,
204 BadNumber(String),
205}
206
207#[derive(Debug, Clone, PartialEq, Eq)]
208pub struct LexError {
209 pub kind: LexErrorKind,
210 pub pos: usize,
211}
212
213impl fmt::Display for LexError {
214 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
215 match &self.kind {
216 LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
217 LexErrorKind::UnterminatedString => {
218 write!(f, "unterminated string literal at byte {}", self.pos)
219 }
220 LexErrorKind::UnterminatedQuotedIdent => {
221 write!(f, "unterminated quoted identifier at byte {}", self.pos)
222 }
223 LexErrorKind::UnterminatedBlockComment => {
224 write!(f, "unterminated /* */ comment at byte {}", self.pos)
225 }
226 LexErrorKind::BadNumber(s) => {
227 write!(f, "invalid number literal {s:?} at byte {}", self.pos)
228 }
229 }
230 }
231}
232
233#[allow(clippy::too_many_lines)] pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
236 let bytes = input.as_bytes();
237 let mut i = 0usize;
238 let mut out = Vec::new();
239
240 while i < bytes.len() {
241 let b = bytes[i];
242 match b {
243 b' ' | b'\t' | b'\n' | b'\r' => {
244 i += 1;
245 }
246 b'-' if peek_eq(bytes, i + 1, b'-') => {
247 i += 2;
248 while i < bytes.len() && bytes[i] != b'\n' {
249 i += 1;
250 }
251 }
252 b'/' if peek_eq(bytes, i + 1, b'*') => {
253 let start = i;
254 if peek_eq(bytes, i + 2, b'!') {
264 let mut j = i + 3;
265 while j < bytes.len() && bytes[j].is_ascii_digit() {
268 j += 1;
269 }
270 if j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
271 j += 1;
272 }
273 i = j;
274 continue;
275 }
276 i += 2;
277 let mut closed = false;
278 while i + 1 < bytes.len() {
279 if bytes[i] == b'*' && bytes[i + 1] == b'/' {
280 i += 2;
281 closed = true;
282 break;
283 }
284 i += 1;
285 }
286 if !closed {
287 return Err(LexError {
288 kind: LexErrorKind::UnterminatedBlockComment,
289 pos: start,
290 });
291 }
292 }
293 b'*' if peek_eq(bytes, i + 1, b'/') => {
298 i += 2;
299 }
300 b'\'' => {
301 let (tok, consumed) = lex_quoted(input, i, b'\'', false)?;
302 out.push(tok);
303 i += consumed;
304 }
305 b'"' => {
306 let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
307 out.push(tok);
308 i += consumed;
309 }
310 b'`' => {
314 let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
315 out.push(tok);
316 i += consumed;
317 }
318 b if b.is_ascii_alphabetic() || b == b'_' => {
319 let start = i;
320 i += 1;
321 while i < bytes.len() {
322 let c = bytes[i];
323 if c.is_ascii_alphanumeric() || c == b'_' {
324 i += 1;
325 } else {
326 break;
327 }
328 }
329 let raw = &input[start..i];
330 out.push(keyword_or_ident_raw(raw));
334 }
335 b if b.is_ascii_digit() => {
336 let (tok, consumed) =
337 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
338 out.push(tok);
339 i += consumed;
340 }
341 b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
342 let (tok, consumed) =
343 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
344 out.push(tok);
345 i += consumed;
346 }
347 b'+' => single(&mut out, Token::Plus, &mut i),
348 b'-' => {
349 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
352 out.push(Token::JsonGetText);
353 i += 3;
354 } else if peek_eq(bytes, i + 1, b'>') {
355 out.push(Token::JsonGet);
356 i += 2;
357 } else {
358 single(&mut out, Token::Minus, &mut i);
359 }
360 }
361 b'#' => {
363 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
364 out.push(Token::JsonGetPathText);
365 i += 3;
366 } else if peek_eq(bytes, i + 1, b'>') {
367 out.push(Token::JsonGetPath);
368 i += 2;
369 } else {
370 return Err(LexError {
371 kind: LexErrorKind::UnknownChar('#'),
372 pos: i,
373 });
374 }
375 }
376 b'@' => {
385 if peek_eq(bytes, i + 1, b'>') {
386 out.push(Token::JsonContains);
387 i += 2;
388 } else if peek_eq(bytes, i + 1, b'@')
389 && !is_session_var_ident_start(bytes.get(i + 2).copied())
390 {
391 out.push(Token::TsMatch);
394 i += 2;
395 } else {
396 let prefix_end = if peek_eq(bytes, i + 1, b'@') {
401 i + 2
402 } else {
403 i + 1
404 };
405 let mut end = prefix_end;
406 while end < bytes.len() && is_session_var_ident_continue(bytes[end]) {
407 end += 1;
408 }
409 if end == prefix_end {
410 out.push(Token::At);
419 i = prefix_end;
420 continue;
421 }
422 out.push(Token::SessionVar(input[i..end].to_string()));
423 i = end;
424 }
425 }
426 b'*' => single(&mut out, Token::Star, &mut i),
427 b'/' => single(&mut out, Token::Slash, &mut i),
428 b'(' => single(&mut out, Token::LParen, &mut i),
429 b')' => single(&mut out, Token::RParen, &mut i),
430 b'[' => single(&mut out, Token::LBracket, &mut i),
431 b']' => single(&mut out, Token::RBracket, &mut i),
432 b',' => single(&mut out, Token::Comma, &mut i),
433 b';' => single(&mut out, Token::Semicolon, &mut i),
434 b'.' => single(&mut out, Token::Dot, &mut i),
435 b'=' => single(&mut out, Token::Eq, &mut i),
436 b'<' => {
437 if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
438 out.push(Token::CosineDistance);
439 i += 3;
440 } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
441 out.push(Token::InnerProduct);
442 i += 3;
443 } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
444 out.push(Token::L2Distance);
445 i += 3;
446 } else if peek_eq(bytes, i + 1, b'<') && peek_eq(bytes, i + 2, b'=') {
447 out.push(Token::InetContainedByEq);
449 i += 3;
450 } else if peek_eq(bytes, i + 1, b'<') {
451 out.push(Token::InetContainedBy);
453 i += 2;
454 } else if peek_eq(bytes, i + 1, b'=') {
455 out.push(Token::LtEq);
456 i += 2;
457 } else if peek_eq(bytes, i + 1, b'>') {
458 out.push(Token::NotEq);
459 i += 2;
460 } else {
461 out.push(Token::Lt);
462 i += 1;
463 }
464 }
465 b':' if peek_eq(bytes, i + 1, b':') => {
466 out.push(Token::DoubleColon);
467 i += 2;
468 }
469 b':' if peek_eq(bytes, i + 1, b'=') => {
470 out.push(Token::ColonEq);
472 i += 2;
473 }
474 b':' => {
475 out.push(Token::Colon);
479 i += 1;
480 }
481 b'|' if peek_eq(bytes, i + 1, b'|') => {
482 out.push(Token::Concat);
483 i += 2;
484 }
485 b'>' => {
486 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'=') {
487 out.push(Token::InetContainsEq);
489 i += 3;
490 } else if peek_eq(bytes, i + 1, b'>') {
491 out.push(Token::InetContains);
493 i += 2;
494 } else if peek_eq(bytes, i + 1, b'=') {
495 out.push(Token::GtEq);
496 i += 2;
497 } else {
498 out.push(Token::Gt);
499 i += 1;
500 }
501 }
502 b'&' if peek_eq(bytes, i + 1, b'&') => {
503 out.push(Token::InetOverlap);
505 i += 2;
506 }
507 b'!' if peek_eq(bytes, i + 1, b'=') => {
508 out.push(Token::NotEq);
509 i += 2;
510 }
511 b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
519 let end = find_dollar_tag_end(bytes, i + 2, b"$$");
521 let body = match end {
522 Some(e) => &input[i + 2..e],
523 None => {
524 return Err(LexError {
525 kind: LexErrorKind::UnterminatedString,
526 pos: i,
527 });
528 }
529 };
530 out.push(Token::String(body.to_string()));
531 i = end.unwrap() + 2;
532 }
533 b'$' if i + 1 < bytes.len()
534 && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
535 {
536 let mut j = i + 1;
539 while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
540 j += 1;
541 }
542 if j >= bytes.len() || bytes[j] != b'$' {
543 let ch = input[i..].chars().next().unwrap_or('?');
546 return Err(LexError {
547 kind: LexErrorKind::UnknownChar(ch),
548 pos: i,
549 });
550 }
551 let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
552 let end = find_dollar_tag_end(bytes, j + 1, &close);
553 let body = match end {
554 Some(e) => &input[j + 1..e],
555 None => {
556 return Err(LexError {
557 kind: LexErrorKind::UnterminatedString,
558 pos: i,
559 });
560 }
561 };
562 out.push(Token::String(body.to_string()));
563 i = end.unwrap() + close.len();
564 }
565 b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
569 let mut j = i + 1;
570 let mut n: u32 = 0;
571 while j < bytes.len() && bytes[j].is_ascii_digit() {
572 n = n
573 .saturating_mul(10)
574 .saturating_add(u32::from(bytes[j] - b'0'));
575 j += 1;
576 }
577 if n == 0 || n > u32::from(u16::MAX) {
578 return Err(LexError {
579 kind: LexErrorKind::BadNumber(input[i..j].to_string()),
580 pos: i,
581 });
582 }
583 #[allow(clippy::cast_possible_truncation)]
584 out.push(Token::Placeholder(n as u16));
585 i = j;
586 }
587 _ => {
588 let ch = input[i..].chars().next().unwrap_or('?');
589 return Err(LexError {
590 kind: LexErrorKind::UnknownChar(ch),
591 pos: i,
592 });
593 }
594 }
595 }
596 out.push(Token::Eof);
597 Ok(out)
598}
599
600fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
601 bytes.get(i) == Some(&target)
602}
603
604fn is_session_var_ident_start(b: Option<u8>) -> bool {
609 matches!(b, Some(c) if c.is_ascii_alphabetic() || c == b'_')
610}
611
612fn is_session_var_ident_continue(b: u8) -> bool {
617 b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'$'
618}
619
620fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
623 if tag.is_empty() || from > bytes.len() {
624 return None;
625 }
626 let mut i = from;
627 while i + tag.len() <= bytes.len() {
628 if &bytes[i..i + tag.len()] == tag {
629 return Some(i);
630 }
631 i += 1;
632 }
633 None
634}
635
636fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
637 bytes.get(i).is_some_and(pred)
638}
639
640fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
641 out.push(tok);
642 *i += 1;
643}
644
645fn keyword_or_ident_raw(raw: &str) -> Token {
655 let b = raw.as_bytes();
656 let tok = match b.len() {
657 2 => kw_len2(b),
658 3 => kw_len3(b),
659 4 => kw_len4(b),
660 5 => kw_len5(b),
661 6 => kw_len6(b),
662 7 => kw_len7(b),
663 8 => kw_len8(b),
664 9 => kw_len9(b),
665 10 => kw_len10(b),
666 11 => kw_len11(b),
667 12 => kw_len12(b),
668 _ => None,
669 };
670 match tok {
671 Some(t) => t,
672 None => Token::Ident(raw.to_ascii_lowercase()),
674 }
675}
676
677#[inline]
683fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
684 if input.len() != lower.len() {
685 return false;
686 }
687 for i in 0..lower.len() {
688 if input[i].to_ascii_lowercase() != lower[i] {
689 return false;
690 }
691 }
692 true
693}
694
695#[inline]
696fn kw_len2(b: &[u8]) -> Option<Token> {
697 if eq_ci(b, b"as") {
699 return Some(Token::As);
700 }
701 if eq_ci(b, b"by") {
702 return Some(Token::By);
703 }
704 if eq_ci(b, b"in") {
705 return Some(Token::In);
706 }
707 if eq_ci(b, b"is") {
708 return Some(Token::Is);
709 }
710 if eq_ci(b, b"on") {
711 return Some(Token::On);
712 }
713 if eq_ci(b, b"or") {
714 return Some(Token::Or);
715 }
716 if eq_ci(b, b"to") {
717 return Some(Token::To);
718 }
719 None
720}
721
722#[inline]
723fn kw_len3(b: &[u8]) -> Option<Token> {
724 if eq_ci(b, b"for") {
726 return Some(Token::For);
727 }
728 if eq_ci(b, b"all") {
729 return Some(Token::All);
730 }
731 if eq_ci(b, b"and") {
732 return Some(Token::And);
733 }
734 if eq_ci(b, b"asc") {
735 return Some(Token::Asc);
736 }
737 if eq_ci(b, b"not") {
738 return Some(Token::Not);
739 }
740 None
741}
742
743#[inline]
744fn kw_len4(b: &[u8]) -> Option<Token> {
745 if eq_ci(b, b"from") {
747 return Some(Token::From);
748 }
749 if eq_ci(b, b"drop") {
750 return Some(Token::Drop);
751 }
752 if eq_ci(b, b"null") {
753 return Some(Token::Null);
754 }
755 if eq_ci(b, b"true") {
756 return Some(Token::True);
757 }
758 if eq_ci(b, b"into") {
759 return Some(Token::Into);
760 }
761 if eq_ci(b, b"like") {
762 return Some(Token::Like);
763 }
764 if eq_ci(b, b"join") {
765 return Some(Token::Join);
766 }
767 if eq_ci(b, b"left") {
768 return Some(Token::Left);
769 }
770 if eq_ci(b, b"show") {
771 return Some(Token::Show);
772 }
773 if eq_ci(b, b"desc") {
774 return Some(Token::Desc);
775 }
776 None
777}
778
779#[inline]
780fn kw_len5(b: &[u8]) -> Option<Token> {
781 if eq_ci(b, b"false") {
784 return Some(Token::False);
785 }
786 if eq_ci(b, b"where") {
787 return Some(Token::Where);
788 }
789 if eq_ci(b, b"table") {
790 return Some(Token::Table);
791 }
792 if eq_ci(b, b"index") {
793 return Some(Token::Index);
794 }
795 if eq_ci(b, b"begin") {
796 return Some(Token::Begin);
797 }
798 if eq_ci(b, b"order") {
799 return Some(Token::Order);
800 }
801 if eq_ci(b, b"limit") {
802 return Some(Token::Limit);
803 }
804 if eq_ci(b, b"group") {
805 return Some(Token::Group);
806 }
807 if eq_ci(b, b"union") {
808 return Some(Token::Union);
809 }
810 if eq_ci(b, b"inner") {
811 return Some(Token::Inner);
812 }
813 if eq_ci(b, b"cross") {
814 return Some(Token::Cross);
815 }
816 if eq_ci(b, b"outer") {
817 return Some(Token::Outer);
818 }
819 None
820}
821
822#[inline]
823fn kw_len6(b: &[u8]) -> Option<Token> {
824 if eq_ci(b, b"select") {
826 return Some(Token::Select);
827 }
828 if eq_ci(b, b"tables") {
829 return Some(Token::Tables);
830 }
831 if eq_ci(b, b"except") {
832 return Some(Token::Except);
833 }
834 if eq_ci(b, b"create") {
835 return Some(Token::Create);
836 }
837 if eq_ci(b, b"insert") {
838 return Some(Token::Insert);
839 }
840 if eq_ci(b, b"values") {
841 return Some(Token::Values);
842 }
843 if eq_ci(b, b"commit") {
844 return Some(Token::Commit);
845 }
846 if eq_ci(b, b"having") {
847 return Some(Token::Having);
848 }
849 if eq_ci(b, b"offset") {
850 return Some(Token::Offset);
851 }
852 None
853}
854
855#[inline]
856fn kw_len7(b: &[u8]) -> Option<Token> {
857 if eq_ci(b, b"between") {
859 return Some(Token::Between);
860 }
861 if eq_ci(b, b"default") {
862 return Some(Token::Default);
863 }
864 if eq_ci(b, b"release") {
865 return Some(Token::Release);
866 }
867 if eq_ci(b, b"extract") {
868 return Some(Token::Extract);
869 }
870 None
871}
872
873#[inline]
874fn kw_len8(b: &[u8]) -> Option<Token> {
875 if eq_ci(b, b"rollback") {
877 return Some(Token::Rollback);
878 }
879 if eq_ci(b, b"distinct") {
880 return Some(Token::Distinct);
881 }
882 if eq_ci(b, b"interval") {
883 return Some(Token::Interval);
884 }
885 None
886}
887
888#[inline]
889fn kw_len9(b: &[u8]) -> Option<Token> {
890 if eq_ci(b, b"savepoint") {
892 return Some(Token::Savepoint);
893 }
894 None
895}
896
897#[inline]
898fn kw_len10(b: &[u8]) -> Option<Token> {
899 if eq_ci(b, b"connection") {
901 return Some(Token::Connection);
902 }
903 None
904}
905
906#[inline]
907fn kw_len11(b: &[u8]) -> Option<Token> {
908 if eq_ci(b, b"publication") {
910 return Some(Token::Publication);
911 }
912 None
913}
914
915#[inline]
916fn kw_len12(b: &[u8]) -> Option<Token> {
917 if eq_ci(b, b"subscription") {
919 return Some(Token::Subscription);
920 }
921 None
922}
923
924fn lex_quoted(
931 input: &str,
932 start: usize,
933 quote: u8,
934 is_ident: bool,
935) -> Result<(Token, usize), LexError> {
936 let bytes = input.as_bytes();
937 let mut i = start + 1;
938 let mut s = String::new();
939 loop {
940 if i >= bytes.len() {
941 return Err(LexError {
942 kind: if is_ident {
943 LexErrorKind::UnterminatedQuotedIdent
944 } else {
945 LexErrorKind::UnterminatedString
946 },
947 pos: start,
948 });
949 }
950 if bytes[i] == quote {
951 if peek_eq(bytes, i + 1, quote) {
952 s.push(quote as char);
953 i += 2;
954 } else {
955 i += 1;
956 break;
957 }
958 } else {
959 let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
960 s.push(ch);
961 i += ch.len_utf8();
962 }
963 }
964 let tok = if is_ident {
965 Token::QuotedIdent(s)
966 } else {
967 Token::String(s)
968 };
969 Ok((tok, i - start))
970}
971
972fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
973 let bytes = s.as_bytes();
974 let mut i = 0usize;
975 let mut is_float = false;
976
977 while i < bytes.len() && bytes[i].is_ascii_digit() {
978 i += 1;
979 }
980 if i < bytes.len() && bytes[i] == b'.' {
981 is_float = true;
982 i += 1;
983 while i < bytes.len() && bytes[i].is_ascii_digit() {
984 i += 1;
985 }
986 }
987 if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
988 is_float = true;
989 i += 1;
990 if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
991 i += 1;
992 }
993 let exp_start = i;
994 while i < bytes.len() && bytes[i].is_ascii_digit() {
995 i += 1;
996 }
997 if exp_start == i {
998 return Err(LexErrorKind::BadNumber(s[..i].to_string()));
999 }
1000 }
1001
1002 let lit = &s[..i];
1003 if is_float {
1004 lit.parse::<f64>()
1005 .map(|v| (Token::Float(v), i))
1006 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1007 } else {
1008 lit.parse::<i64>()
1009 .map(|v| (Token::Integer(v), i))
1010 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1011 }
1012}
1013
1014#[cfg(test)]
1015mod tests {
1016 use super::*;
1017 use alloc::vec;
1018
1019 fn lex(s: &str) -> Vec<Token> {
1020 tokenize(s).expect("lex ok")
1021 }
1022
1023 #[test]
1024 fn empty_yields_only_eof() {
1025 assert_eq!(lex(""), vec![Token::Eof]);
1026 }
1027
1028 #[test]
1029 fn whitespace_only_yields_only_eof() {
1030 assert_eq!(lex(" \t\n "), vec![Token::Eof]);
1031 }
1032
1033 #[test]
1034 fn keywords_are_case_insensitive() {
1035 assert_eq!(
1036 lex("SELECT select Select"),
1037 vec![Token::Select, Token::Select, Token::Select, Token::Eof]
1038 );
1039 }
1040
1041 #[test]
1042 fn identifiers_lowercase_ascii() {
1043 assert_eq!(
1044 lex("hello WORLD _x x1"),
1045 vec![
1046 Token::Ident("hello".into()),
1047 Token::Ident("world".into()),
1048 Token::Ident("_x".into()),
1049 Token::Ident("x1".into()),
1050 Token::Eof,
1051 ]
1052 );
1053 }
1054
1055 #[test]
1056 fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
1057 assert_eq!(
1058 lex(r#""User Name" "a""b""#),
1059 vec![
1060 Token::QuotedIdent("User Name".into()),
1061 Token::QuotedIdent("a\"b".into()),
1062 Token::Eof,
1063 ]
1064 );
1065 }
1066
1067 #[test]
1068 fn integer_and_float_literals() {
1069 assert_eq!(
1070 lex("0 42 1.5 .5 1e10 2.5e-3"),
1071 vec![
1072 Token::Integer(0),
1073 Token::Integer(42),
1074 Token::Float(1.5),
1075 Token::Float(0.5),
1076 Token::Float(1e10),
1077 Token::Float(2.5e-3),
1078 Token::Eof,
1079 ]
1080 );
1081 }
1082
1083 #[test]
1084 fn negative_number_is_minus_then_integer() {
1085 assert_eq!(
1087 lex("-42"),
1088 vec![Token::Minus, Token::Integer(42), Token::Eof]
1089 );
1090 }
1091
1092 #[test]
1093 fn string_literal_doubled_quote_escape() {
1094 assert_eq!(
1095 lex("'hello' 'it''s'"),
1096 vec![
1097 Token::String("hello".into()),
1098 Token::String("it's".into()),
1099 Token::Eof,
1100 ]
1101 );
1102 }
1103
1104 #[test]
1105 fn all_comparison_and_arithmetic_operators() {
1106 assert_eq!(
1107 lex("= <> != < <= > >= + - * /"),
1108 vec![
1109 Token::Eq,
1110 Token::NotEq,
1111 Token::NotEq,
1112 Token::Lt,
1113 Token::LtEq,
1114 Token::Gt,
1115 Token::GtEq,
1116 Token::Plus,
1117 Token::Minus,
1118 Token::Star,
1119 Token::Slash,
1120 Token::Eof,
1121 ]
1122 );
1123 }
1124
1125 #[test]
1126 fn punctuation() {
1127 assert_eq!(
1128 lex("( ) , ; ."),
1129 vec![
1130 Token::LParen,
1131 Token::RParen,
1132 Token::Comma,
1133 Token::Semicolon,
1134 Token::Dot,
1135 Token::Eof,
1136 ]
1137 );
1138 }
1139
1140 #[test]
1141 fn line_comment_skipped() {
1142 assert_eq!(
1143 lex("SELECT -- trailing junk\nFROM"),
1144 vec![Token::Select, Token::From, Token::Eof]
1145 );
1146 }
1147
1148 #[test]
1149 fn block_comment_skipped() {
1150 assert_eq!(
1151 lex("SELECT /* skipped */ 1"),
1152 vec![Token::Select, Token::Integer(1), Token::Eof]
1153 );
1154 }
1155
1156 #[test]
1157 fn unterminated_string_errors() {
1158 let err = tokenize("'oops").unwrap_err();
1159 assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1160 assert_eq!(err.pos, 0);
1161 }
1162
1163 #[test]
1164 fn unterminated_block_comment_errors() {
1165 let err = tokenize("/* never closed").unwrap_err();
1166 assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1167 }
1168
1169 #[test]
1170 fn unknown_char_errors() {
1171 let err = tokenize("\x07").unwrap_err();
1177 assert!(matches!(err.kind, LexErrorKind::UnknownChar(_)));
1178 }
1179
1180 #[test]
1181 fn at_alone_lexes_as_punctuation() {
1182 assert_eq!(
1185 lex("'u'@'h'"),
1186 vec![
1187 Token::String("u".into()),
1188 Token::At,
1189 Token::String("h".into()),
1190 Token::Eof,
1191 ]
1192 );
1193 }
1194
1195 #[test]
1196 fn dot_in_qualified_column() {
1197 assert_eq!(
1198 lex("t.col"),
1199 vec![
1200 Token::Ident("t".into()),
1201 Token::Dot,
1202 Token::Ident("col".into()),
1203 Token::Eof,
1204 ]
1205 );
1206 }
1207
1208 #[test]
1211 fn brackets_are_distinct_tokens() {
1212 assert_eq!(
1213 lex("[ ]"),
1214 vec![Token::LBracket, Token::RBracket, Token::Eof]
1215 );
1216 }
1217
1218 #[test]
1219 fn l2_distance_is_three_char_token() {
1220 assert_eq!(
1221 lex("a <-> b"),
1222 vec![
1223 Token::Ident("a".into()),
1224 Token::L2Distance,
1225 Token::Ident("b".into()),
1226 Token::Eof,
1227 ]
1228 );
1229 assert_eq!(
1231 lex("a <- b"),
1232 vec![
1233 Token::Ident("a".into()),
1234 Token::Lt,
1235 Token::Minus,
1236 Token::Ident("b".into()),
1237 Token::Eof,
1238 ]
1239 );
1240 }
1241
1242 #[test]
1243 fn order_by_limit_are_keywords() {
1244 assert_eq!(
1245 lex("ORDER BY LIMIT"),
1246 vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1247 );
1248 }
1249
1250 #[test]
1253 fn inner_product_operator_3char() {
1254 assert_eq!(
1255 lex("a <#> b"),
1256 vec![
1257 Token::Ident("a".into()),
1258 Token::InnerProduct,
1259 Token::Ident("b".into()),
1260 Token::Eof,
1261 ]
1262 );
1263 }
1264
1265 #[test]
1266 fn cosine_distance_operator_3char() {
1267 assert_eq!(
1268 lex("a <=> b"),
1269 vec![
1270 Token::Ident("a".into()),
1271 Token::CosineDistance,
1272 Token::Ident("b".into()),
1273 Token::Eof,
1274 ]
1275 );
1276 assert_eq!(
1279 lex("a <= b"),
1280 vec![
1281 Token::Ident("a".into()),
1282 Token::LtEq,
1283 Token::Ident("b".into()),
1284 Token::Eof,
1285 ]
1286 );
1287 }
1288
1289 #[test]
1290 fn double_colon_cast_token() {
1291 assert_eq!(
1292 lex("x::INT"),
1293 vec![
1294 Token::Ident("x".into()),
1295 Token::DoubleColon,
1296 Token::Ident("int".into()),
1297 Token::Eof,
1298 ]
1299 );
1300 }
1301
1302 #[test]
1303 fn lone_single_colon_lexes_as_colon_token() {
1304 let toks = tokenize(":x").expect("colon now lexes");
1309 assert_eq!(toks[0], Token::Colon);
1310 }
1311
1312 #[test]
1313 fn colon_eq_lexes_as_assignment() {
1314 let toks = tokenize("x := 1").expect("colon-eq lexes");
1316 assert!(matches!(toks[1], Token::ColonEq));
1318 }
1319}