1use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17 Select,
19 From,
20 Where,
21 As,
22 Null,
23 True,
24 False,
25 And,
26 Or,
27 Not,
28 Create,
29 Table,
30 Insert,
31 Into,
32 Values,
33 Index,
34 On,
35 Begin,
36 Commit,
37 Rollback,
38 Order,
39 By,
40 Limit,
41
42 Ident(String), QuotedIdent(String), SessionVar(String),
54
55 Integer(i64),
57 Float(f64),
58 String(String),
59
60 Plus,
62 Minus,
63 Star,
64 Slash,
65 Eq,
66 NotEq,
67 Lt,
68 LtEq,
69 Gt,
70 GtEq,
71
72 LParen,
74 RParen,
75 LBracket,
76 RBracket,
77 Comma,
78 Semicolon,
79 Dot,
80 JsonGet,
84 JsonGetText,
86 JsonGetPath,
89 JsonGetPathText,
91 JsonContains,
95 TsMatch,
99 L2Distance,
100 InnerProduct,
103 CosineDistance,
105 DoubleColon,
108 ColonEq,
111 Colon,
115 Concat,
117 Is,
119 Between,
120 In,
121 Like,
122 Group,
123 Distinct,
124 Union,
125 All,
126 Join,
127 Inner,
128 Left,
129 Cross,
130 Outer,
131 Default,
132 Savepoint,
133 Release,
134 To,
135 Having,
136 Show,
137 Extract,
138 Offset,
139 Asc,
140 Desc,
141 Interval,
144 Placeholder(u16),
148
149 Drop,
153 For,
155 Tables,
160 Except,
163 Publication,
165 Subscription,
167 Connection,
170
171 Eof,
172}
173
174#[derive(Debug, Clone, PartialEq, Eq)]
175pub enum LexErrorKind {
176 UnknownChar(char),
177 UnterminatedString,
178 UnterminatedQuotedIdent,
179 UnterminatedBlockComment,
180 BadNumber(String),
181}
182
183#[derive(Debug, Clone, PartialEq, Eq)]
184pub struct LexError {
185 pub kind: LexErrorKind,
186 pub pos: usize,
187}
188
189impl fmt::Display for LexError {
190 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
191 match &self.kind {
192 LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
193 LexErrorKind::UnterminatedString => {
194 write!(f, "unterminated string literal at byte {}", self.pos)
195 }
196 LexErrorKind::UnterminatedQuotedIdent => {
197 write!(f, "unterminated quoted identifier at byte {}", self.pos)
198 }
199 LexErrorKind::UnterminatedBlockComment => {
200 write!(f, "unterminated /* */ comment at byte {}", self.pos)
201 }
202 LexErrorKind::BadNumber(s) => {
203 write!(f, "invalid number literal {s:?} at byte {}", self.pos)
204 }
205 }
206 }
207}
208
209#[allow(clippy::too_many_lines)] pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
212 let bytes = input.as_bytes();
213 let mut i = 0usize;
214 let mut out = Vec::new();
215
216 while i < bytes.len() {
217 let b = bytes[i];
218 match b {
219 b' ' | b'\t' | b'\n' | b'\r' => {
220 i += 1;
221 }
222 b'-' if peek_eq(bytes, i + 1, b'-') => {
223 i += 2;
224 while i < bytes.len() && bytes[i] != b'\n' {
225 i += 1;
226 }
227 }
228 b'/' if peek_eq(bytes, i + 1, b'*') => {
229 let start = i;
230 if peek_eq(bytes, i + 2, b'!') {
240 let mut j = i + 3;
241 while j < bytes.len() && bytes[j].is_ascii_digit() {
244 j += 1;
245 }
246 if j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
247 j += 1;
248 }
249 i = j;
250 continue;
251 }
252 i += 2;
253 let mut closed = false;
254 while i + 1 < bytes.len() {
255 if bytes[i] == b'*' && bytes[i + 1] == b'/' {
256 i += 2;
257 closed = true;
258 break;
259 }
260 i += 1;
261 }
262 if !closed {
263 return Err(LexError {
264 kind: LexErrorKind::UnterminatedBlockComment,
265 pos: start,
266 });
267 }
268 }
269 b'*' if peek_eq(bytes, i + 1, b'/') => {
274 i += 2;
275 }
276 b'\'' => {
277 let (tok, consumed) = lex_quoted(input, i, b'\'', false)?;
278 out.push(tok);
279 i += consumed;
280 }
281 b'"' => {
282 let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
283 out.push(tok);
284 i += consumed;
285 }
286 b'`' => {
290 let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
291 out.push(tok);
292 i += consumed;
293 }
294 b if b.is_ascii_alphabetic() || b == b'_' => {
295 let start = i;
296 i += 1;
297 while i < bytes.len() {
298 let c = bytes[i];
299 if c.is_ascii_alphanumeric() || c == b'_' {
300 i += 1;
301 } else {
302 break;
303 }
304 }
305 let raw = &input[start..i];
306 out.push(keyword_or_ident_raw(raw));
310 }
311 b if b.is_ascii_digit() => {
312 let (tok, consumed) =
313 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
314 out.push(tok);
315 i += consumed;
316 }
317 b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
318 let (tok, consumed) =
319 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
320 out.push(tok);
321 i += consumed;
322 }
323 b'+' => single(&mut out, Token::Plus, &mut i),
324 b'-' => {
325 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
328 out.push(Token::JsonGetText);
329 i += 3;
330 } else if peek_eq(bytes, i + 1, b'>') {
331 out.push(Token::JsonGet);
332 i += 2;
333 } else {
334 single(&mut out, Token::Minus, &mut i);
335 }
336 }
337 b'#' => {
339 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
340 out.push(Token::JsonGetPathText);
341 i += 3;
342 } else if peek_eq(bytes, i + 1, b'>') {
343 out.push(Token::JsonGetPath);
344 i += 2;
345 } else {
346 return Err(LexError {
347 kind: LexErrorKind::UnknownChar('#'),
348 pos: i,
349 });
350 }
351 }
352 b'@' => {
361 if peek_eq(bytes, i + 1, b'>') {
362 out.push(Token::JsonContains);
363 i += 2;
364 } else if peek_eq(bytes, i + 1, b'@')
365 && !is_session_var_ident_start(bytes.get(i + 2).copied())
366 {
367 out.push(Token::TsMatch);
370 i += 2;
371 } else {
372 let prefix_end = if peek_eq(bytes, i + 1, b'@') {
377 i + 2
378 } else {
379 i + 1
380 };
381 let mut end = prefix_end;
382 while end < bytes.len() && is_session_var_ident_continue(bytes[end]) {
383 end += 1;
384 }
385 if end == prefix_end {
386 return Err(LexError {
387 kind: LexErrorKind::UnknownChar('@'),
388 pos: i,
389 });
390 }
391 out.push(Token::SessionVar(input[i..end].to_string()));
392 i = end;
393 }
394 }
395 b'*' => single(&mut out, Token::Star, &mut i),
396 b'/' => single(&mut out, Token::Slash, &mut i),
397 b'(' => single(&mut out, Token::LParen, &mut i),
398 b')' => single(&mut out, Token::RParen, &mut i),
399 b'[' => single(&mut out, Token::LBracket, &mut i),
400 b']' => single(&mut out, Token::RBracket, &mut i),
401 b',' => single(&mut out, Token::Comma, &mut i),
402 b';' => single(&mut out, Token::Semicolon, &mut i),
403 b'.' => single(&mut out, Token::Dot, &mut i),
404 b'=' => single(&mut out, Token::Eq, &mut i),
405 b'<' => {
406 if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
407 out.push(Token::CosineDistance);
408 i += 3;
409 } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
410 out.push(Token::InnerProduct);
411 i += 3;
412 } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
413 out.push(Token::L2Distance);
414 i += 3;
415 } else if peek_eq(bytes, i + 1, b'=') {
416 out.push(Token::LtEq);
417 i += 2;
418 } else if peek_eq(bytes, i + 1, b'>') {
419 out.push(Token::NotEq);
420 i += 2;
421 } else {
422 out.push(Token::Lt);
423 i += 1;
424 }
425 }
426 b':' if peek_eq(bytes, i + 1, b':') => {
427 out.push(Token::DoubleColon);
428 i += 2;
429 }
430 b':' if peek_eq(bytes, i + 1, b'=') => {
431 out.push(Token::ColonEq);
433 i += 2;
434 }
435 b':' => {
436 out.push(Token::Colon);
440 i += 1;
441 }
442 b'|' if peek_eq(bytes, i + 1, b'|') => {
443 out.push(Token::Concat);
444 i += 2;
445 }
446 b'>' => {
447 if peek_eq(bytes, i + 1, b'=') {
448 out.push(Token::GtEq);
449 i += 2;
450 } else {
451 out.push(Token::Gt);
452 i += 1;
453 }
454 }
455 b'!' if peek_eq(bytes, i + 1, b'=') => {
456 out.push(Token::NotEq);
457 i += 2;
458 }
459 b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
467 let end = find_dollar_tag_end(bytes, i + 2, b"$$");
469 let body = match end {
470 Some(e) => &input[i + 2..e],
471 None => {
472 return Err(LexError {
473 kind: LexErrorKind::UnterminatedString,
474 pos: i,
475 });
476 }
477 };
478 out.push(Token::String(body.to_string()));
479 i = end.unwrap() + 2;
480 }
481 b'$' if i + 1 < bytes.len()
482 && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
483 {
484 let mut j = i + 1;
487 while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
488 j += 1;
489 }
490 if j >= bytes.len() || bytes[j] != b'$' {
491 let ch = input[i..].chars().next().unwrap_or('?');
494 return Err(LexError {
495 kind: LexErrorKind::UnknownChar(ch),
496 pos: i,
497 });
498 }
499 let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
500 let end = find_dollar_tag_end(bytes, j + 1, &close);
501 let body = match end {
502 Some(e) => &input[j + 1..e],
503 None => {
504 return Err(LexError {
505 kind: LexErrorKind::UnterminatedString,
506 pos: i,
507 });
508 }
509 };
510 out.push(Token::String(body.to_string()));
511 i = end.unwrap() + close.len();
512 }
513 b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
517 let mut j = i + 1;
518 let mut n: u32 = 0;
519 while j < bytes.len() && bytes[j].is_ascii_digit() {
520 n = n
521 .saturating_mul(10)
522 .saturating_add(u32::from(bytes[j] - b'0'));
523 j += 1;
524 }
525 if n == 0 || n > u32::from(u16::MAX) {
526 return Err(LexError {
527 kind: LexErrorKind::BadNumber(input[i..j].to_string()),
528 pos: i,
529 });
530 }
531 #[allow(clippy::cast_possible_truncation)]
532 out.push(Token::Placeholder(n as u16));
533 i = j;
534 }
535 _ => {
536 let ch = input[i..].chars().next().unwrap_or('?');
537 return Err(LexError {
538 kind: LexErrorKind::UnknownChar(ch),
539 pos: i,
540 });
541 }
542 }
543 }
544 out.push(Token::Eof);
545 Ok(out)
546}
547
548fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
549 bytes.get(i) == Some(&target)
550}
551
552fn is_session_var_ident_start(b: Option<u8>) -> bool {
557 matches!(b, Some(c) if c.is_ascii_alphabetic() || c == b'_')
558}
559
560fn is_session_var_ident_continue(b: u8) -> bool {
565 b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'$'
566}
567
568fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
571 if tag.is_empty() || from > bytes.len() {
572 return None;
573 }
574 let mut i = from;
575 while i + tag.len() <= bytes.len() {
576 if &bytes[i..i + tag.len()] == tag {
577 return Some(i);
578 }
579 i += 1;
580 }
581 None
582}
583
584fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
585 bytes.get(i).is_some_and(pred)
586}
587
588fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
589 out.push(tok);
590 *i += 1;
591}
592
593fn keyword_or_ident_raw(raw: &str) -> Token {
603 let b = raw.as_bytes();
604 let tok = match b.len() {
605 2 => kw_len2(b),
606 3 => kw_len3(b),
607 4 => kw_len4(b),
608 5 => kw_len5(b),
609 6 => kw_len6(b),
610 7 => kw_len7(b),
611 8 => kw_len8(b),
612 9 => kw_len9(b),
613 10 => kw_len10(b),
614 11 => kw_len11(b),
615 12 => kw_len12(b),
616 _ => None,
617 };
618 match tok {
619 Some(t) => t,
620 None => Token::Ident(raw.to_ascii_lowercase()),
622 }
623}
624
625#[inline]
631fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
632 if input.len() != lower.len() {
633 return false;
634 }
635 for i in 0..lower.len() {
636 if input[i].to_ascii_lowercase() != lower[i] {
637 return false;
638 }
639 }
640 true
641}
642
643#[inline]
644fn kw_len2(b: &[u8]) -> Option<Token> {
645 if eq_ci(b, b"as") {
647 return Some(Token::As);
648 }
649 if eq_ci(b, b"by") {
650 return Some(Token::By);
651 }
652 if eq_ci(b, b"in") {
653 return Some(Token::In);
654 }
655 if eq_ci(b, b"is") {
656 return Some(Token::Is);
657 }
658 if eq_ci(b, b"on") {
659 return Some(Token::On);
660 }
661 if eq_ci(b, b"or") {
662 return Some(Token::Or);
663 }
664 if eq_ci(b, b"to") {
665 return Some(Token::To);
666 }
667 None
668}
669
670#[inline]
671fn kw_len3(b: &[u8]) -> Option<Token> {
672 if eq_ci(b, b"for") {
674 return Some(Token::For);
675 }
676 if eq_ci(b, b"all") {
677 return Some(Token::All);
678 }
679 if eq_ci(b, b"and") {
680 return Some(Token::And);
681 }
682 if eq_ci(b, b"asc") {
683 return Some(Token::Asc);
684 }
685 if eq_ci(b, b"not") {
686 return Some(Token::Not);
687 }
688 None
689}
690
691#[inline]
692fn kw_len4(b: &[u8]) -> Option<Token> {
693 if eq_ci(b, b"from") {
695 return Some(Token::From);
696 }
697 if eq_ci(b, b"drop") {
698 return Some(Token::Drop);
699 }
700 if eq_ci(b, b"null") {
701 return Some(Token::Null);
702 }
703 if eq_ci(b, b"true") {
704 return Some(Token::True);
705 }
706 if eq_ci(b, b"into") {
707 return Some(Token::Into);
708 }
709 if eq_ci(b, b"like") {
710 return Some(Token::Like);
711 }
712 if eq_ci(b, b"join") {
713 return Some(Token::Join);
714 }
715 if eq_ci(b, b"left") {
716 return Some(Token::Left);
717 }
718 if eq_ci(b, b"show") {
719 return Some(Token::Show);
720 }
721 if eq_ci(b, b"desc") {
722 return Some(Token::Desc);
723 }
724 None
725}
726
727#[inline]
728fn kw_len5(b: &[u8]) -> Option<Token> {
729 if eq_ci(b, b"false") {
732 return Some(Token::False);
733 }
734 if eq_ci(b, b"where") {
735 return Some(Token::Where);
736 }
737 if eq_ci(b, b"table") {
738 return Some(Token::Table);
739 }
740 if eq_ci(b, b"index") {
741 return Some(Token::Index);
742 }
743 if eq_ci(b, b"begin") {
744 return Some(Token::Begin);
745 }
746 if eq_ci(b, b"order") {
747 return Some(Token::Order);
748 }
749 if eq_ci(b, b"limit") {
750 return Some(Token::Limit);
751 }
752 if eq_ci(b, b"group") {
753 return Some(Token::Group);
754 }
755 if eq_ci(b, b"union") {
756 return Some(Token::Union);
757 }
758 if eq_ci(b, b"inner") {
759 return Some(Token::Inner);
760 }
761 if eq_ci(b, b"cross") {
762 return Some(Token::Cross);
763 }
764 if eq_ci(b, b"outer") {
765 return Some(Token::Outer);
766 }
767 None
768}
769
770#[inline]
771fn kw_len6(b: &[u8]) -> Option<Token> {
772 if eq_ci(b, b"select") {
774 return Some(Token::Select);
775 }
776 if eq_ci(b, b"tables") {
777 return Some(Token::Tables);
778 }
779 if eq_ci(b, b"except") {
780 return Some(Token::Except);
781 }
782 if eq_ci(b, b"create") {
783 return Some(Token::Create);
784 }
785 if eq_ci(b, b"insert") {
786 return Some(Token::Insert);
787 }
788 if eq_ci(b, b"values") {
789 return Some(Token::Values);
790 }
791 if eq_ci(b, b"commit") {
792 return Some(Token::Commit);
793 }
794 if eq_ci(b, b"having") {
795 return Some(Token::Having);
796 }
797 if eq_ci(b, b"offset") {
798 return Some(Token::Offset);
799 }
800 None
801}
802
803#[inline]
804fn kw_len7(b: &[u8]) -> Option<Token> {
805 if eq_ci(b, b"between") {
807 return Some(Token::Between);
808 }
809 if eq_ci(b, b"default") {
810 return Some(Token::Default);
811 }
812 if eq_ci(b, b"release") {
813 return Some(Token::Release);
814 }
815 if eq_ci(b, b"extract") {
816 return Some(Token::Extract);
817 }
818 None
819}
820
821#[inline]
822fn kw_len8(b: &[u8]) -> Option<Token> {
823 if eq_ci(b, b"rollback") {
825 return Some(Token::Rollback);
826 }
827 if eq_ci(b, b"distinct") {
828 return Some(Token::Distinct);
829 }
830 if eq_ci(b, b"interval") {
831 return Some(Token::Interval);
832 }
833 None
834}
835
836#[inline]
837fn kw_len9(b: &[u8]) -> Option<Token> {
838 if eq_ci(b, b"savepoint") {
840 return Some(Token::Savepoint);
841 }
842 None
843}
844
845#[inline]
846fn kw_len10(b: &[u8]) -> Option<Token> {
847 if eq_ci(b, b"connection") {
849 return Some(Token::Connection);
850 }
851 None
852}
853
854#[inline]
855fn kw_len11(b: &[u8]) -> Option<Token> {
856 if eq_ci(b, b"publication") {
858 return Some(Token::Publication);
859 }
860 None
861}
862
863#[inline]
864fn kw_len12(b: &[u8]) -> Option<Token> {
865 if eq_ci(b, b"subscription") {
867 return Some(Token::Subscription);
868 }
869 None
870}
871
872fn lex_quoted(
879 input: &str,
880 start: usize,
881 quote: u8,
882 is_ident: bool,
883) -> Result<(Token, usize), LexError> {
884 let bytes = input.as_bytes();
885 let mut i = start + 1;
886 let mut s = String::new();
887 loop {
888 if i >= bytes.len() {
889 return Err(LexError {
890 kind: if is_ident {
891 LexErrorKind::UnterminatedQuotedIdent
892 } else {
893 LexErrorKind::UnterminatedString
894 },
895 pos: start,
896 });
897 }
898 if bytes[i] == quote {
899 if peek_eq(bytes, i + 1, quote) {
900 s.push(quote as char);
901 i += 2;
902 } else {
903 i += 1;
904 break;
905 }
906 } else {
907 let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
908 s.push(ch);
909 i += ch.len_utf8();
910 }
911 }
912 let tok = if is_ident {
913 Token::QuotedIdent(s)
914 } else {
915 Token::String(s)
916 };
917 Ok((tok, i - start))
918}
919
920fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
921 let bytes = s.as_bytes();
922 let mut i = 0usize;
923 let mut is_float = false;
924
925 while i < bytes.len() && bytes[i].is_ascii_digit() {
926 i += 1;
927 }
928 if i < bytes.len() && bytes[i] == b'.' {
929 is_float = true;
930 i += 1;
931 while i < bytes.len() && bytes[i].is_ascii_digit() {
932 i += 1;
933 }
934 }
935 if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
936 is_float = true;
937 i += 1;
938 if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
939 i += 1;
940 }
941 let exp_start = i;
942 while i < bytes.len() && bytes[i].is_ascii_digit() {
943 i += 1;
944 }
945 if exp_start == i {
946 return Err(LexErrorKind::BadNumber(s[..i].to_string()));
947 }
948 }
949
950 let lit = &s[..i];
951 if is_float {
952 lit.parse::<f64>()
953 .map(|v| (Token::Float(v), i))
954 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
955 } else {
956 lit.parse::<i64>()
957 .map(|v| (Token::Integer(v), i))
958 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
959 }
960}
961
962#[cfg(test)]
963mod tests {
964 use super::*;
965 use alloc::vec;
966
967 fn lex(s: &str) -> Vec<Token> {
968 tokenize(s).expect("lex ok")
969 }
970
971 #[test]
972 fn empty_yields_only_eof() {
973 assert_eq!(lex(""), vec![Token::Eof]);
974 }
975
976 #[test]
977 fn whitespace_only_yields_only_eof() {
978 assert_eq!(lex(" \t\n "), vec![Token::Eof]);
979 }
980
981 #[test]
982 fn keywords_are_case_insensitive() {
983 assert_eq!(
984 lex("SELECT select Select"),
985 vec![Token::Select, Token::Select, Token::Select, Token::Eof]
986 );
987 }
988
989 #[test]
990 fn identifiers_lowercase_ascii() {
991 assert_eq!(
992 lex("hello WORLD _x x1"),
993 vec![
994 Token::Ident("hello".into()),
995 Token::Ident("world".into()),
996 Token::Ident("_x".into()),
997 Token::Ident("x1".into()),
998 Token::Eof,
999 ]
1000 );
1001 }
1002
1003 #[test]
1004 fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
1005 assert_eq!(
1006 lex(r#""User Name" "a""b""#),
1007 vec![
1008 Token::QuotedIdent("User Name".into()),
1009 Token::QuotedIdent("a\"b".into()),
1010 Token::Eof,
1011 ]
1012 );
1013 }
1014
1015 #[test]
1016 fn integer_and_float_literals() {
1017 assert_eq!(
1018 lex("0 42 1.5 .5 1e10 2.5e-3"),
1019 vec![
1020 Token::Integer(0),
1021 Token::Integer(42),
1022 Token::Float(1.5),
1023 Token::Float(0.5),
1024 Token::Float(1e10),
1025 Token::Float(2.5e-3),
1026 Token::Eof,
1027 ]
1028 );
1029 }
1030
1031 #[test]
1032 fn negative_number_is_minus_then_integer() {
1033 assert_eq!(
1035 lex("-42"),
1036 vec![Token::Minus, Token::Integer(42), Token::Eof]
1037 );
1038 }
1039
1040 #[test]
1041 fn string_literal_doubled_quote_escape() {
1042 assert_eq!(
1043 lex("'hello' 'it''s'"),
1044 vec![
1045 Token::String("hello".into()),
1046 Token::String("it's".into()),
1047 Token::Eof,
1048 ]
1049 );
1050 }
1051
1052 #[test]
1053 fn all_comparison_and_arithmetic_operators() {
1054 assert_eq!(
1055 lex("= <> != < <= > >= + - * /"),
1056 vec![
1057 Token::Eq,
1058 Token::NotEq,
1059 Token::NotEq,
1060 Token::Lt,
1061 Token::LtEq,
1062 Token::Gt,
1063 Token::GtEq,
1064 Token::Plus,
1065 Token::Minus,
1066 Token::Star,
1067 Token::Slash,
1068 Token::Eof,
1069 ]
1070 );
1071 }
1072
1073 #[test]
1074 fn punctuation() {
1075 assert_eq!(
1076 lex("( ) , ; ."),
1077 vec![
1078 Token::LParen,
1079 Token::RParen,
1080 Token::Comma,
1081 Token::Semicolon,
1082 Token::Dot,
1083 Token::Eof,
1084 ]
1085 );
1086 }
1087
1088 #[test]
1089 fn line_comment_skipped() {
1090 assert_eq!(
1091 lex("SELECT -- trailing junk\nFROM"),
1092 vec![Token::Select, Token::From, Token::Eof]
1093 );
1094 }
1095
1096 #[test]
1097 fn block_comment_skipped() {
1098 assert_eq!(
1099 lex("SELECT /* skipped */ 1"),
1100 vec![Token::Select, Token::Integer(1), Token::Eof]
1101 );
1102 }
1103
1104 #[test]
1105 fn unterminated_string_errors() {
1106 let err = tokenize("'oops").unwrap_err();
1107 assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1108 assert_eq!(err.pos, 0);
1109 }
1110
1111 #[test]
1112 fn unterminated_block_comment_errors() {
1113 let err = tokenize("/* never closed").unwrap_err();
1114 assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1115 }
1116
1117 #[test]
1118 fn unknown_char_errors() {
1119 let err = tokenize("@").unwrap_err();
1120 assert!(matches!(err.kind, LexErrorKind::UnknownChar('@')));
1121 }
1122
1123 #[test]
1124 fn dot_in_qualified_column() {
1125 assert_eq!(
1126 lex("t.col"),
1127 vec![
1128 Token::Ident("t".into()),
1129 Token::Dot,
1130 Token::Ident("col".into()),
1131 Token::Eof,
1132 ]
1133 );
1134 }
1135
1136 #[test]
1139 fn brackets_are_distinct_tokens() {
1140 assert_eq!(
1141 lex("[ ]"),
1142 vec![Token::LBracket, Token::RBracket, Token::Eof]
1143 );
1144 }
1145
1146 #[test]
1147 fn l2_distance_is_three_char_token() {
1148 assert_eq!(
1149 lex("a <-> b"),
1150 vec![
1151 Token::Ident("a".into()),
1152 Token::L2Distance,
1153 Token::Ident("b".into()),
1154 Token::Eof,
1155 ]
1156 );
1157 assert_eq!(
1159 lex("a <- b"),
1160 vec![
1161 Token::Ident("a".into()),
1162 Token::Lt,
1163 Token::Minus,
1164 Token::Ident("b".into()),
1165 Token::Eof,
1166 ]
1167 );
1168 }
1169
1170 #[test]
1171 fn order_by_limit_are_keywords() {
1172 assert_eq!(
1173 lex("ORDER BY LIMIT"),
1174 vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1175 );
1176 }
1177
1178 #[test]
1181 fn inner_product_operator_3char() {
1182 assert_eq!(
1183 lex("a <#> b"),
1184 vec![
1185 Token::Ident("a".into()),
1186 Token::InnerProduct,
1187 Token::Ident("b".into()),
1188 Token::Eof,
1189 ]
1190 );
1191 }
1192
1193 #[test]
1194 fn cosine_distance_operator_3char() {
1195 assert_eq!(
1196 lex("a <=> b"),
1197 vec![
1198 Token::Ident("a".into()),
1199 Token::CosineDistance,
1200 Token::Ident("b".into()),
1201 Token::Eof,
1202 ]
1203 );
1204 assert_eq!(
1207 lex("a <= b"),
1208 vec![
1209 Token::Ident("a".into()),
1210 Token::LtEq,
1211 Token::Ident("b".into()),
1212 Token::Eof,
1213 ]
1214 );
1215 }
1216
1217 #[test]
1218 fn double_colon_cast_token() {
1219 assert_eq!(
1220 lex("x::INT"),
1221 vec![
1222 Token::Ident("x".into()),
1223 Token::DoubleColon,
1224 Token::Ident("int".into()),
1225 Token::Eof,
1226 ]
1227 );
1228 }
1229
1230 #[test]
1231 fn lone_single_colon_lexes_as_colon_token() {
1232 let toks = tokenize(":x").expect("colon now lexes");
1237 assert_eq!(toks[0], Token::Colon);
1238 }
1239
1240 #[test]
1241 fn colon_eq_lexes_as_assignment() {
1242 let toks = tokenize("x := 1").expect("colon-eq lexes");
1244 assert!(matches!(toks[1], Token::ColonEq));
1246 }
1247}