1use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17 Select,
19 From,
20 Where,
21 As,
22 Null,
23 True,
24 False,
25 And,
26 Or,
27 Not,
28 Create,
29 Table,
30 Insert,
31 Into,
32 Values,
33 Index,
34 On,
35 Begin,
36 Commit,
37 Rollback,
38 Order,
39 By,
40 Limit,
41
42 Ident(String), QuotedIdent(String), SessionVar(String),
54
55 Integer(i64),
57 Float(f64),
58 String(String),
59
60 Plus,
62 Minus,
63 Star,
64 Slash,
65 Eq,
66 NotEq,
67 Lt,
68 LtEq,
69 Gt,
70 GtEq,
71
72 LParen,
74 RParen,
75 LBracket,
76 RBracket,
77 Comma,
78 Semicolon,
79 Dot,
80 JsonGet,
84 JsonGetText,
86 JsonGetPath,
89 JsonGetPathText,
91 JsonContains,
95 TsMatch,
99 L2Distance,
100 InnerProduct,
103 CosineDistance,
105 DoubleColon,
108 ColonEq,
111 Colon,
115 Concat,
117 Is,
119 Between,
120 In,
121 Like,
122 Group,
123 Distinct,
124 Union,
125 All,
126 Join,
127 Inner,
128 Left,
129 Cross,
130 Outer,
131 Default,
132 Savepoint,
133 Release,
134 To,
135 Having,
136 Show,
137 Extract,
138 Offset,
139 Asc,
140 Desc,
141 Interval,
144 Placeholder(u16),
148
149 Drop,
153 For,
155 Tables,
160 Except,
163 Publication,
165 Subscription,
167 Connection,
170
171 Eof,
172}
173
174#[derive(Debug, Clone, PartialEq, Eq)]
175pub enum LexErrorKind {
176 UnknownChar(char),
177 UnterminatedString,
178 UnterminatedQuotedIdent,
179 UnterminatedBlockComment,
180 BadNumber(String),
181}
182
183#[derive(Debug, Clone, PartialEq, Eq)]
184pub struct LexError {
185 pub kind: LexErrorKind,
186 pub pos: usize,
187}
188
189impl fmt::Display for LexError {
190 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
191 match &self.kind {
192 LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
193 LexErrorKind::UnterminatedString => {
194 write!(f, "unterminated string literal at byte {}", self.pos)
195 }
196 LexErrorKind::UnterminatedQuotedIdent => {
197 write!(f, "unterminated quoted identifier at byte {}", self.pos)
198 }
199 LexErrorKind::UnterminatedBlockComment => {
200 write!(f, "unterminated /* */ comment at byte {}", self.pos)
201 }
202 LexErrorKind::BadNumber(s) => {
203 write!(f, "invalid number literal {s:?} at byte {}", self.pos)
204 }
205 }
206 }
207}
208
209#[allow(clippy::too_many_lines)] pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
212 let bytes = input.as_bytes();
213 let mut i = 0usize;
214 let mut out = Vec::new();
215
216 while i < bytes.len() {
217 let b = bytes[i];
218 match b {
219 b' ' | b'\t' | b'\n' | b'\r' => {
220 i += 1;
221 }
222 b'-' if peek_eq(bytes, i + 1, b'-') => {
223 i += 2;
224 while i < bytes.len() && bytes[i] != b'\n' {
225 i += 1;
226 }
227 }
228 b'/' if peek_eq(bytes, i + 1, b'*') => {
229 let start = i;
230 if peek_eq(bytes, i + 2, b'!') {
240 let mut j = i + 3;
241 while j < bytes.len() && bytes[j].is_ascii_digit() {
244 j += 1;
245 }
246 if j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
247 j += 1;
248 }
249 i = j;
250 continue;
251 }
252 i += 2;
253 let mut closed = false;
254 while i + 1 < bytes.len() {
255 if bytes[i] == b'*' && bytes[i + 1] == b'/' {
256 i += 2;
257 closed = true;
258 break;
259 }
260 i += 1;
261 }
262 if !closed {
263 return Err(LexError {
264 kind: LexErrorKind::UnterminatedBlockComment,
265 pos: start,
266 });
267 }
268 }
269 b'*' if peek_eq(bytes, i + 1, b'/') => {
274 i += 2;
275 }
276 b'\'' => {
277 let (tok, consumed) = lex_quoted(input, i, b'\'', false)?;
278 out.push(tok);
279 i += consumed;
280 }
281 b'"' => {
282 let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
283 out.push(tok);
284 i += consumed;
285 }
286 b'`' => {
290 let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
291 out.push(tok);
292 i += consumed;
293 }
294 b if b.is_ascii_alphabetic() || b == b'_' => {
295 let start = i;
296 i += 1;
297 while i < bytes.len() {
298 let c = bytes[i];
299 if c.is_ascii_alphanumeric() || c == b'_' {
300 i += 1;
301 } else {
302 break;
303 }
304 }
305 let raw = &input[start..i];
306 out.push(keyword_or_ident_raw(raw));
310 }
311 b if b.is_ascii_digit() => {
312 let (tok, consumed) =
313 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
314 out.push(tok);
315 i += consumed;
316 }
317 b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
318 let (tok, consumed) =
319 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
320 out.push(tok);
321 i += consumed;
322 }
323 b'+' => single(&mut out, Token::Plus, &mut i),
324 b'-' => {
325 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
328 out.push(Token::JsonGetText);
329 i += 3;
330 } else if peek_eq(bytes, i + 1, b'>') {
331 out.push(Token::JsonGet);
332 i += 2;
333 } else {
334 single(&mut out, Token::Minus, &mut i);
335 }
336 }
337 b'#' => {
339 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
340 out.push(Token::JsonGetPathText);
341 i += 3;
342 } else if peek_eq(bytes, i + 1, b'>') {
343 out.push(Token::JsonGetPath);
344 i += 2;
345 } else {
346 return Err(LexError {
347 kind: LexErrorKind::UnknownChar('#'),
348 pos: i,
349 });
350 }
351 }
352 b'@' => {
361 if peek_eq(bytes, i + 1, b'>') {
362 out.push(Token::JsonContains);
363 i += 2;
364 } else if peek_eq(bytes, i + 1, b'@')
365 && !is_session_var_ident_start(bytes.get(i + 2).copied())
366 {
367 out.push(Token::TsMatch);
370 i += 2;
371 } else {
372 let prefix_end = if peek_eq(bytes, i + 1, b'@') { i + 2 } else { i + 1 };
377 let mut end = prefix_end;
378 while end < bytes.len() && is_session_var_ident_continue(bytes[end]) {
379 end += 1;
380 }
381 if end == prefix_end {
382 return Err(LexError {
383 kind: LexErrorKind::UnknownChar('@'),
384 pos: i,
385 });
386 }
387 out.push(Token::SessionVar(input[i..end].to_string()));
388 i = end;
389 }
390 }
391 b'*' => single(&mut out, Token::Star, &mut i),
392 b'/' => single(&mut out, Token::Slash, &mut i),
393 b'(' => single(&mut out, Token::LParen, &mut i),
394 b')' => single(&mut out, Token::RParen, &mut i),
395 b'[' => single(&mut out, Token::LBracket, &mut i),
396 b']' => single(&mut out, Token::RBracket, &mut i),
397 b',' => single(&mut out, Token::Comma, &mut i),
398 b';' => single(&mut out, Token::Semicolon, &mut i),
399 b'.' => single(&mut out, Token::Dot, &mut i),
400 b'=' => single(&mut out, Token::Eq, &mut i),
401 b'<' => {
402 if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
403 out.push(Token::CosineDistance);
404 i += 3;
405 } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
406 out.push(Token::InnerProduct);
407 i += 3;
408 } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
409 out.push(Token::L2Distance);
410 i += 3;
411 } else if peek_eq(bytes, i + 1, b'=') {
412 out.push(Token::LtEq);
413 i += 2;
414 } else if peek_eq(bytes, i + 1, b'>') {
415 out.push(Token::NotEq);
416 i += 2;
417 } else {
418 out.push(Token::Lt);
419 i += 1;
420 }
421 }
422 b':' if peek_eq(bytes, i + 1, b':') => {
423 out.push(Token::DoubleColon);
424 i += 2;
425 }
426 b':' if peek_eq(bytes, i + 1, b'=') => {
427 out.push(Token::ColonEq);
429 i += 2;
430 }
431 b':' => {
432 out.push(Token::Colon);
436 i += 1;
437 }
438 b'|' if peek_eq(bytes, i + 1, b'|') => {
439 out.push(Token::Concat);
440 i += 2;
441 }
442 b'>' => {
443 if peek_eq(bytes, i + 1, b'=') {
444 out.push(Token::GtEq);
445 i += 2;
446 } else {
447 out.push(Token::Gt);
448 i += 1;
449 }
450 }
451 b'!' if peek_eq(bytes, i + 1, b'=') => {
452 out.push(Token::NotEq);
453 i += 2;
454 }
455 b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
463 let end = find_dollar_tag_end(bytes, i + 2, b"$$");
465 let body = match end {
466 Some(e) => &input[i + 2..e],
467 None => {
468 return Err(LexError {
469 kind: LexErrorKind::UnterminatedString,
470 pos: i,
471 });
472 }
473 };
474 out.push(Token::String(body.to_string()));
475 i = end.unwrap() + 2;
476 }
477 b'$' if i + 1 < bytes.len()
478 && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
479 {
480 let mut j = i + 1;
483 while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
484 j += 1;
485 }
486 if j >= bytes.len() || bytes[j] != b'$' {
487 let ch = input[i..].chars().next().unwrap_or('?');
490 return Err(LexError {
491 kind: LexErrorKind::UnknownChar(ch),
492 pos: i,
493 });
494 }
495 let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
496 let end = find_dollar_tag_end(bytes, j + 1, &close);
497 let body = match end {
498 Some(e) => &input[j + 1..e],
499 None => {
500 return Err(LexError {
501 kind: LexErrorKind::UnterminatedString,
502 pos: i,
503 });
504 }
505 };
506 out.push(Token::String(body.to_string()));
507 i = end.unwrap() + close.len();
508 }
509 b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
513 let mut j = i + 1;
514 let mut n: u32 = 0;
515 while j < bytes.len() && bytes[j].is_ascii_digit() {
516 n = n
517 .saturating_mul(10)
518 .saturating_add(u32::from(bytes[j] - b'0'));
519 j += 1;
520 }
521 if n == 0 || n > u32::from(u16::MAX) {
522 return Err(LexError {
523 kind: LexErrorKind::BadNumber(input[i..j].to_string()),
524 pos: i,
525 });
526 }
527 #[allow(clippy::cast_possible_truncation)]
528 out.push(Token::Placeholder(n as u16));
529 i = j;
530 }
531 _ => {
532 let ch = input[i..].chars().next().unwrap_or('?');
533 return Err(LexError {
534 kind: LexErrorKind::UnknownChar(ch),
535 pos: i,
536 });
537 }
538 }
539 }
540 out.push(Token::Eof);
541 Ok(out)
542}
543
544fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
545 bytes.get(i) == Some(&target)
546}
547
548fn is_session_var_ident_start(b: Option<u8>) -> bool {
553 matches!(b, Some(c) if c.is_ascii_alphabetic() || c == b'_')
554}
555
556fn is_session_var_ident_continue(b: u8) -> bool {
561 b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'$'
562}
563
564fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
567 if tag.is_empty() || from > bytes.len() {
568 return None;
569 }
570 let mut i = from;
571 while i + tag.len() <= bytes.len() {
572 if &bytes[i..i + tag.len()] == tag {
573 return Some(i);
574 }
575 i += 1;
576 }
577 None
578}
579
580fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
581 bytes.get(i).is_some_and(pred)
582}
583
584fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
585 out.push(tok);
586 *i += 1;
587}
588
589fn keyword_or_ident_raw(raw: &str) -> Token {
599 let b = raw.as_bytes();
600 let tok = match b.len() {
601 2 => kw_len2(b),
602 3 => kw_len3(b),
603 4 => kw_len4(b),
604 5 => kw_len5(b),
605 6 => kw_len6(b),
606 7 => kw_len7(b),
607 8 => kw_len8(b),
608 9 => kw_len9(b),
609 10 => kw_len10(b),
610 11 => kw_len11(b),
611 12 => kw_len12(b),
612 _ => None,
613 };
614 match tok {
615 Some(t) => t,
616 None => Token::Ident(raw.to_ascii_lowercase()),
618 }
619}
620
621#[inline]
627fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
628 if input.len() != lower.len() {
629 return false;
630 }
631 for i in 0..lower.len() {
632 if input[i].to_ascii_lowercase() != lower[i] {
633 return false;
634 }
635 }
636 true
637}
638
639#[inline]
640fn kw_len2(b: &[u8]) -> Option<Token> {
641 if eq_ci(b, b"as") {
643 return Some(Token::As);
644 }
645 if eq_ci(b, b"by") {
646 return Some(Token::By);
647 }
648 if eq_ci(b, b"in") {
649 return Some(Token::In);
650 }
651 if eq_ci(b, b"is") {
652 return Some(Token::Is);
653 }
654 if eq_ci(b, b"on") {
655 return Some(Token::On);
656 }
657 if eq_ci(b, b"or") {
658 return Some(Token::Or);
659 }
660 if eq_ci(b, b"to") {
661 return Some(Token::To);
662 }
663 None
664}
665
666#[inline]
667fn kw_len3(b: &[u8]) -> Option<Token> {
668 if eq_ci(b, b"for") {
670 return Some(Token::For);
671 }
672 if eq_ci(b, b"all") {
673 return Some(Token::All);
674 }
675 if eq_ci(b, b"and") {
676 return Some(Token::And);
677 }
678 if eq_ci(b, b"asc") {
679 return Some(Token::Asc);
680 }
681 if eq_ci(b, b"not") {
682 return Some(Token::Not);
683 }
684 None
685}
686
687#[inline]
688fn kw_len4(b: &[u8]) -> Option<Token> {
689 if eq_ci(b, b"from") {
691 return Some(Token::From);
692 }
693 if eq_ci(b, b"drop") {
694 return Some(Token::Drop);
695 }
696 if eq_ci(b, b"null") {
697 return Some(Token::Null);
698 }
699 if eq_ci(b, b"true") {
700 return Some(Token::True);
701 }
702 if eq_ci(b, b"into") {
703 return Some(Token::Into);
704 }
705 if eq_ci(b, b"like") {
706 return Some(Token::Like);
707 }
708 if eq_ci(b, b"join") {
709 return Some(Token::Join);
710 }
711 if eq_ci(b, b"left") {
712 return Some(Token::Left);
713 }
714 if eq_ci(b, b"show") {
715 return Some(Token::Show);
716 }
717 if eq_ci(b, b"desc") {
718 return Some(Token::Desc);
719 }
720 None
721}
722
723#[inline]
724fn kw_len5(b: &[u8]) -> Option<Token> {
725 if eq_ci(b, b"false") {
728 return Some(Token::False);
729 }
730 if eq_ci(b, b"where") {
731 return Some(Token::Where);
732 }
733 if eq_ci(b, b"table") {
734 return Some(Token::Table);
735 }
736 if eq_ci(b, b"index") {
737 return Some(Token::Index);
738 }
739 if eq_ci(b, b"begin") {
740 return Some(Token::Begin);
741 }
742 if eq_ci(b, b"order") {
743 return Some(Token::Order);
744 }
745 if eq_ci(b, b"limit") {
746 return Some(Token::Limit);
747 }
748 if eq_ci(b, b"group") {
749 return Some(Token::Group);
750 }
751 if eq_ci(b, b"union") {
752 return Some(Token::Union);
753 }
754 if eq_ci(b, b"inner") {
755 return Some(Token::Inner);
756 }
757 if eq_ci(b, b"cross") {
758 return Some(Token::Cross);
759 }
760 if eq_ci(b, b"outer") {
761 return Some(Token::Outer);
762 }
763 None
764}
765
766#[inline]
767fn kw_len6(b: &[u8]) -> Option<Token> {
768 if eq_ci(b, b"select") {
770 return Some(Token::Select);
771 }
772 if eq_ci(b, b"tables") {
773 return Some(Token::Tables);
774 }
775 if eq_ci(b, b"except") {
776 return Some(Token::Except);
777 }
778 if eq_ci(b, b"create") {
779 return Some(Token::Create);
780 }
781 if eq_ci(b, b"insert") {
782 return Some(Token::Insert);
783 }
784 if eq_ci(b, b"values") {
785 return Some(Token::Values);
786 }
787 if eq_ci(b, b"commit") {
788 return Some(Token::Commit);
789 }
790 if eq_ci(b, b"having") {
791 return Some(Token::Having);
792 }
793 if eq_ci(b, b"offset") {
794 return Some(Token::Offset);
795 }
796 None
797}
798
799#[inline]
800fn kw_len7(b: &[u8]) -> Option<Token> {
801 if eq_ci(b, b"between") {
803 return Some(Token::Between);
804 }
805 if eq_ci(b, b"default") {
806 return Some(Token::Default);
807 }
808 if eq_ci(b, b"release") {
809 return Some(Token::Release);
810 }
811 if eq_ci(b, b"extract") {
812 return Some(Token::Extract);
813 }
814 None
815}
816
817#[inline]
818fn kw_len8(b: &[u8]) -> Option<Token> {
819 if eq_ci(b, b"rollback") {
821 return Some(Token::Rollback);
822 }
823 if eq_ci(b, b"distinct") {
824 return Some(Token::Distinct);
825 }
826 if eq_ci(b, b"interval") {
827 return Some(Token::Interval);
828 }
829 None
830}
831
832#[inline]
833fn kw_len9(b: &[u8]) -> Option<Token> {
834 if eq_ci(b, b"savepoint") {
836 return Some(Token::Savepoint);
837 }
838 None
839}
840
841#[inline]
842fn kw_len10(b: &[u8]) -> Option<Token> {
843 if eq_ci(b, b"connection") {
845 return Some(Token::Connection);
846 }
847 None
848}
849
850#[inline]
851fn kw_len11(b: &[u8]) -> Option<Token> {
852 if eq_ci(b, b"publication") {
854 return Some(Token::Publication);
855 }
856 None
857}
858
859#[inline]
860fn kw_len12(b: &[u8]) -> Option<Token> {
861 if eq_ci(b, b"subscription") {
863 return Some(Token::Subscription);
864 }
865 None
866}
867
868fn lex_quoted(
875 input: &str,
876 start: usize,
877 quote: u8,
878 is_ident: bool,
879) -> Result<(Token, usize), LexError> {
880 let bytes = input.as_bytes();
881 let mut i = start + 1;
882 let mut s = String::new();
883 loop {
884 if i >= bytes.len() {
885 return Err(LexError {
886 kind: if is_ident {
887 LexErrorKind::UnterminatedQuotedIdent
888 } else {
889 LexErrorKind::UnterminatedString
890 },
891 pos: start,
892 });
893 }
894 if bytes[i] == quote {
895 if peek_eq(bytes, i + 1, quote) {
896 s.push(quote as char);
897 i += 2;
898 } else {
899 i += 1;
900 break;
901 }
902 } else {
903 let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
904 s.push(ch);
905 i += ch.len_utf8();
906 }
907 }
908 let tok = if is_ident {
909 Token::QuotedIdent(s)
910 } else {
911 Token::String(s)
912 };
913 Ok((tok, i - start))
914}
915
916fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
917 let bytes = s.as_bytes();
918 let mut i = 0usize;
919 let mut is_float = false;
920
921 while i < bytes.len() && bytes[i].is_ascii_digit() {
922 i += 1;
923 }
924 if i < bytes.len() && bytes[i] == b'.' {
925 is_float = true;
926 i += 1;
927 while i < bytes.len() && bytes[i].is_ascii_digit() {
928 i += 1;
929 }
930 }
931 if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
932 is_float = true;
933 i += 1;
934 if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
935 i += 1;
936 }
937 let exp_start = i;
938 while i < bytes.len() && bytes[i].is_ascii_digit() {
939 i += 1;
940 }
941 if exp_start == i {
942 return Err(LexErrorKind::BadNumber(s[..i].to_string()));
943 }
944 }
945
946 let lit = &s[..i];
947 if is_float {
948 lit.parse::<f64>()
949 .map(|v| (Token::Float(v), i))
950 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
951 } else {
952 lit.parse::<i64>()
953 .map(|v| (Token::Integer(v), i))
954 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
955 }
956}
957
958#[cfg(test)]
959mod tests {
960 use super::*;
961 use alloc::vec;
962
963 fn lex(s: &str) -> Vec<Token> {
964 tokenize(s).expect("lex ok")
965 }
966
967 #[test]
968 fn empty_yields_only_eof() {
969 assert_eq!(lex(""), vec![Token::Eof]);
970 }
971
972 #[test]
973 fn whitespace_only_yields_only_eof() {
974 assert_eq!(lex(" \t\n "), vec![Token::Eof]);
975 }
976
977 #[test]
978 fn keywords_are_case_insensitive() {
979 assert_eq!(
980 lex("SELECT select Select"),
981 vec![Token::Select, Token::Select, Token::Select, Token::Eof]
982 );
983 }
984
985 #[test]
986 fn identifiers_lowercase_ascii() {
987 assert_eq!(
988 lex("hello WORLD _x x1"),
989 vec![
990 Token::Ident("hello".into()),
991 Token::Ident("world".into()),
992 Token::Ident("_x".into()),
993 Token::Ident("x1".into()),
994 Token::Eof,
995 ]
996 );
997 }
998
999 #[test]
1000 fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
1001 assert_eq!(
1002 lex(r#""User Name" "a""b""#),
1003 vec![
1004 Token::QuotedIdent("User Name".into()),
1005 Token::QuotedIdent("a\"b".into()),
1006 Token::Eof,
1007 ]
1008 );
1009 }
1010
1011 #[test]
1012 fn integer_and_float_literals() {
1013 assert_eq!(
1014 lex("0 42 1.5 .5 1e10 2.5e-3"),
1015 vec![
1016 Token::Integer(0),
1017 Token::Integer(42),
1018 Token::Float(1.5),
1019 Token::Float(0.5),
1020 Token::Float(1e10),
1021 Token::Float(2.5e-3),
1022 Token::Eof,
1023 ]
1024 );
1025 }
1026
1027 #[test]
1028 fn negative_number_is_minus_then_integer() {
1029 assert_eq!(
1031 lex("-42"),
1032 vec![Token::Minus, Token::Integer(42), Token::Eof]
1033 );
1034 }
1035
1036 #[test]
1037 fn string_literal_doubled_quote_escape() {
1038 assert_eq!(
1039 lex("'hello' 'it''s'"),
1040 vec![
1041 Token::String("hello".into()),
1042 Token::String("it's".into()),
1043 Token::Eof,
1044 ]
1045 );
1046 }
1047
1048 #[test]
1049 fn all_comparison_and_arithmetic_operators() {
1050 assert_eq!(
1051 lex("= <> != < <= > >= + - * /"),
1052 vec![
1053 Token::Eq,
1054 Token::NotEq,
1055 Token::NotEq,
1056 Token::Lt,
1057 Token::LtEq,
1058 Token::Gt,
1059 Token::GtEq,
1060 Token::Plus,
1061 Token::Minus,
1062 Token::Star,
1063 Token::Slash,
1064 Token::Eof,
1065 ]
1066 );
1067 }
1068
1069 #[test]
1070 fn punctuation() {
1071 assert_eq!(
1072 lex("( ) , ; ."),
1073 vec![
1074 Token::LParen,
1075 Token::RParen,
1076 Token::Comma,
1077 Token::Semicolon,
1078 Token::Dot,
1079 Token::Eof,
1080 ]
1081 );
1082 }
1083
1084 #[test]
1085 fn line_comment_skipped() {
1086 assert_eq!(
1087 lex("SELECT -- trailing junk\nFROM"),
1088 vec![Token::Select, Token::From, Token::Eof]
1089 );
1090 }
1091
1092 #[test]
1093 fn block_comment_skipped() {
1094 assert_eq!(
1095 lex("SELECT /* skipped */ 1"),
1096 vec![Token::Select, Token::Integer(1), Token::Eof]
1097 );
1098 }
1099
1100 #[test]
1101 fn unterminated_string_errors() {
1102 let err = tokenize("'oops").unwrap_err();
1103 assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1104 assert_eq!(err.pos, 0);
1105 }
1106
1107 #[test]
1108 fn unterminated_block_comment_errors() {
1109 let err = tokenize("/* never closed").unwrap_err();
1110 assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1111 }
1112
1113 #[test]
1114 fn unknown_char_errors() {
1115 let err = tokenize("@").unwrap_err();
1116 assert!(matches!(err.kind, LexErrorKind::UnknownChar('@')));
1117 }
1118
1119 #[test]
1120 fn dot_in_qualified_column() {
1121 assert_eq!(
1122 lex("t.col"),
1123 vec![
1124 Token::Ident("t".into()),
1125 Token::Dot,
1126 Token::Ident("col".into()),
1127 Token::Eof,
1128 ]
1129 );
1130 }
1131
1132 #[test]
1135 fn brackets_are_distinct_tokens() {
1136 assert_eq!(
1137 lex("[ ]"),
1138 vec![Token::LBracket, Token::RBracket, Token::Eof]
1139 );
1140 }
1141
1142 #[test]
1143 fn l2_distance_is_three_char_token() {
1144 assert_eq!(
1145 lex("a <-> b"),
1146 vec![
1147 Token::Ident("a".into()),
1148 Token::L2Distance,
1149 Token::Ident("b".into()),
1150 Token::Eof,
1151 ]
1152 );
1153 assert_eq!(
1155 lex("a <- b"),
1156 vec![
1157 Token::Ident("a".into()),
1158 Token::Lt,
1159 Token::Minus,
1160 Token::Ident("b".into()),
1161 Token::Eof,
1162 ]
1163 );
1164 }
1165
1166 #[test]
1167 fn order_by_limit_are_keywords() {
1168 assert_eq!(
1169 lex("ORDER BY LIMIT"),
1170 vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1171 );
1172 }
1173
1174 #[test]
1177 fn inner_product_operator_3char() {
1178 assert_eq!(
1179 lex("a <#> b"),
1180 vec![
1181 Token::Ident("a".into()),
1182 Token::InnerProduct,
1183 Token::Ident("b".into()),
1184 Token::Eof,
1185 ]
1186 );
1187 }
1188
1189 #[test]
1190 fn cosine_distance_operator_3char() {
1191 assert_eq!(
1192 lex("a <=> b"),
1193 vec![
1194 Token::Ident("a".into()),
1195 Token::CosineDistance,
1196 Token::Ident("b".into()),
1197 Token::Eof,
1198 ]
1199 );
1200 assert_eq!(
1203 lex("a <= b"),
1204 vec![
1205 Token::Ident("a".into()),
1206 Token::LtEq,
1207 Token::Ident("b".into()),
1208 Token::Eof,
1209 ]
1210 );
1211 }
1212
1213 #[test]
1214 fn double_colon_cast_token() {
1215 assert_eq!(
1216 lex("x::INT"),
1217 vec![
1218 Token::Ident("x".into()),
1219 Token::DoubleColon,
1220 Token::Ident("int".into()),
1221 Token::Eof,
1222 ]
1223 );
1224 }
1225
1226 #[test]
1227 fn lone_single_colon_lexes_as_colon_token() {
1228 let toks = tokenize(":x").expect("colon now lexes");
1233 assert_eq!(toks[0], Token::Colon);
1234 }
1235
1236 #[test]
1237 fn colon_eq_lexes_as_assignment() {
1238 let toks = tokenize("x := 1").expect("colon-eq lexes");
1240 assert!(matches!(toks[1], Token::ColonEq));
1242 }
1243}