1use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17 Select,
19 From,
20 Where,
21 As,
22 Null,
23 True,
24 False,
25 And,
26 Or,
27 Not,
28 Create,
29 Table,
30 Insert,
31 Into,
32 Values,
33 Index,
34 On,
35 Begin,
36 Commit,
37 Rollback,
38 Order,
39 By,
40 Limit,
41
42 Ident(String), QuotedIdent(String), SessionVar(String),
54
55 Integer(i64),
57 Float(f64),
58 String(String),
59
60 Plus,
62 Minus,
63 Star,
64 Slash,
65 Eq,
66 NotEq,
67 Lt,
68 LtEq,
69 Gt,
70 GtEq,
71 InetContainedBy,
74 InetContainedByEq,
77 InetContains,
80 InetContainsEq,
83 InetOverlap,
86
87 LParen,
89 RParen,
90 LBracket,
91 RBracket,
92 Comma,
93 Semicolon,
94 Dot,
95 At,
104 JsonGet,
108 JsonGetText,
110 JsonGetPath,
113 JsonGetPathText,
115 JsonContains,
119 TsMatch,
123 L2Distance,
124 InnerProduct,
127 CosineDistance,
129 DoubleColon,
132 ColonEq,
135 Colon,
139 Concat,
141 Pipe,
143 Amp,
145 Tilde,
147 Is,
149 Between,
150 In,
151 Like,
152 Group,
153 Distinct,
154 Union,
155 All,
156 Join,
157 Inner,
158 Left,
159 Cross,
160 Outer,
161 Default,
162 Savepoint,
163 Release,
164 To,
165 Having,
166 Show,
167 Extract,
168 Offset,
169 Asc,
170 Desc,
171 Interval,
174 Placeholder(u16),
178
179 Drop,
183 For,
185 Tables,
190 Except,
193 Publication,
195 Subscription,
197 Connection,
200
201 Eof,
202}
203
204#[derive(Debug, Clone, PartialEq, Eq)]
205pub enum LexErrorKind {
206 UnknownChar(char),
207 UnterminatedString,
208 UnterminatedQuotedIdent,
209 UnterminatedBlockComment,
210 BadNumber(String),
211}
212
213#[derive(Debug, Clone, PartialEq, Eq)]
214pub struct LexError {
215 pub kind: LexErrorKind,
216 pub pos: usize,
217}
218
219impl fmt::Display for LexError {
220 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
221 match &self.kind {
222 LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
223 LexErrorKind::UnterminatedString => {
224 write!(f, "unterminated string literal at byte {}", self.pos)
225 }
226 LexErrorKind::UnterminatedQuotedIdent => {
227 write!(f, "unterminated quoted identifier at byte {}", self.pos)
228 }
229 LexErrorKind::UnterminatedBlockComment => {
230 write!(f, "unterminated /* */ comment at byte {}", self.pos)
231 }
232 LexErrorKind::BadNumber(s) => {
233 write!(f, "invalid number literal {s:?} at byte {}", self.pos)
234 }
235 }
236 }
237}
238
239pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
243 tokenize_with(input, false)
244}
245
246#[allow(clippy::too_many_lines)] pub fn tokenize_with(input: &str, backslash_escapes: bool) -> Result<Vec<Token>, LexError> {
255 let bytes = input.as_bytes();
256 let mut i = 0usize;
257 let mut out = Vec::new();
258
259 while i < bytes.len() {
260 let b = bytes[i];
261 match b {
262 b' ' | b'\t' | b'\n' | b'\r' => {
263 i += 1;
264 }
265 b'-' if peek_eq(bytes, i + 1, b'-') => {
266 i += 2;
267 while i < bytes.len() && bytes[i] != b'\n' {
268 i += 1;
269 }
270 }
271 b'/' if peek_eq(bytes, i + 1, b'*') => {
272 let start = i;
273 if peek_eq(bytes, i + 2, b'!') {
283 let mut j = i + 3;
284 while j < bytes.len() && bytes[j].is_ascii_digit() {
287 j += 1;
288 }
289 if j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
290 j += 1;
291 }
292 i = j;
293 continue;
294 }
295 i += 2;
296 let mut closed = false;
297 while i + 1 < bytes.len() {
298 if bytes[i] == b'*' && bytes[i + 1] == b'/' {
299 i += 2;
300 closed = true;
301 break;
302 }
303 i += 1;
304 }
305 if !closed {
306 return Err(LexError {
307 kind: LexErrorKind::UnterminatedBlockComment,
308 pos: start,
309 });
310 }
311 }
312 b'*' if peek_eq(bytes, i + 1, b'/') => {
317 i += 2;
318 }
319 b'\'' => {
320 let (tok, consumed) = if backslash_escapes {
321 lex_escape_string(input, i)?
324 } else {
325 lex_quoted(input, i, b'\'', false)?
326 };
327 out.push(tok);
328 i += consumed;
329 }
330 b'E' | b'e' if peek_eq(bytes, i + 1, b'\'') => {
338 let (tok, consumed) = lex_escape_string(input, i + 1)?;
339 out.push(tok);
340 i += 1 + consumed;
341 }
342 b'"' => {
343 let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
344 out.push(tok);
345 i += consumed;
346 }
347 b'`' => {
351 let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
352 out.push(tok);
353 i += consumed;
354 }
355 b if b.is_ascii_alphabetic() || b == b'_' => {
356 let start = i;
357 i += 1;
358 while i < bytes.len() {
359 let c = bytes[i];
360 if c.is_ascii_alphanumeric() || c == b'_' {
361 i += 1;
362 } else {
363 break;
364 }
365 }
366 let raw = &input[start..i];
367 out.push(keyword_or_ident_raw(raw));
371 }
372 b if b.is_ascii_digit() => {
373 let (tok, consumed) =
374 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
375 out.push(tok);
376 i += consumed;
377 }
378 b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
379 let (tok, consumed) =
380 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
381 out.push(tok);
382 i += consumed;
383 }
384 b'+' => single(&mut out, Token::Plus, &mut i),
385 b'-' => {
386 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
389 out.push(Token::JsonGetText);
390 i += 3;
391 } else if peek_eq(bytes, i + 1, b'>') {
392 out.push(Token::JsonGet);
393 i += 2;
394 } else {
395 single(&mut out, Token::Minus, &mut i);
396 }
397 }
398 b'#' => {
400 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
401 out.push(Token::JsonGetPathText);
402 i += 3;
403 } else if peek_eq(bytes, i + 1, b'>') {
404 out.push(Token::JsonGetPath);
405 i += 2;
406 } else {
407 return Err(LexError {
408 kind: LexErrorKind::UnknownChar('#'),
409 pos: i,
410 });
411 }
412 }
413 b'@' => {
422 if peek_eq(bytes, i + 1, b'>') {
423 out.push(Token::JsonContains);
424 i += 2;
425 } else if peek_eq(bytes, i + 1, b'@')
426 && !is_session_var_ident_start(bytes.get(i + 2).copied())
427 {
428 out.push(Token::TsMatch);
431 i += 2;
432 } else {
433 let prefix_end = if peek_eq(bytes, i + 1, b'@') {
438 i + 2
439 } else {
440 i + 1
441 };
442 let mut end = prefix_end;
443 while end < bytes.len() && is_session_var_ident_continue(bytes[end]) {
444 end += 1;
445 }
446 if end == prefix_end {
447 out.push(Token::At);
456 i = prefix_end;
457 continue;
458 }
459 out.push(Token::SessionVar(input[i..end].to_string()));
460 i = end;
461 }
462 }
463 b'*' => single(&mut out, Token::Star, &mut i),
464 b'/' => single(&mut out, Token::Slash, &mut i),
465 b'(' => single(&mut out, Token::LParen, &mut i),
466 b')' => single(&mut out, Token::RParen, &mut i),
467 b'[' => single(&mut out, Token::LBracket, &mut i),
468 b']' => single(&mut out, Token::RBracket, &mut i),
469 b',' => single(&mut out, Token::Comma, &mut i),
470 b';' => single(&mut out, Token::Semicolon, &mut i),
471 b'.' => single(&mut out, Token::Dot, &mut i),
472 b'=' => single(&mut out, Token::Eq, &mut i),
473 b'<' => {
474 if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
475 out.push(Token::CosineDistance);
476 i += 3;
477 } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
478 out.push(Token::InnerProduct);
479 i += 3;
480 } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
481 out.push(Token::L2Distance);
482 i += 3;
483 } else if peek_eq(bytes, i + 1, b'<') && peek_eq(bytes, i + 2, b'=') {
484 out.push(Token::InetContainedByEq);
486 i += 3;
487 } else if peek_eq(bytes, i + 1, b'<') {
488 out.push(Token::InetContainedBy);
490 i += 2;
491 } else if peek_eq(bytes, i + 1, b'=') {
492 out.push(Token::LtEq);
493 i += 2;
494 } else if peek_eq(bytes, i + 1, b'>') {
495 out.push(Token::NotEq);
496 i += 2;
497 } else {
498 out.push(Token::Lt);
499 i += 1;
500 }
501 }
502 b':' if peek_eq(bytes, i + 1, b':') => {
503 out.push(Token::DoubleColon);
504 i += 2;
505 }
506 b':' if peek_eq(bytes, i + 1, b'=') => {
507 out.push(Token::ColonEq);
509 i += 2;
510 }
511 b':' => {
512 out.push(Token::Colon);
516 i += 1;
517 }
518 b'|' if peek_eq(bytes, i + 1, b'|') => {
519 out.push(Token::Concat);
520 i += 2;
521 }
522 b'|' => {
525 single(&mut out, Token::Pipe, &mut i);
526 }
527 b'~' => {
528 single(&mut out, Token::Tilde, &mut i);
529 }
530 b'>' => {
531 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'=') {
532 out.push(Token::InetContainsEq);
534 i += 3;
535 } else if peek_eq(bytes, i + 1, b'>') {
536 out.push(Token::InetContains);
538 i += 2;
539 } else if peek_eq(bytes, i + 1, b'=') {
540 out.push(Token::GtEq);
541 i += 2;
542 } else {
543 out.push(Token::Gt);
544 i += 1;
545 }
546 }
547 b'&' if peek_eq(bytes, i + 1, b'&') => {
548 out.push(Token::InetOverlap);
550 i += 2;
551 }
552 b'&' => {
553 single(&mut out, Token::Amp, &mut i);
554 }
555 b'!' if peek_eq(bytes, i + 1, b'=') => {
556 out.push(Token::NotEq);
557 i += 2;
558 }
559 b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
567 let end = find_dollar_tag_end(bytes, i + 2, b"$$");
569 let body = match end {
570 Some(e) => &input[i + 2..e],
571 None => {
572 return Err(LexError {
573 kind: LexErrorKind::UnterminatedString,
574 pos: i,
575 });
576 }
577 };
578 out.push(Token::String(body.to_string()));
579 i = end.unwrap() + 2;
580 }
581 b'$' if i + 1 < bytes.len()
582 && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
583 {
584 let mut j = i + 1;
587 while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
588 j += 1;
589 }
590 if j >= bytes.len() || bytes[j] != b'$' {
591 let ch = input[i..].chars().next().unwrap_or('?');
594 return Err(LexError {
595 kind: LexErrorKind::UnknownChar(ch),
596 pos: i,
597 });
598 }
599 let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
600 let end = find_dollar_tag_end(bytes, j + 1, &close);
601 let body = match end {
602 Some(e) => &input[j + 1..e],
603 None => {
604 return Err(LexError {
605 kind: LexErrorKind::UnterminatedString,
606 pos: i,
607 });
608 }
609 };
610 out.push(Token::String(body.to_string()));
611 i = end.unwrap() + close.len();
612 }
613 b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
617 let mut j = i + 1;
618 let mut n: u32 = 0;
619 while j < bytes.len() && bytes[j].is_ascii_digit() {
620 n = n
621 .saturating_mul(10)
622 .saturating_add(u32::from(bytes[j] - b'0'));
623 j += 1;
624 }
625 if n == 0 || n > u32::from(u16::MAX) {
626 return Err(LexError {
627 kind: LexErrorKind::BadNumber(input[i..j].to_string()),
628 pos: i,
629 });
630 }
631 #[allow(clippy::cast_possible_truncation)]
632 out.push(Token::Placeholder(n as u16));
633 i = j;
634 }
635 _ => {
636 let ch = input[i..].chars().next().unwrap_or('?');
637 return Err(LexError {
638 kind: LexErrorKind::UnknownChar(ch),
639 pos: i,
640 });
641 }
642 }
643 }
644 out.push(Token::Eof);
645 Ok(out)
646}
647
648fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
649 bytes.get(i) == Some(&target)
650}
651
652fn is_session_var_ident_start(b: Option<u8>) -> bool {
657 matches!(b, Some(c) if c.is_ascii_alphabetic() || c == b'_')
658}
659
660fn is_session_var_ident_continue(b: u8) -> bool {
665 b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'$'
666}
667
668fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
671 if tag.is_empty() || from > bytes.len() {
672 return None;
673 }
674 let mut i = from;
675 while i + tag.len() <= bytes.len() {
676 if &bytes[i..i + tag.len()] == tag {
677 return Some(i);
678 }
679 i += 1;
680 }
681 None
682}
683
684fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
685 bytes.get(i).is_some_and(pred)
686}
687
688fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
689 out.push(tok);
690 *i += 1;
691}
692
693fn keyword_or_ident_raw(raw: &str) -> Token {
703 let b = raw.as_bytes();
704 let tok = match b.len() {
705 2 => kw_len2(b),
706 3 => kw_len3(b),
707 4 => kw_len4(b),
708 5 => kw_len5(b),
709 6 => kw_len6(b),
710 7 => kw_len7(b),
711 8 => kw_len8(b),
712 9 => kw_len9(b),
713 10 => kw_len10(b),
714 11 => kw_len11(b),
715 12 => kw_len12(b),
716 _ => None,
717 };
718 match tok {
719 Some(t) => t,
720 None => Token::Ident(raw.to_ascii_lowercase()),
722 }
723}
724
725#[inline]
731fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
732 if input.len() != lower.len() {
733 return false;
734 }
735 for i in 0..lower.len() {
736 if input[i].to_ascii_lowercase() != lower[i] {
737 return false;
738 }
739 }
740 true
741}
742
743#[inline]
744fn kw_len2(b: &[u8]) -> Option<Token> {
745 if eq_ci(b, b"as") {
747 return Some(Token::As);
748 }
749 if eq_ci(b, b"by") {
750 return Some(Token::By);
751 }
752 if eq_ci(b, b"in") {
753 return Some(Token::In);
754 }
755 if eq_ci(b, b"is") {
756 return Some(Token::Is);
757 }
758 if eq_ci(b, b"on") {
759 return Some(Token::On);
760 }
761 if eq_ci(b, b"or") {
762 return Some(Token::Or);
763 }
764 if eq_ci(b, b"to") {
765 return Some(Token::To);
766 }
767 None
768}
769
770#[inline]
771fn kw_len3(b: &[u8]) -> Option<Token> {
772 if eq_ci(b, b"for") {
774 return Some(Token::For);
775 }
776 if eq_ci(b, b"all") {
777 return Some(Token::All);
778 }
779 if eq_ci(b, b"and") {
780 return Some(Token::And);
781 }
782 if eq_ci(b, b"asc") {
783 return Some(Token::Asc);
784 }
785 if eq_ci(b, b"not") {
786 return Some(Token::Not);
787 }
788 None
789}
790
791#[inline]
792fn kw_len4(b: &[u8]) -> Option<Token> {
793 if eq_ci(b, b"from") {
795 return Some(Token::From);
796 }
797 if eq_ci(b, b"drop") {
798 return Some(Token::Drop);
799 }
800 if eq_ci(b, b"null") {
801 return Some(Token::Null);
802 }
803 if eq_ci(b, b"true") {
804 return Some(Token::True);
805 }
806 if eq_ci(b, b"into") {
807 return Some(Token::Into);
808 }
809 if eq_ci(b, b"like") {
810 return Some(Token::Like);
811 }
812 if eq_ci(b, b"join") {
813 return Some(Token::Join);
814 }
815 if eq_ci(b, b"left") {
816 return Some(Token::Left);
817 }
818 if eq_ci(b, b"show") {
819 return Some(Token::Show);
820 }
821 if eq_ci(b, b"desc") {
822 return Some(Token::Desc);
823 }
824 None
825}
826
827#[inline]
828fn kw_len5(b: &[u8]) -> Option<Token> {
829 if eq_ci(b, b"false") {
832 return Some(Token::False);
833 }
834 if eq_ci(b, b"where") {
835 return Some(Token::Where);
836 }
837 if eq_ci(b, b"table") {
838 return Some(Token::Table);
839 }
840 if eq_ci(b, b"index") {
841 return Some(Token::Index);
842 }
843 if eq_ci(b, b"begin") {
844 return Some(Token::Begin);
845 }
846 if eq_ci(b, b"order") {
847 return Some(Token::Order);
848 }
849 if eq_ci(b, b"limit") {
850 return Some(Token::Limit);
851 }
852 if eq_ci(b, b"group") {
853 return Some(Token::Group);
854 }
855 if eq_ci(b, b"union") {
856 return Some(Token::Union);
857 }
858 if eq_ci(b, b"inner") {
859 return Some(Token::Inner);
860 }
861 if eq_ci(b, b"cross") {
862 return Some(Token::Cross);
863 }
864 if eq_ci(b, b"outer") {
865 return Some(Token::Outer);
866 }
867 None
868}
869
870#[inline]
871fn kw_len6(b: &[u8]) -> Option<Token> {
872 if eq_ci(b, b"select") {
874 return Some(Token::Select);
875 }
876 if eq_ci(b, b"tables") {
877 return Some(Token::Tables);
878 }
879 if eq_ci(b, b"except") {
880 return Some(Token::Except);
881 }
882 if eq_ci(b, b"create") {
883 return Some(Token::Create);
884 }
885 if eq_ci(b, b"insert") {
886 return Some(Token::Insert);
887 }
888 if eq_ci(b, b"values") {
889 return Some(Token::Values);
890 }
891 if eq_ci(b, b"commit") {
892 return Some(Token::Commit);
893 }
894 if eq_ci(b, b"having") {
895 return Some(Token::Having);
896 }
897 if eq_ci(b, b"offset") {
898 return Some(Token::Offset);
899 }
900 None
901}
902
903#[inline]
904fn kw_len7(b: &[u8]) -> Option<Token> {
905 if eq_ci(b, b"between") {
907 return Some(Token::Between);
908 }
909 if eq_ci(b, b"default") {
910 return Some(Token::Default);
911 }
912 if eq_ci(b, b"release") {
913 return Some(Token::Release);
914 }
915 if eq_ci(b, b"extract") {
916 return Some(Token::Extract);
917 }
918 None
919}
920
921#[inline]
922fn kw_len8(b: &[u8]) -> Option<Token> {
923 if eq_ci(b, b"rollback") {
925 return Some(Token::Rollback);
926 }
927 if eq_ci(b, b"distinct") {
928 return Some(Token::Distinct);
929 }
930 if eq_ci(b, b"interval") {
931 return Some(Token::Interval);
932 }
933 None
934}
935
936#[inline]
937fn kw_len9(b: &[u8]) -> Option<Token> {
938 if eq_ci(b, b"savepoint") {
940 return Some(Token::Savepoint);
941 }
942 None
943}
944
945#[inline]
946fn kw_len10(b: &[u8]) -> Option<Token> {
947 if eq_ci(b, b"connection") {
949 return Some(Token::Connection);
950 }
951 None
952}
953
954#[inline]
955fn kw_len11(b: &[u8]) -> Option<Token> {
956 if eq_ci(b, b"publication") {
958 return Some(Token::Publication);
959 }
960 None
961}
962
963#[inline]
964fn kw_len12(b: &[u8]) -> Option<Token> {
965 if eq_ci(b, b"subscription") {
967 return Some(Token::Subscription);
968 }
969 None
970}
971
972fn lex_quoted(
979 input: &str,
980 start: usize,
981 quote: u8,
982 is_ident: bool,
983) -> Result<(Token, usize), LexError> {
984 let bytes = input.as_bytes();
985 let mut i = start + 1;
986 let mut s = String::new();
987 loop {
988 if i >= bytes.len() {
989 return Err(LexError {
990 kind: if is_ident {
991 LexErrorKind::UnterminatedQuotedIdent
992 } else {
993 LexErrorKind::UnterminatedString
994 },
995 pos: start,
996 });
997 }
998 if bytes[i] == quote {
999 if peek_eq(bytes, i + 1, quote) {
1000 s.push(quote as char);
1001 i += 2;
1002 } else {
1003 i += 1;
1004 break;
1005 }
1006 } else {
1007 let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
1008 s.push(ch);
1009 i += ch.len_utf8();
1010 }
1011 }
1012 let tok = if is_ident {
1013 Token::QuotedIdent(s)
1014 } else {
1015 Token::String(s)
1016 };
1017 Ok((tok, i - start))
1018}
1019
1020fn lex_escape_string(input: &str, start: usize) -> Result<(Token, usize), LexError> {
1036 let bytes = input.as_bytes();
1037 debug_assert_eq!(bytes[start], b'\'');
1038 let mut i = start + 1;
1039 let mut s = String::new();
1040 loop {
1041 if i >= bytes.len() {
1042 return Err(LexError {
1043 kind: LexErrorKind::UnterminatedString,
1044 pos: start,
1045 });
1046 }
1047 let b = bytes[i];
1048 if b == b'\'' {
1049 if peek_eq(bytes, i + 1, b'\'') {
1050 s.push('\'');
1051 i += 2;
1052 continue;
1053 }
1054 i += 1;
1055 break;
1056 }
1057 if b == b'\\' && i + 1 < bytes.len() {
1058 let n = bytes[i + 1];
1059 match n {
1060 b'\\' => {
1061 s.push('\\');
1062 i += 2;
1063 }
1064 b'\'' => {
1065 s.push('\'');
1066 i += 2;
1067 }
1068 b'"' => {
1069 s.push('"');
1070 i += 2;
1071 }
1072 b'n' => {
1073 s.push('\n');
1074 i += 2;
1075 }
1076 b'r' => {
1077 s.push('\r');
1078 i += 2;
1079 }
1080 b't' => {
1081 s.push('\t');
1082 i += 2;
1083 }
1084 b'b' => {
1085 s.push('\u{0008}');
1086 i += 2;
1087 }
1088 b'f' => {
1089 s.push('\u{000C}');
1090 i += 2;
1091 }
1092 b'0' if i + 2 >= bytes.len() || !bytes[i + 2].is_ascii_digit() => {
1093 s.push('\0');
1094 i += 2;
1095 }
1096 b'x' => {
1097 let h1 = bytes.get(i + 2).copied();
1099 let h2 = bytes.get(i + 3).copied();
1100 let n1 = h1.and_then(hex_digit_value);
1101 let n2 = h2.and_then(hex_digit_value);
1102 match (n1, n2) {
1103 (Some(a), Some(b2)) => {
1104 s.push((((a << 4) | b2) as u8) as char);
1105 i += 4;
1106 }
1107 (Some(a), _) => {
1108 s.push((a as u8) as char);
1109 i += 3;
1110 }
1111 _ => {
1112 s.push('x');
1114 i += 2;
1115 }
1116 }
1117 }
1118 d if d.is_ascii_digit() && d < b'8' => {
1119 let mut value: u32 = u32::from(d - b'0');
1121 let mut take = 2;
1122 while take < 4 {
1123 let next = bytes.get(i + take).copied();
1124 match next {
1125 Some(c) if c.is_ascii_digit() && c < b'8' => {
1126 value = (value << 3) | u32::from(c - b'0');
1127 take += 1;
1128 }
1129 _ => break,
1130 }
1131 }
1132 if let Some(c) = char::from_u32(value) {
1133 s.push(c);
1134 } else {
1135 s.push((value & 0xFF) as u8 as char);
1137 }
1138 i += take;
1139 }
1140 other => {
1141 s.push(other as char);
1145 i += 2;
1146 }
1147 }
1148 } else {
1149 let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
1150 s.push(ch);
1151 i += ch.len_utf8();
1152 }
1153 }
1154 Ok((Token::String(s), i - start))
1155}
1156
1157fn hex_digit_value(b: u8) -> Option<u32> {
1158 match b {
1159 b'0'..=b'9' => Some(u32::from(b - b'0')),
1160 b'a'..=b'f' => Some(u32::from(b - b'a' + 10)),
1161 b'A'..=b'F' => Some(u32::from(b - b'A' + 10)),
1162 _ => None,
1163 }
1164}
1165
1166fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
1167 let bytes = s.as_bytes();
1168 let mut i = 0usize;
1169 let mut is_float = false;
1170
1171 while i < bytes.len() && bytes[i].is_ascii_digit() {
1172 i += 1;
1173 }
1174 if i < bytes.len() && bytes[i] == b'.' {
1175 is_float = true;
1176 i += 1;
1177 while i < bytes.len() && bytes[i].is_ascii_digit() {
1178 i += 1;
1179 }
1180 }
1181 if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
1182 is_float = true;
1183 i += 1;
1184 if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
1185 i += 1;
1186 }
1187 let exp_start = i;
1188 while i < bytes.len() && bytes[i].is_ascii_digit() {
1189 i += 1;
1190 }
1191 if exp_start == i {
1192 return Err(LexErrorKind::BadNumber(s[..i].to_string()));
1193 }
1194 }
1195
1196 let lit = &s[..i];
1197 if is_float {
1198 lit.parse::<f64>()
1199 .map(|v| (Token::Float(v), i))
1200 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1201 } else {
1202 lit.parse::<i64>()
1203 .map(|v| (Token::Integer(v), i))
1204 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1205 }
1206}
1207
1208#[cfg(test)]
1209mod tests {
1210 use super::*;
1211 use alloc::vec;
1212
1213 fn lex(s: &str) -> Vec<Token> {
1214 tokenize(s).expect("lex ok")
1215 }
1216
1217 #[test]
1218 fn empty_yields_only_eof() {
1219 assert_eq!(lex(""), vec![Token::Eof]);
1220 }
1221
1222 #[test]
1223 fn whitespace_only_yields_only_eof() {
1224 assert_eq!(lex(" \t\n "), vec![Token::Eof]);
1225 }
1226
1227 #[test]
1228 fn keywords_are_case_insensitive() {
1229 assert_eq!(
1230 lex("SELECT select Select"),
1231 vec![Token::Select, Token::Select, Token::Select, Token::Eof]
1232 );
1233 }
1234
1235 #[test]
1236 fn identifiers_lowercase_ascii() {
1237 assert_eq!(
1238 lex("hello WORLD _x x1"),
1239 vec![
1240 Token::Ident("hello".into()),
1241 Token::Ident("world".into()),
1242 Token::Ident("_x".into()),
1243 Token::Ident("x1".into()),
1244 Token::Eof,
1245 ]
1246 );
1247 }
1248
1249 #[test]
1250 fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
1251 assert_eq!(
1252 lex(r#""User Name" "a""b""#),
1253 vec![
1254 Token::QuotedIdent("User Name".into()),
1255 Token::QuotedIdent("a\"b".into()),
1256 Token::Eof,
1257 ]
1258 );
1259 }
1260
1261 #[test]
1262 fn integer_and_float_literals() {
1263 assert_eq!(
1264 lex("0 42 1.5 .5 1e10 2.5e-3"),
1265 vec![
1266 Token::Integer(0),
1267 Token::Integer(42),
1268 Token::Float(1.5),
1269 Token::Float(0.5),
1270 Token::Float(1e10),
1271 Token::Float(2.5e-3),
1272 Token::Eof,
1273 ]
1274 );
1275 }
1276
1277 #[test]
1278 fn negative_number_is_minus_then_integer() {
1279 assert_eq!(
1281 lex("-42"),
1282 vec![Token::Minus, Token::Integer(42), Token::Eof]
1283 );
1284 }
1285
1286 #[test]
1287 fn string_literal_doubled_quote_escape() {
1288 assert_eq!(
1289 lex("'hello' 'it''s'"),
1290 vec![
1291 Token::String("hello".into()),
1292 Token::String("it's".into()),
1293 Token::Eof,
1294 ]
1295 );
1296 }
1297
1298 #[test]
1299 fn all_comparison_and_arithmetic_operators() {
1300 assert_eq!(
1301 lex("= <> != < <= > >= + - * /"),
1302 vec![
1303 Token::Eq,
1304 Token::NotEq,
1305 Token::NotEq,
1306 Token::Lt,
1307 Token::LtEq,
1308 Token::Gt,
1309 Token::GtEq,
1310 Token::Plus,
1311 Token::Minus,
1312 Token::Star,
1313 Token::Slash,
1314 Token::Eof,
1315 ]
1316 );
1317 }
1318
1319 #[test]
1320 fn punctuation() {
1321 assert_eq!(
1322 lex("( ) , ; ."),
1323 vec![
1324 Token::LParen,
1325 Token::RParen,
1326 Token::Comma,
1327 Token::Semicolon,
1328 Token::Dot,
1329 Token::Eof,
1330 ]
1331 );
1332 }
1333
1334 #[test]
1335 fn line_comment_skipped() {
1336 assert_eq!(
1337 lex("SELECT -- trailing junk\nFROM"),
1338 vec![Token::Select, Token::From, Token::Eof]
1339 );
1340 }
1341
1342 #[test]
1343 fn block_comment_skipped() {
1344 assert_eq!(
1345 lex("SELECT /* skipped */ 1"),
1346 vec![Token::Select, Token::Integer(1), Token::Eof]
1347 );
1348 }
1349
1350 #[test]
1351 fn unterminated_string_errors() {
1352 let err = tokenize("'oops").unwrap_err();
1353 assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1354 assert_eq!(err.pos, 0);
1355 }
1356
1357 #[test]
1358 fn unterminated_block_comment_errors() {
1359 let err = tokenize("/* never closed").unwrap_err();
1360 assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1361 }
1362
1363 #[test]
1364 fn unknown_char_errors() {
1365 let err = tokenize("\x07").unwrap_err();
1371 assert!(matches!(err.kind, LexErrorKind::UnknownChar(_)));
1372 }
1373
1374 #[test]
1375 fn at_alone_lexes_as_punctuation() {
1376 assert_eq!(
1379 lex("'u'@'h'"),
1380 vec![
1381 Token::String("u".into()),
1382 Token::At,
1383 Token::String("h".into()),
1384 Token::Eof,
1385 ]
1386 );
1387 }
1388
1389 #[test]
1390 fn dot_in_qualified_column() {
1391 assert_eq!(
1392 lex("t.col"),
1393 vec![
1394 Token::Ident("t".into()),
1395 Token::Dot,
1396 Token::Ident("col".into()),
1397 Token::Eof,
1398 ]
1399 );
1400 }
1401
1402 #[test]
1405 fn brackets_are_distinct_tokens() {
1406 assert_eq!(
1407 lex("[ ]"),
1408 vec![Token::LBracket, Token::RBracket, Token::Eof]
1409 );
1410 }
1411
1412 #[test]
1413 fn l2_distance_is_three_char_token() {
1414 assert_eq!(
1415 lex("a <-> b"),
1416 vec![
1417 Token::Ident("a".into()),
1418 Token::L2Distance,
1419 Token::Ident("b".into()),
1420 Token::Eof,
1421 ]
1422 );
1423 assert_eq!(
1425 lex("a <- b"),
1426 vec![
1427 Token::Ident("a".into()),
1428 Token::Lt,
1429 Token::Minus,
1430 Token::Ident("b".into()),
1431 Token::Eof,
1432 ]
1433 );
1434 }
1435
1436 #[test]
1437 fn order_by_limit_are_keywords() {
1438 assert_eq!(
1439 lex("ORDER BY LIMIT"),
1440 vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1441 );
1442 }
1443
1444 #[test]
1447 fn inner_product_operator_3char() {
1448 assert_eq!(
1449 lex("a <#> b"),
1450 vec![
1451 Token::Ident("a".into()),
1452 Token::InnerProduct,
1453 Token::Ident("b".into()),
1454 Token::Eof,
1455 ]
1456 );
1457 }
1458
1459 #[test]
1460 fn cosine_distance_operator_3char() {
1461 assert_eq!(
1462 lex("a <=> b"),
1463 vec![
1464 Token::Ident("a".into()),
1465 Token::CosineDistance,
1466 Token::Ident("b".into()),
1467 Token::Eof,
1468 ]
1469 );
1470 assert_eq!(
1473 lex("a <= b"),
1474 vec![
1475 Token::Ident("a".into()),
1476 Token::LtEq,
1477 Token::Ident("b".into()),
1478 Token::Eof,
1479 ]
1480 );
1481 }
1482
1483 #[test]
1484 fn double_colon_cast_token() {
1485 assert_eq!(
1486 lex("x::INT"),
1487 vec![
1488 Token::Ident("x".into()),
1489 Token::DoubleColon,
1490 Token::Ident("int".into()),
1491 Token::Eof,
1492 ]
1493 );
1494 }
1495
1496 #[test]
1497 fn lone_single_colon_lexes_as_colon_token() {
1498 let toks = tokenize(":x").expect("colon now lexes");
1503 assert_eq!(toks[0], Token::Colon);
1504 }
1505
1506 #[test]
1507 fn colon_eq_lexes_as_assignment() {
1508 let toks = tokenize("x := 1").expect("colon-eq lexes");
1510 assert!(matches!(toks[1], Token::ColonEq));
1512 }
1513
1514 #[test]
1515 fn pg_escape_string_double_backslash_decodes_to_single() {
1516 let toks = tokenize(r"E'\\xdeadbeef'").expect("E-string lexes");
1521 assert_eq!(toks, vec![Token::String(r"\xdeadbeef".into()), Token::Eof]);
1522 }
1523
1524 #[test]
1525 fn pg_escape_string_supports_basic_escapes() {
1526 let toks = tokenize(r"E'a\nb\tc\'d\\e'").expect("E-string lexes");
1528 assert_eq!(toks, vec![Token::String("a\nb\tc'd\\e".into()), Token::Eof]);
1529 }
1530
1531 #[test]
1532 fn pg_escape_string_hex_byte() {
1533 let toks = tokenize(r"E'\x41B\x42'").expect("E-string lexes");
1535 assert_eq!(toks, vec![Token::String("ABB".into()), Token::Eof]);
1536 }
1537
1538 #[test]
1539 fn pg_escape_string_lowercase_e_prefix() {
1540 let toks = tokenize(r"e'hi\n'").expect("e-string lexes");
1541 assert_eq!(toks, vec![Token::String("hi\n".into()), Token::Eof]);
1542 }
1543
1544 #[test]
1545 fn pg_escape_string_doubled_quote() {
1546 let toks = tokenize(r"E'it''s ok'").expect("E-string lexes");
1548 assert_eq!(toks, vec![Token::String("it's ok".into()), Token::Eof]);
1549 }
1550}