1use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17 Select,
19 From,
20 Where,
21 As,
22 Null,
23 True,
24 False,
25 And,
26 Or,
27 Not,
28 Create,
29 Table,
30 Insert,
31 Into,
32 Values,
33 Index,
34 On,
35 Begin,
36 Commit,
37 Rollback,
38 Order,
39 By,
40 Limit,
41
42 Ident(String), QuotedIdent(String), SessionVar(String),
54
55 Integer(i64),
57 Float(f64),
58 String(String),
59
60 Plus,
62 Minus,
63 Star,
64 Slash,
65 Eq,
66 NotEq,
67 Lt,
68 LtEq,
69 Gt,
70 GtEq,
71 InetContainedBy,
74 InetContainedByEq,
77 InetContains,
80 InetContainsEq,
83 InetOverlap,
86
87 LParen,
89 RParen,
90 LBracket,
91 RBracket,
92 Comma,
93 Semicolon,
94 Dot,
95 At,
104 JsonGet,
108 JsonGetText,
110 JsonGetPath,
113 JsonGetPathText,
115 JsonContains,
119 TsMatch,
123 L2Distance,
124 InnerProduct,
127 CosineDistance,
129 DoubleColon,
132 ColonEq,
135 Colon,
139 Concat,
141 Pipe,
143 Amp,
145 Tilde,
147 Is,
149 Between,
150 In,
151 Like,
152 Group,
153 Distinct,
154 Union,
155 All,
156 Join,
157 Inner,
158 Left,
159 Cross,
160 Outer,
161 Default,
162 Savepoint,
163 Release,
164 To,
165 Having,
166 Show,
167 Extract,
168 Offset,
169 Asc,
170 Desc,
171 Interval,
174 Placeholder(u16),
178
179 Drop,
183 For,
185 Tables,
190 Except,
193 Publication,
195 Subscription,
197 Connection,
200
201 Eof,
202}
203
204#[derive(Debug, Clone, PartialEq, Eq)]
205pub enum LexErrorKind {
206 UnknownChar(char),
207 UnterminatedString,
208 UnterminatedQuotedIdent,
209 UnterminatedBlockComment,
210 BadNumber(String),
211}
212
213#[derive(Debug, Clone, PartialEq, Eq)]
214pub struct LexError {
215 pub kind: LexErrorKind,
216 pub pos: usize,
217}
218
219impl fmt::Display for LexError {
220 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
221 match &self.kind {
222 LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
223 LexErrorKind::UnterminatedString => {
224 write!(f, "unterminated string literal at byte {}", self.pos)
225 }
226 LexErrorKind::UnterminatedQuotedIdent => {
227 write!(f, "unterminated quoted identifier at byte {}", self.pos)
228 }
229 LexErrorKind::UnterminatedBlockComment => {
230 write!(f, "unterminated /* */ comment at byte {}", self.pos)
231 }
232 LexErrorKind::BadNumber(s) => {
233 write!(f, "invalid number literal {s:?} at byte {}", self.pos)
234 }
235 }
236 }
237}
238
239#[allow(clippy::too_many_lines)] pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
242 let bytes = input.as_bytes();
243 let mut i = 0usize;
244 let mut out = Vec::new();
245
246 while i < bytes.len() {
247 let b = bytes[i];
248 match b {
249 b' ' | b'\t' | b'\n' | b'\r' => {
250 i += 1;
251 }
252 b'-' if peek_eq(bytes, i + 1, b'-') => {
253 i += 2;
254 while i < bytes.len() && bytes[i] != b'\n' {
255 i += 1;
256 }
257 }
258 b'/' if peek_eq(bytes, i + 1, b'*') => {
259 let start = i;
260 if peek_eq(bytes, i + 2, b'!') {
270 let mut j = i + 3;
271 while j < bytes.len() && bytes[j].is_ascii_digit() {
274 j += 1;
275 }
276 if j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
277 j += 1;
278 }
279 i = j;
280 continue;
281 }
282 i += 2;
283 let mut closed = false;
284 while i + 1 < bytes.len() {
285 if bytes[i] == b'*' && bytes[i + 1] == b'/' {
286 i += 2;
287 closed = true;
288 break;
289 }
290 i += 1;
291 }
292 if !closed {
293 return Err(LexError {
294 kind: LexErrorKind::UnterminatedBlockComment,
295 pos: start,
296 });
297 }
298 }
299 b'*' if peek_eq(bytes, i + 1, b'/') => {
304 i += 2;
305 }
306 b'\'' => {
307 let (tok, consumed) = lex_quoted(input, i, b'\'', false)?;
308 out.push(tok);
309 i += consumed;
310 }
311 b'E' | b'e' if peek_eq(bytes, i + 1, b'\'') => {
319 let (tok, consumed) = lex_escape_string(input, i + 1)?;
320 out.push(tok);
321 i += 1 + consumed;
322 }
323 b'"' => {
324 let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
325 out.push(tok);
326 i += consumed;
327 }
328 b'`' => {
332 let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
333 out.push(tok);
334 i += consumed;
335 }
336 b if b.is_ascii_alphabetic() || b == b'_' => {
337 let start = i;
338 i += 1;
339 while i < bytes.len() {
340 let c = bytes[i];
341 if c.is_ascii_alphanumeric() || c == b'_' {
342 i += 1;
343 } else {
344 break;
345 }
346 }
347 let raw = &input[start..i];
348 out.push(keyword_or_ident_raw(raw));
352 }
353 b if b.is_ascii_digit() => {
354 let (tok, consumed) =
355 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
356 out.push(tok);
357 i += consumed;
358 }
359 b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
360 let (tok, consumed) =
361 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
362 out.push(tok);
363 i += consumed;
364 }
365 b'+' => single(&mut out, Token::Plus, &mut i),
366 b'-' => {
367 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
370 out.push(Token::JsonGetText);
371 i += 3;
372 } else if peek_eq(bytes, i + 1, b'>') {
373 out.push(Token::JsonGet);
374 i += 2;
375 } else {
376 single(&mut out, Token::Minus, &mut i);
377 }
378 }
379 b'#' => {
381 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
382 out.push(Token::JsonGetPathText);
383 i += 3;
384 } else if peek_eq(bytes, i + 1, b'>') {
385 out.push(Token::JsonGetPath);
386 i += 2;
387 } else {
388 return Err(LexError {
389 kind: LexErrorKind::UnknownChar('#'),
390 pos: i,
391 });
392 }
393 }
394 b'@' => {
403 if peek_eq(bytes, i + 1, b'>') {
404 out.push(Token::JsonContains);
405 i += 2;
406 } else if peek_eq(bytes, i + 1, b'@')
407 && !is_session_var_ident_start(bytes.get(i + 2).copied())
408 {
409 out.push(Token::TsMatch);
412 i += 2;
413 } else {
414 let prefix_end = if peek_eq(bytes, i + 1, b'@') {
419 i + 2
420 } else {
421 i + 1
422 };
423 let mut end = prefix_end;
424 while end < bytes.len() && is_session_var_ident_continue(bytes[end]) {
425 end += 1;
426 }
427 if end == prefix_end {
428 out.push(Token::At);
437 i = prefix_end;
438 continue;
439 }
440 out.push(Token::SessionVar(input[i..end].to_string()));
441 i = end;
442 }
443 }
444 b'*' => single(&mut out, Token::Star, &mut i),
445 b'/' => single(&mut out, Token::Slash, &mut i),
446 b'(' => single(&mut out, Token::LParen, &mut i),
447 b')' => single(&mut out, Token::RParen, &mut i),
448 b'[' => single(&mut out, Token::LBracket, &mut i),
449 b']' => single(&mut out, Token::RBracket, &mut i),
450 b',' => single(&mut out, Token::Comma, &mut i),
451 b';' => single(&mut out, Token::Semicolon, &mut i),
452 b'.' => single(&mut out, Token::Dot, &mut i),
453 b'=' => single(&mut out, Token::Eq, &mut i),
454 b'<' => {
455 if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
456 out.push(Token::CosineDistance);
457 i += 3;
458 } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
459 out.push(Token::InnerProduct);
460 i += 3;
461 } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
462 out.push(Token::L2Distance);
463 i += 3;
464 } else if peek_eq(bytes, i + 1, b'<') && peek_eq(bytes, i + 2, b'=') {
465 out.push(Token::InetContainedByEq);
467 i += 3;
468 } else if peek_eq(bytes, i + 1, b'<') {
469 out.push(Token::InetContainedBy);
471 i += 2;
472 } else if peek_eq(bytes, i + 1, b'=') {
473 out.push(Token::LtEq);
474 i += 2;
475 } else if peek_eq(bytes, i + 1, b'>') {
476 out.push(Token::NotEq);
477 i += 2;
478 } else {
479 out.push(Token::Lt);
480 i += 1;
481 }
482 }
483 b':' if peek_eq(bytes, i + 1, b':') => {
484 out.push(Token::DoubleColon);
485 i += 2;
486 }
487 b':' if peek_eq(bytes, i + 1, b'=') => {
488 out.push(Token::ColonEq);
490 i += 2;
491 }
492 b':' => {
493 out.push(Token::Colon);
497 i += 1;
498 }
499 b'|' if peek_eq(bytes, i + 1, b'|') => {
500 out.push(Token::Concat);
501 i += 2;
502 }
503 b'|' => {
506 single(&mut out, Token::Pipe, &mut i);
507 }
508 b'~' => {
509 single(&mut out, Token::Tilde, &mut i);
510 }
511 b'>' => {
512 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'=') {
513 out.push(Token::InetContainsEq);
515 i += 3;
516 } else if peek_eq(bytes, i + 1, b'>') {
517 out.push(Token::InetContains);
519 i += 2;
520 } else if peek_eq(bytes, i + 1, b'=') {
521 out.push(Token::GtEq);
522 i += 2;
523 } else {
524 out.push(Token::Gt);
525 i += 1;
526 }
527 }
528 b'&' if peek_eq(bytes, i + 1, b'&') => {
529 out.push(Token::InetOverlap);
531 i += 2;
532 }
533 b'&' => {
534 single(&mut out, Token::Amp, &mut i);
535 }
536 b'!' if peek_eq(bytes, i + 1, b'=') => {
537 out.push(Token::NotEq);
538 i += 2;
539 }
540 b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
548 let end = find_dollar_tag_end(bytes, i + 2, b"$$");
550 let body = match end {
551 Some(e) => &input[i + 2..e],
552 None => {
553 return Err(LexError {
554 kind: LexErrorKind::UnterminatedString,
555 pos: i,
556 });
557 }
558 };
559 out.push(Token::String(body.to_string()));
560 i = end.unwrap() + 2;
561 }
562 b'$' if i + 1 < bytes.len()
563 && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
564 {
565 let mut j = i + 1;
568 while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
569 j += 1;
570 }
571 if j >= bytes.len() || bytes[j] != b'$' {
572 let ch = input[i..].chars().next().unwrap_or('?');
575 return Err(LexError {
576 kind: LexErrorKind::UnknownChar(ch),
577 pos: i,
578 });
579 }
580 let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
581 let end = find_dollar_tag_end(bytes, j + 1, &close);
582 let body = match end {
583 Some(e) => &input[j + 1..e],
584 None => {
585 return Err(LexError {
586 kind: LexErrorKind::UnterminatedString,
587 pos: i,
588 });
589 }
590 };
591 out.push(Token::String(body.to_string()));
592 i = end.unwrap() + close.len();
593 }
594 b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
598 let mut j = i + 1;
599 let mut n: u32 = 0;
600 while j < bytes.len() && bytes[j].is_ascii_digit() {
601 n = n
602 .saturating_mul(10)
603 .saturating_add(u32::from(bytes[j] - b'0'));
604 j += 1;
605 }
606 if n == 0 || n > u32::from(u16::MAX) {
607 return Err(LexError {
608 kind: LexErrorKind::BadNumber(input[i..j].to_string()),
609 pos: i,
610 });
611 }
612 #[allow(clippy::cast_possible_truncation)]
613 out.push(Token::Placeholder(n as u16));
614 i = j;
615 }
616 _ => {
617 let ch = input[i..].chars().next().unwrap_or('?');
618 return Err(LexError {
619 kind: LexErrorKind::UnknownChar(ch),
620 pos: i,
621 });
622 }
623 }
624 }
625 out.push(Token::Eof);
626 Ok(out)
627}
628
629fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
630 bytes.get(i) == Some(&target)
631}
632
633fn is_session_var_ident_start(b: Option<u8>) -> bool {
638 matches!(b, Some(c) if c.is_ascii_alphabetic() || c == b'_')
639}
640
641fn is_session_var_ident_continue(b: u8) -> bool {
646 b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'$'
647}
648
649fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
652 if tag.is_empty() || from > bytes.len() {
653 return None;
654 }
655 let mut i = from;
656 while i + tag.len() <= bytes.len() {
657 if &bytes[i..i + tag.len()] == tag {
658 return Some(i);
659 }
660 i += 1;
661 }
662 None
663}
664
665fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
666 bytes.get(i).is_some_and(pred)
667}
668
669fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
670 out.push(tok);
671 *i += 1;
672}
673
674fn keyword_or_ident_raw(raw: &str) -> Token {
684 let b = raw.as_bytes();
685 let tok = match b.len() {
686 2 => kw_len2(b),
687 3 => kw_len3(b),
688 4 => kw_len4(b),
689 5 => kw_len5(b),
690 6 => kw_len6(b),
691 7 => kw_len7(b),
692 8 => kw_len8(b),
693 9 => kw_len9(b),
694 10 => kw_len10(b),
695 11 => kw_len11(b),
696 12 => kw_len12(b),
697 _ => None,
698 };
699 match tok {
700 Some(t) => t,
701 None => Token::Ident(raw.to_ascii_lowercase()),
703 }
704}
705
706#[inline]
712fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
713 if input.len() != lower.len() {
714 return false;
715 }
716 for i in 0..lower.len() {
717 if input[i].to_ascii_lowercase() != lower[i] {
718 return false;
719 }
720 }
721 true
722}
723
724#[inline]
725fn kw_len2(b: &[u8]) -> Option<Token> {
726 if eq_ci(b, b"as") {
728 return Some(Token::As);
729 }
730 if eq_ci(b, b"by") {
731 return Some(Token::By);
732 }
733 if eq_ci(b, b"in") {
734 return Some(Token::In);
735 }
736 if eq_ci(b, b"is") {
737 return Some(Token::Is);
738 }
739 if eq_ci(b, b"on") {
740 return Some(Token::On);
741 }
742 if eq_ci(b, b"or") {
743 return Some(Token::Or);
744 }
745 if eq_ci(b, b"to") {
746 return Some(Token::To);
747 }
748 None
749}
750
751#[inline]
752fn kw_len3(b: &[u8]) -> Option<Token> {
753 if eq_ci(b, b"for") {
755 return Some(Token::For);
756 }
757 if eq_ci(b, b"all") {
758 return Some(Token::All);
759 }
760 if eq_ci(b, b"and") {
761 return Some(Token::And);
762 }
763 if eq_ci(b, b"asc") {
764 return Some(Token::Asc);
765 }
766 if eq_ci(b, b"not") {
767 return Some(Token::Not);
768 }
769 None
770}
771
772#[inline]
773fn kw_len4(b: &[u8]) -> Option<Token> {
774 if eq_ci(b, b"from") {
776 return Some(Token::From);
777 }
778 if eq_ci(b, b"drop") {
779 return Some(Token::Drop);
780 }
781 if eq_ci(b, b"null") {
782 return Some(Token::Null);
783 }
784 if eq_ci(b, b"true") {
785 return Some(Token::True);
786 }
787 if eq_ci(b, b"into") {
788 return Some(Token::Into);
789 }
790 if eq_ci(b, b"like") {
791 return Some(Token::Like);
792 }
793 if eq_ci(b, b"join") {
794 return Some(Token::Join);
795 }
796 if eq_ci(b, b"left") {
797 return Some(Token::Left);
798 }
799 if eq_ci(b, b"show") {
800 return Some(Token::Show);
801 }
802 if eq_ci(b, b"desc") {
803 return Some(Token::Desc);
804 }
805 None
806}
807
808#[inline]
809fn kw_len5(b: &[u8]) -> Option<Token> {
810 if eq_ci(b, b"false") {
813 return Some(Token::False);
814 }
815 if eq_ci(b, b"where") {
816 return Some(Token::Where);
817 }
818 if eq_ci(b, b"table") {
819 return Some(Token::Table);
820 }
821 if eq_ci(b, b"index") {
822 return Some(Token::Index);
823 }
824 if eq_ci(b, b"begin") {
825 return Some(Token::Begin);
826 }
827 if eq_ci(b, b"order") {
828 return Some(Token::Order);
829 }
830 if eq_ci(b, b"limit") {
831 return Some(Token::Limit);
832 }
833 if eq_ci(b, b"group") {
834 return Some(Token::Group);
835 }
836 if eq_ci(b, b"union") {
837 return Some(Token::Union);
838 }
839 if eq_ci(b, b"inner") {
840 return Some(Token::Inner);
841 }
842 if eq_ci(b, b"cross") {
843 return Some(Token::Cross);
844 }
845 if eq_ci(b, b"outer") {
846 return Some(Token::Outer);
847 }
848 None
849}
850
851#[inline]
852fn kw_len6(b: &[u8]) -> Option<Token> {
853 if eq_ci(b, b"select") {
855 return Some(Token::Select);
856 }
857 if eq_ci(b, b"tables") {
858 return Some(Token::Tables);
859 }
860 if eq_ci(b, b"except") {
861 return Some(Token::Except);
862 }
863 if eq_ci(b, b"create") {
864 return Some(Token::Create);
865 }
866 if eq_ci(b, b"insert") {
867 return Some(Token::Insert);
868 }
869 if eq_ci(b, b"values") {
870 return Some(Token::Values);
871 }
872 if eq_ci(b, b"commit") {
873 return Some(Token::Commit);
874 }
875 if eq_ci(b, b"having") {
876 return Some(Token::Having);
877 }
878 if eq_ci(b, b"offset") {
879 return Some(Token::Offset);
880 }
881 None
882}
883
884#[inline]
885fn kw_len7(b: &[u8]) -> Option<Token> {
886 if eq_ci(b, b"between") {
888 return Some(Token::Between);
889 }
890 if eq_ci(b, b"default") {
891 return Some(Token::Default);
892 }
893 if eq_ci(b, b"release") {
894 return Some(Token::Release);
895 }
896 if eq_ci(b, b"extract") {
897 return Some(Token::Extract);
898 }
899 None
900}
901
902#[inline]
903fn kw_len8(b: &[u8]) -> Option<Token> {
904 if eq_ci(b, b"rollback") {
906 return Some(Token::Rollback);
907 }
908 if eq_ci(b, b"distinct") {
909 return Some(Token::Distinct);
910 }
911 if eq_ci(b, b"interval") {
912 return Some(Token::Interval);
913 }
914 None
915}
916
917#[inline]
918fn kw_len9(b: &[u8]) -> Option<Token> {
919 if eq_ci(b, b"savepoint") {
921 return Some(Token::Savepoint);
922 }
923 None
924}
925
926#[inline]
927fn kw_len10(b: &[u8]) -> Option<Token> {
928 if eq_ci(b, b"connection") {
930 return Some(Token::Connection);
931 }
932 None
933}
934
935#[inline]
936fn kw_len11(b: &[u8]) -> Option<Token> {
937 if eq_ci(b, b"publication") {
939 return Some(Token::Publication);
940 }
941 None
942}
943
944#[inline]
945fn kw_len12(b: &[u8]) -> Option<Token> {
946 if eq_ci(b, b"subscription") {
948 return Some(Token::Subscription);
949 }
950 None
951}
952
953fn lex_quoted(
960 input: &str,
961 start: usize,
962 quote: u8,
963 is_ident: bool,
964) -> Result<(Token, usize), LexError> {
965 let bytes = input.as_bytes();
966 let mut i = start + 1;
967 let mut s = String::new();
968 loop {
969 if i >= bytes.len() {
970 return Err(LexError {
971 kind: if is_ident {
972 LexErrorKind::UnterminatedQuotedIdent
973 } else {
974 LexErrorKind::UnterminatedString
975 },
976 pos: start,
977 });
978 }
979 if bytes[i] == quote {
980 if peek_eq(bytes, i + 1, quote) {
981 s.push(quote as char);
982 i += 2;
983 } else {
984 i += 1;
985 break;
986 }
987 } else {
988 let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
989 s.push(ch);
990 i += ch.len_utf8();
991 }
992 }
993 let tok = if is_ident {
994 Token::QuotedIdent(s)
995 } else {
996 Token::String(s)
997 };
998 Ok((tok, i - start))
999}
1000
1001fn lex_escape_string(input: &str, start: usize) -> Result<(Token, usize), LexError> {
1017 let bytes = input.as_bytes();
1018 debug_assert_eq!(bytes[start], b'\'');
1019 let mut i = start + 1;
1020 let mut s = String::new();
1021 loop {
1022 if i >= bytes.len() {
1023 return Err(LexError {
1024 kind: LexErrorKind::UnterminatedString,
1025 pos: start,
1026 });
1027 }
1028 let b = bytes[i];
1029 if b == b'\'' {
1030 if peek_eq(bytes, i + 1, b'\'') {
1031 s.push('\'');
1032 i += 2;
1033 continue;
1034 }
1035 i += 1;
1036 break;
1037 }
1038 if b == b'\\' && i + 1 < bytes.len() {
1039 let n = bytes[i + 1];
1040 match n {
1041 b'\\' => {
1042 s.push('\\');
1043 i += 2;
1044 }
1045 b'\'' => {
1046 s.push('\'');
1047 i += 2;
1048 }
1049 b'"' => {
1050 s.push('"');
1051 i += 2;
1052 }
1053 b'n' => {
1054 s.push('\n');
1055 i += 2;
1056 }
1057 b'r' => {
1058 s.push('\r');
1059 i += 2;
1060 }
1061 b't' => {
1062 s.push('\t');
1063 i += 2;
1064 }
1065 b'b' => {
1066 s.push('\u{0008}');
1067 i += 2;
1068 }
1069 b'f' => {
1070 s.push('\u{000C}');
1071 i += 2;
1072 }
1073 b'0' if i + 2 >= bytes.len() || !bytes[i + 2].is_ascii_digit() => {
1074 s.push('\0');
1075 i += 2;
1076 }
1077 b'x' => {
1078 let h1 = bytes.get(i + 2).copied();
1080 let h2 = bytes.get(i + 3).copied();
1081 let n1 = h1.and_then(hex_digit_value);
1082 let n2 = h2.and_then(hex_digit_value);
1083 match (n1, n2) {
1084 (Some(a), Some(b2)) => {
1085 s.push((((a << 4) | b2) as u8) as char);
1086 i += 4;
1087 }
1088 (Some(a), _) => {
1089 s.push((a as u8) as char);
1090 i += 3;
1091 }
1092 _ => {
1093 s.push('x');
1095 i += 2;
1096 }
1097 }
1098 }
1099 d if d.is_ascii_digit() && d < b'8' => {
1100 let mut value: u32 = u32::from(d - b'0');
1102 let mut take = 2;
1103 while take < 4 {
1104 let next = bytes.get(i + take).copied();
1105 match next {
1106 Some(c) if c.is_ascii_digit() && c < b'8' => {
1107 value = (value << 3) | u32::from(c - b'0');
1108 take += 1;
1109 }
1110 _ => break,
1111 }
1112 }
1113 if let Some(c) = char::from_u32(value) {
1114 s.push(c);
1115 } else {
1116 s.push((value & 0xFF) as u8 as char);
1118 }
1119 i += take;
1120 }
1121 other => {
1122 s.push(other as char);
1126 i += 2;
1127 }
1128 }
1129 } else {
1130 let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
1131 s.push(ch);
1132 i += ch.len_utf8();
1133 }
1134 }
1135 Ok((Token::String(s), i - start))
1136}
1137
1138fn hex_digit_value(b: u8) -> Option<u32> {
1139 match b {
1140 b'0'..=b'9' => Some(u32::from(b - b'0')),
1141 b'a'..=b'f' => Some(u32::from(b - b'a' + 10)),
1142 b'A'..=b'F' => Some(u32::from(b - b'A' + 10)),
1143 _ => None,
1144 }
1145}
1146
1147fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
1148 let bytes = s.as_bytes();
1149 let mut i = 0usize;
1150 let mut is_float = false;
1151
1152 while i < bytes.len() && bytes[i].is_ascii_digit() {
1153 i += 1;
1154 }
1155 if i < bytes.len() && bytes[i] == b'.' {
1156 is_float = true;
1157 i += 1;
1158 while i < bytes.len() && bytes[i].is_ascii_digit() {
1159 i += 1;
1160 }
1161 }
1162 if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
1163 is_float = true;
1164 i += 1;
1165 if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
1166 i += 1;
1167 }
1168 let exp_start = i;
1169 while i < bytes.len() && bytes[i].is_ascii_digit() {
1170 i += 1;
1171 }
1172 if exp_start == i {
1173 return Err(LexErrorKind::BadNumber(s[..i].to_string()));
1174 }
1175 }
1176
1177 let lit = &s[..i];
1178 if is_float {
1179 lit.parse::<f64>()
1180 .map(|v| (Token::Float(v), i))
1181 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1182 } else {
1183 lit.parse::<i64>()
1184 .map(|v| (Token::Integer(v), i))
1185 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1186 }
1187}
1188
1189#[cfg(test)]
1190mod tests {
1191 use super::*;
1192 use alloc::vec;
1193
1194 fn lex(s: &str) -> Vec<Token> {
1195 tokenize(s).expect("lex ok")
1196 }
1197
1198 #[test]
1199 fn empty_yields_only_eof() {
1200 assert_eq!(lex(""), vec![Token::Eof]);
1201 }
1202
1203 #[test]
1204 fn whitespace_only_yields_only_eof() {
1205 assert_eq!(lex(" \t\n "), vec![Token::Eof]);
1206 }
1207
1208 #[test]
1209 fn keywords_are_case_insensitive() {
1210 assert_eq!(
1211 lex("SELECT select Select"),
1212 vec![Token::Select, Token::Select, Token::Select, Token::Eof]
1213 );
1214 }
1215
1216 #[test]
1217 fn identifiers_lowercase_ascii() {
1218 assert_eq!(
1219 lex("hello WORLD _x x1"),
1220 vec![
1221 Token::Ident("hello".into()),
1222 Token::Ident("world".into()),
1223 Token::Ident("_x".into()),
1224 Token::Ident("x1".into()),
1225 Token::Eof,
1226 ]
1227 );
1228 }
1229
1230 #[test]
1231 fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
1232 assert_eq!(
1233 lex(r#""User Name" "a""b""#),
1234 vec![
1235 Token::QuotedIdent("User Name".into()),
1236 Token::QuotedIdent("a\"b".into()),
1237 Token::Eof,
1238 ]
1239 );
1240 }
1241
1242 #[test]
1243 fn integer_and_float_literals() {
1244 assert_eq!(
1245 lex("0 42 1.5 .5 1e10 2.5e-3"),
1246 vec![
1247 Token::Integer(0),
1248 Token::Integer(42),
1249 Token::Float(1.5),
1250 Token::Float(0.5),
1251 Token::Float(1e10),
1252 Token::Float(2.5e-3),
1253 Token::Eof,
1254 ]
1255 );
1256 }
1257
1258 #[test]
1259 fn negative_number_is_minus_then_integer() {
1260 assert_eq!(
1262 lex("-42"),
1263 vec![Token::Minus, Token::Integer(42), Token::Eof]
1264 );
1265 }
1266
1267 #[test]
1268 fn string_literal_doubled_quote_escape() {
1269 assert_eq!(
1270 lex("'hello' 'it''s'"),
1271 vec![
1272 Token::String("hello".into()),
1273 Token::String("it's".into()),
1274 Token::Eof,
1275 ]
1276 );
1277 }
1278
1279 #[test]
1280 fn all_comparison_and_arithmetic_operators() {
1281 assert_eq!(
1282 lex("= <> != < <= > >= + - * /"),
1283 vec![
1284 Token::Eq,
1285 Token::NotEq,
1286 Token::NotEq,
1287 Token::Lt,
1288 Token::LtEq,
1289 Token::Gt,
1290 Token::GtEq,
1291 Token::Plus,
1292 Token::Minus,
1293 Token::Star,
1294 Token::Slash,
1295 Token::Eof,
1296 ]
1297 );
1298 }
1299
1300 #[test]
1301 fn punctuation() {
1302 assert_eq!(
1303 lex("( ) , ; ."),
1304 vec![
1305 Token::LParen,
1306 Token::RParen,
1307 Token::Comma,
1308 Token::Semicolon,
1309 Token::Dot,
1310 Token::Eof,
1311 ]
1312 );
1313 }
1314
1315 #[test]
1316 fn line_comment_skipped() {
1317 assert_eq!(
1318 lex("SELECT -- trailing junk\nFROM"),
1319 vec![Token::Select, Token::From, Token::Eof]
1320 );
1321 }
1322
1323 #[test]
1324 fn block_comment_skipped() {
1325 assert_eq!(
1326 lex("SELECT /* skipped */ 1"),
1327 vec![Token::Select, Token::Integer(1), Token::Eof]
1328 );
1329 }
1330
1331 #[test]
1332 fn unterminated_string_errors() {
1333 let err = tokenize("'oops").unwrap_err();
1334 assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1335 assert_eq!(err.pos, 0);
1336 }
1337
1338 #[test]
1339 fn unterminated_block_comment_errors() {
1340 let err = tokenize("/* never closed").unwrap_err();
1341 assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1342 }
1343
1344 #[test]
1345 fn unknown_char_errors() {
1346 let err = tokenize("\x07").unwrap_err();
1352 assert!(matches!(err.kind, LexErrorKind::UnknownChar(_)));
1353 }
1354
1355 #[test]
1356 fn at_alone_lexes_as_punctuation() {
1357 assert_eq!(
1360 lex("'u'@'h'"),
1361 vec![
1362 Token::String("u".into()),
1363 Token::At,
1364 Token::String("h".into()),
1365 Token::Eof,
1366 ]
1367 );
1368 }
1369
1370 #[test]
1371 fn dot_in_qualified_column() {
1372 assert_eq!(
1373 lex("t.col"),
1374 vec![
1375 Token::Ident("t".into()),
1376 Token::Dot,
1377 Token::Ident("col".into()),
1378 Token::Eof,
1379 ]
1380 );
1381 }
1382
1383 #[test]
1386 fn brackets_are_distinct_tokens() {
1387 assert_eq!(
1388 lex("[ ]"),
1389 vec![Token::LBracket, Token::RBracket, Token::Eof]
1390 );
1391 }
1392
1393 #[test]
1394 fn l2_distance_is_three_char_token() {
1395 assert_eq!(
1396 lex("a <-> b"),
1397 vec![
1398 Token::Ident("a".into()),
1399 Token::L2Distance,
1400 Token::Ident("b".into()),
1401 Token::Eof,
1402 ]
1403 );
1404 assert_eq!(
1406 lex("a <- b"),
1407 vec![
1408 Token::Ident("a".into()),
1409 Token::Lt,
1410 Token::Minus,
1411 Token::Ident("b".into()),
1412 Token::Eof,
1413 ]
1414 );
1415 }
1416
1417 #[test]
1418 fn order_by_limit_are_keywords() {
1419 assert_eq!(
1420 lex("ORDER BY LIMIT"),
1421 vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1422 );
1423 }
1424
1425 #[test]
1428 fn inner_product_operator_3char() {
1429 assert_eq!(
1430 lex("a <#> b"),
1431 vec![
1432 Token::Ident("a".into()),
1433 Token::InnerProduct,
1434 Token::Ident("b".into()),
1435 Token::Eof,
1436 ]
1437 );
1438 }
1439
1440 #[test]
1441 fn cosine_distance_operator_3char() {
1442 assert_eq!(
1443 lex("a <=> b"),
1444 vec![
1445 Token::Ident("a".into()),
1446 Token::CosineDistance,
1447 Token::Ident("b".into()),
1448 Token::Eof,
1449 ]
1450 );
1451 assert_eq!(
1454 lex("a <= b"),
1455 vec![
1456 Token::Ident("a".into()),
1457 Token::LtEq,
1458 Token::Ident("b".into()),
1459 Token::Eof,
1460 ]
1461 );
1462 }
1463
1464 #[test]
1465 fn double_colon_cast_token() {
1466 assert_eq!(
1467 lex("x::INT"),
1468 vec![
1469 Token::Ident("x".into()),
1470 Token::DoubleColon,
1471 Token::Ident("int".into()),
1472 Token::Eof,
1473 ]
1474 );
1475 }
1476
1477 #[test]
1478 fn lone_single_colon_lexes_as_colon_token() {
1479 let toks = tokenize(":x").expect("colon now lexes");
1484 assert_eq!(toks[0], Token::Colon);
1485 }
1486
1487 #[test]
1488 fn colon_eq_lexes_as_assignment() {
1489 let toks = tokenize("x := 1").expect("colon-eq lexes");
1491 assert!(matches!(toks[1], Token::ColonEq));
1493 }
1494
1495 #[test]
1496 fn pg_escape_string_double_backslash_decodes_to_single() {
1497 let toks = tokenize(r"E'\\xdeadbeef'").expect("E-string lexes");
1502 assert_eq!(toks, vec![Token::String(r"\xdeadbeef".into()), Token::Eof]);
1503 }
1504
1505 #[test]
1506 fn pg_escape_string_supports_basic_escapes() {
1507 let toks = tokenize(r"E'a\nb\tc\'d\\e'").expect("E-string lexes");
1509 assert_eq!(toks, vec![Token::String("a\nb\tc'd\\e".into()), Token::Eof]);
1510 }
1511
1512 #[test]
1513 fn pg_escape_string_hex_byte() {
1514 let toks = tokenize(r"E'\x41B\x42'").expect("E-string lexes");
1516 assert_eq!(toks, vec![Token::String("ABB".into()), Token::Eof]);
1517 }
1518
1519 #[test]
1520 fn pg_escape_string_lowercase_e_prefix() {
1521 let toks = tokenize(r"e'hi\n'").expect("e-string lexes");
1522 assert_eq!(toks, vec![Token::String("hi\n".into()), Token::Eof]);
1523 }
1524
1525 #[test]
1526 fn pg_escape_string_doubled_quote() {
1527 let toks = tokenize(r"E'it''s ok'").expect("E-string lexes");
1529 assert_eq!(toks, vec![Token::String("it's ok".into()), Token::Eof]);
1530 }
1531}