1use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17 Select,
19 From,
20 Where,
21 As,
22 Null,
23 True,
24 False,
25 And,
26 Or,
27 Not,
28 Create,
29 Table,
30 Insert,
31 Into,
32 Values,
33 Index,
34 On,
35 Begin,
36 Commit,
37 Rollback,
38 Order,
39 By,
40 Limit,
41
42 Ident(String), QuotedIdent(String), SessionVar(String),
54
55 Integer(i64),
57 Float(f64),
58 String(String),
59
60 Plus,
62 Minus,
63 Star,
64 Slash,
65 Eq,
66 NotEq,
67 Lt,
68 LtEq,
69 Gt,
70 GtEq,
71 InetContainedBy,
74 InetContainedByEq,
77 InetContains,
80 InetContainsEq,
83 InetOverlap,
86
87 LParen,
89 RParen,
90 LBracket,
91 RBracket,
92 Comma,
93 Semicolon,
94 Dot,
95 At,
104 JsonGet,
108 JsonGetText,
110 JsonGetPath,
113 JsonGetPathText,
115 JsonContains,
119 TsMatch,
123 L2Distance,
124 InnerProduct,
127 CosineDistance,
129 DoubleColon,
132 ColonEq,
135 Colon,
139 Concat,
141 Is,
143 Between,
144 In,
145 Like,
146 Group,
147 Distinct,
148 Union,
149 All,
150 Join,
151 Inner,
152 Left,
153 Cross,
154 Outer,
155 Default,
156 Savepoint,
157 Release,
158 To,
159 Having,
160 Show,
161 Extract,
162 Offset,
163 Asc,
164 Desc,
165 Interval,
168 Placeholder(u16),
172
173 Drop,
177 For,
179 Tables,
184 Except,
187 Publication,
189 Subscription,
191 Connection,
194
195 Eof,
196}
197
198#[derive(Debug, Clone, PartialEq, Eq)]
199pub enum LexErrorKind {
200 UnknownChar(char),
201 UnterminatedString,
202 UnterminatedQuotedIdent,
203 UnterminatedBlockComment,
204 BadNumber(String),
205}
206
207#[derive(Debug, Clone, PartialEq, Eq)]
208pub struct LexError {
209 pub kind: LexErrorKind,
210 pub pos: usize,
211}
212
213impl fmt::Display for LexError {
214 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
215 match &self.kind {
216 LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
217 LexErrorKind::UnterminatedString => {
218 write!(f, "unterminated string literal at byte {}", self.pos)
219 }
220 LexErrorKind::UnterminatedQuotedIdent => {
221 write!(f, "unterminated quoted identifier at byte {}", self.pos)
222 }
223 LexErrorKind::UnterminatedBlockComment => {
224 write!(f, "unterminated /* */ comment at byte {}", self.pos)
225 }
226 LexErrorKind::BadNumber(s) => {
227 write!(f, "invalid number literal {s:?} at byte {}", self.pos)
228 }
229 }
230 }
231}
232
233#[allow(clippy::too_many_lines)] pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
236 let bytes = input.as_bytes();
237 let mut i = 0usize;
238 let mut out = Vec::new();
239
240 while i < bytes.len() {
241 let b = bytes[i];
242 match b {
243 b' ' | b'\t' | b'\n' | b'\r' => {
244 i += 1;
245 }
246 b'-' if peek_eq(bytes, i + 1, b'-') => {
247 i += 2;
248 while i < bytes.len() && bytes[i] != b'\n' {
249 i += 1;
250 }
251 }
252 b'/' if peek_eq(bytes, i + 1, b'*') => {
253 let start = i;
254 if peek_eq(bytes, i + 2, b'!') {
264 let mut j = i + 3;
265 while j < bytes.len() && bytes[j].is_ascii_digit() {
268 j += 1;
269 }
270 if j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
271 j += 1;
272 }
273 i = j;
274 continue;
275 }
276 i += 2;
277 let mut closed = false;
278 while i + 1 < bytes.len() {
279 if bytes[i] == b'*' && bytes[i + 1] == b'/' {
280 i += 2;
281 closed = true;
282 break;
283 }
284 i += 1;
285 }
286 if !closed {
287 return Err(LexError {
288 kind: LexErrorKind::UnterminatedBlockComment,
289 pos: start,
290 });
291 }
292 }
293 b'*' if peek_eq(bytes, i + 1, b'/') => {
298 i += 2;
299 }
300 b'\'' => {
301 let (tok, consumed) = lex_quoted(input, i, b'\'', false)?;
302 out.push(tok);
303 i += consumed;
304 }
305 b'E' | b'e' if peek_eq(bytes, i + 1, b'\'') => {
313 let (tok, consumed) = lex_escape_string(input, i + 1)?;
314 out.push(tok);
315 i += 1 + consumed;
316 }
317 b'"' => {
318 let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
319 out.push(tok);
320 i += consumed;
321 }
322 b'`' => {
326 let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
327 out.push(tok);
328 i += consumed;
329 }
330 b if b.is_ascii_alphabetic() || b == b'_' => {
331 let start = i;
332 i += 1;
333 while i < bytes.len() {
334 let c = bytes[i];
335 if c.is_ascii_alphanumeric() || c == b'_' {
336 i += 1;
337 } else {
338 break;
339 }
340 }
341 let raw = &input[start..i];
342 out.push(keyword_or_ident_raw(raw));
346 }
347 b if b.is_ascii_digit() => {
348 let (tok, consumed) =
349 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
350 out.push(tok);
351 i += consumed;
352 }
353 b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
354 let (tok, consumed) =
355 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
356 out.push(tok);
357 i += consumed;
358 }
359 b'+' => single(&mut out, Token::Plus, &mut i),
360 b'-' => {
361 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
364 out.push(Token::JsonGetText);
365 i += 3;
366 } else if peek_eq(bytes, i + 1, b'>') {
367 out.push(Token::JsonGet);
368 i += 2;
369 } else {
370 single(&mut out, Token::Minus, &mut i);
371 }
372 }
373 b'#' => {
375 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
376 out.push(Token::JsonGetPathText);
377 i += 3;
378 } else if peek_eq(bytes, i + 1, b'>') {
379 out.push(Token::JsonGetPath);
380 i += 2;
381 } else {
382 return Err(LexError {
383 kind: LexErrorKind::UnknownChar('#'),
384 pos: i,
385 });
386 }
387 }
388 b'@' => {
397 if peek_eq(bytes, i + 1, b'>') {
398 out.push(Token::JsonContains);
399 i += 2;
400 } else if peek_eq(bytes, i + 1, b'@')
401 && !is_session_var_ident_start(bytes.get(i + 2).copied())
402 {
403 out.push(Token::TsMatch);
406 i += 2;
407 } else {
408 let prefix_end = if peek_eq(bytes, i + 1, b'@') {
413 i + 2
414 } else {
415 i + 1
416 };
417 let mut end = prefix_end;
418 while end < bytes.len() && is_session_var_ident_continue(bytes[end]) {
419 end += 1;
420 }
421 if end == prefix_end {
422 out.push(Token::At);
431 i = prefix_end;
432 continue;
433 }
434 out.push(Token::SessionVar(input[i..end].to_string()));
435 i = end;
436 }
437 }
438 b'*' => single(&mut out, Token::Star, &mut i),
439 b'/' => single(&mut out, Token::Slash, &mut i),
440 b'(' => single(&mut out, Token::LParen, &mut i),
441 b')' => single(&mut out, Token::RParen, &mut i),
442 b'[' => single(&mut out, Token::LBracket, &mut i),
443 b']' => single(&mut out, Token::RBracket, &mut i),
444 b',' => single(&mut out, Token::Comma, &mut i),
445 b';' => single(&mut out, Token::Semicolon, &mut i),
446 b'.' => single(&mut out, Token::Dot, &mut i),
447 b'=' => single(&mut out, Token::Eq, &mut i),
448 b'<' => {
449 if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
450 out.push(Token::CosineDistance);
451 i += 3;
452 } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
453 out.push(Token::InnerProduct);
454 i += 3;
455 } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
456 out.push(Token::L2Distance);
457 i += 3;
458 } else if peek_eq(bytes, i + 1, b'<') && peek_eq(bytes, i + 2, b'=') {
459 out.push(Token::InetContainedByEq);
461 i += 3;
462 } else if peek_eq(bytes, i + 1, b'<') {
463 out.push(Token::InetContainedBy);
465 i += 2;
466 } else if peek_eq(bytes, i + 1, b'=') {
467 out.push(Token::LtEq);
468 i += 2;
469 } else if peek_eq(bytes, i + 1, b'>') {
470 out.push(Token::NotEq);
471 i += 2;
472 } else {
473 out.push(Token::Lt);
474 i += 1;
475 }
476 }
477 b':' if peek_eq(bytes, i + 1, b':') => {
478 out.push(Token::DoubleColon);
479 i += 2;
480 }
481 b':' if peek_eq(bytes, i + 1, b'=') => {
482 out.push(Token::ColonEq);
484 i += 2;
485 }
486 b':' => {
487 out.push(Token::Colon);
491 i += 1;
492 }
493 b'|' if peek_eq(bytes, i + 1, b'|') => {
494 out.push(Token::Concat);
495 i += 2;
496 }
497 b'>' => {
498 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'=') {
499 out.push(Token::InetContainsEq);
501 i += 3;
502 } else if peek_eq(bytes, i + 1, b'>') {
503 out.push(Token::InetContains);
505 i += 2;
506 } else if peek_eq(bytes, i + 1, b'=') {
507 out.push(Token::GtEq);
508 i += 2;
509 } else {
510 out.push(Token::Gt);
511 i += 1;
512 }
513 }
514 b'&' if peek_eq(bytes, i + 1, b'&') => {
515 out.push(Token::InetOverlap);
517 i += 2;
518 }
519 b'!' if peek_eq(bytes, i + 1, b'=') => {
520 out.push(Token::NotEq);
521 i += 2;
522 }
523 b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
531 let end = find_dollar_tag_end(bytes, i + 2, b"$$");
533 let body = match end {
534 Some(e) => &input[i + 2..e],
535 None => {
536 return Err(LexError {
537 kind: LexErrorKind::UnterminatedString,
538 pos: i,
539 });
540 }
541 };
542 out.push(Token::String(body.to_string()));
543 i = end.unwrap() + 2;
544 }
545 b'$' if i + 1 < bytes.len()
546 && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
547 {
548 let mut j = i + 1;
551 while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
552 j += 1;
553 }
554 if j >= bytes.len() || bytes[j] != b'$' {
555 let ch = input[i..].chars().next().unwrap_or('?');
558 return Err(LexError {
559 kind: LexErrorKind::UnknownChar(ch),
560 pos: i,
561 });
562 }
563 let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
564 let end = find_dollar_tag_end(bytes, j + 1, &close);
565 let body = match end {
566 Some(e) => &input[j + 1..e],
567 None => {
568 return Err(LexError {
569 kind: LexErrorKind::UnterminatedString,
570 pos: i,
571 });
572 }
573 };
574 out.push(Token::String(body.to_string()));
575 i = end.unwrap() + close.len();
576 }
577 b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
581 let mut j = i + 1;
582 let mut n: u32 = 0;
583 while j < bytes.len() && bytes[j].is_ascii_digit() {
584 n = n
585 .saturating_mul(10)
586 .saturating_add(u32::from(bytes[j] - b'0'));
587 j += 1;
588 }
589 if n == 0 || n > u32::from(u16::MAX) {
590 return Err(LexError {
591 kind: LexErrorKind::BadNumber(input[i..j].to_string()),
592 pos: i,
593 });
594 }
595 #[allow(clippy::cast_possible_truncation)]
596 out.push(Token::Placeholder(n as u16));
597 i = j;
598 }
599 _ => {
600 let ch = input[i..].chars().next().unwrap_or('?');
601 return Err(LexError {
602 kind: LexErrorKind::UnknownChar(ch),
603 pos: i,
604 });
605 }
606 }
607 }
608 out.push(Token::Eof);
609 Ok(out)
610}
611
612fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
613 bytes.get(i) == Some(&target)
614}
615
616fn is_session_var_ident_start(b: Option<u8>) -> bool {
621 matches!(b, Some(c) if c.is_ascii_alphabetic() || c == b'_')
622}
623
624fn is_session_var_ident_continue(b: u8) -> bool {
629 b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'$'
630}
631
632fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
635 if tag.is_empty() || from > bytes.len() {
636 return None;
637 }
638 let mut i = from;
639 while i + tag.len() <= bytes.len() {
640 if &bytes[i..i + tag.len()] == tag {
641 return Some(i);
642 }
643 i += 1;
644 }
645 None
646}
647
648fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
649 bytes.get(i).is_some_and(pred)
650}
651
652fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
653 out.push(tok);
654 *i += 1;
655}
656
657fn keyword_or_ident_raw(raw: &str) -> Token {
667 let b = raw.as_bytes();
668 let tok = match b.len() {
669 2 => kw_len2(b),
670 3 => kw_len3(b),
671 4 => kw_len4(b),
672 5 => kw_len5(b),
673 6 => kw_len6(b),
674 7 => kw_len7(b),
675 8 => kw_len8(b),
676 9 => kw_len9(b),
677 10 => kw_len10(b),
678 11 => kw_len11(b),
679 12 => kw_len12(b),
680 _ => None,
681 };
682 match tok {
683 Some(t) => t,
684 None => Token::Ident(raw.to_ascii_lowercase()),
686 }
687}
688
689#[inline]
695fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
696 if input.len() != lower.len() {
697 return false;
698 }
699 for i in 0..lower.len() {
700 if input[i].to_ascii_lowercase() != lower[i] {
701 return false;
702 }
703 }
704 true
705}
706
707#[inline]
708fn kw_len2(b: &[u8]) -> Option<Token> {
709 if eq_ci(b, b"as") {
711 return Some(Token::As);
712 }
713 if eq_ci(b, b"by") {
714 return Some(Token::By);
715 }
716 if eq_ci(b, b"in") {
717 return Some(Token::In);
718 }
719 if eq_ci(b, b"is") {
720 return Some(Token::Is);
721 }
722 if eq_ci(b, b"on") {
723 return Some(Token::On);
724 }
725 if eq_ci(b, b"or") {
726 return Some(Token::Or);
727 }
728 if eq_ci(b, b"to") {
729 return Some(Token::To);
730 }
731 None
732}
733
734#[inline]
735fn kw_len3(b: &[u8]) -> Option<Token> {
736 if eq_ci(b, b"for") {
738 return Some(Token::For);
739 }
740 if eq_ci(b, b"all") {
741 return Some(Token::All);
742 }
743 if eq_ci(b, b"and") {
744 return Some(Token::And);
745 }
746 if eq_ci(b, b"asc") {
747 return Some(Token::Asc);
748 }
749 if eq_ci(b, b"not") {
750 return Some(Token::Not);
751 }
752 None
753}
754
755#[inline]
756fn kw_len4(b: &[u8]) -> Option<Token> {
757 if eq_ci(b, b"from") {
759 return Some(Token::From);
760 }
761 if eq_ci(b, b"drop") {
762 return Some(Token::Drop);
763 }
764 if eq_ci(b, b"null") {
765 return Some(Token::Null);
766 }
767 if eq_ci(b, b"true") {
768 return Some(Token::True);
769 }
770 if eq_ci(b, b"into") {
771 return Some(Token::Into);
772 }
773 if eq_ci(b, b"like") {
774 return Some(Token::Like);
775 }
776 if eq_ci(b, b"join") {
777 return Some(Token::Join);
778 }
779 if eq_ci(b, b"left") {
780 return Some(Token::Left);
781 }
782 if eq_ci(b, b"show") {
783 return Some(Token::Show);
784 }
785 if eq_ci(b, b"desc") {
786 return Some(Token::Desc);
787 }
788 None
789}
790
791#[inline]
792fn kw_len5(b: &[u8]) -> Option<Token> {
793 if eq_ci(b, b"false") {
796 return Some(Token::False);
797 }
798 if eq_ci(b, b"where") {
799 return Some(Token::Where);
800 }
801 if eq_ci(b, b"table") {
802 return Some(Token::Table);
803 }
804 if eq_ci(b, b"index") {
805 return Some(Token::Index);
806 }
807 if eq_ci(b, b"begin") {
808 return Some(Token::Begin);
809 }
810 if eq_ci(b, b"order") {
811 return Some(Token::Order);
812 }
813 if eq_ci(b, b"limit") {
814 return Some(Token::Limit);
815 }
816 if eq_ci(b, b"group") {
817 return Some(Token::Group);
818 }
819 if eq_ci(b, b"union") {
820 return Some(Token::Union);
821 }
822 if eq_ci(b, b"inner") {
823 return Some(Token::Inner);
824 }
825 if eq_ci(b, b"cross") {
826 return Some(Token::Cross);
827 }
828 if eq_ci(b, b"outer") {
829 return Some(Token::Outer);
830 }
831 None
832}
833
834#[inline]
835fn kw_len6(b: &[u8]) -> Option<Token> {
836 if eq_ci(b, b"select") {
838 return Some(Token::Select);
839 }
840 if eq_ci(b, b"tables") {
841 return Some(Token::Tables);
842 }
843 if eq_ci(b, b"except") {
844 return Some(Token::Except);
845 }
846 if eq_ci(b, b"create") {
847 return Some(Token::Create);
848 }
849 if eq_ci(b, b"insert") {
850 return Some(Token::Insert);
851 }
852 if eq_ci(b, b"values") {
853 return Some(Token::Values);
854 }
855 if eq_ci(b, b"commit") {
856 return Some(Token::Commit);
857 }
858 if eq_ci(b, b"having") {
859 return Some(Token::Having);
860 }
861 if eq_ci(b, b"offset") {
862 return Some(Token::Offset);
863 }
864 None
865}
866
867#[inline]
868fn kw_len7(b: &[u8]) -> Option<Token> {
869 if eq_ci(b, b"between") {
871 return Some(Token::Between);
872 }
873 if eq_ci(b, b"default") {
874 return Some(Token::Default);
875 }
876 if eq_ci(b, b"release") {
877 return Some(Token::Release);
878 }
879 if eq_ci(b, b"extract") {
880 return Some(Token::Extract);
881 }
882 None
883}
884
885#[inline]
886fn kw_len8(b: &[u8]) -> Option<Token> {
887 if eq_ci(b, b"rollback") {
889 return Some(Token::Rollback);
890 }
891 if eq_ci(b, b"distinct") {
892 return Some(Token::Distinct);
893 }
894 if eq_ci(b, b"interval") {
895 return Some(Token::Interval);
896 }
897 None
898}
899
900#[inline]
901fn kw_len9(b: &[u8]) -> Option<Token> {
902 if eq_ci(b, b"savepoint") {
904 return Some(Token::Savepoint);
905 }
906 None
907}
908
909#[inline]
910fn kw_len10(b: &[u8]) -> Option<Token> {
911 if eq_ci(b, b"connection") {
913 return Some(Token::Connection);
914 }
915 None
916}
917
918#[inline]
919fn kw_len11(b: &[u8]) -> Option<Token> {
920 if eq_ci(b, b"publication") {
922 return Some(Token::Publication);
923 }
924 None
925}
926
927#[inline]
928fn kw_len12(b: &[u8]) -> Option<Token> {
929 if eq_ci(b, b"subscription") {
931 return Some(Token::Subscription);
932 }
933 None
934}
935
936fn lex_quoted(
943 input: &str,
944 start: usize,
945 quote: u8,
946 is_ident: bool,
947) -> Result<(Token, usize), LexError> {
948 let bytes = input.as_bytes();
949 let mut i = start + 1;
950 let mut s = String::new();
951 loop {
952 if i >= bytes.len() {
953 return Err(LexError {
954 kind: if is_ident {
955 LexErrorKind::UnterminatedQuotedIdent
956 } else {
957 LexErrorKind::UnterminatedString
958 },
959 pos: start,
960 });
961 }
962 if bytes[i] == quote {
963 if peek_eq(bytes, i + 1, quote) {
964 s.push(quote as char);
965 i += 2;
966 } else {
967 i += 1;
968 break;
969 }
970 } else {
971 let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
972 s.push(ch);
973 i += ch.len_utf8();
974 }
975 }
976 let tok = if is_ident {
977 Token::QuotedIdent(s)
978 } else {
979 Token::String(s)
980 };
981 Ok((tok, i - start))
982}
983
984fn lex_escape_string(input: &str, start: usize) -> Result<(Token, usize), LexError> {
1000 let bytes = input.as_bytes();
1001 debug_assert_eq!(bytes[start], b'\'');
1002 let mut i = start + 1;
1003 let mut s = String::new();
1004 loop {
1005 if i >= bytes.len() {
1006 return Err(LexError {
1007 kind: LexErrorKind::UnterminatedString,
1008 pos: start,
1009 });
1010 }
1011 let b = bytes[i];
1012 if b == b'\'' {
1013 if peek_eq(bytes, i + 1, b'\'') {
1014 s.push('\'');
1015 i += 2;
1016 continue;
1017 }
1018 i += 1;
1019 break;
1020 }
1021 if b == b'\\' && i + 1 < bytes.len() {
1022 let n = bytes[i + 1];
1023 match n {
1024 b'\\' => { s.push('\\'); i += 2; }
1025 b'\'' => { s.push('\''); i += 2; }
1026 b'"' => { s.push('"'); i += 2; }
1027 b'n' => { s.push('\n'); i += 2; }
1028 b'r' => { s.push('\r'); i += 2; }
1029 b't' => { s.push('\t'); i += 2; }
1030 b'b' => { s.push('\u{0008}'); i += 2; }
1031 b'f' => { s.push('\u{000C}'); i += 2; }
1032 b'0' if i + 2 >= bytes.len() || !bytes[i + 2].is_ascii_digit() => {
1033 s.push('\0');
1034 i += 2;
1035 }
1036 b'x' => {
1037 let h1 = bytes.get(i + 2).copied();
1039 let h2 = bytes.get(i + 3).copied();
1040 let n1 = h1.and_then(hex_digit_value);
1041 let n2 = h2.and_then(hex_digit_value);
1042 match (n1, n2) {
1043 (Some(a), Some(b2)) => {
1044 s.push((((a << 4) | b2) as u8) as char);
1045 i += 4;
1046 }
1047 (Some(a), _) => {
1048 s.push((a as u8) as char);
1049 i += 3;
1050 }
1051 _ => {
1052 s.push('x');
1054 i += 2;
1055 }
1056 }
1057 }
1058 d if d.is_ascii_digit() && d < b'8' => {
1059 let mut value: u32 = u32::from(d - b'0');
1061 let mut take = 2;
1062 while take < 4 {
1063 let next = bytes.get(i + take).copied();
1064 match next {
1065 Some(c) if c.is_ascii_digit() && c < b'8' => {
1066 value = (value << 3) | u32::from(c - b'0');
1067 take += 1;
1068 }
1069 _ => break,
1070 }
1071 }
1072 if let Some(c) = char::from_u32(value) {
1073 s.push(c);
1074 } else {
1075 s.push((value & 0xFF) as u8 as char);
1077 }
1078 i += take;
1079 }
1080 other => {
1081 s.push(other as char);
1085 i += 2;
1086 }
1087 }
1088 } else {
1089 let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
1090 s.push(ch);
1091 i += ch.len_utf8();
1092 }
1093 }
1094 Ok((Token::String(s), i - start))
1095}
1096
1097fn hex_digit_value(b: u8) -> Option<u32> {
1098 match b {
1099 b'0'..=b'9' => Some(u32::from(b - b'0')),
1100 b'a'..=b'f' => Some(u32::from(b - b'a' + 10)),
1101 b'A'..=b'F' => Some(u32::from(b - b'A' + 10)),
1102 _ => None,
1103 }
1104}
1105
1106fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
1107 let bytes = s.as_bytes();
1108 let mut i = 0usize;
1109 let mut is_float = false;
1110
1111 while i < bytes.len() && bytes[i].is_ascii_digit() {
1112 i += 1;
1113 }
1114 if i < bytes.len() && bytes[i] == b'.' {
1115 is_float = true;
1116 i += 1;
1117 while i < bytes.len() && bytes[i].is_ascii_digit() {
1118 i += 1;
1119 }
1120 }
1121 if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
1122 is_float = true;
1123 i += 1;
1124 if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
1125 i += 1;
1126 }
1127 let exp_start = i;
1128 while i < bytes.len() && bytes[i].is_ascii_digit() {
1129 i += 1;
1130 }
1131 if exp_start == i {
1132 return Err(LexErrorKind::BadNumber(s[..i].to_string()));
1133 }
1134 }
1135
1136 let lit = &s[..i];
1137 if is_float {
1138 lit.parse::<f64>()
1139 .map(|v| (Token::Float(v), i))
1140 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1141 } else {
1142 lit.parse::<i64>()
1143 .map(|v| (Token::Integer(v), i))
1144 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1145 }
1146}
1147
1148#[cfg(test)]
1149mod tests {
1150 use super::*;
1151 use alloc::vec;
1152
1153 fn lex(s: &str) -> Vec<Token> {
1154 tokenize(s).expect("lex ok")
1155 }
1156
1157 #[test]
1158 fn empty_yields_only_eof() {
1159 assert_eq!(lex(""), vec![Token::Eof]);
1160 }
1161
1162 #[test]
1163 fn whitespace_only_yields_only_eof() {
1164 assert_eq!(lex(" \t\n "), vec![Token::Eof]);
1165 }
1166
1167 #[test]
1168 fn keywords_are_case_insensitive() {
1169 assert_eq!(
1170 lex("SELECT select Select"),
1171 vec![Token::Select, Token::Select, Token::Select, Token::Eof]
1172 );
1173 }
1174
1175 #[test]
1176 fn identifiers_lowercase_ascii() {
1177 assert_eq!(
1178 lex("hello WORLD _x x1"),
1179 vec![
1180 Token::Ident("hello".into()),
1181 Token::Ident("world".into()),
1182 Token::Ident("_x".into()),
1183 Token::Ident("x1".into()),
1184 Token::Eof,
1185 ]
1186 );
1187 }
1188
1189 #[test]
1190 fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
1191 assert_eq!(
1192 lex(r#""User Name" "a""b""#),
1193 vec![
1194 Token::QuotedIdent("User Name".into()),
1195 Token::QuotedIdent("a\"b".into()),
1196 Token::Eof,
1197 ]
1198 );
1199 }
1200
1201 #[test]
1202 fn integer_and_float_literals() {
1203 assert_eq!(
1204 lex("0 42 1.5 .5 1e10 2.5e-3"),
1205 vec![
1206 Token::Integer(0),
1207 Token::Integer(42),
1208 Token::Float(1.5),
1209 Token::Float(0.5),
1210 Token::Float(1e10),
1211 Token::Float(2.5e-3),
1212 Token::Eof,
1213 ]
1214 );
1215 }
1216
1217 #[test]
1218 fn negative_number_is_minus_then_integer() {
1219 assert_eq!(
1221 lex("-42"),
1222 vec![Token::Minus, Token::Integer(42), Token::Eof]
1223 );
1224 }
1225
1226 #[test]
1227 fn string_literal_doubled_quote_escape() {
1228 assert_eq!(
1229 lex("'hello' 'it''s'"),
1230 vec![
1231 Token::String("hello".into()),
1232 Token::String("it's".into()),
1233 Token::Eof,
1234 ]
1235 );
1236 }
1237
1238 #[test]
1239 fn all_comparison_and_arithmetic_operators() {
1240 assert_eq!(
1241 lex("= <> != < <= > >= + - * /"),
1242 vec![
1243 Token::Eq,
1244 Token::NotEq,
1245 Token::NotEq,
1246 Token::Lt,
1247 Token::LtEq,
1248 Token::Gt,
1249 Token::GtEq,
1250 Token::Plus,
1251 Token::Minus,
1252 Token::Star,
1253 Token::Slash,
1254 Token::Eof,
1255 ]
1256 );
1257 }
1258
1259 #[test]
1260 fn punctuation() {
1261 assert_eq!(
1262 lex("( ) , ; ."),
1263 vec![
1264 Token::LParen,
1265 Token::RParen,
1266 Token::Comma,
1267 Token::Semicolon,
1268 Token::Dot,
1269 Token::Eof,
1270 ]
1271 );
1272 }
1273
1274 #[test]
1275 fn line_comment_skipped() {
1276 assert_eq!(
1277 lex("SELECT -- trailing junk\nFROM"),
1278 vec![Token::Select, Token::From, Token::Eof]
1279 );
1280 }
1281
1282 #[test]
1283 fn block_comment_skipped() {
1284 assert_eq!(
1285 lex("SELECT /* skipped */ 1"),
1286 vec![Token::Select, Token::Integer(1), Token::Eof]
1287 );
1288 }
1289
1290 #[test]
1291 fn unterminated_string_errors() {
1292 let err = tokenize("'oops").unwrap_err();
1293 assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1294 assert_eq!(err.pos, 0);
1295 }
1296
1297 #[test]
1298 fn unterminated_block_comment_errors() {
1299 let err = tokenize("/* never closed").unwrap_err();
1300 assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1301 }
1302
1303 #[test]
1304 fn unknown_char_errors() {
1305 let err = tokenize("\x07").unwrap_err();
1311 assert!(matches!(err.kind, LexErrorKind::UnknownChar(_)));
1312 }
1313
1314 #[test]
1315 fn at_alone_lexes_as_punctuation() {
1316 assert_eq!(
1319 lex("'u'@'h'"),
1320 vec![
1321 Token::String("u".into()),
1322 Token::At,
1323 Token::String("h".into()),
1324 Token::Eof,
1325 ]
1326 );
1327 }
1328
1329 #[test]
1330 fn dot_in_qualified_column() {
1331 assert_eq!(
1332 lex("t.col"),
1333 vec![
1334 Token::Ident("t".into()),
1335 Token::Dot,
1336 Token::Ident("col".into()),
1337 Token::Eof,
1338 ]
1339 );
1340 }
1341
1342 #[test]
1345 fn brackets_are_distinct_tokens() {
1346 assert_eq!(
1347 lex("[ ]"),
1348 vec![Token::LBracket, Token::RBracket, Token::Eof]
1349 );
1350 }
1351
1352 #[test]
1353 fn l2_distance_is_three_char_token() {
1354 assert_eq!(
1355 lex("a <-> b"),
1356 vec![
1357 Token::Ident("a".into()),
1358 Token::L2Distance,
1359 Token::Ident("b".into()),
1360 Token::Eof,
1361 ]
1362 );
1363 assert_eq!(
1365 lex("a <- b"),
1366 vec![
1367 Token::Ident("a".into()),
1368 Token::Lt,
1369 Token::Minus,
1370 Token::Ident("b".into()),
1371 Token::Eof,
1372 ]
1373 );
1374 }
1375
1376 #[test]
1377 fn order_by_limit_are_keywords() {
1378 assert_eq!(
1379 lex("ORDER BY LIMIT"),
1380 vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1381 );
1382 }
1383
1384 #[test]
1387 fn inner_product_operator_3char() {
1388 assert_eq!(
1389 lex("a <#> b"),
1390 vec![
1391 Token::Ident("a".into()),
1392 Token::InnerProduct,
1393 Token::Ident("b".into()),
1394 Token::Eof,
1395 ]
1396 );
1397 }
1398
1399 #[test]
1400 fn cosine_distance_operator_3char() {
1401 assert_eq!(
1402 lex("a <=> b"),
1403 vec![
1404 Token::Ident("a".into()),
1405 Token::CosineDistance,
1406 Token::Ident("b".into()),
1407 Token::Eof,
1408 ]
1409 );
1410 assert_eq!(
1413 lex("a <= b"),
1414 vec![
1415 Token::Ident("a".into()),
1416 Token::LtEq,
1417 Token::Ident("b".into()),
1418 Token::Eof,
1419 ]
1420 );
1421 }
1422
1423 #[test]
1424 fn double_colon_cast_token() {
1425 assert_eq!(
1426 lex("x::INT"),
1427 vec![
1428 Token::Ident("x".into()),
1429 Token::DoubleColon,
1430 Token::Ident("int".into()),
1431 Token::Eof,
1432 ]
1433 );
1434 }
1435
1436 #[test]
1437 fn lone_single_colon_lexes_as_colon_token() {
1438 let toks = tokenize(":x").expect("colon now lexes");
1443 assert_eq!(toks[0], Token::Colon);
1444 }
1445
1446 #[test]
1447 fn colon_eq_lexes_as_assignment() {
1448 let toks = tokenize("x := 1").expect("colon-eq lexes");
1450 assert!(matches!(toks[1], Token::ColonEq));
1452 }
1453
1454 #[test]
1455 fn pg_escape_string_double_backslash_decodes_to_single() {
1456 let toks = tokenize(r"E'\\xdeadbeef'").expect("E-string lexes");
1461 assert_eq!(toks, vec![Token::String(r"\xdeadbeef".into()), Token::Eof]);
1462 }
1463
1464 #[test]
1465 fn pg_escape_string_supports_basic_escapes() {
1466 let toks = tokenize(r"E'a\nb\tc\'d\\e'").expect("E-string lexes");
1468 assert_eq!(
1469 toks,
1470 vec![Token::String("a\nb\tc'd\\e".into()), Token::Eof]
1471 );
1472 }
1473
1474 #[test]
1475 fn pg_escape_string_hex_byte() {
1476 let toks = tokenize(r"E'\x41B\x42'").expect("E-string lexes");
1478 assert_eq!(toks, vec![Token::String("ABB".into()), Token::Eof]);
1479 }
1480
1481 #[test]
1482 fn pg_escape_string_lowercase_e_prefix() {
1483 let toks = tokenize(r"e'hi\n'").expect("e-string lexes");
1484 assert_eq!(toks, vec![Token::String("hi\n".into()), Token::Eof]);
1485 }
1486
1487 #[test]
1488 fn pg_escape_string_doubled_quote() {
1489 let toks = tokenize(r"E'it''s ok'").expect("E-string lexes");
1491 assert_eq!(toks, vec![Token::String("it's ok".into()), Token::Eof]);
1492 }
1493}