1use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17 Select,
19 From,
20 Where,
21 As,
22 Null,
23 True,
24 False,
25 And,
26 Or,
27 Not,
28 Create,
29 Table,
30 Insert,
31 Into,
32 Values,
33 Index,
34 On,
35 Begin,
36 Commit,
37 Rollback,
38 Order,
39 By,
40 Limit,
41
42 Ident(String), QuotedIdent(String), SessionVar(String),
54
55 Integer(i64),
57 Float(f64),
58 String(String),
59
60 Plus,
62 Minus,
63 Star,
64 Slash,
65 Eq,
66 NotEq,
67 Lt,
68 LtEq,
69 Gt,
70 GtEq,
71 InetContainedBy,
74 InetContainedByEq,
77 InetContains,
80 InetContainsEq,
83 InetOverlap,
86
87 LParen,
89 RParen,
90 LBracket,
91 RBracket,
92 Comma,
93 Semicolon,
94 Dot,
95 At,
104 JsonGet,
108 JsonGetText,
110 JsonGetPath,
113 JsonGetPathText,
115 JsonContains,
119 TsMatch,
123 L2Distance,
124 InnerProduct,
127 CosineDistance,
129 DoubleColon,
132 ColonEq,
135 Colon,
139 Concat,
141 Is,
143 Between,
144 In,
145 Like,
146 Group,
147 Distinct,
148 Union,
149 All,
150 Join,
151 Inner,
152 Left,
153 Cross,
154 Outer,
155 Default,
156 Savepoint,
157 Release,
158 To,
159 Having,
160 Show,
161 Extract,
162 Offset,
163 Asc,
164 Desc,
165 Interval,
168 Placeholder(u16),
172
173 Drop,
177 For,
179 Tables,
184 Except,
187 Publication,
189 Subscription,
191 Connection,
194
195 Eof,
196}
197
198#[derive(Debug, Clone, PartialEq, Eq)]
199pub enum LexErrorKind {
200 UnknownChar(char),
201 UnterminatedString,
202 UnterminatedQuotedIdent,
203 UnterminatedBlockComment,
204 BadNumber(String),
205}
206
207#[derive(Debug, Clone, PartialEq, Eq)]
208pub struct LexError {
209 pub kind: LexErrorKind,
210 pub pos: usize,
211}
212
213impl fmt::Display for LexError {
214 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
215 match &self.kind {
216 LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
217 LexErrorKind::UnterminatedString => {
218 write!(f, "unterminated string literal at byte {}", self.pos)
219 }
220 LexErrorKind::UnterminatedQuotedIdent => {
221 write!(f, "unterminated quoted identifier at byte {}", self.pos)
222 }
223 LexErrorKind::UnterminatedBlockComment => {
224 write!(f, "unterminated /* */ comment at byte {}", self.pos)
225 }
226 LexErrorKind::BadNumber(s) => {
227 write!(f, "invalid number literal {s:?} at byte {}", self.pos)
228 }
229 }
230 }
231}
232
233#[allow(clippy::too_many_lines)] pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
236 let bytes = input.as_bytes();
237 let mut i = 0usize;
238 let mut out = Vec::new();
239
240 while i < bytes.len() {
241 let b = bytes[i];
242 match b {
243 b' ' | b'\t' | b'\n' | b'\r' => {
244 i += 1;
245 }
246 b'-' if peek_eq(bytes, i + 1, b'-') => {
247 i += 2;
248 while i < bytes.len() && bytes[i] != b'\n' {
249 i += 1;
250 }
251 }
252 b'/' if peek_eq(bytes, i + 1, b'*') => {
253 let start = i;
254 if peek_eq(bytes, i + 2, b'!') {
264 let mut j = i + 3;
265 while j < bytes.len() && bytes[j].is_ascii_digit() {
268 j += 1;
269 }
270 if j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
271 j += 1;
272 }
273 i = j;
274 continue;
275 }
276 i += 2;
277 let mut closed = false;
278 while i + 1 < bytes.len() {
279 if bytes[i] == b'*' && bytes[i + 1] == b'/' {
280 i += 2;
281 closed = true;
282 break;
283 }
284 i += 1;
285 }
286 if !closed {
287 return Err(LexError {
288 kind: LexErrorKind::UnterminatedBlockComment,
289 pos: start,
290 });
291 }
292 }
293 b'*' if peek_eq(bytes, i + 1, b'/') => {
298 i += 2;
299 }
300 b'\'' => {
301 let (tok, consumed) = lex_quoted(input, i, b'\'', false)?;
302 out.push(tok);
303 i += consumed;
304 }
305 b'E' | b'e' if peek_eq(bytes, i + 1, b'\'') => {
313 let (tok, consumed) = lex_escape_string(input, i + 1)?;
314 out.push(tok);
315 i += 1 + consumed;
316 }
317 b'"' => {
318 let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
319 out.push(tok);
320 i += consumed;
321 }
322 b'`' => {
326 let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
327 out.push(tok);
328 i += consumed;
329 }
330 b if b.is_ascii_alphabetic() || b == b'_' => {
331 let start = i;
332 i += 1;
333 while i < bytes.len() {
334 let c = bytes[i];
335 if c.is_ascii_alphanumeric() || c == b'_' {
336 i += 1;
337 } else {
338 break;
339 }
340 }
341 let raw = &input[start..i];
342 out.push(keyword_or_ident_raw(raw));
346 }
347 b if b.is_ascii_digit() => {
348 let (tok, consumed) =
349 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
350 out.push(tok);
351 i += consumed;
352 }
353 b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
354 let (tok, consumed) =
355 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
356 out.push(tok);
357 i += consumed;
358 }
359 b'+' => single(&mut out, Token::Plus, &mut i),
360 b'-' => {
361 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
364 out.push(Token::JsonGetText);
365 i += 3;
366 } else if peek_eq(bytes, i + 1, b'>') {
367 out.push(Token::JsonGet);
368 i += 2;
369 } else {
370 single(&mut out, Token::Minus, &mut i);
371 }
372 }
373 b'#' => {
375 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
376 out.push(Token::JsonGetPathText);
377 i += 3;
378 } else if peek_eq(bytes, i + 1, b'>') {
379 out.push(Token::JsonGetPath);
380 i += 2;
381 } else {
382 return Err(LexError {
383 kind: LexErrorKind::UnknownChar('#'),
384 pos: i,
385 });
386 }
387 }
388 b'@' => {
397 if peek_eq(bytes, i + 1, b'>') {
398 out.push(Token::JsonContains);
399 i += 2;
400 } else if peek_eq(bytes, i + 1, b'@')
401 && !is_session_var_ident_start(bytes.get(i + 2).copied())
402 {
403 out.push(Token::TsMatch);
406 i += 2;
407 } else {
408 let prefix_end = if peek_eq(bytes, i + 1, b'@') {
413 i + 2
414 } else {
415 i + 1
416 };
417 let mut end = prefix_end;
418 while end < bytes.len() && is_session_var_ident_continue(bytes[end]) {
419 end += 1;
420 }
421 if end == prefix_end {
422 out.push(Token::At);
431 i = prefix_end;
432 continue;
433 }
434 out.push(Token::SessionVar(input[i..end].to_string()));
435 i = end;
436 }
437 }
438 b'*' => single(&mut out, Token::Star, &mut i),
439 b'/' => single(&mut out, Token::Slash, &mut i),
440 b'(' => single(&mut out, Token::LParen, &mut i),
441 b')' => single(&mut out, Token::RParen, &mut i),
442 b'[' => single(&mut out, Token::LBracket, &mut i),
443 b']' => single(&mut out, Token::RBracket, &mut i),
444 b',' => single(&mut out, Token::Comma, &mut i),
445 b';' => single(&mut out, Token::Semicolon, &mut i),
446 b'.' => single(&mut out, Token::Dot, &mut i),
447 b'=' => single(&mut out, Token::Eq, &mut i),
448 b'<' => {
449 if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
450 out.push(Token::CosineDistance);
451 i += 3;
452 } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
453 out.push(Token::InnerProduct);
454 i += 3;
455 } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
456 out.push(Token::L2Distance);
457 i += 3;
458 } else if peek_eq(bytes, i + 1, b'<') && peek_eq(bytes, i + 2, b'=') {
459 out.push(Token::InetContainedByEq);
461 i += 3;
462 } else if peek_eq(bytes, i + 1, b'<') {
463 out.push(Token::InetContainedBy);
465 i += 2;
466 } else if peek_eq(bytes, i + 1, b'=') {
467 out.push(Token::LtEq);
468 i += 2;
469 } else if peek_eq(bytes, i + 1, b'>') {
470 out.push(Token::NotEq);
471 i += 2;
472 } else {
473 out.push(Token::Lt);
474 i += 1;
475 }
476 }
477 b':' if peek_eq(bytes, i + 1, b':') => {
478 out.push(Token::DoubleColon);
479 i += 2;
480 }
481 b':' if peek_eq(bytes, i + 1, b'=') => {
482 out.push(Token::ColonEq);
484 i += 2;
485 }
486 b':' => {
487 out.push(Token::Colon);
491 i += 1;
492 }
493 b'|' if peek_eq(bytes, i + 1, b'|') => {
494 out.push(Token::Concat);
495 i += 2;
496 }
497 b'>' => {
498 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'=') {
499 out.push(Token::InetContainsEq);
501 i += 3;
502 } else if peek_eq(bytes, i + 1, b'>') {
503 out.push(Token::InetContains);
505 i += 2;
506 } else if peek_eq(bytes, i + 1, b'=') {
507 out.push(Token::GtEq);
508 i += 2;
509 } else {
510 out.push(Token::Gt);
511 i += 1;
512 }
513 }
514 b'&' if peek_eq(bytes, i + 1, b'&') => {
515 out.push(Token::InetOverlap);
517 i += 2;
518 }
519 b'!' if peek_eq(bytes, i + 1, b'=') => {
520 out.push(Token::NotEq);
521 i += 2;
522 }
523 b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
531 let end = find_dollar_tag_end(bytes, i + 2, b"$$");
533 let body = match end {
534 Some(e) => &input[i + 2..e],
535 None => {
536 return Err(LexError {
537 kind: LexErrorKind::UnterminatedString,
538 pos: i,
539 });
540 }
541 };
542 out.push(Token::String(body.to_string()));
543 i = end.unwrap() + 2;
544 }
545 b'$' if i + 1 < bytes.len()
546 && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
547 {
548 let mut j = i + 1;
551 while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
552 j += 1;
553 }
554 if j >= bytes.len() || bytes[j] != b'$' {
555 let ch = input[i..].chars().next().unwrap_or('?');
558 return Err(LexError {
559 kind: LexErrorKind::UnknownChar(ch),
560 pos: i,
561 });
562 }
563 let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
564 let end = find_dollar_tag_end(bytes, j + 1, &close);
565 let body = match end {
566 Some(e) => &input[j + 1..e],
567 None => {
568 return Err(LexError {
569 kind: LexErrorKind::UnterminatedString,
570 pos: i,
571 });
572 }
573 };
574 out.push(Token::String(body.to_string()));
575 i = end.unwrap() + close.len();
576 }
577 b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
581 let mut j = i + 1;
582 let mut n: u32 = 0;
583 while j < bytes.len() && bytes[j].is_ascii_digit() {
584 n = n
585 .saturating_mul(10)
586 .saturating_add(u32::from(bytes[j] - b'0'));
587 j += 1;
588 }
589 if n == 0 || n > u32::from(u16::MAX) {
590 return Err(LexError {
591 kind: LexErrorKind::BadNumber(input[i..j].to_string()),
592 pos: i,
593 });
594 }
595 #[allow(clippy::cast_possible_truncation)]
596 out.push(Token::Placeholder(n as u16));
597 i = j;
598 }
599 _ => {
600 let ch = input[i..].chars().next().unwrap_or('?');
601 return Err(LexError {
602 kind: LexErrorKind::UnknownChar(ch),
603 pos: i,
604 });
605 }
606 }
607 }
608 out.push(Token::Eof);
609 Ok(out)
610}
611
612fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
613 bytes.get(i) == Some(&target)
614}
615
616fn is_session_var_ident_start(b: Option<u8>) -> bool {
621 matches!(b, Some(c) if c.is_ascii_alphabetic() || c == b'_')
622}
623
624fn is_session_var_ident_continue(b: u8) -> bool {
629 b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'$'
630}
631
632fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
635 if tag.is_empty() || from > bytes.len() {
636 return None;
637 }
638 let mut i = from;
639 while i + tag.len() <= bytes.len() {
640 if &bytes[i..i + tag.len()] == tag {
641 return Some(i);
642 }
643 i += 1;
644 }
645 None
646}
647
648fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
649 bytes.get(i).is_some_and(pred)
650}
651
652fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
653 out.push(tok);
654 *i += 1;
655}
656
657fn keyword_or_ident_raw(raw: &str) -> Token {
667 let b = raw.as_bytes();
668 let tok = match b.len() {
669 2 => kw_len2(b),
670 3 => kw_len3(b),
671 4 => kw_len4(b),
672 5 => kw_len5(b),
673 6 => kw_len6(b),
674 7 => kw_len7(b),
675 8 => kw_len8(b),
676 9 => kw_len9(b),
677 10 => kw_len10(b),
678 11 => kw_len11(b),
679 12 => kw_len12(b),
680 _ => None,
681 };
682 match tok {
683 Some(t) => t,
684 None => Token::Ident(raw.to_ascii_lowercase()),
686 }
687}
688
689#[inline]
695fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
696 if input.len() != lower.len() {
697 return false;
698 }
699 for i in 0..lower.len() {
700 if input[i].to_ascii_lowercase() != lower[i] {
701 return false;
702 }
703 }
704 true
705}
706
707#[inline]
708fn kw_len2(b: &[u8]) -> Option<Token> {
709 if eq_ci(b, b"as") {
711 return Some(Token::As);
712 }
713 if eq_ci(b, b"by") {
714 return Some(Token::By);
715 }
716 if eq_ci(b, b"in") {
717 return Some(Token::In);
718 }
719 if eq_ci(b, b"is") {
720 return Some(Token::Is);
721 }
722 if eq_ci(b, b"on") {
723 return Some(Token::On);
724 }
725 if eq_ci(b, b"or") {
726 return Some(Token::Or);
727 }
728 if eq_ci(b, b"to") {
729 return Some(Token::To);
730 }
731 None
732}
733
734#[inline]
735fn kw_len3(b: &[u8]) -> Option<Token> {
736 if eq_ci(b, b"for") {
738 return Some(Token::For);
739 }
740 if eq_ci(b, b"all") {
741 return Some(Token::All);
742 }
743 if eq_ci(b, b"and") {
744 return Some(Token::And);
745 }
746 if eq_ci(b, b"asc") {
747 return Some(Token::Asc);
748 }
749 if eq_ci(b, b"not") {
750 return Some(Token::Not);
751 }
752 None
753}
754
755#[inline]
756fn kw_len4(b: &[u8]) -> Option<Token> {
757 if eq_ci(b, b"from") {
759 return Some(Token::From);
760 }
761 if eq_ci(b, b"drop") {
762 return Some(Token::Drop);
763 }
764 if eq_ci(b, b"null") {
765 return Some(Token::Null);
766 }
767 if eq_ci(b, b"true") {
768 return Some(Token::True);
769 }
770 if eq_ci(b, b"into") {
771 return Some(Token::Into);
772 }
773 if eq_ci(b, b"like") {
774 return Some(Token::Like);
775 }
776 if eq_ci(b, b"join") {
777 return Some(Token::Join);
778 }
779 if eq_ci(b, b"left") {
780 return Some(Token::Left);
781 }
782 if eq_ci(b, b"show") {
783 return Some(Token::Show);
784 }
785 if eq_ci(b, b"desc") {
786 return Some(Token::Desc);
787 }
788 None
789}
790
791#[inline]
792fn kw_len5(b: &[u8]) -> Option<Token> {
793 if eq_ci(b, b"false") {
796 return Some(Token::False);
797 }
798 if eq_ci(b, b"where") {
799 return Some(Token::Where);
800 }
801 if eq_ci(b, b"table") {
802 return Some(Token::Table);
803 }
804 if eq_ci(b, b"index") {
805 return Some(Token::Index);
806 }
807 if eq_ci(b, b"begin") {
808 return Some(Token::Begin);
809 }
810 if eq_ci(b, b"order") {
811 return Some(Token::Order);
812 }
813 if eq_ci(b, b"limit") {
814 return Some(Token::Limit);
815 }
816 if eq_ci(b, b"group") {
817 return Some(Token::Group);
818 }
819 if eq_ci(b, b"union") {
820 return Some(Token::Union);
821 }
822 if eq_ci(b, b"inner") {
823 return Some(Token::Inner);
824 }
825 if eq_ci(b, b"cross") {
826 return Some(Token::Cross);
827 }
828 if eq_ci(b, b"outer") {
829 return Some(Token::Outer);
830 }
831 None
832}
833
834#[inline]
835fn kw_len6(b: &[u8]) -> Option<Token> {
836 if eq_ci(b, b"select") {
838 return Some(Token::Select);
839 }
840 if eq_ci(b, b"tables") {
841 return Some(Token::Tables);
842 }
843 if eq_ci(b, b"except") {
844 return Some(Token::Except);
845 }
846 if eq_ci(b, b"create") {
847 return Some(Token::Create);
848 }
849 if eq_ci(b, b"insert") {
850 return Some(Token::Insert);
851 }
852 if eq_ci(b, b"values") {
853 return Some(Token::Values);
854 }
855 if eq_ci(b, b"commit") {
856 return Some(Token::Commit);
857 }
858 if eq_ci(b, b"having") {
859 return Some(Token::Having);
860 }
861 if eq_ci(b, b"offset") {
862 return Some(Token::Offset);
863 }
864 None
865}
866
867#[inline]
868fn kw_len7(b: &[u8]) -> Option<Token> {
869 if eq_ci(b, b"between") {
871 return Some(Token::Between);
872 }
873 if eq_ci(b, b"default") {
874 return Some(Token::Default);
875 }
876 if eq_ci(b, b"release") {
877 return Some(Token::Release);
878 }
879 if eq_ci(b, b"extract") {
880 return Some(Token::Extract);
881 }
882 None
883}
884
885#[inline]
886fn kw_len8(b: &[u8]) -> Option<Token> {
887 if eq_ci(b, b"rollback") {
889 return Some(Token::Rollback);
890 }
891 if eq_ci(b, b"distinct") {
892 return Some(Token::Distinct);
893 }
894 if eq_ci(b, b"interval") {
895 return Some(Token::Interval);
896 }
897 None
898}
899
900#[inline]
901fn kw_len9(b: &[u8]) -> Option<Token> {
902 if eq_ci(b, b"savepoint") {
904 return Some(Token::Savepoint);
905 }
906 None
907}
908
909#[inline]
910fn kw_len10(b: &[u8]) -> Option<Token> {
911 if eq_ci(b, b"connection") {
913 return Some(Token::Connection);
914 }
915 None
916}
917
918#[inline]
919fn kw_len11(b: &[u8]) -> Option<Token> {
920 if eq_ci(b, b"publication") {
922 return Some(Token::Publication);
923 }
924 None
925}
926
927#[inline]
928fn kw_len12(b: &[u8]) -> Option<Token> {
929 if eq_ci(b, b"subscription") {
931 return Some(Token::Subscription);
932 }
933 None
934}
935
936fn lex_quoted(
943 input: &str,
944 start: usize,
945 quote: u8,
946 is_ident: bool,
947) -> Result<(Token, usize), LexError> {
948 let bytes = input.as_bytes();
949 let mut i = start + 1;
950 let mut s = String::new();
951 loop {
952 if i >= bytes.len() {
953 return Err(LexError {
954 kind: if is_ident {
955 LexErrorKind::UnterminatedQuotedIdent
956 } else {
957 LexErrorKind::UnterminatedString
958 },
959 pos: start,
960 });
961 }
962 if bytes[i] == quote {
963 if peek_eq(bytes, i + 1, quote) {
964 s.push(quote as char);
965 i += 2;
966 } else {
967 i += 1;
968 break;
969 }
970 } else {
971 let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
972 s.push(ch);
973 i += ch.len_utf8();
974 }
975 }
976 let tok = if is_ident {
977 Token::QuotedIdent(s)
978 } else {
979 Token::String(s)
980 };
981 Ok((tok, i - start))
982}
983
984fn lex_escape_string(input: &str, start: usize) -> Result<(Token, usize), LexError> {
1000 let bytes = input.as_bytes();
1001 debug_assert_eq!(bytes[start], b'\'');
1002 let mut i = start + 1;
1003 let mut s = String::new();
1004 loop {
1005 if i >= bytes.len() {
1006 return Err(LexError {
1007 kind: LexErrorKind::UnterminatedString,
1008 pos: start,
1009 });
1010 }
1011 let b = bytes[i];
1012 if b == b'\'' {
1013 if peek_eq(bytes, i + 1, b'\'') {
1014 s.push('\'');
1015 i += 2;
1016 continue;
1017 }
1018 i += 1;
1019 break;
1020 }
1021 if b == b'\\' && i + 1 < bytes.len() {
1022 let n = bytes[i + 1];
1023 match n {
1024 b'\\' => {
1025 s.push('\\');
1026 i += 2;
1027 }
1028 b'\'' => {
1029 s.push('\'');
1030 i += 2;
1031 }
1032 b'"' => {
1033 s.push('"');
1034 i += 2;
1035 }
1036 b'n' => {
1037 s.push('\n');
1038 i += 2;
1039 }
1040 b'r' => {
1041 s.push('\r');
1042 i += 2;
1043 }
1044 b't' => {
1045 s.push('\t');
1046 i += 2;
1047 }
1048 b'b' => {
1049 s.push('\u{0008}');
1050 i += 2;
1051 }
1052 b'f' => {
1053 s.push('\u{000C}');
1054 i += 2;
1055 }
1056 b'0' if i + 2 >= bytes.len() || !bytes[i + 2].is_ascii_digit() => {
1057 s.push('\0');
1058 i += 2;
1059 }
1060 b'x' => {
1061 let h1 = bytes.get(i + 2).copied();
1063 let h2 = bytes.get(i + 3).copied();
1064 let n1 = h1.and_then(hex_digit_value);
1065 let n2 = h2.and_then(hex_digit_value);
1066 match (n1, n2) {
1067 (Some(a), Some(b2)) => {
1068 s.push((((a << 4) | b2) as u8) as char);
1069 i += 4;
1070 }
1071 (Some(a), _) => {
1072 s.push((a as u8) as char);
1073 i += 3;
1074 }
1075 _ => {
1076 s.push('x');
1078 i += 2;
1079 }
1080 }
1081 }
1082 d if d.is_ascii_digit() && d < b'8' => {
1083 let mut value: u32 = u32::from(d - b'0');
1085 let mut take = 2;
1086 while take < 4 {
1087 let next = bytes.get(i + take).copied();
1088 match next {
1089 Some(c) if c.is_ascii_digit() && c < b'8' => {
1090 value = (value << 3) | u32::from(c - b'0');
1091 take += 1;
1092 }
1093 _ => break,
1094 }
1095 }
1096 if let Some(c) = char::from_u32(value) {
1097 s.push(c);
1098 } else {
1099 s.push((value & 0xFF) as u8 as char);
1101 }
1102 i += take;
1103 }
1104 other => {
1105 s.push(other as char);
1109 i += 2;
1110 }
1111 }
1112 } else {
1113 let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
1114 s.push(ch);
1115 i += ch.len_utf8();
1116 }
1117 }
1118 Ok((Token::String(s), i - start))
1119}
1120
1121fn hex_digit_value(b: u8) -> Option<u32> {
1122 match b {
1123 b'0'..=b'9' => Some(u32::from(b - b'0')),
1124 b'a'..=b'f' => Some(u32::from(b - b'a' + 10)),
1125 b'A'..=b'F' => Some(u32::from(b - b'A' + 10)),
1126 _ => None,
1127 }
1128}
1129
1130fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
1131 let bytes = s.as_bytes();
1132 let mut i = 0usize;
1133 let mut is_float = false;
1134
1135 while i < bytes.len() && bytes[i].is_ascii_digit() {
1136 i += 1;
1137 }
1138 if i < bytes.len() && bytes[i] == b'.' {
1139 is_float = true;
1140 i += 1;
1141 while i < bytes.len() && bytes[i].is_ascii_digit() {
1142 i += 1;
1143 }
1144 }
1145 if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
1146 is_float = true;
1147 i += 1;
1148 if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
1149 i += 1;
1150 }
1151 let exp_start = i;
1152 while i < bytes.len() && bytes[i].is_ascii_digit() {
1153 i += 1;
1154 }
1155 if exp_start == i {
1156 return Err(LexErrorKind::BadNumber(s[..i].to_string()));
1157 }
1158 }
1159
1160 let lit = &s[..i];
1161 if is_float {
1162 lit.parse::<f64>()
1163 .map(|v| (Token::Float(v), i))
1164 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1165 } else {
1166 lit.parse::<i64>()
1167 .map(|v| (Token::Integer(v), i))
1168 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
1169 }
1170}
1171
1172#[cfg(test)]
1173mod tests {
1174 use super::*;
1175 use alloc::vec;
1176
1177 fn lex(s: &str) -> Vec<Token> {
1178 tokenize(s).expect("lex ok")
1179 }
1180
1181 #[test]
1182 fn empty_yields_only_eof() {
1183 assert_eq!(lex(""), vec![Token::Eof]);
1184 }
1185
1186 #[test]
1187 fn whitespace_only_yields_only_eof() {
1188 assert_eq!(lex(" \t\n "), vec![Token::Eof]);
1189 }
1190
1191 #[test]
1192 fn keywords_are_case_insensitive() {
1193 assert_eq!(
1194 lex("SELECT select Select"),
1195 vec![Token::Select, Token::Select, Token::Select, Token::Eof]
1196 );
1197 }
1198
1199 #[test]
1200 fn identifiers_lowercase_ascii() {
1201 assert_eq!(
1202 lex("hello WORLD _x x1"),
1203 vec![
1204 Token::Ident("hello".into()),
1205 Token::Ident("world".into()),
1206 Token::Ident("_x".into()),
1207 Token::Ident("x1".into()),
1208 Token::Eof,
1209 ]
1210 );
1211 }
1212
1213 #[test]
1214 fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
1215 assert_eq!(
1216 lex(r#""User Name" "a""b""#),
1217 vec![
1218 Token::QuotedIdent("User Name".into()),
1219 Token::QuotedIdent("a\"b".into()),
1220 Token::Eof,
1221 ]
1222 );
1223 }
1224
1225 #[test]
1226 fn integer_and_float_literals() {
1227 assert_eq!(
1228 lex("0 42 1.5 .5 1e10 2.5e-3"),
1229 vec![
1230 Token::Integer(0),
1231 Token::Integer(42),
1232 Token::Float(1.5),
1233 Token::Float(0.5),
1234 Token::Float(1e10),
1235 Token::Float(2.5e-3),
1236 Token::Eof,
1237 ]
1238 );
1239 }
1240
1241 #[test]
1242 fn negative_number_is_minus_then_integer() {
1243 assert_eq!(
1245 lex("-42"),
1246 vec![Token::Minus, Token::Integer(42), Token::Eof]
1247 );
1248 }
1249
1250 #[test]
1251 fn string_literal_doubled_quote_escape() {
1252 assert_eq!(
1253 lex("'hello' 'it''s'"),
1254 vec![
1255 Token::String("hello".into()),
1256 Token::String("it's".into()),
1257 Token::Eof,
1258 ]
1259 );
1260 }
1261
1262 #[test]
1263 fn all_comparison_and_arithmetic_operators() {
1264 assert_eq!(
1265 lex("= <> != < <= > >= + - * /"),
1266 vec![
1267 Token::Eq,
1268 Token::NotEq,
1269 Token::NotEq,
1270 Token::Lt,
1271 Token::LtEq,
1272 Token::Gt,
1273 Token::GtEq,
1274 Token::Plus,
1275 Token::Minus,
1276 Token::Star,
1277 Token::Slash,
1278 Token::Eof,
1279 ]
1280 );
1281 }
1282
1283 #[test]
1284 fn punctuation() {
1285 assert_eq!(
1286 lex("( ) , ; ."),
1287 vec![
1288 Token::LParen,
1289 Token::RParen,
1290 Token::Comma,
1291 Token::Semicolon,
1292 Token::Dot,
1293 Token::Eof,
1294 ]
1295 );
1296 }
1297
1298 #[test]
1299 fn line_comment_skipped() {
1300 assert_eq!(
1301 lex("SELECT -- trailing junk\nFROM"),
1302 vec![Token::Select, Token::From, Token::Eof]
1303 );
1304 }
1305
1306 #[test]
1307 fn block_comment_skipped() {
1308 assert_eq!(
1309 lex("SELECT /* skipped */ 1"),
1310 vec![Token::Select, Token::Integer(1), Token::Eof]
1311 );
1312 }
1313
1314 #[test]
1315 fn unterminated_string_errors() {
1316 let err = tokenize("'oops").unwrap_err();
1317 assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1318 assert_eq!(err.pos, 0);
1319 }
1320
1321 #[test]
1322 fn unterminated_block_comment_errors() {
1323 let err = tokenize("/* never closed").unwrap_err();
1324 assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1325 }
1326
1327 #[test]
1328 fn unknown_char_errors() {
1329 let err = tokenize("\x07").unwrap_err();
1335 assert!(matches!(err.kind, LexErrorKind::UnknownChar(_)));
1336 }
1337
1338 #[test]
1339 fn at_alone_lexes_as_punctuation() {
1340 assert_eq!(
1343 lex("'u'@'h'"),
1344 vec![
1345 Token::String("u".into()),
1346 Token::At,
1347 Token::String("h".into()),
1348 Token::Eof,
1349 ]
1350 );
1351 }
1352
1353 #[test]
1354 fn dot_in_qualified_column() {
1355 assert_eq!(
1356 lex("t.col"),
1357 vec![
1358 Token::Ident("t".into()),
1359 Token::Dot,
1360 Token::Ident("col".into()),
1361 Token::Eof,
1362 ]
1363 );
1364 }
1365
1366 #[test]
1369 fn brackets_are_distinct_tokens() {
1370 assert_eq!(
1371 lex("[ ]"),
1372 vec![Token::LBracket, Token::RBracket, Token::Eof]
1373 );
1374 }
1375
1376 #[test]
1377 fn l2_distance_is_three_char_token() {
1378 assert_eq!(
1379 lex("a <-> b"),
1380 vec![
1381 Token::Ident("a".into()),
1382 Token::L2Distance,
1383 Token::Ident("b".into()),
1384 Token::Eof,
1385 ]
1386 );
1387 assert_eq!(
1389 lex("a <- b"),
1390 vec![
1391 Token::Ident("a".into()),
1392 Token::Lt,
1393 Token::Minus,
1394 Token::Ident("b".into()),
1395 Token::Eof,
1396 ]
1397 );
1398 }
1399
1400 #[test]
1401 fn order_by_limit_are_keywords() {
1402 assert_eq!(
1403 lex("ORDER BY LIMIT"),
1404 vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1405 );
1406 }
1407
1408 #[test]
1411 fn inner_product_operator_3char() {
1412 assert_eq!(
1413 lex("a <#> b"),
1414 vec![
1415 Token::Ident("a".into()),
1416 Token::InnerProduct,
1417 Token::Ident("b".into()),
1418 Token::Eof,
1419 ]
1420 );
1421 }
1422
1423 #[test]
1424 fn cosine_distance_operator_3char() {
1425 assert_eq!(
1426 lex("a <=> b"),
1427 vec![
1428 Token::Ident("a".into()),
1429 Token::CosineDistance,
1430 Token::Ident("b".into()),
1431 Token::Eof,
1432 ]
1433 );
1434 assert_eq!(
1437 lex("a <= b"),
1438 vec![
1439 Token::Ident("a".into()),
1440 Token::LtEq,
1441 Token::Ident("b".into()),
1442 Token::Eof,
1443 ]
1444 );
1445 }
1446
1447 #[test]
1448 fn double_colon_cast_token() {
1449 assert_eq!(
1450 lex("x::INT"),
1451 vec![
1452 Token::Ident("x".into()),
1453 Token::DoubleColon,
1454 Token::Ident("int".into()),
1455 Token::Eof,
1456 ]
1457 );
1458 }
1459
1460 #[test]
1461 fn lone_single_colon_lexes_as_colon_token() {
1462 let toks = tokenize(":x").expect("colon now lexes");
1467 assert_eq!(toks[0], Token::Colon);
1468 }
1469
1470 #[test]
1471 fn colon_eq_lexes_as_assignment() {
1472 let toks = tokenize("x := 1").expect("colon-eq lexes");
1474 assert!(matches!(toks[1], Token::ColonEq));
1476 }
1477
1478 #[test]
1479 fn pg_escape_string_double_backslash_decodes_to_single() {
1480 let toks = tokenize(r"E'\\xdeadbeef'").expect("E-string lexes");
1485 assert_eq!(toks, vec![Token::String(r"\xdeadbeef".into()), Token::Eof]);
1486 }
1487
1488 #[test]
1489 fn pg_escape_string_supports_basic_escapes() {
1490 let toks = tokenize(r"E'a\nb\tc\'d\\e'").expect("E-string lexes");
1492 assert_eq!(toks, vec![Token::String("a\nb\tc'd\\e".into()), Token::Eof]);
1493 }
1494
1495 #[test]
1496 fn pg_escape_string_hex_byte() {
1497 let toks = tokenize(r"E'\x41B\x42'").expect("E-string lexes");
1499 assert_eq!(toks, vec![Token::String("ABB".into()), Token::Eof]);
1500 }
1501
1502 #[test]
1503 fn pg_escape_string_lowercase_e_prefix() {
1504 let toks = tokenize(r"e'hi\n'").expect("e-string lexes");
1505 assert_eq!(toks, vec![Token::String("hi\n".into()), Token::Eof]);
1506 }
1507
1508 #[test]
1509 fn pg_escape_string_doubled_quote() {
1510 let toks = tokenize(r"E'it''s ok'").expect("E-string lexes");
1512 assert_eq!(toks, vec![Token::String("it's ok".into()), Token::Eof]);
1513 }
1514}