1use alloc::string::{String, ToString};
12use alloc::vec::Vec;
13use core::fmt;
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17 Select,
19 From,
20 Where,
21 As,
22 Null,
23 True,
24 False,
25 And,
26 Or,
27 Not,
28 Create,
29 Table,
30 Insert,
31 Into,
32 Values,
33 Index,
34 On,
35 Begin,
36 Commit,
37 Rollback,
38 Order,
39 By,
40 Limit,
41
42 Ident(String), QuotedIdent(String), Integer(i64),
48 Float(f64),
49 String(String),
50
51 Plus,
53 Minus,
54 Star,
55 Slash,
56 Eq,
57 NotEq,
58 Lt,
59 LtEq,
60 Gt,
61 GtEq,
62
63 LParen,
65 RParen,
66 LBracket,
67 RBracket,
68 Comma,
69 Semicolon,
70 Dot,
71 JsonGet,
75 JsonGetText,
77 JsonGetPath,
80 JsonGetPathText,
82 JsonContains,
86 TsMatch,
90 L2Distance,
91 InnerProduct,
94 CosineDistance,
96 DoubleColon,
99 ColonEq,
102 Colon,
106 Concat,
108 Is,
110 Between,
111 In,
112 Like,
113 Group,
114 Distinct,
115 Union,
116 All,
117 Join,
118 Inner,
119 Left,
120 Cross,
121 Outer,
122 Default,
123 Savepoint,
124 Release,
125 To,
126 Having,
127 Show,
128 Extract,
129 Offset,
130 Asc,
131 Desc,
132 Interval,
135 Placeholder(u16),
139
140 Drop,
144 For,
146 Tables,
151 Except,
154 Publication,
156 Subscription,
158 Connection,
161
162 Eof,
163}
164
165#[derive(Debug, Clone, PartialEq, Eq)]
166pub enum LexErrorKind {
167 UnknownChar(char),
168 UnterminatedString,
169 UnterminatedQuotedIdent,
170 UnterminatedBlockComment,
171 BadNumber(String),
172}
173
174#[derive(Debug, Clone, PartialEq, Eq)]
175pub struct LexError {
176 pub kind: LexErrorKind,
177 pub pos: usize,
178}
179
180impl fmt::Display for LexError {
181 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
182 match &self.kind {
183 LexErrorKind::UnknownChar(c) => write!(f, "unknown char {c:?} at byte {}", self.pos),
184 LexErrorKind::UnterminatedString => {
185 write!(f, "unterminated string literal at byte {}", self.pos)
186 }
187 LexErrorKind::UnterminatedQuotedIdent => {
188 write!(f, "unterminated quoted identifier at byte {}", self.pos)
189 }
190 LexErrorKind::UnterminatedBlockComment => {
191 write!(f, "unterminated /* */ comment at byte {}", self.pos)
192 }
193 LexErrorKind::BadNumber(s) => {
194 write!(f, "invalid number literal {s:?} at byte {}", self.pos)
195 }
196 }
197 }
198}
199
200#[allow(clippy::too_many_lines)] pub fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
203 let bytes = input.as_bytes();
204 let mut i = 0usize;
205 let mut out = Vec::new();
206
207 while i < bytes.len() {
208 let b = bytes[i];
209 match b {
210 b' ' | b'\t' | b'\n' | b'\r' => {
211 i += 1;
212 }
213 b'-' if peek_eq(bytes, i + 1, b'-') => {
214 i += 2;
215 while i < bytes.len() && bytes[i] != b'\n' {
216 i += 1;
217 }
218 }
219 b'/' if peek_eq(bytes, i + 1, b'*') => {
220 let start = i;
221 i += 2;
222 let mut closed = false;
223 while i + 1 < bytes.len() {
224 if bytes[i] == b'*' && bytes[i + 1] == b'/' {
225 i += 2;
226 closed = true;
227 break;
228 }
229 i += 1;
230 }
231 if !closed {
232 return Err(LexError {
233 kind: LexErrorKind::UnterminatedBlockComment,
234 pos: start,
235 });
236 }
237 }
238 b'\'' => {
239 let (tok, consumed) = lex_quoted(input, i, b'\'', false)?;
240 out.push(tok);
241 i += consumed;
242 }
243 b'"' => {
244 let (tok, consumed) = lex_quoted(input, i, b'"', true)?;
245 out.push(tok);
246 i += consumed;
247 }
248 b'`' => {
252 let (tok, consumed) = lex_quoted(input, i, b'`', true)?;
253 out.push(tok);
254 i += consumed;
255 }
256 b if b.is_ascii_alphabetic() || b == b'_' => {
257 let start = i;
258 i += 1;
259 while i < bytes.len() {
260 let c = bytes[i];
261 if c.is_ascii_alphanumeric() || c == b'_' {
262 i += 1;
263 } else {
264 break;
265 }
266 }
267 let raw = &input[start..i];
268 out.push(keyword_or_ident_raw(raw));
272 }
273 b if b.is_ascii_digit() => {
274 let (tok, consumed) =
275 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
276 out.push(tok);
277 i += consumed;
278 }
279 b'.' if peek_pred(bytes, i + 1, u8::is_ascii_digit) => {
280 let (tok, consumed) =
281 lex_number(&input[i..]).map_err(|kind| LexError { kind, pos: i })?;
282 out.push(tok);
283 i += consumed;
284 }
285 b'+' => single(&mut out, Token::Plus, &mut i),
286 b'-' => {
287 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
290 out.push(Token::JsonGetText);
291 i += 3;
292 } else if peek_eq(bytes, i + 1, b'>') {
293 out.push(Token::JsonGet);
294 i += 2;
295 } else {
296 single(&mut out, Token::Minus, &mut i);
297 }
298 }
299 b'#' => {
301 if peek_eq(bytes, i + 1, b'>') && peek_eq(bytes, i + 2, b'>') {
302 out.push(Token::JsonGetPathText);
303 i += 3;
304 } else if peek_eq(bytes, i + 1, b'>') {
305 out.push(Token::JsonGetPath);
306 i += 2;
307 } else {
308 return Err(LexError {
309 kind: LexErrorKind::UnknownChar('#'),
310 pos: i,
311 });
312 }
313 }
314 b'@' => {
317 if peek_eq(bytes, i + 1, b'>') {
318 out.push(Token::JsonContains);
319 i += 2;
320 } else if peek_eq(bytes, i + 1, b'@') {
321 out.push(Token::TsMatch);
322 i += 2;
323 } else {
324 return Err(LexError {
325 kind: LexErrorKind::UnknownChar('@'),
326 pos: i,
327 });
328 }
329 }
330 b'*' => single(&mut out, Token::Star, &mut i),
331 b'/' => single(&mut out, Token::Slash, &mut i),
332 b'(' => single(&mut out, Token::LParen, &mut i),
333 b')' => single(&mut out, Token::RParen, &mut i),
334 b'[' => single(&mut out, Token::LBracket, &mut i),
335 b']' => single(&mut out, Token::RBracket, &mut i),
336 b',' => single(&mut out, Token::Comma, &mut i),
337 b';' => single(&mut out, Token::Semicolon, &mut i),
338 b'.' => single(&mut out, Token::Dot, &mut i),
339 b'=' => single(&mut out, Token::Eq, &mut i),
340 b'<' => {
341 if peek_eq(bytes, i + 1, b'=') && peek_eq(bytes, i + 2, b'>') {
342 out.push(Token::CosineDistance);
343 i += 3;
344 } else if peek_eq(bytes, i + 1, b'#') && peek_eq(bytes, i + 2, b'>') {
345 out.push(Token::InnerProduct);
346 i += 3;
347 } else if peek_eq(bytes, i + 1, b'-') && peek_eq(bytes, i + 2, b'>') {
348 out.push(Token::L2Distance);
349 i += 3;
350 } else if peek_eq(bytes, i + 1, b'=') {
351 out.push(Token::LtEq);
352 i += 2;
353 } else if peek_eq(bytes, i + 1, b'>') {
354 out.push(Token::NotEq);
355 i += 2;
356 } else {
357 out.push(Token::Lt);
358 i += 1;
359 }
360 }
361 b':' if peek_eq(bytes, i + 1, b':') => {
362 out.push(Token::DoubleColon);
363 i += 2;
364 }
365 b':' if peek_eq(bytes, i + 1, b'=') => {
366 out.push(Token::ColonEq);
368 i += 2;
369 }
370 b':' => {
371 out.push(Token::Colon);
375 i += 1;
376 }
377 b'|' if peek_eq(bytes, i + 1, b'|') => {
378 out.push(Token::Concat);
379 i += 2;
380 }
381 b'>' => {
382 if peek_eq(bytes, i + 1, b'=') {
383 out.push(Token::GtEq);
384 i += 2;
385 } else {
386 out.push(Token::Gt);
387 i += 1;
388 }
389 }
390 b'!' if peek_eq(bytes, i + 1, b'=') => {
391 out.push(Token::NotEq);
392 i += 2;
393 }
394 b'$' if i + 1 < bytes.len() && bytes[i + 1] == b'$' => {
402 let end = find_dollar_tag_end(bytes, i + 2, b"$$");
404 let body = match end {
405 Some(e) => &input[i + 2..e],
406 None => {
407 return Err(LexError {
408 kind: LexErrorKind::UnterminatedString,
409 pos: i,
410 });
411 }
412 };
413 out.push(Token::String(body.to_string()));
414 i = end.unwrap() + 2;
415 }
416 b'$' if i + 1 < bytes.len()
417 && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_') =>
418 {
419 let mut j = i + 1;
422 while j < bytes.len() && (bytes[j].is_ascii_alphanumeric() || bytes[j] == b'_') {
423 j += 1;
424 }
425 if j >= bytes.len() || bytes[j] != b'$' {
426 let ch = input[i..].chars().next().unwrap_or('?');
429 return Err(LexError {
430 kind: LexErrorKind::UnknownChar(ch),
431 pos: i,
432 });
433 }
434 let close: alloc::vec::Vec<u8> = bytes[i..=j].to_vec();
435 let end = find_dollar_tag_end(bytes, j + 1, &close);
436 let body = match end {
437 Some(e) => &input[j + 1..e],
438 None => {
439 return Err(LexError {
440 kind: LexErrorKind::UnterminatedString,
441 pos: i,
442 });
443 }
444 };
445 out.push(Token::String(body.to_string()));
446 i = end.unwrap() + close.len();
447 }
448 b'$' if i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit() => {
452 let mut j = i + 1;
453 let mut n: u32 = 0;
454 while j < bytes.len() && bytes[j].is_ascii_digit() {
455 n = n
456 .saturating_mul(10)
457 .saturating_add(u32::from(bytes[j] - b'0'));
458 j += 1;
459 }
460 if n == 0 || n > u32::from(u16::MAX) {
461 return Err(LexError {
462 kind: LexErrorKind::BadNumber(input[i..j].to_string()),
463 pos: i,
464 });
465 }
466 #[allow(clippy::cast_possible_truncation)]
467 out.push(Token::Placeholder(n as u16));
468 i = j;
469 }
470 _ => {
471 let ch = input[i..].chars().next().unwrap_or('?');
472 return Err(LexError {
473 kind: LexErrorKind::UnknownChar(ch),
474 pos: i,
475 });
476 }
477 }
478 }
479 out.push(Token::Eof);
480 Ok(out)
481}
482
483fn peek_eq(bytes: &[u8], i: usize, target: u8) -> bool {
484 bytes.get(i) == Some(&target)
485}
486
487fn find_dollar_tag_end(bytes: &[u8], from: usize, tag: &[u8]) -> Option<usize> {
490 if tag.is_empty() || from > bytes.len() {
491 return None;
492 }
493 let mut i = from;
494 while i + tag.len() <= bytes.len() {
495 if &bytes[i..i + tag.len()] == tag {
496 return Some(i);
497 }
498 i += 1;
499 }
500 None
501}
502
503fn peek_pred<F: Fn(&u8) -> bool>(bytes: &[u8], i: usize, pred: F) -> bool {
504 bytes.get(i).is_some_and(pred)
505}
506
507fn single(out: &mut Vec<Token>, tok: Token, i: &mut usize) {
508 out.push(tok);
509 *i += 1;
510}
511
512fn keyword_or_ident_raw(raw: &str) -> Token {
522 let b = raw.as_bytes();
523 let tok = match b.len() {
524 2 => kw_len2(b),
525 3 => kw_len3(b),
526 4 => kw_len4(b),
527 5 => kw_len5(b),
528 6 => kw_len6(b),
529 7 => kw_len7(b),
530 8 => kw_len8(b),
531 9 => kw_len9(b),
532 10 => kw_len10(b),
533 11 => kw_len11(b),
534 12 => kw_len12(b),
535 _ => None,
536 };
537 match tok {
538 Some(t) => t,
539 None => Token::Ident(raw.to_ascii_lowercase()),
541 }
542}
543
544#[inline]
550fn eq_ci(input: &[u8], lower: &[u8]) -> bool {
551 if input.len() != lower.len() {
552 return false;
553 }
554 for i in 0..lower.len() {
555 if input[i].to_ascii_lowercase() != lower[i] {
556 return false;
557 }
558 }
559 true
560}
561
562#[inline]
563fn kw_len2(b: &[u8]) -> Option<Token> {
564 if eq_ci(b, b"as") {
566 return Some(Token::As);
567 }
568 if eq_ci(b, b"by") {
569 return Some(Token::By);
570 }
571 if eq_ci(b, b"in") {
572 return Some(Token::In);
573 }
574 if eq_ci(b, b"is") {
575 return Some(Token::Is);
576 }
577 if eq_ci(b, b"on") {
578 return Some(Token::On);
579 }
580 if eq_ci(b, b"or") {
581 return Some(Token::Or);
582 }
583 if eq_ci(b, b"to") {
584 return Some(Token::To);
585 }
586 None
587}
588
589#[inline]
590fn kw_len3(b: &[u8]) -> Option<Token> {
591 if eq_ci(b, b"for") {
593 return Some(Token::For);
594 }
595 if eq_ci(b, b"all") {
596 return Some(Token::All);
597 }
598 if eq_ci(b, b"and") {
599 return Some(Token::And);
600 }
601 if eq_ci(b, b"asc") {
602 return Some(Token::Asc);
603 }
604 if eq_ci(b, b"not") {
605 return Some(Token::Not);
606 }
607 None
608}
609
610#[inline]
611fn kw_len4(b: &[u8]) -> Option<Token> {
612 if eq_ci(b, b"from") {
614 return Some(Token::From);
615 }
616 if eq_ci(b, b"drop") {
617 return Some(Token::Drop);
618 }
619 if eq_ci(b, b"null") {
620 return Some(Token::Null);
621 }
622 if eq_ci(b, b"true") {
623 return Some(Token::True);
624 }
625 if eq_ci(b, b"into") {
626 return Some(Token::Into);
627 }
628 if eq_ci(b, b"like") {
629 return Some(Token::Like);
630 }
631 if eq_ci(b, b"join") {
632 return Some(Token::Join);
633 }
634 if eq_ci(b, b"left") {
635 return Some(Token::Left);
636 }
637 if eq_ci(b, b"show") {
638 return Some(Token::Show);
639 }
640 if eq_ci(b, b"desc") {
641 return Some(Token::Desc);
642 }
643 None
644}
645
646#[inline]
647fn kw_len5(b: &[u8]) -> Option<Token> {
648 if eq_ci(b, b"false") {
651 return Some(Token::False);
652 }
653 if eq_ci(b, b"where") {
654 return Some(Token::Where);
655 }
656 if eq_ci(b, b"table") {
657 return Some(Token::Table);
658 }
659 if eq_ci(b, b"index") {
660 return Some(Token::Index);
661 }
662 if eq_ci(b, b"begin") {
663 return Some(Token::Begin);
664 }
665 if eq_ci(b, b"order") {
666 return Some(Token::Order);
667 }
668 if eq_ci(b, b"limit") {
669 return Some(Token::Limit);
670 }
671 if eq_ci(b, b"group") {
672 return Some(Token::Group);
673 }
674 if eq_ci(b, b"union") {
675 return Some(Token::Union);
676 }
677 if eq_ci(b, b"inner") {
678 return Some(Token::Inner);
679 }
680 if eq_ci(b, b"cross") {
681 return Some(Token::Cross);
682 }
683 if eq_ci(b, b"outer") {
684 return Some(Token::Outer);
685 }
686 None
687}
688
689#[inline]
690fn kw_len6(b: &[u8]) -> Option<Token> {
691 if eq_ci(b, b"select") {
693 return Some(Token::Select);
694 }
695 if eq_ci(b, b"tables") {
696 return Some(Token::Tables);
697 }
698 if eq_ci(b, b"except") {
699 return Some(Token::Except);
700 }
701 if eq_ci(b, b"create") {
702 return Some(Token::Create);
703 }
704 if eq_ci(b, b"insert") {
705 return Some(Token::Insert);
706 }
707 if eq_ci(b, b"values") {
708 return Some(Token::Values);
709 }
710 if eq_ci(b, b"commit") {
711 return Some(Token::Commit);
712 }
713 if eq_ci(b, b"having") {
714 return Some(Token::Having);
715 }
716 if eq_ci(b, b"offset") {
717 return Some(Token::Offset);
718 }
719 None
720}
721
722#[inline]
723fn kw_len7(b: &[u8]) -> Option<Token> {
724 if eq_ci(b, b"between") {
726 return Some(Token::Between);
727 }
728 if eq_ci(b, b"default") {
729 return Some(Token::Default);
730 }
731 if eq_ci(b, b"release") {
732 return Some(Token::Release);
733 }
734 if eq_ci(b, b"extract") {
735 return Some(Token::Extract);
736 }
737 None
738}
739
740#[inline]
741fn kw_len8(b: &[u8]) -> Option<Token> {
742 if eq_ci(b, b"rollback") {
744 return Some(Token::Rollback);
745 }
746 if eq_ci(b, b"distinct") {
747 return Some(Token::Distinct);
748 }
749 if eq_ci(b, b"interval") {
750 return Some(Token::Interval);
751 }
752 None
753}
754
755#[inline]
756fn kw_len9(b: &[u8]) -> Option<Token> {
757 if eq_ci(b, b"savepoint") {
759 return Some(Token::Savepoint);
760 }
761 None
762}
763
764#[inline]
765fn kw_len10(b: &[u8]) -> Option<Token> {
766 if eq_ci(b, b"connection") {
768 return Some(Token::Connection);
769 }
770 None
771}
772
773#[inline]
774fn kw_len11(b: &[u8]) -> Option<Token> {
775 if eq_ci(b, b"publication") {
777 return Some(Token::Publication);
778 }
779 None
780}
781
782#[inline]
783fn kw_len12(b: &[u8]) -> Option<Token> {
784 if eq_ci(b, b"subscription") {
786 return Some(Token::Subscription);
787 }
788 None
789}
790
791fn lex_quoted(
798 input: &str,
799 start: usize,
800 quote: u8,
801 is_ident: bool,
802) -> Result<(Token, usize), LexError> {
803 let bytes = input.as_bytes();
804 let mut i = start + 1;
805 let mut s = String::new();
806 loop {
807 if i >= bytes.len() {
808 return Err(LexError {
809 kind: if is_ident {
810 LexErrorKind::UnterminatedQuotedIdent
811 } else {
812 LexErrorKind::UnterminatedString
813 },
814 pos: start,
815 });
816 }
817 if bytes[i] == quote {
818 if peek_eq(bytes, i + 1, quote) {
819 s.push(quote as char);
820 i += 2;
821 } else {
822 i += 1;
823 break;
824 }
825 } else {
826 let ch = input[i..].chars().next().expect("non-empty UTF-8 boundary");
827 s.push(ch);
828 i += ch.len_utf8();
829 }
830 }
831 let tok = if is_ident {
832 Token::QuotedIdent(s)
833 } else {
834 Token::String(s)
835 };
836 Ok((tok, i - start))
837}
838
839fn lex_number(s: &str) -> Result<(Token, usize), LexErrorKind> {
840 let bytes = s.as_bytes();
841 let mut i = 0usize;
842 let mut is_float = false;
843
844 while i < bytes.len() && bytes[i].is_ascii_digit() {
845 i += 1;
846 }
847 if i < bytes.len() && bytes[i] == b'.' {
848 is_float = true;
849 i += 1;
850 while i < bytes.len() && bytes[i].is_ascii_digit() {
851 i += 1;
852 }
853 }
854 if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
855 is_float = true;
856 i += 1;
857 if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
858 i += 1;
859 }
860 let exp_start = i;
861 while i < bytes.len() && bytes[i].is_ascii_digit() {
862 i += 1;
863 }
864 if exp_start == i {
865 return Err(LexErrorKind::BadNumber(s[..i].to_string()));
866 }
867 }
868
869 let lit = &s[..i];
870 if is_float {
871 lit.parse::<f64>()
872 .map(|v| (Token::Float(v), i))
873 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
874 } else {
875 lit.parse::<i64>()
876 .map(|v| (Token::Integer(v), i))
877 .map_err(|_| LexErrorKind::BadNumber(lit.to_string()))
878 }
879}
880
881#[cfg(test)]
882mod tests {
883 use super::*;
884 use alloc::vec;
885
886 fn lex(s: &str) -> Vec<Token> {
887 tokenize(s).expect("lex ok")
888 }
889
890 #[test]
891 fn empty_yields_only_eof() {
892 assert_eq!(lex(""), vec![Token::Eof]);
893 }
894
895 #[test]
896 fn whitespace_only_yields_only_eof() {
897 assert_eq!(lex(" \t\n "), vec![Token::Eof]);
898 }
899
900 #[test]
901 fn keywords_are_case_insensitive() {
902 assert_eq!(
903 lex("SELECT select Select"),
904 vec![Token::Select, Token::Select, Token::Select, Token::Eof]
905 );
906 }
907
908 #[test]
909 fn identifiers_lowercase_ascii() {
910 assert_eq!(
911 lex("hello WORLD _x x1"),
912 vec![
913 Token::Ident("hello".into()),
914 Token::Ident("world".into()),
915 Token::Ident("_x".into()),
916 Token::Ident("x1".into()),
917 Token::Eof,
918 ]
919 );
920 }
921
922 #[test]
923 fn quoted_identifier_keeps_case_and_handles_embedded_quote() {
924 assert_eq!(
925 lex(r#""User Name" "a""b""#),
926 vec![
927 Token::QuotedIdent("User Name".into()),
928 Token::QuotedIdent("a\"b".into()),
929 Token::Eof,
930 ]
931 );
932 }
933
934 #[test]
935 fn integer_and_float_literals() {
936 assert_eq!(
937 lex("0 42 1.5 .5 1e10 2.5e-3"),
938 vec![
939 Token::Integer(0),
940 Token::Integer(42),
941 Token::Float(1.5),
942 Token::Float(0.5),
943 Token::Float(1e10),
944 Token::Float(2.5e-3),
945 Token::Eof,
946 ]
947 );
948 }
949
950 #[test]
951 fn negative_number_is_minus_then_integer() {
952 assert_eq!(
954 lex("-42"),
955 vec![Token::Minus, Token::Integer(42), Token::Eof]
956 );
957 }
958
959 #[test]
960 fn string_literal_doubled_quote_escape() {
961 assert_eq!(
962 lex("'hello' 'it''s'"),
963 vec![
964 Token::String("hello".into()),
965 Token::String("it's".into()),
966 Token::Eof,
967 ]
968 );
969 }
970
971 #[test]
972 fn all_comparison_and_arithmetic_operators() {
973 assert_eq!(
974 lex("= <> != < <= > >= + - * /"),
975 vec![
976 Token::Eq,
977 Token::NotEq,
978 Token::NotEq,
979 Token::Lt,
980 Token::LtEq,
981 Token::Gt,
982 Token::GtEq,
983 Token::Plus,
984 Token::Minus,
985 Token::Star,
986 Token::Slash,
987 Token::Eof,
988 ]
989 );
990 }
991
992 #[test]
993 fn punctuation() {
994 assert_eq!(
995 lex("( ) , ; ."),
996 vec![
997 Token::LParen,
998 Token::RParen,
999 Token::Comma,
1000 Token::Semicolon,
1001 Token::Dot,
1002 Token::Eof,
1003 ]
1004 );
1005 }
1006
1007 #[test]
1008 fn line_comment_skipped() {
1009 assert_eq!(
1010 lex("SELECT -- trailing junk\nFROM"),
1011 vec![Token::Select, Token::From, Token::Eof]
1012 );
1013 }
1014
1015 #[test]
1016 fn block_comment_skipped() {
1017 assert_eq!(
1018 lex("SELECT /* skipped */ 1"),
1019 vec![Token::Select, Token::Integer(1), Token::Eof]
1020 );
1021 }
1022
1023 #[test]
1024 fn unterminated_string_errors() {
1025 let err = tokenize("'oops").unwrap_err();
1026 assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1027 assert_eq!(err.pos, 0);
1028 }
1029
1030 #[test]
1031 fn unterminated_block_comment_errors() {
1032 let err = tokenize("/* never closed").unwrap_err();
1033 assert!(matches!(err.kind, LexErrorKind::UnterminatedBlockComment));
1034 }
1035
1036 #[test]
1037 fn unknown_char_errors() {
1038 let err = tokenize("@").unwrap_err();
1039 assert!(matches!(err.kind, LexErrorKind::UnknownChar('@')));
1040 }
1041
1042 #[test]
1043 fn dot_in_qualified_column() {
1044 assert_eq!(
1045 lex("t.col"),
1046 vec![
1047 Token::Ident("t".into()),
1048 Token::Dot,
1049 Token::Ident("col".into()),
1050 Token::Eof,
1051 ]
1052 );
1053 }
1054
1055 #[test]
1058 fn brackets_are_distinct_tokens() {
1059 assert_eq!(
1060 lex("[ ]"),
1061 vec![Token::LBracket, Token::RBracket, Token::Eof]
1062 );
1063 }
1064
1065 #[test]
1066 fn l2_distance_is_three_char_token() {
1067 assert_eq!(
1068 lex("a <-> b"),
1069 vec![
1070 Token::Ident("a".into()),
1071 Token::L2Distance,
1072 Token::Ident("b".into()),
1073 Token::Eof,
1074 ]
1075 );
1076 assert_eq!(
1078 lex("a <- b"),
1079 vec![
1080 Token::Ident("a".into()),
1081 Token::Lt,
1082 Token::Minus,
1083 Token::Ident("b".into()),
1084 Token::Eof,
1085 ]
1086 );
1087 }
1088
1089 #[test]
1090 fn order_by_limit_are_keywords() {
1091 assert_eq!(
1092 lex("ORDER BY LIMIT"),
1093 vec![Token::Order, Token::By, Token::Limit, Token::Eof]
1094 );
1095 }
1096
1097 #[test]
1100 fn inner_product_operator_3char() {
1101 assert_eq!(
1102 lex("a <#> b"),
1103 vec![
1104 Token::Ident("a".into()),
1105 Token::InnerProduct,
1106 Token::Ident("b".into()),
1107 Token::Eof,
1108 ]
1109 );
1110 }
1111
1112 #[test]
1113 fn cosine_distance_operator_3char() {
1114 assert_eq!(
1115 lex("a <=> b"),
1116 vec![
1117 Token::Ident("a".into()),
1118 Token::CosineDistance,
1119 Token::Ident("b".into()),
1120 Token::Eof,
1121 ]
1122 );
1123 assert_eq!(
1126 lex("a <= b"),
1127 vec![
1128 Token::Ident("a".into()),
1129 Token::LtEq,
1130 Token::Ident("b".into()),
1131 Token::Eof,
1132 ]
1133 );
1134 }
1135
1136 #[test]
1137 fn double_colon_cast_token() {
1138 assert_eq!(
1139 lex("x::INT"),
1140 vec![
1141 Token::Ident("x".into()),
1142 Token::DoubleColon,
1143 Token::Ident("int".into()),
1144 Token::Eof,
1145 ]
1146 );
1147 }
1148
1149 #[test]
1150 fn lone_single_colon_lexes_as_colon_token() {
1151 let toks = tokenize(":x").expect("colon now lexes");
1156 assert_eq!(toks[0], Token::Colon);
1157 }
1158
1159 #[test]
1160 fn colon_eq_lexes_as_assignment() {
1161 let toks = tokenize("x := 1").expect("colon-eq lexes");
1163 assert!(matches!(toks[1], Token::ColonEq));
1165 }
1166}