1use std::fmt;
15use std::iter::Peekable;
16use std::str::Chars;
17
18#[derive(Debug, Clone, PartialEq)]
20pub enum Token {
21 Select,
23 From,
24 Where,
25 And,
26 Or,
27 Not,
28 Match,
29 Return,
30 Join,
31 Graph,
32 Path,
33 To,
34 Via,
35 On,
36 As,
37 Is,
38 Null,
39 Between,
40 Like,
41 In,
42 Order,
43 By,
44 Asc,
45 Desc,
46 Nulls,
47 First,
48 Last,
49 Limit,
50 Offset,
51 Inner,
52 Left,
53 Right,
54 Outer,
55 Full,
56 Cross,
57 Starts,
58 Ends,
59 With,
60 Contains,
61 True,
62 False,
63 Enrich,
64 Group,
65 Count,
66 Sum,
67 Avg,
68 Min,
69 Max,
70 Distinct,
71
72 Vector,
74 Search,
75 Similar,
76 Collection,
77 Metric,
78 Threshold,
79 K,
80 Hybrid,
81 Fusion,
82 Rerank,
83 Rrf,
84 Intersection,
85 Union,
86 Recursive,
87 All,
88 Weight,
89 L2,
90 Cosine,
91 InnerProduct,
92 Include,
93 Metadata,
94 Vectors,
95
96 Insert,
98 Into,
99 Values,
100 Update,
101 Set,
102 Delete,
103 Truncate,
104 Create,
105 Table,
106 Drop,
107 Alter,
108 Add,
109 Column,
110 Primary,
111 Explain,
113 For,
114 Format,
115 Json,
116 Key,
117 Default,
118 Compress,
119 Index,
120 Unique,
121 If,
122 Exists,
123 Returning,
124 Cascade,
125 Rename,
126 Using,
127
128 Node,
130 Edge,
131 Document,
132 Kv,
133
134 Timeseries,
136 Retention,
137 Queue,
138 Tree,
139 Push,
140 Pop,
141 Peek,
142 Purge,
143 Ack,
144 Nack,
145 Priority,
146
147 Neighborhood,
149 ShortestPath,
150 Centrality,
151 Community,
152 Components,
153 Cycles,
154 Traverse,
155 Depth,
156 Direction,
157 Algorithm,
158 Strategy,
159 MaxIterations,
160 MaxLength,
161 Mode,
162 Clustering,
163 TopologicalSort,
164 Properties,
165 Text,
166 Fuzzy,
167 MinScore,
168
169 Begin,
171 Commit,
172 Rollback,
173 Savepoint,
174 Release,
175 Start,
176 Transaction,
177 Work,
178
179 Vacuum,
181 Analyze,
182
183 Schema,
185 Sequence,
186 Increment,
187
188 Copy,
190 Header,
191 Delimiter,
192
193 View,
195 Materialized,
196 Refresh,
197
198 Partition,
200 Range,
201 List,
202 Hash,
203 Attach,
204 Detach,
205 Of,
206
207 Policy,
209 Enable,
210 Disable,
211 Security,
212 Row,
213 Level,
214
215 Foreign,
217 Server,
218 Wrapper,
219 Options,
220 Data,
221
222 Sessionize,
224 Gap,
225
226 Over,
230 Rows,
231 Preceding,
232 Following,
233 Unbounded,
234 Current,
235
236 String(String),
238 Integer(i64),
239 Float(f64),
240 JsonLiteral(String),
246
247 Ident(String),
249
250 Eq, Ne, Lt, Le, Gt, Ge, Plus, Minus, Star, Slash, Percent, LParen, RParen, LBracket, RBracket, LBrace, RBrace, Comma, Dot, Colon, Semi, Dollar, Question, FatArrow, Arrow, ArrowLeft, Dash, DotDot, Pipe, DoublePipe, Eof,
290}
291
292impl fmt::Display for Token {
293 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
294 match self {
295 Token::Select => write!(f, "SELECT"),
296 Token::From => write!(f, "FROM"),
297 Token::Where => write!(f, "WHERE"),
298 Token::And => write!(f, "AND"),
299 Token::Or => write!(f, "OR"),
300 Token::Not => write!(f, "NOT"),
301 Token::Match => write!(f, "MATCH"),
302 Token::Return => write!(f, "RETURN"),
303 Token::Join => write!(f, "JOIN"),
304 Token::Graph => write!(f, "GRAPH"),
305 Token::Path => write!(f, "PATH"),
306 Token::To => write!(f, "TO"),
307 Token::Via => write!(f, "VIA"),
308 Token::On => write!(f, "ON"),
309 Token::As => write!(f, "AS"),
310 Token::Is => write!(f, "IS"),
311 Token::Null => write!(f, "NULL"),
312 Token::Between => write!(f, "BETWEEN"),
313 Token::Like => write!(f, "LIKE"),
314 Token::In => write!(f, "IN"),
315 Token::Order => write!(f, "ORDER"),
316 Token::By => write!(f, "BY"),
317 Token::Asc => write!(f, "ASC"),
318 Token::Desc => write!(f, "DESC"),
319 Token::Nulls => write!(f, "NULLS"),
320 Token::First => write!(f, "FIRST"),
321 Token::Last => write!(f, "LAST"),
322 Token::Limit => write!(f, "LIMIT"),
323 Token::Offset => write!(f, "OFFSET"),
324 Token::Inner => write!(f, "INNER"),
325 Token::Left => write!(f, "LEFT"),
326 Token::Right => write!(f, "RIGHT"),
327 Token::Outer => write!(f, "OUTER"),
328 Token::Full => write!(f, "FULL"),
329 Token::Cross => write!(f, "CROSS"),
330 Token::Starts => write!(f, "STARTS"),
331 Token::Ends => write!(f, "ENDS"),
332 Token::With => write!(f, "WITH"),
333 Token::Contains => write!(f, "CONTAINS"),
334 Token::True => write!(f, "TRUE"),
335 Token::False => write!(f, "FALSE"),
336 Token::Enrich => write!(f, "ENRICH"),
337 Token::Group => write!(f, "GROUP"),
338 Token::Count => write!(f, "COUNT"),
339 Token::Sum => write!(f, "SUM"),
340 Token::Avg => write!(f, "AVG"),
341 Token::Min => write!(f, "MIN"),
342 Token::Max => write!(f, "MAX"),
343 Token::Distinct => write!(f, "DISTINCT"),
344 Token::Vector => write!(f, "VECTOR"),
345 Token::Search => write!(f, "SEARCH"),
346 Token::Similar => write!(f, "SIMILAR"),
347 Token::Collection => write!(f, "COLLECTION"),
348 Token::Metric => write!(f, "METRIC"),
349 Token::Threshold => write!(f, "THRESHOLD"),
350 Token::K => write!(f, "K"),
351 Token::Hybrid => write!(f, "HYBRID"),
352 Token::Fusion => write!(f, "FUSION"),
353 Token::Rerank => write!(f, "RERANK"),
354 Token::Rrf => write!(f, "RRF"),
355 Token::Intersection => write!(f, "INTERSECTION"),
356 Token::Union => write!(f, "UNION"),
357 Token::Recursive => write!(f, "RECURSIVE"),
358 Token::All => write!(f, "ALL"),
359 Token::Weight => write!(f, "WEIGHT"),
360 Token::L2 => write!(f, "L2"),
361 Token::Cosine => write!(f, "COSINE"),
362 Token::InnerProduct => write!(f, "INNER_PRODUCT"),
363 Token::Include => write!(f, "INCLUDE"),
364 Token::Metadata => write!(f, "METADATA"),
365 Token::Vectors => write!(f, "VECTORS"),
366 Token::Explain => write!(f, "EXPLAIN"),
367 Token::For => write!(f, "FOR"),
368 Token::Format => write!(f, "FORMAT"),
369 Token::Json => write!(f, "JSON"),
370 Token::Insert => write!(f, "INSERT"),
371 Token::Into => write!(f, "INTO"),
372 Token::Values => write!(f, "VALUES"),
373 Token::Update => write!(f, "UPDATE"),
374 Token::Set => write!(f, "SET"),
375 Token::Delete => write!(f, "DELETE"),
376 Token::Truncate => write!(f, "TRUNCATE"),
377 Token::Create => write!(f, "CREATE"),
378 Token::Table => write!(f, "TABLE"),
379 Token::Drop => write!(f, "DROP"),
380 Token::Alter => write!(f, "ALTER"),
381 Token::Add => write!(f, "ADD"),
382 Token::Column => write!(f, "COLUMN"),
383 Token::Primary => write!(f, "PRIMARY"),
384 Token::Key => write!(f, "KEY"),
385 Token::Default => write!(f, "DEFAULT"),
386 Token::Compress => write!(f, "COMPRESS"),
387 Token::Index => write!(f, "INDEX"),
388 Token::Unique => write!(f, "UNIQUE"),
389 Token::If => write!(f, "IF"),
390 Token::Exists => write!(f, "EXISTS"),
391 Token::Returning => write!(f, "RETURNING"),
392 Token::Cascade => write!(f, "CASCADE"),
393 Token::Rename => write!(f, "RENAME"),
394 Token::Using => write!(f, "USING"),
395 Token::Node => write!(f, "NODE"),
396 Token::Edge => write!(f, "EDGE"),
397 Token::Document => write!(f, "DOCUMENT"),
398 Token::Kv => write!(f, "KV"),
399 Token::Timeseries => write!(f, "TIMESERIES"),
400 Token::Retention => write!(f, "RETENTION"),
401 Token::Queue => write!(f, "QUEUE"),
402 Token::Tree => write!(f, "TREE"),
403 Token::Push => write!(f, "PUSH"),
404 Token::Pop => write!(f, "POP"),
405 Token::Peek => write!(f, "PEEK"),
406 Token::Purge => write!(f, "PURGE"),
407 Token::Ack => write!(f, "ACK"),
408 Token::Nack => write!(f, "NACK"),
409 Token::Priority => write!(f, "PRIORITY"),
410 Token::Neighborhood => write!(f, "NEIGHBORHOOD"),
411 Token::ShortestPath => write!(f, "SHORTEST_PATH"),
412 Token::Centrality => write!(f, "CENTRALITY"),
413 Token::Community => write!(f, "COMMUNITY"),
414 Token::Components => write!(f, "COMPONENTS"),
415 Token::Cycles => write!(f, "CYCLES"),
416 Token::Traverse => write!(f, "TRAVERSE"),
417 Token::Depth => write!(f, "DEPTH"),
418 Token::Direction => write!(f, "DIRECTION"),
419 Token::Algorithm => write!(f, "ALGORITHM"),
420 Token::Strategy => write!(f, "STRATEGY"),
421 Token::MaxIterations => write!(f, "MAX_ITERATIONS"),
422 Token::MaxLength => write!(f, "MAX_LENGTH"),
423 Token::Mode => write!(f, "MODE"),
424 Token::Clustering => write!(f, "CLUSTERING"),
425 Token::TopologicalSort => write!(f, "TOPOLOGICAL_SORT"),
426 Token::Properties => write!(f, "PROPERTIES"),
427 Token::Text => write!(f, "TEXT"),
428 Token::Fuzzy => write!(f, "FUZZY"),
429 Token::MinScore => write!(f, "MIN_SCORE"),
430 Token::Begin => write!(f, "BEGIN"),
431 Token::Commit => write!(f, "COMMIT"),
432 Token::Rollback => write!(f, "ROLLBACK"),
433 Token::Savepoint => write!(f, "SAVEPOINT"),
434 Token::Release => write!(f, "RELEASE"),
435 Token::Start => write!(f, "START"),
436 Token::Transaction => write!(f, "TRANSACTION"),
437 Token::Work => write!(f, "WORK"),
438 Token::Vacuum => write!(f, "VACUUM"),
439 Token::Analyze => write!(f, "ANALYZE"),
440 Token::Schema => write!(f, "SCHEMA"),
441 Token::Sequence => write!(f, "SEQUENCE"),
442 Token::Increment => write!(f, "INCREMENT"),
443 Token::Copy => write!(f, "COPY"),
444 Token::Header => write!(f, "HEADER"),
445 Token::Delimiter => write!(f, "DELIMITER"),
446 Token::View => write!(f, "VIEW"),
447 Token::Materialized => write!(f, "MATERIALIZED"),
448 Token::Refresh => write!(f, "REFRESH"),
449 Token::Partition => write!(f, "PARTITION"),
450 Token::Range => write!(f, "RANGE"),
451 Token::List => write!(f, "LIST"),
452 Token::Hash => write!(f, "HASH"),
453 Token::Attach => write!(f, "ATTACH"),
454 Token::Detach => write!(f, "DETACH"),
455 Token::Of => write!(f, "OF"),
456 Token::Policy => write!(f, "POLICY"),
457 Token::Enable => write!(f, "ENABLE"),
458 Token::Disable => write!(f, "DISABLE"),
459 Token::Security => write!(f, "SECURITY"),
460 Token::Row => write!(f, "ROW"),
461 Token::Level => write!(f, "LEVEL"),
462 Token::Foreign => write!(f, "FOREIGN"),
463 Token::Server => write!(f, "SERVER"),
464 Token::Wrapper => write!(f, "WRAPPER"),
465 Token::Options => write!(f, "OPTIONS"),
466 Token::Data => write!(f, "DATA"),
467 Token::Sessionize => write!(f, "SESSIONIZE"),
468 Token::Gap => write!(f, "GAP"),
469 Token::Over => write!(f, "OVER"),
470 Token::Rows => write!(f, "ROWS"),
471 Token::Preceding => write!(f, "PRECEDING"),
472 Token::Following => write!(f, "FOLLOWING"),
473 Token::Unbounded => write!(f, "UNBOUNDED"),
474 Token::Current => write!(f, "CURRENT"),
475 Token::String(s) => write!(f, "'{}'", s),
476 Token::Integer(n) => write!(f, "{}", n),
477 Token::Float(n) => write!(f, "{}", n),
478 Token::JsonLiteral(s) => write!(f, "{}", s),
479 Token::Ident(s) => write!(f, "{}", s),
480 Token::Eq => write!(f, "="),
481 Token::Ne => write!(f, "<>"),
482 Token::Lt => write!(f, "<"),
483 Token::Le => write!(f, "<="),
484 Token::Gt => write!(f, ">"),
485 Token::Ge => write!(f, ">="),
486 Token::Plus => write!(f, "+"),
487 Token::Minus => write!(f, "-"),
488 Token::Star => write!(f, "*"),
489 Token::Slash => write!(f, "/"),
490 Token::Percent => write!(f, "%"),
491 Token::LParen => write!(f, "("),
492 Token::RParen => write!(f, ")"),
493 Token::LBracket => write!(f, "["),
494 Token::RBracket => write!(f, "]"),
495 Token::LBrace => write!(f, "{{"),
496 Token::RBrace => write!(f, "}}"),
497 Token::Comma => write!(f, ","),
498 Token::Dot => write!(f, "."),
499 Token::Colon => write!(f, ":"),
500 Token::Semi => write!(f, ";"),
501 Token::Dollar => write!(f, "$"),
502 Token::Question => write!(f, "?"),
503 Token::FatArrow => write!(f, "=>"),
504 Token::Arrow => write!(f, "->"),
505 Token::ArrowLeft => write!(f, "<-"),
506 Token::Dash => write!(f, "-"),
507 Token::DotDot => write!(f, ".."),
508 Token::Pipe => write!(f, "|"),
509 Token::DoublePipe => write!(f, "||"),
510 Token::Eof => write!(f, "EOF"),
511 }
512 }
513}
514
515#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
517pub struct Position {
518 pub line: u32,
520 pub column: u32,
522 pub offset: u32,
524}
525
526impl Position {
527 pub fn new(line: u32, column: u32, offset: u32) -> Self {
529 Self {
530 line,
531 column,
532 offset,
533 }
534 }
535}
536
537impl fmt::Display for Position {
538 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
539 write!(f, "{}:{}", self.line, self.column)
540 }
541}
542
543#[derive(Debug, Clone)]
545pub struct Spanned {
546 pub token: Token,
548 pub start: Position,
550 pub end: Position,
552}
553
554impl Spanned {
555 pub fn new(token: Token, start: Position, end: Position) -> Self {
557 Self { token, start, end }
558 }
559}
560
561#[derive(Debug, Clone)]
563pub struct LexerError {
564 pub message: String,
566 pub position: Position,
568 pub limit_hit: Option<LexerLimitHit>,
572}
573
574#[derive(Debug, Clone, PartialEq, Eq)]
576pub enum LexerLimitHit {
577 IdentifierTooLong {
579 limit_name: &'static str,
580 value: usize,
581 },
582}
583
584impl LexerError {
585 pub fn new(message: impl Into<String>, position: Position) -> Self {
587 Self {
588 message: message.into(),
589 position,
590 limit_hit: None,
591 }
592 }
593
594 pub(crate) fn with_limit(
596 message: impl Into<String>,
597 position: Position,
598 limit_hit: LexerLimitHit,
599 ) -> Self {
600 Self {
601 message: message.into(),
602 position,
603 limit_hit: Some(limit_hit),
604 }
605 }
606}
607
608impl fmt::Display for LexerError {
609 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
610 write!(f, "Lexer error at {}: {}", self.position, self.message)
611 }
612}
613
614impl std::error::Error for LexerError {}
615
616pub const JSON_LITERAL_MAX_BYTES: usize = 16 * 1024 * 1024;
623
624pub struct Lexer<'a> {
626 input: &'a str,
629 chars: Peekable<Chars<'a>>,
631 line: u32,
633 column: u32,
634 offset: u32,
635 peeked: Option<Spanned>,
637 putback: Option<(char, Position)>,
639 max_identifier_chars: usize,
641}
642
643impl<'a> Lexer<'a> {
644 pub fn new(input: &'a str) -> Self {
646 Self::with_limits(input, crate::limits::ParserLimits::default())
647 }
648
649 pub fn with_limits(input: &'a str, limits: crate::limits::ParserLimits) -> Self {
651 Self {
652 input,
653 chars: input.chars().peekable(),
654 line: 1,
655 column: 1,
656 offset: 0,
657 peeked: None,
658 putback: None,
659 max_identifier_chars: limits.max_identifier_chars,
660 }
661 }
662
663 #[allow(dead_code)]
672 pub(crate) fn max_identifier_chars(&self) -> usize {
673 self.max_identifier_chars
674 }
675
676 fn position(&self) -> Position {
678 Position::new(self.line, self.column, self.offset)
679 }
680
681 fn unget(&mut self, ch: char, pos: Position) {
683 self.putback = Some((ch, pos));
684 }
685
686 fn advance(&mut self) -> Option<char> {
688 if let Some((ch, pos)) = self.putback.take() {
690 self.line = pos.line;
692 self.column = pos.column + 1;
693 self.offset = pos.offset + ch.len_utf8() as u32;
694 return Some(ch);
695 }
696
697 let ch = self.chars.next()?;
698 self.offset += ch.len_utf8() as u32;
699 if ch == '\n' {
700 self.line += 1;
701 self.column = 1;
702 } else {
703 self.column += 1;
704 }
705 Some(ch)
706 }
707
708 fn peek(&mut self) -> Option<char> {
710 if let Some((ch, _)) = &self.putback {
712 return Some(*ch);
713 }
714 self.chars.peek().copied()
715 }
716
717 #[allow(dead_code)]
723 fn skip_whitespace(&mut self) {
724 while let Some(ch) = self.peek() {
725 if ch.is_whitespace() {
726 self.advance();
727 } else if ch == '-' {
728 let pos = self.position();
730 self.advance();
731 if self.peek() == Some('-') {
732 self.advance();
734 while let Some(c) = self.peek() {
735 if c == '\n' {
736 break;
737 }
738 self.advance();
739 }
740 } else {
741 self.line = pos.line;
744 self.column = pos.column;
745 self.offset = pos.offset;
746 break;
749 }
750 } else {
751 break;
752 }
753 }
754 }
755
756 pub fn peek_token(&mut self) -> Result<&Spanned, LexerError> {
758 if self.peeked.is_none() {
759 self.peeked = Some(self.next_token_internal()?);
760 }
761 Ok(self.peeked.as_ref().unwrap())
762 }
763
764 pub fn next_token(&mut self) -> Result<Spanned, LexerError> {
766 if let Some(tok) = self.peeked.take() {
767 return Ok(tok);
768 }
769 self.next_token_internal()
770 }
771
772 fn next_token_internal(&mut self) -> Result<Spanned, LexerError> {
774 self.skip_whitespace_simple();
775
776 let start = self.position();
777
778 let ch = match self.peek() {
779 Some(c) => c,
780 None => {
781 return Ok(Spanned::new(Token::Eof, start, start));
782 }
783 };
784
785 let token = match ch {
787 '\'' | '"' => self.scan_string()?,
789
790 '0'..='9' => self.scan_number()?,
792
793 'a'..='z' | 'A'..='Z' | '_' => self.scan_identifier()?,
795
796 '=' => {
798 self.advance();
799 if self.peek() == Some('>') {
802 self.advance();
803 Token::FatArrow
804 } else {
805 Token::Eq
806 }
807 }
808 '<' => self.scan_less_than()?,
809 '>' => self.scan_greater_than()?,
810 '!' => {
811 self.advance();
812 if self.peek() == Some('=') {
813 self.advance();
814 Token::Ne
815 } else {
816 return Err(LexerError::new("Expected '=' after '!'", start));
817 }
818 }
819 '+' => {
820 self.advance();
821 Token::Plus
822 }
823 '-' => self.scan_minus()?,
824 '*' => {
825 self.advance();
826 Token::Star
827 }
828 '/' => {
829 self.advance();
830 Token::Slash
831 }
832 '%' => {
833 self.advance();
834 Token::Percent
835 }
836 '(' => {
837 self.advance();
838 Token::LParen
839 }
840 ')' => {
841 self.advance();
842 Token::RParen
843 }
844 '[' => {
845 self.advance();
846 Token::LBracket
847 }
848 ']' => {
849 self.advance();
850 Token::RBracket
851 }
852 '{' => {
853 if self.looks_like_json_object_start() {
860 return self.scan_json_literal(start);
861 }
862 self.advance();
863 Token::LBrace
864 }
865 '}' => {
866 self.advance();
867 Token::RBrace
868 }
869 ',' => {
870 self.advance();
871 Token::Comma
872 }
873 '.' => self.scan_dot()?,
874 ':' => {
875 self.advance();
876 Token::Colon
877 }
878 ';' => {
879 self.advance();
880 Token::Semi
881 }
882 '$' => {
883 self.advance();
884 Token::Dollar
885 }
886 '?' => {
887 self.advance();
888 Token::Question
889 }
890 '|' => {
891 self.advance();
892 if self.peek() == Some('|') {
893 self.advance();
894 Token::DoublePipe
895 } else {
896 Token::Pipe
897 }
898 }
899 _ => {
900 return Err(LexerError::new(
901 format!("Unexpected character: '{}'", ch),
902 start,
903 ));
904 }
905 };
906
907 let end = self.position();
908 Ok(Spanned::new(token, start, end))
909 }
910
911 fn skip_whitespace_simple(&mut self) {
913 while let Some(ch) = self.peek() {
914 if ch.is_whitespace() {
915 self.advance();
916 } else if ch == '-' && self.input[self.offset as usize..].starts_with("--") {
917 self.advance();
918 self.advance();
919 while let Some(c) = self.peek() {
920 if c == '\n' {
921 break;
922 }
923 self.advance();
924 }
925 } else if ch == '/' && self.input[self.offset as usize..].starts_with("/*") {
926 self.advance();
927 self.advance();
928 while let Some(c) = self.peek() {
929 self.advance();
930 if c == '*' && self.peek() == Some('/') {
931 self.advance();
932 break;
933 }
934 }
935 } else {
936 break;
937 }
938 }
939 }
940
941 fn scan_string(&mut self) -> Result<Token, LexerError> {
943 let quote = self.advance().unwrap(); let start = self.position();
945 let mut value = String::new();
946
947 loop {
948 match self.peek() {
949 None => {
950 return Err(LexerError::new("Unterminated string", start));
951 }
952 Some(c) if c == quote => {
953 self.advance();
954 if self.peek() == Some(quote) {
956 self.advance();
957 value.push(quote);
958 } else {
959 break;
960 }
961 }
962 Some('\\') => {
963 self.advance();
964 match self.peek() {
965 Some('n') => {
966 self.advance();
967 value.push('\n');
968 }
969 Some('r') => {
970 self.advance();
971 value.push('\r');
972 }
973 Some('t') => {
974 self.advance();
975 value.push('\t');
976 }
977 Some('\\') => {
978 self.advance();
979 value.push('\\');
980 }
981 Some(c) if c == quote => {
982 self.advance();
983 value.push(quote);
984 }
985 Some(c) => {
986 value.push('\\');
988 value.push(c);
989 self.advance();
990 }
991 None => {
992 return Err(LexerError::new("Unterminated string", start));
993 }
994 }
995 }
996 Some(c) => {
997 self.advance();
998 value.push(c);
999 }
1000 }
1001 }
1002
1003 Ok(Token::String(value))
1004 }
1005
1006 fn scan_number(&mut self) -> Result<Token, LexerError> {
1008 let mut value = String::new();
1009 let mut is_float = false;
1010
1011 while let Some(ch) = self.peek() {
1013 if ch.is_ascii_digit() {
1014 value.push(ch);
1015 self.advance();
1016 } else {
1017 break;
1018 }
1019 }
1020
1021 if self.peek() == Some('.') {
1023 let dot_pos = self.position();
1025 self.advance(); if self.peek() == Some('.') {
1028 self.unget('.', dot_pos);
1030 } else if self.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
1032 is_float = true;
1033 value.push('.');
1034 while let Some(ch) = self.peek() {
1035 if ch.is_ascii_digit() {
1036 value.push(ch);
1037 self.advance();
1038 } else {
1039 break;
1040 }
1041 }
1042 } else {
1043 self.unget('.', dot_pos);
1045 }
1046 }
1047
1048 if self.peek() == Some('e') || self.peek() == Some('E') {
1050 is_float = true;
1051 value.push(self.advance().unwrap());
1052
1053 if self.peek() == Some('+') || self.peek() == Some('-') {
1054 value.push(self.advance().unwrap());
1055 }
1056
1057 while let Some(ch) = self.peek() {
1058 if ch.is_ascii_digit() {
1059 value.push(ch);
1060 self.advance();
1061 } else {
1062 break;
1063 }
1064 }
1065 }
1066
1067 if is_float {
1068 match value.parse::<f64>() {
1069 Ok(n) => Ok(Token::Float(n)),
1070 Err(_) => Err(LexerError::new(
1071 format!("Invalid float: {}", value),
1072 self.position(),
1073 )),
1074 }
1075 } else {
1076 match value.parse::<i64>() {
1077 Ok(n) => Ok(Token::Integer(n)),
1078 Err(_) => Err(LexerError::new(
1079 format!("Invalid integer: {}", value),
1080 self.position(),
1081 )),
1082 }
1083 }
1084 }
1085
1086 fn scan_identifier(&mut self) -> Result<Token, LexerError> {
1088 let start_pos = self.position();
1089 let mut value = String::new();
1090 let max = self.max_identifier_chars;
1091
1092 while let Some(ch) = self.peek() {
1093 if ch.is_alphanumeric() || ch == '_' {
1094 if value.chars().count() >= max {
1095 return Err(LexerError::with_limit(
1099 format!(
1100 "identifier exceeds maximum length (max_identifier_chars = {})",
1101 max
1102 ),
1103 start_pos,
1104 LexerLimitHit::IdentifierTooLong {
1105 limit_name: "max_identifier_chars",
1106 value: max,
1107 },
1108 ));
1109 }
1110 value.push(ch);
1111 self.advance();
1112 } else {
1113 break;
1114 }
1115 }
1116
1117 let token = match value.to_uppercase().as_str() {
1119 "SELECT" => Token::Select,
1120 "FROM" => Token::From,
1121 "WHERE" => Token::Where,
1122 "AND" => Token::And,
1123 "OR" => Token::Or,
1124 "NOT" => Token::Not,
1125 "MATCH" => Token::Match,
1126 "RETURN" => Token::Return,
1127 "JOIN" => Token::Join,
1128 "GRAPH" => Token::Graph,
1129 "PATH" => Token::Path,
1130 "TO" => Token::To,
1131 "VIA" => Token::Via,
1132 "ON" => Token::On,
1133 "AS" => Token::As,
1134 "IS" => Token::Is,
1135 "NULL" => Token::Null,
1136 "BETWEEN" => Token::Between,
1137 "LIKE" => Token::Like,
1138 "IN" => Token::In,
1139 "ORDER" => Token::Order,
1140 "BY" => Token::By,
1141 "ASC" => Token::Asc,
1142 "DESC" => Token::Desc,
1143 "NULLS" => Token::Nulls,
1144 "FIRST" => Token::First,
1145 "LAST" => Token::Last,
1146 "LIMIT" => Token::Limit,
1147 "OFFSET" => Token::Offset,
1148 "INNER" => Token::Inner,
1149 "LEFT" => Token::Left,
1150 "RIGHT" => Token::Right,
1151 "OUTER" => Token::Outer,
1152 "FULL" => Token::Full,
1153 "CROSS" => Token::Cross,
1154 "STARTS" => Token::Starts,
1155 "ENDS" => Token::Ends,
1156 "WITH" => Token::With,
1157 "CONTAINS" => Token::Contains,
1158 "TRUE" => Token::True,
1159 "FALSE" => Token::False,
1160 "ENRICH" => Token::Enrich,
1161 "GROUP" => Token::Group,
1162 "COUNT" => Token::Count,
1163 "SUM" => Token::Sum,
1164 "AVG" => Token::Avg,
1165 "MIN" => Token::Min,
1166 "MAX" => Token::Max,
1167 "DISTINCT" => Token::Distinct,
1168 "VECTOR" => Token::Vector,
1169 "SEARCH" => Token::Search,
1170 "SIMILAR" => Token::Similar,
1171 "COLLECTION" => Token::Collection,
1172 "METRIC" => Token::Metric,
1173 "THRESHOLD" => Token::Threshold,
1174 "K" => Token::K,
1175 "HYBRID" => Token::Hybrid,
1176 "FUSION" => Token::Fusion,
1177 "RERANK" => Token::Rerank,
1178 "RRF" => Token::Rrf,
1179 "INTERSECTION" => Token::Intersection,
1180 "UNION" => Token::Union,
1181 "RECURSIVE" => Token::Recursive,
1182 "ALL" => Token::All,
1183 "WEIGHT" => Token::Weight,
1184 "L2" => Token::L2,
1185 "COSINE" => Token::Cosine,
1186 "INNER_PRODUCT" | "INNERPRODUCT" => Token::InnerProduct,
1187 "INCLUDE" => Token::Include,
1188 "METADATA" => Token::Metadata,
1189 "VECTORS" => Token::Vectors,
1190 "EXPLAIN" => Token::Explain,
1191 "FOR" => Token::For,
1192 "FORMAT" => Token::Format,
1193 "JSON" => Token::Json,
1194 "INSERT" => Token::Insert,
1195 "INTO" => Token::Into,
1196 "VALUES" => Token::Values,
1197 "UPDATE" => Token::Update,
1198 "SET" => Token::Set,
1199 "DELETE" => Token::Delete,
1200 "TRUNCATE" => Token::Truncate,
1201 "CREATE" => Token::Create,
1202 "TABLE" => Token::Table,
1203 "DROP" => Token::Drop,
1204 "ALTER" => Token::Alter,
1205 "ADD" => Token::Add,
1206 "COLUMN" => Token::Column,
1207 "PRIMARY" => Token::Primary,
1208 "KEY" => Token::Key,
1209 "DEFAULT" => Token::Default,
1210 "COMPRESS" => Token::Compress,
1211 "INDEX" => Token::Index,
1212 "UNIQUE" => Token::Unique,
1213 "IF" => Token::If,
1214 "EXISTS" => Token::Exists,
1215 "RETURNING" => Token::Returning,
1216 "CASCADE" => Token::Cascade,
1217 "RENAME" => Token::Rename,
1218 "USING" => Token::Using,
1219 "NODE" => Token::Node,
1220 "EDGE" => Token::Edge,
1221 "DOCUMENT" => Token::Document,
1222 "KV" => Token::Kv,
1223 "TIMESERIES" => Token::Timeseries,
1224 "RETENTION" => Token::Retention,
1225 "QUEUE" => Token::Queue,
1226 "TREE" => Token::Tree,
1227 "PUSH" => Token::Push,
1228 "POP" => Token::Pop,
1229 "PEEK" => Token::Peek,
1230 "PURGE" => Token::Purge,
1231 "ACK" => Token::Ack,
1232 "NACK" => Token::Nack,
1233 "PRIORITY" => Token::Priority,
1234 "LPUSH" => Token::Ident("LPUSH".to_string()),
1235 "RPUSH" => Token::Ident("RPUSH".to_string()),
1236 "LPOP" => Token::Ident("LPOP".to_string()),
1237 "RPOP" => Token::Ident("RPOP".to_string()),
1238 "NEIGHBORHOOD" => Token::Neighborhood,
1239 "SHORTEST_PATH" | "SHORTESTPATH" => Token::ShortestPath,
1240 "CENTRALITY" => Token::Centrality,
1241 "COMMUNITY" => Token::Community,
1242 "COMPONENTS" => Token::Components,
1243 "CYCLES" => Token::Cycles,
1244 "TRAVERSE" => Token::Traverse,
1245 "DEPTH" => Token::Depth,
1246 "DIRECTION" => Token::Direction,
1247 "ALGORITHM" => Token::Algorithm,
1248 "STRATEGY" => Token::Strategy,
1249 "MAX_ITERATIONS" | "MAXITERATIONS" => Token::MaxIterations,
1250 "MAX_LENGTH" | "MAXLENGTH" => Token::MaxLength,
1251 "MODE" => Token::Mode,
1252 "CLUSTERING" => Token::Clustering,
1253 "TOPOLOGICAL_SORT" | "TOPOLOGICALSORT" => Token::TopologicalSort,
1254 "PROPERTIES" => Token::Properties,
1255 "TEXT" => Token::Text,
1256 "FUZZY" => Token::Fuzzy,
1257 "MIN_SCORE" | "MINSCORE" => Token::MinScore,
1258 "BEGIN" => Token::Begin,
1259 "COMMIT" => Token::Commit,
1260 "ROLLBACK" => Token::Rollback,
1261 "SAVEPOINT" => Token::Savepoint,
1262 "RELEASE" => Token::Release,
1263 "START" => Token::Start,
1264 "TRANSACTION" => Token::Transaction,
1265 "WORK" => Token::Work,
1266 "VACUUM" => Token::Vacuum,
1267 "ANALYZE" => Token::Analyze,
1268 "SCHEMA" => Token::Schema,
1269 "SEQUENCE" => Token::Sequence,
1270 "INCREMENT" => Token::Increment,
1271 "COPY" => Token::Copy,
1272 "HEADER" => Token::Header,
1273 "DELIMITER" => Token::Delimiter,
1274 "VIEW" => Token::View,
1275 "MATERIALIZED" => Token::Materialized,
1276 "REFRESH" => Token::Refresh,
1277 "PARTITION" => Token::Partition,
1278 "RANGE" => Token::Range,
1279 "LIST" => Token::List,
1280 "HASH" => Token::Hash,
1281 "ATTACH" => Token::Attach,
1282 "DETACH" => Token::Detach,
1283 "OF" => Token::Of,
1284 "POLICY" => Token::Policy,
1285 "ENABLE" => Token::Enable,
1286 "DISABLE" => Token::Disable,
1287 "SECURITY" => Token::Security,
1288 "ROW" => Token::Row,
1289 "LEVEL" => Token::Level,
1290 "FOREIGN" => Token::Foreign,
1291 "SERVER" => Token::Server,
1292 "WRAPPER" => Token::Wrapper,
1293 "OPTIONS" => Token::Options,
1294 "DATA" => Token::Data,
1295 "SESSIONIZE" => Token::Sessionize,
1296 "GAP" => Token::Gap,
1297 "OVER" => Token::Over,
1298 "ROWS" => Token::Rows,
1299 "PRECEDING" => Token::Preceding,
1300 "FOLLOWING" => Token::Following,
1301 "UNBOUNDED" => Token::Unbounded,
1302 "CURRENT" => Token::Current,
1303 _ => Token::Ident(value),
1304 };
1305
1306 Ok(token)
1307 }
1308
1309 fn scan_less_than(&mut self) -> Result<Token, LexerError> {
1311 self.advance(); match self.peek() {
1313 Some('=') => {
1314 self.advance();
1315 Ok(Token::Le)
1316 }
1317 Some('>') => {
1318 self.advance();
1319 Ok(Token::Ne)
1320 }
1321 Some('-') => {
1322 self.advance();
1323 Ok(Token::ArrowLeft)
1324 }
1325 _ => Ok(Token::Lt),
1326 }
1327 }
1328
1329 fn scan_greater_than(&mut self) -> Result<Token, LexerError> {
1331 self.advance(); if self.peek() == Some('=') {
1333 self.advance();
1334 Ok(Token::Ge)
1335 } else {
1336 Ok(Token::Gt)
1337 }
1338 }
1339
1340 fn scan_minus(&mut self) -> Result<Token, LexerError> {
1342 self.advance(); match self.peek() {
1344 Some('>') => {
1345 self.advance();
1346 Ok(Token::Arrow)
1347 }
1348 Some('-') => {
1349 self.advance();
1351 while let Some(c) = self.peek() {
1352 if c == '\n' {
1353 break;
1354 }
1355 self.advance();
1356 }
1357 self.skip_whitespace_simple();
1359 if self.peek().is_none() {
1360 Ok(Token::Eof)
1361 } else {
1362 let next = self.next_token_internal()?;
1363 Ok(next.token)
1364 }
1365 }
1366 _ => Ok(Token::Dash),
1367 }
1368 }
1369
1370 fn scan_dot(&mut self) -> Result<Token, LexerError> {
1372 self.advance(); if self.peek() == Some('.') {
1374 self.advance();
1375 Ok(Token::DotDot)
1376 } else {
1377 Ok(Token::Dot)
1378 }
1379 }
1380
1381 fn looks_like_json_object_start(&self) -> bool {
1386 let bytes = self.input.as_bytes();
1387 let mut i = self.offset as usize;
1388 debug_assert!(bytes.get(i) == Some(&b'{'));
1390 i += 1;
1391 while i < bytes.len() {
1392 match bytes[i] {
1393 b' ' | b'\t' | b'\n' | b'\r' => i += 1,
1394 b'"' | b'}' => return true,
1395 _ => return false,
1396 }
1397 }
1398 false
1399 }
1400
1401 fn scan_json_literal(&mut self, start: Position) -> Result<Spanned, LexerError> {
1418 let start_offset = self.offset as usize;
1419 self.advance();
1421 let mut depth: u32 = 1;
1422 let mut in_string = false;
1423 let mut escape = false;
1424 loop {
1425 let ch = match self.peek() {
1426 Some(c) => c,
1427 None => {
1428 return Err(LexerError::new(
1429 format!(
1430 "unterminated JSON object literal (started at offset {})",
1431 start.offset
1432 ),
1433 self.position(),
1434 ));
1435 }
1436 };
1437
1438 let scanned_bytes = self.offset as usize - start_offset;
1440 if scanned_bytes > JSON_LITERAL_MAX_BYTES {
1441 return Err(LexerError::new(
1442 format!(
1443 "JSON object literal exceeds JSON_LITERAL_MAX_BYTES ({} bytes)",
1444 JSON_LITERAL_MAX_BYTES
1445 ),
1446 start,
1447 ));
1448 }
1449
1450 self.advance();
1451
1452 if escape {
1453 escape = false;
1454 continue;
1455 }
1456
1457 if in_string {
1458 match ch {
1459 '\\' => escape = true,
1460 '"' => in_string = false,
1461 _ => {}
1462 }
1463 continue;
1464 }
1465
1466 match ch {
1467 '"' => in_string = true,
1468 '{' => depth += 1,
1469 '}' => {
1470 depth -= 1;
1471 if depth == 0 {
1472 let end = self.position();
1473 let end_offset = self.offset as usize;
1474 if end_offset - start_offset > JSON_LITERAL_MAX_BYTES {
1476 return Err(LexerError::new(
1477 format!(
1478 "JSON object literal exceeds JSON_LITERAL_MAX_BYTES ({} bytes)",
1479 JSON_LITERAL_MAX_BYTES
1480 ),
1481 start,
1482 ));
1483 }
1484 let raw = self.input[start_offset..end_offset].to_string();
1485 return Ok(Spanned::new(Token::JsonLiteral(raw), start, end));
1486 }
1487 }
1488 _ => {}
1489 }
1490 }
1491 }
1492
1493 pub fn tokenize(&mut self) -> Result<Vec<Spanned>, LexerError> {
1495 let mut tokens = Vec::new();
1496 loop {
1497 let tok = self.next_token()?;
1498 let is_eof = tok.token == Token::Eof;
1499 tokens.push(tok);
1500 if is_eof {
1501 break;
1502 }
1503 }
1504 Ok(tokens)
1505 }
1506}
1507
1508#[cfg(test)]
1513mod tests {
1514 use super::*;
1515
1516 fn tokenize(input: &str) -> Vec<Token> {
1517 let mut lexer = Lexer::new(input);
1518 lexer
1519 .tokenize()
1520 .unwrap()
1521 .into_iter()
1522 .map(|s| s.token)
1523 .collect()
1524 }
1525
1526 #[test]
1527 fn test_keywords() {
1528 let tokens = tokenize("SELECT FROM WHERE AND OR NOT");
1529 assert_eq!(
1530 tokens,
1531 vec![
1532 Token::Select,
1533 Token::From,
1534 Token::Where,
1535 Token::And,
1536 Token::Or,
1537 Token::Not,
1538 Token::Eof
1539 ]
1540 );
1541 }
1542
1543 #[test]
1544 fn test_identifiers() {
1545 let tokens = tokenize("hosts users ip_address");
1546 assert_eq!(
1547 tokens,
1548 vec![
1549 Token::Ident("hosts".into()),
1550 Token::Ident("users".into()),
1551 Token::Ident("ip_address".into()),
1552 Token::Eof
1553 ]
1554 );
1555 }
1556
1557 #[test]
1558 fn test_numbers() {
1559 let tokens = tokenize("42 2.5 1e10 2.5e-3");
1560 assert_eq!(
1561 tokens,
1562 vec![
1563 Token::Integer(42),
1564 Token::Float(2.5),
1565 Token::Float(1e10),
1566 Token::Float(2.5e-3),
1567 Token::Eof
1568 ]
1569 );
1570 }
1571
1572 #[test]
1573 fn test_strings() {
1574 let tokens = tokenize("'hello' \"world\" 'it''s'");
1575 assert_eq!(
1576 tokens,
1577 vec![
1578 Token::String("hello".into()),
1579 Token::String("world".into()),
1580 Token::String("it's".into()),
1581 Token::Eof
1582 ]
1583 );
1584 }
1585
1586 #[test]
1587 fn test_operators() {
1588 let tokens = tokenize("= <> < <= > >= != + - * /");
1589 assert_eq!(
1590 tokens,
1591 vec![
1592 Token::Eq,
1593 Token::Ne,
1594 Token::Lt,
1595 Token::Le,
1596 Token::Gt,
1597 Token::Ge,
1598 Token::Ne,
1599 Token::Plus,
1600 Token::Dash,
1601 Token::Star,
1602 Token::Slash,
1603 Token::Eof
1604 ]
1605 );
1606 }
1607
1608 #[test]
1609 fn test_delimiters() {
1610 let tokens = tokenize("( ) [ ] { a } , . : ;");
1615 assert_eq!(
1616 tokens,
1617 vec![
1618 Token::LParen,
1619 Token::RParen,
1620 Token::LBracket,
1621 Token::RBracket,
1622 Token::LBrace,
1623 Token::Ident("a".into()),
1624 Token::RBrace,
1625 Token::Comma,
1626 Token::Dot,
1627 Token::Colon,
1628 Token::Semi,
1629 Token::Eof
1630 ]
1631 );
1632 }
1633
1634 #[test]
1635 fn test_json_literal_empty_object() {
1636 let tokens = tokenize("{ }");
1637 assert_eq!(tokens, vec![Token::JsonLiteral("{ }".into()), Token::Eof]);
1638 }
1639
1640 #[test]
1641 fn test_json_literal_simple() {
1642 let tokens = tokenize(r#"{"a":1}"#);
1643 assert_eq!(
1644 tokens,
1645 vec![Token::JsonLiteral(r#"{"a":1}"#.into()), Token::Eof]
1646 );
1647 }
1648
1649 #[test]
1650 fn test_json_literal_nested() {
1651 let raw = r#"{"a":{"b":[1,2,{"c":"}"}]}}"#;
1652 let tokens = tokenize(raw);
1653 assert_eq!(tokens, vec![Token::JsonLiteral(raw.into()), Token::Eof]);
1654 }
1655
1656 #[test]
1657 fn test_json_literal_escaped_quote_in_string() {
1658 let raw = r#"{"path":"O\"Brien}"}"#;
1660 let tokens = tokenize(raw);
1661 assert_eq!(tokens, vec![Token::JsonLiteral(raw.into()), Token::Eof]);
1662 }
1663
1664 #[test]
1665 fn test_json_literal_unbalanced_eof() {
1666 let mut lexer = Lexer::new(r#"{"a":1"#);
1667 let err = lexer.tokenize().expect_err("expected unterminated error");
1668 assert!(
1669 err.message.contains("unterminated JSON object literal"),
1670 "got: {}",
1671 err.message
1672 );
1673 }
1674
1675 #[test]
1676 fn test_json_literal_property_bag_compatible() {
1677 let tokens = tokenize("{name: 'value'}");
1680 assert_eq!(tokens[0], Token::LBrace);
1681 assert_eq!(*tokens.last().unwrap(), Token::Eof);
1682 }
1683
1684 #[test]
1685 fn test_graph_syntax() {
1686 let tokens = tokenize("-> <- - ..");
1687 assert_eq!(
1688 tokens,
1689 vec![
1690 Token::Arrow,
1691 Token::ArrowLeft,
1692 Token::Dash,
1693 Token::DotDot,
1694 Token::Eof
1695 ]
1696 );
1697 }
1698
1699 #[test]
1700 fn test_table_query() {
1701 let tokens = tokenize("SELECT ip, hostname FROM hosts WHERE os = 'Linux' LIMIT 10");
1702 assert_eq!(
1703 tokens,
1704 vec![
1705 Token::Select,
1706 Token::Ident("ip".into()),
1707 Token::Comma,
1708 Token::Ident("hostname".into()),
1709 Token::From,
1710 Token::Ident("hosts".into()),
1711 Token::Where,
1712 Token::Ident("os".into()),
1713 Token::Eq,
1714 Token::String("Linux".into()),
1715 Token::Limit,
1716 Token::Integer(10),
1717 Token::Eof
1718 ]
1719 );
1720 }
1721
1722 #[test]
1723 fn test_graph_query() {
1724 let tokens = tokenize("MATCH (h:Host)-[:HAS_SERVICE]->(s:Service) RETURN h, s");
1725 assert_eq!(
1726 tokens,
1727 vec![
1728 Token::Match,
1729 Token::LParen,
1730 Token::Ident("h".into()),
1731 Token::Colon,
1732 Token::Ident("Host".into()),
1733 Token::RParen,
1734 Token::Dash,
1735 Token::LBracket,
1736 Token::Colon,
1737 Token::Ident("HAS_SERVICE".into()),
1738 Token::RBracket,
1739 Token::Arrow,
1740 Token::LParen,
1741 Token::Ident("s".into()),
1742 Token::Colon,
1743 Token::Ident("Service".into()),
1744 Token::RParen,
1745 Token::Return,
1746 Token::Ident("h".into()),
1747 Token::Comma,
1748 Token::Ident("s".into()),
1749 Token::Eof
1750 ]
1751 );
1752 }
1753
1754 #[test]
1755 fn test_join_query() {
1756 let tokens = tokenize("FROM hosts h JOIN GRAPH (h)-[:HAS_VULN]->(v) ON h.ip = v.id");
1757 assert_eq!(
1758 tokens,
1759 vec![
1760 Token::From,
1761 Token::Ident("hosts".into()),
1762 Token::Ident("h".into()),
1763 Token::Join,
1764 Token::Graph,
1765 Token::LParen,
1766 Token::Ident("h".into()),
1767 Token::RParen,
1768 Token::Dash,
1769 Token::LBracket,
1770 Token::Colon,
1771 Token::Ident("HAS_VULN".into()),
1772 Token::RBracket,
1773 Token::Arrow,
1774 Token::LParen,
1775 Token::Ident("v".into()),
1776 Token::RParen,
1777 Token::On,
1778 Token::Ident("h".into()),
1779 Token::Dot,
1780 Token::Ident("ip".into()),
1781 Token::Eq,
1782 Token::Ident("v".into()),
1783 Token::Dot,
1784 Token::Ident("id".into()),
1785 Token::Eof
1786 ]
1787 );
1788 }
1789
1790 #[test]
1791 fn test_path_query() {
1792 let tokens = tokenize("PATH FROM host('192.168.1.1') TO host('10.0.0.1') VIA [:AUTH]");
1793 assert_eq!(
1794 tokens,
1795 vec![
1796 Token::Path,
1797 Token::From,
1798 Token::Ident("host".into()),
1799 Token::LParen,
1800 Token::String("192.168.1.1".into()),
1801 Token::RParen,
1802 Token::To,
1803 Token::Ident("host".into()),
1804 Token::LParen,
1805 Token::String("10.0.0.1".into()),
1806 Token::RParen,
1807 Token::Via,
1808 Token::LBracket,
1809 Token::Colon,
1810 Token::Ident("AUTH".into()),
1811 Token::RBracket,
1812 Token::Eof
1813 ]
1814 );
1815 }
1816
1817 #[test]
1818 fn test_variable_length_pattern() {
1819 let tokens = tokenize("(a)-[*1..5]->(b)");
1820 assert_eq!(
1821 tokens,
1822 vec![
1823 Token::LParen,
1824 Token::Ident("a".into()),
1825 Token::RParen,
1826 Token::Dash,
1827 Token::LBracket,
1828 Token::Star,
1829 Token::Integer(1),
1830 Token::DotDot,
1831 Token::Integer(5),
1832 Token::RBracket,
1833 Token::Arrow,
1834 Token::LParen,
1835 Token::Ident("b".into()),
1836 Token::RParen,
1837 Token::Eof
1838 ]
1839 );
1840 }
1841
1842 #[test]
1843 fn test_case_insensitive_keywords() {
1844 let tokens = tokenize("select FROM Where AND");
1845 assert_eq!(
1846 tokens,
1847 vec![
1848 Token::Select,
1849 Token::From,
1850 Token::Where,
1851 Token::And,
1852 Token::Eof
1853 ]
1854 );
1855 }
1856
1857 #[test]
1858 fn test_comments() {
1859 let tokens = tokenize("SELECT -- this is a comment\nip FROM hosts");
1860 assert_eq!(
1861 tokens,
1862 vec![
1863 Token::Select,
1864 Token::Ident("ip".into()),
1865 Token::From,
1866 Token::Ident("hosts".into()),
1867 Token::Eof
1868 ]
1869 );
1870 }
1871
1872 #[test]
1873 fn test_escaped_strings() {
1874 let tokens = tokenize(r"'hello\nworld' 'tab\there'");
1875 assert_eq!(
1876 tokens,
1877 vec![
1878 Token::String("hello\nworld".into()),
1879 Token::String("tab\there".into()),
1880 Token::Eof
1881 ]
1882 );
1883 }
1884
1885 #[test]
1886 fn test_keyword_matrix_and_alias_spellings() {
1887 let cases = [
1888 ("SELECT", Token::Select),
1889 ("FROM", Token::From),
1890 ("WHERE", Token::Where),
1891 ("AND", Token::And),
1892 ("OR", Token::Or),
1893 ("NOT", Token::Not),
1894 ("MATCH", Token::Match),
1895 ("RETURN", Token::Return),
1896 ("JOIN", Token::Join),
1897 ("GRAPH", Token::Graph),
1898 ("PATH", Token::Path),
1899 ("TO", Token::To),
1900 ("VIA", Token::Via),
1901 ("ON", Token::On),
1902 ("AS", Token::As),
1903 ("IS", Token::Is),
1904 ("NULL", Token::Null),
1905 ("BETWEEN", Token::Between),
1906 ("LIKE", Token::Like),
1907 ("IN", Token::In),
1908 ("ORDER", Token::Order),
1909 ("BY", Token::By),
1910 ("ASC", Token::Asc),
1911 ("DESC", Token::Desc),
1912 ("NULLS", Token::Nulls),
1913 ("FIRST", Token::First),
1914 ("LAST", Token::Last),
1915 ("LIMIT", Token::Limit),
1916 ("OFFSET", Token::Offset),
1917 ("INNER", Token::Inner),
1918 ("LEFT", Token::Left),
1919 ("RIGHT", Token::Right),
1920 ("OUTER", Token::Outer),
1921 ("FULL", Token::Full),
1922 ("CROSS", Token::Cross),
1923 ("STARTS", Token::Starts),
1924 ("ENDS", Token::Ends),
1925 ("WITH", Token::With),
1926 ("CONTAINS", Token::Contains),
1927 ("TRUE", Token::True),
1928 ("FALSE", Token::False),
1929 ("ENRICH", Token::Enrich),
1930 ("GROUP", Token::Group),
1931 ("COUNT", Token::Count),
1932 ("SUM", Token::Sum),
1933 ("AVG", Token::Avg),
1934 ("MIN", Token::Min),
1935 ("MAX", Token::Max),
1936 ("DISTINCT", Token::Distinct),
1937 ("VECTOR", Token::Vector),
1938 ("SEARCH", Token::Search),
1939 ("SIMILAR", Token::Similar),
1940 ("COLLECTION", Token::Collection),
1941 ("METRIC", Token::Metric),
1942 ("THRESHOLD", Token::Threshold),
1943 ("K", Token::K),
1944 ("HYBRID", Token::Hybrid),
1945 ("FUSION", Token::Fusion),
1946 ("RERANK", Token::Rerank),
1947 ("RRF", Token::Rrf),
1948 ("INTERSECTION", Token::Intersection),
1949 ("UNION", Token::Union),
1950 ("RECURSIVE", Token::Recursive),
1951 ("ALL", Token::All),
1952 ("WEIGHT", Token::Weight),
1953 ("L2", Token::L2),
1954 ("COSINE", Token::Cosine),
1955 ("INNER_PRODUCT", Token::InnerProduct),
1956 ("INNERPRODUCT", Token::InnerProduct),
1957 ("INCLUDE", Token::Include),
1958 ("METADATA", Token::Metadata),
1959 ("VECTORS", Token::Vectors),
1960 ("EXPLAIN", Token::Explain),
1961 ("FOR", Token::For),
1962 ("FORMAT", Token::Format),
1963 ("JSON", Token::Json),
1964 ("INSERT", Token::Insert),
1965 ("INTO", Token::Into),
1966 ("VALUES", Token::Values),
1967 ("UPDATE", Token::Update),
1968 ("SET", Token::Set),
1969 ("DELETE", Token::Delete),
1970 ("TRUNCATE", Token::Truncate),
1971 ("CREATE", Token::Create),
1972 ("TABLE", Token::Table),
1973 ("DROP", Token::Drop),
1974 ("ALTER", Token::Alter),
1975 ("ADD", Token::Add),
1976 ("COLUMN", Token::Column),
1977 ("PRIMARY", Token::Primary),
1978 ("KEY", Token::Key),
1979 ("DEFAULT", Token::Default),
1980 ("COMPRESS", Token::Compress),
1981 ("INDEX", Token::Index),
1982 ("UNIQUE", Token::Unique),
1983 ("IF", Token::If),
1984 ("EXISTS", Token::Exists),
1985 ("RETURNING", Token::Returning),
1986 ("CASCADE", Token::Cascade),
1987 ("RENAME", Token::Rename),
1988 ("USING", Token::Using),
1989 ("NODE", Token::Node),
1990 ("EDGE", Token::Edge),
1991 ("DOCUMENT", Token::Document),
1992 ("KV", Token::Kv),
1993 ("TIMESERIES", Token::Timeseries),
1994 ("RETENTION", Token::Retention),
1995 ("QUEUE", Token::Queue),
1996 ("TREE", Token::Tree),
1997 ("PUSH", Token::Push),
1998 ("POP", Token::Pop),
1999 ("PEEK", Token::Peek),
2000 ("PURGE", Token::Purge),
2001 ("ACK", Token::Ack),
2002 ("NACK", Token::Nack),
2003 ("PRIORITY", Token::Priority),
2004 ("LPUSH", Token::Ident("LPUSH".into())),
2005 ("RPUSH", Token::Ident("RPUSH".into())),
2006 ("LPOP", Token::Ident("LPOP".into())),
2007 ("RPOP", Token::Ident("RPOP".into())),
2008 ("NEIGHBORHOOD", Token::Neighborhood),
2009 ("SHORTEST_PATH", Token::ShortestPath),
2010 ("SHORTESTPATH", Token::ShortestPath),
2011 ("CENTRALITY", Token::Centrality),
2012 ("COMMUNITY", Token::Community),
2013 ("COMPONENTS", Token::Components),
2014 ("CYCLES", Token::Cycles),
2015 ("TRAVERSE", Token::Traverse),
2016 ("DEPTH", Token::Depth),
2017 ("DIRECTION", Token::Direction),
2018 ("ALGORITHM", Token::Algorithm),
2019 ("STRATEGY", Token::Strategy),
2020 ("MAX_ITERATIONS", Token::MaxIterations),
2021 ("MAXITERATIONS", Token::MaxIterations),
2022 ("MAX_LENGTH", Token::MaxLength),
2023 ("MAXLENGTH", Token::MaxLength),
2024 ("MODE", Token::Mode),
2025 ("CLUSTERING", Token::Clustering),
2026 ("TOPOLOGICAL_SORT", Token::TopologicalSort),
2027 ("TOPOLOGICALSORT", Token::TopologicalSort),
2028 ("PROPERTIES", Token::Properties),
2029 ("TEXT", Token::Text),
2030 ("FUZZY", Token::Fuzzy),
2031 ("MIN_SCORE", Token::MinScore),
2032 ("MINSCORE", Token::MinScore),
2033 ("BEGIN", Token::Begin),
2034 ("COMMIT", Token::Commit),
2035 ("ROLLBACK", Token::Rollback),
2036 ("SAVEPOINT", Token::Savepoint),
2037 ("RELEASE", Token::Release),
2038 ("START", Token::Start),
2039 ("TRANSACTION", Token::Transaction),
2040 ("WORK", Token::Work),
2041 ("VACUUM", Token::Vacuum),
2042 ("ANALYZE", Token::Analyze),
2043 ("SCHEMA", Token::Schema),
2044 ("SEQUENCE", Token::Sequence),
2045 ("INCREMENT", Token::Increment),
2046 ("COPY", Token::Copy),
2047 ("HEADER", Token::Header),
2048 ("DELIMITER", Token::Delimiter),
2049 ("VIEW", Token::View),
2050 ("MATERIALIZED", Token::Materialized),
2051 ("REFRESH", Token::Refresh),
2052 ("PARTITION", Token::Partition),
2053 ("RANGE", Token::Range),
2054 ("LIST", Token::List),
2055 ("HASH", Token::Hash),
2056 ("ATTACH", Token::Attach),
2057 ("DETACH", Token::Detach),
2058 ("OF", Token::Of),
2059 ("POLICY", Token::Policy),
2060 ("ENABLE", Token::Enable),
2061 ("DISABLE", Token::Disable),
2062 ("SECURITY", Token::Security),
2063 ("ROW", Token::Row),
2064 ("LEVEL", Token::Level),
2065 ("FOREIGN", Token::Foreign),
2066 ("SERVER", Token::Server),
2067 ("WRAPPER", Token::Wrapper),
2068 ("OPTIONS", Token::Options),
2069 ("DATA", Token::Data),
2070 ("plain_ident", Token::Ident("plain_ident".into())),
2071 ];
2072
2073 for (input, expected) in cases {
2074 let tokens = tokenize(input);
2075 assert_eq!(tokens, vec![expected, Token::Eof], "{input}");
2076 }
2077 }
2078
2079 #[test]
2080 fn test_display_all_token_variants() {
2081 let cases = [
2082 (Token::Select, "SELECT"),
2083 (Token::From, "FROM"),
2084 (Token::Where, "WHERE"),
2085 (Token::And, "AND"),
2086 (Token::Or, "OR"),
2087 (Token::Not, "NOT"),
2088 (Token::Match, "MATCH"),
2089 (Token::Return, "RETURN"),
2090 (Token::Join, "JOIN"),
2091 (Token::Graph, "GRAPH"),
2092 (Token::Path, "PATH"),
2093 (Token::To, "TO"),
2094 (Token::Via, "VIA"),
2095 (Token::On, "ON"),
2096 (Token::As, "AS"),
2097 (Token::Is, "IS"),
2098 (Token::Null, "NULL"),
2099 (Token::Between, "BETWEEN"),
2100 (Token::Like, "LIKE"),
2101 (Token::In, "IN"),
2102 (Token::Order, "ORDER"),
2103 (Token::By, "BY"),
2104 (Token::Asc, "ASC"),
2105 (Token::Desc, "DESC"),
2106 (Token::Nulls, "NULLS"),
2107 (Token::First, "FIRST"),
2108 (Token::Last, "LAST"),
2109 (Token::Limit, "LIMIT"),
2110 (Token::Offset, "OFFSET"),
2111 (Token::Inner, "INNER"),
2112 (Token::Left, "LEFT"),
2113 (Token::Right, "RIGHT"),
2114 (Token::Outer, "OUTER"),
2115 (Token::Full, "FULL"),
2116 (Token::Cross, "CROSS"),
2117 (Token::Starts, "STARTS"),
2118 (Token::Ends, "ENDS"),
2119 (Token::With, "WITH"),
2120 (Token::Contains, "CONTAINS"),
2121 (Token::True, "TRUE"),
2122 (Token::False, "FALSE"),
2123 (Token::Enrich, "ENRICH"),
2124 (Token::Group, "GROUP"),
2125 (Token::Count, "COUNT"),
2126 (Token::Sum, "SUM"),
2127 (Token::Avg, "AVG"),
2128 (Token::Min, "MIN"),
2129 (Token::Max, "MAX"),
2130 (Token::Distinct, "DISTINCT"),
2131 (Token::Vector, "VECTOR"),
2132 (Token::Search, "SEARCH"),
2133 (Token::Similar, "SIMILAR"),
2134 (Token::Collection, "COLLECTION"),
2135 (Token::Metric, "METRIC"),
2136 (Token::Threshold, "THRESHOLD"),
2137 (Token::K, "K"),
2138 (Token::Hybrid, "HYBRID"),
2139 (Token::Fusion, "FUSION"),
2140 (Token::Rerank, "RERANK"),
2141 (Token::Rrf, "RRF"),
2142 (Token::Intersection, "INTERSECTION"),
2143 (Token::Union, "UNION"),
2144 (Token::Recursive, "RECURSIVE"),
2145 (Token::All, "ALL"),
2146 (Token::Weight, "WEIGHT"),
2147 (Token::L2, "L2"),
2148 (Token::Cosine, "COSINE"),
2149 (Token::InnerProduct, "INNER_PRODUCT"),
2150 (Token::Include, "INCLUDE"),
2151 (Token::Metadata, "METADATA"),
2152 (Token::Vectors, "VECTORS"),
2153 (Token::Explain, "EXPLAIN"),
2154 (Token::For, "FOR"),
2155 (Token::Format, "FORMAT"),
2156 (Token::Json, "JSON"),
2157 (Token::Insert, "INSERT"),
2158 (Token::Into, "INTO"),
2159 (Token::Values, "VALUES"),
2160 (Token::Update, "UPDATE"),
2161 (Token::Set, "SET"),
2162 (Token::Delete, "DELETE"),
2163 (Token::Truncate, "TRUNCATE"),
2164 (Token::Create, "CREATE"),
2165 (Token::Table, "TABLE"),
2166 (Token::Drop, "DROP"),
2167 (Token::Alter, "ALTER"),
2168 (Token::Add, "ADD"),
2169 (Token::Column, "COLUMN"),
2170 (Token::Primary, "PRIMARY"),
2171 (Token::Key, "KEY"),
2172 (Token::Default, "DEFAULT"),
2173 (Token::Compress, "COMPRESS"),
2174 (Token::Index, "INDEX"),
2175 (Token::Unique, "UNIQUE"),
2176 (Token::If, "IF"),
2177 (Token::Exists, "EXISTS"),
2178 (Token::Returning, "RETURNING"),
2179 (Token::Cascade, "CASCADE"),
2180 (Token::Rename, "RENAME"),
2181 (Token::Using, "USING"),
2182 (Token::Node, "NODE"),
2183 (Token::Edge, "EDGE"),
2184 (Token::Document, "DOCUMENT"),
2185 (Token::Kv, "KV"),
2186 (Token::Timeseries, "TIMESERIES"),
2187 (Token::Retention, "RETENTION"),
2188 (Token::Queue, "QUEUE"),
2189 (Token::Tree, "TREE"),
2190 (Token::Push, "PUSH"),
2191 (Token::Pop, "POP"),
2192 (Token::Peek, "PEEK"),
2193 (Token::Purge, "PURGE"),
2194 (Token::Ack, "ACK"),
2195 (Token::Nack, "NACK"),
2196 (Token::Priority, "PRIORITY"),
2197 (Token::Neighborhood, "NEIGHBORHOOD"),
2198 (Token::ShortestPath, "SHORTEST_PATH"),
2199 (Token::Centrality, "CENTRALITY"),
2200 (Token::Community, "COMMUNITY"),
2201 (Token::Components, "COMPONENTS"),
2202 (Token::Cycles, "CYCLES"),
2203 (Token::Traverse, "TRAVERSE"),
2204 (Token::Depth, "DEPTH"),
2205 (Token::Direction, "DIRECTION"),
2206 (Token::Algorithm, "ALGORITHM"),
2207 (Token::Strategy, "STRATEGY"),
2208 (Token::MaxIterations, "MAX_ITERATIONS"),
2209 (Token::MaxLength, "MAX_LENGTH"),
2210 (Token::Mode, "MODE"),
2211 (Token::Clustering, "CLUSTERING"),
2212 (Token::TopologicalSort, "TOPOLOGICAL_SORT"),
2213 (Token::Properties, "PROPERTIES"),
2214 (Token::Text, "TEXT"),
2215 (Token::Fuzzy, "FUZZY"),
2216 (Token::MinScore, "MIN_SCORE"),
2217 (Token::Begin, "BEGIN"),
2218 (Token::Commit, "COMMIT"),
2219 (Token::Rollback, "ROLLBACK"),
2220 (Token::Savepoint, "SAVEPOINT"),
2221 (Token::Release, "RELEASE"),
2222 (Token::Start, "START"),
2223 (Token::Transaction, "TRANSACTION"),
2224 (Token::Work, "WORK"),
2225 (Token::Vacuum, "VACUUM"),
2226 (Token::Analyze, "ANALYZE"),
2227 (Token::Schema, "SCHEMA"),
2228 (Token::Sequence, "SEQUENCE"),
2229 (Token::Increment, "INCREMENT"),
2230 (Token::Copy, "COPY"),
2231 (Token::Header, "HEADER"),
2232 (Token::Delimiter, "DELIMITER"),
2233 (Token::View, "VIEW"),
2234 (Token::Materialized, "MATERIALIZED"),
2235 (Token::Refresh, "REFRESH"),
2236 (Token::Partition, "PARTITION"),
2237 (Token::Range, "RANGE"),
2238 (Token::List, "LIST"),
2239 (Token::Hash, "HASH"),
2240 (Token::Attach, "ATTACH"),
2241 (Token::Detach, "DETACH"),
2242 (Token::Of, "OF"),
2243 (Token::Policy, "POLICY"),
2244 (Token::Enable, "ENABLE"),
2245 (Token::Disable, "DISABLE"),
2246 (Token::Security, "SECURITY"),
2247 (Token::Row, "ROW"),
2248 (Token::Level, "LEVEL"),
2249 (Token::Foreign, "FOREIGN"),
2250 (Token::Server, "SERVER"),
2251 (Token::Wrapper, "WRAPPER"),
2252 (Token::Options, "OPTIONS"),
2253 (Token::Data, "DATA"),
2254 (Token::String("x".into()), "'x'"),
2255 (Token::Integer(7), "7"),
2256 (Token::Float(1.5), "1.5"),
2257 (Token::JsonLiteral(r#"{"x":1}"#.into()), r#"{"x":1}"#),
2258 (Token::Ident("id".into()), "id"),
2259 (Token::Eq, "="),
2260 (Token::Ne, "<>"),
2261 (Token::Lt, "<"),
2262 (Token::Le, "<="),
2263 (Token::Gt, ">"),
2264 (Token::Ge, ">="),
2265 (Token::Plus, "+"),
2266 (Token::Minus, "-"),
2267 (Token::Star, "*"),
2268 (Token::Slash, "/"),
2269 (Token::Percent, "%"),
2270 (Token::LParen, "("),
2271 (Token::RParen, ")"),
2272 (Token::LBracket, "["),
2273 (Token::RBracket, "]"),
2274 (Token::LBrace, "{"),
2275 (Token::RBrace, "}"),
2276 (Token::Comma, ","),
2277 (Token::Dot, "."),
2278 (Token::Colon, ":"),
2279 (Token::Semi, ";"),
2280 (Token::Dollar, "$"),
2281 (Token::FatArrow, "=>"),
2282 (Token::Arrow, "->"),
2283 (Token::ArrowLeft, "<-"),
2284 (Token::Dash, "-"),
2285 (Token::DotDot, ".."),
2286 (Token::Pipe, "|"),
2287 (Token::DoublePipe, "||"),
2288 (Token::Eof, "EOF"),
2289 ];
2290
2291 for (token, expected) in cases {
2292 assert_eq!(token.to_string(), expected);
2293 }
2294 }
2295
2296 #[test]
2297 fn fat_arrow_lexes_distinctly_from_eq() {
2298 assert_eq!(
2300 tokenize("resolution => 0.5"),
2301 vec![
2302 Token::Ident("resolution".into()),
2303 Token::FatArrow,
2304 Token::Float(0.5),
2305 Token::Eof,
2306 ]
2307 );
2308 assert_eq!(
2309 tokenize("x = 1"),
2310 vec![
2311 Token::Ident("x".into()),
2312 Token::Eq,
2313 Token::Integer(1),
2314 Token::Eof,
2315 ]
2316 );
2317 }
2318
2319 #[test]
2320 fn test_string_escape_and_error_matrix() {
2321 let tokens = tokenize(
2322 r#"'line\nrow' 'carriage\rreturn' 'tab\tstop' 'slash\\' 'quote\'' "dq\"" 'raw\z'"#,
2323 );
2324 assert_eq!(
2325 tokens,
2326 vec![
2327 Token::String("line\nrow".into()),
2328 Token::String("carriage\rreturn".into()),
2329 Token::String("tab\tstop".into()),
2330 Token::String("slash\\".into()),
2331 Token::String("quote'".into()),
2332 Token::String("dq\"".into()),
2333 Token::String(r"raw\z".into()),
2334 Token::Eof
2335 ]
2336 );
2337
2338 let mut lexer = Lexer::new("'unterminated");
2339 assert!(lexer
2340 .next_token()
2341 .unwrap_err()
2342 .message
2343 .contains("Unterminated string"));
2344
2345 let mut lexer = Lexer::new(r"'bad\");
2346 assert!(lexer
2347 .next_token()
2348 .unwrap_err()
2349 .message
2350 .contains("Unterminated string"));
2351 }
2352
2353 #[test]
2354 fn test_operator_comment_peek_limit_and_tokenize_paths() {
2355 let tokens = tokenize("!= % ; $ || | 123.abc 1..2 1e+2 <- -> /* block */ SELECT");
2356 assert_eq!(
2357 tokens,
2358 vec![
2359 Token::Ne,
2360 Token::Percent,
2361 Token::Semi,
2362 Token::Dollar,
2363 Token::DoublePipe,
2364 Token::Pipe,
2365 Token::Integer(123),
2366 Token::Dot,
2367 Token::Ident("abc".into()),
2368 Token::Integer(1),
2369 Token::DotDot,
2370 Token::Integer(2),
2371 Token::Float(1e2),
2372 Token::ArrowLeft,
2373 Token::Arrow,
2374 Token::Select,
2375 Token::Eof,
2376 ]
2377 );
2378
2379 let mut lexer = Lexer::new("SELECT FROM");
2380 assert_eq!(lexer.peek_token().unwrap().token, Token::Select);
2381 assert_eq!(lexer.next_token().unwrap().token, Token::Select);
2382 assert_eq!(lexer.next_token().unwrap().token, Token::From);
2383
2384 let mut lexer = Lexer::new("!");
2385 assert!(lexer
2386 .next_token()
2387 .unwrap_err()
2388 .message
2389 .contains("Expected '=' after '!'"));
2390
2391 let limits = crate::limits::ParserLimits {
2392 max_identifier_chars: 3,
2393 ..crate::limits::ParserLimits::default()
2394 };
2395 let mut lexer = Lexer::with_limits("abcd", limits);
2396 assert_eq!(lexer.max_identifier_chars(), 3);
2397 let err = lexer.next_token().unwrap_err();
2398 assert!(matches!(
2399 err.limit_hit,
2400 Some(LexerLimitHit::IdentifierTooLong { value: 3, .. })
2401 ));
2402 }
2403}