1use std::fmt;
15use std::iter::Peekable;
16use std::str::Chars;
17
18#[derive(Debug, Clone, PartialEq)]
20pub enum Token {
21 Select,
23 From,
24 Where,
25 And,
26 Or,
27 Not,
28 Match,
29 Return,
30 Join,
31 Graph,
32 Path,
33 To,
34 Via,
35 On,
36 As,
37 Is,
38 Null,
39 Between,
40 Like,
41 In,
42 Order,
43 By,
44 Asc,
45 Desc,
46 Nulls,
47 First,
48 Last,
49 Limit,
50 Offset,
51 Inner,
52 Left,
53 Right,
54 Outer,
55 Full,
56 Cross,
57 Starts,
58 Ends,
59 With,
60 Contains,
61 True,
62 False,
63 Enrich,
64 Group,
65 Count,
66 Sum,
67 Avg,
68 Min,
69 Max,
70 Distinct,
71
72 Vector,
74 Search,
75 Similar,
76 Collection,
77 Metric,
78 Threshold,
79 K,
80 Hybrid,
81 Fusion,
82 Rerank,
83 Rrf,
84 Intersection,
85 Union,
86 Recursive,
87 All,
88 Weight,
89 L2,
90 Cosine,
91 InnerProduct,
92 Include,
93 Metadata,
94 Vectors,
95
96 Insert,
98 Into,
99 Values,
100 Update,
101 Set,
102 Delete,
103 Truncate,
104 Create,
105 Table,
106 Drop,
107 Alter,
108 Add,
109 Column,
110 Primary,
111 Explain,
113 For,
114 Format,
115 Json,
116 Key,
117 Default,
118 Compress,
119 Index,
120 Unique,
121 If,
122 Exists,
123 Returning,
124 Cascade,
125 Rename,
126 Using,
127
128 Node,
130 Edge,
131 Document,
132 Kv,
133
134 Timeseries,
136 Retention,
137 Queue,
138 Tree,
139 Push,
140 Pop,
141 Peek,
142 Purge,
143 Ack,
144 Nack,
145 Priority,
146
147 Neighborhood,
149 ShortestPath,
150 Centrality,
151 Community,
152 Components,
153 Cycles,
154 Traverse,
155 Depth,
156 Direction,
157 Algorithm,
158 Strategy,
159 MaxIterations,
160 MaxLength,
161 Mode,
162 Clustering,
163 TopologicalSort,
164 Properties,
165 Text,
166 Fuzzy,
167 MinScore,
168
169 Begin,
171 Commit,
172 Rollback,
173 Savepoint,
174 Release,
175 Start,
176 Transaction,
177 Work,
178
179 Vacuum,
181 Analyze,
182
183 Schema,
185 Sequence,
186 Increment,
187
188 Copy,
190 Header,
191 Delimiter,
192
193 View,
195 Materialized,
196 Refresh,
197
198 Partition,
200 Range,
201 List,
202 Hash,
203 Attach,
204 Detach,
205 Of,
206
207 Policy,
209 Enable,
210 Disable,
211 Security,
212 Row,
213 Level,
214
215 Foreign,
217 Server,
218 Wrapper,
219 Options,
220 Data,
221
222 Sessionize,
224 Gap,
225
226 Over,
230 Rows,
231 Preceding,
232 Following,
233 Unbounded,
234 Current,
235
236 String(String),
238 Integer(i64),
239 Float(f64),
240 JsonLiteral(String),
246
247 Ident(String),
249
250 Eq, Ne, Lt, Le, Gt, Ge, Plus, Minus, Star, Slash, Percent, LParen, RParen, LBracket, RBracket, LBrace, RBrace, Comma, Dot, Colon, Semi, Dollar, Question, FatArrow, Arrow, ArrowLeft, Dash, DotDot, Pipe, DoublePipe, Eof,
290}
291
292impl fmt::Display for Token {
293 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
294 match self {
295 Token::Select => write!(f, "SELECT"),
296 Token::From => write!(f, "FROM"),
297 Token::Where => write!(f, "WHERE"),
298 Token::And => write!(f, "AND"),
299 Token::Or => write!(f, "OR"),
300 Token::Not => write!(f, "NOT"),
301 Token::Match => write!(f, "MATCH"),
302 Token::Return => write!(f, "RETURN"),
303 Token::Join => write!(f, "JOIN"),
304 Token::Graph => write!(f, "GRAPH"),
305 Token::Path => write!(f, "PATH"),
306 Token::To => write!(f, "TO"),
307 Token::Via => write!(f, "VIA"),
308 Token::On => write!(f, "ON"),
309 Token::As => write!(f, "AS"),
310 Token::Is => write!(f, "IS"),
311 Token::Null => write!(f, "NULL"),
312 Token::Between => write!(f, "BETWEEN"),
313 Token::Like => write!(f, "LIKE"),
314 Token::In => write!(f, "IN"),
315 Token::Order => write!(f, "ORDER"),
316 Token::By => write!(f, "BY"),
317 Token::Asc => write!(f, "ASC"),
318 Token::Desc => write!(f, "DESC"),
319 Token::Nulls => write!(f, "NULLS"),
320 Token::First => write!(f, "FIRST"),
321 Token::Last => write!(f, "LAST"),
322 Token::Limit => write!(f, "LIMIT"),
323 Token::Offset => write!(f, "OFFSET"),
324 Token::Inner => write!(f, "INNER"),
325 Token::Left => write!(f, "LEFT"),
326 Token::Right => write!(f, "RIGHT"),
327 Token::Outer => write!(f, "OUTER"),
328 Token::Full => write!(f, "FULL"),
329 Token::Cross => write!(f, "CROSS"),
330 Token::Starts => write!(f, "STARTS"),
331 Token::Ends => write!(f, "ENDS"),
332 Token::With => write!(f, "WITH"),
333 Token::Contains => write!(f, "CONTAINS"),
334 Token::True => write!(f, "TRUE"),
335 Token::False => write!(f, "FALSE"),
336 Token::Enrich => write!(f, "ENRICH"),
337 Token::Group => write!(f, "GROUP"),
338 Token::Count => write!(f, "COUNT"),
339 Token::Sum => write!(f, "SUM"),
340 Token::Avg => write!(f, "AVG"),
341 Token::Min => write!(f, "MIN"),
342 Token::Max => write!(f, "MAX"),
343 Token::Distinct => write!(f, "DISTINCT"),
344 Token::Vector => write!(f, "VECTOR"),
345 Token::Search => write!(f, "SEARCH"),
346 Token::Similar => write!(f, "SIMILAR"),
347 Token::Collection => write!(f, "COLLECTION"),
348 Token::Metric => write!(f, "METRIC"),
349 Token::Threshold => write!(f, "THRESHOLD"),
350 Token::K => write!(f, "K"),
351 Token::Hybrid => write!(f, "HYBRID"),
352 Token::Fusion => write!(f, "FUSION"),
353 Token::Rerank => write!(f, "RERANK"),
354 Token::Rrf => write!(f, "RRF"),
355 Token::Intersection => write!(f, "INTERSECTION"),
356 Token::Union => write!(f, "UNION"),
357 Token::Recursive => write!(f, "RECURSIVE"),
358 Token::All => write!(f, "ALL"),
359 Token::Weight => write!(f, "WEIGHT"),
360 Token::L2 => write!(f, "L2"),
361 Token::Cosine => write!(f, "COSINE"),
362 Token::InnerProduct => write!(f, "INNER_PRODUCT"),
363 Token::Include => write!(f, "INCLUDE"),
364 Token::Metadata => write!(f, "METADATA"),
365 Token::Vectors => write!(f, "VECTORS"),
366 Token::Explain => write!(f, "EXPLAIN"),
367 Token::For => write!(f, "FOR"),
368 Token::Format => write!(f, "FORMAT"),
369 Token::Json => write!(f, "JSON"),
370 Token::Insert => write!(f, "INSERT"),
371 Token::Into => write!(f, "INTO"),
372 Token::Values => write!(f, "VALUES"),
373 Token::Update => write!(f, "UPDATE"),
374 Token::Set => write!(f, "SET"),
375 Token::Delete => write!(f, "DELETE"),
376 Token::Truncate => write!(f, "TRUNCATE"),
377 Token::Create => write!(f, "CREATE"),
378 Token::Table => write!(f, "TABLE"),
379 Token::Drop => write!(f, "DROP"),
380 Token::Alter => write!(f, "ALTER"),
381 Token::Add => write!(f, "ADD"),
382 Token::Column => write!(f, "COLUMN"),
383 Token::Primary => write!(f, "PRIMARY"),
384 Token::Key => write!(f, "KEY"),
385 Token::Default => write!(f, "DEFAULT"),
386 Token::Compress => write!(f, "COMPRESS"),
387 Token::Index => write!(f, "INDEX"),
388 Token::Unique => write!(f, "UNIQUE"),
389 Token::If => write!(f, "IF"),
390 Token::Exists => write!(f, "EXISTS"),
391 Token::Returning => write!(f, "RETURNING"),
392 Token::Cascade => write!(f, "CASCADE"),
393 Token::Rename => write!(f, "RENAME"),
394 Token::Using => write!(f, "USING"),
395 Token::Node => write!(f, "NODE"),
396 Token::Edge => write!(f, "EDGE"),
397 Token::Document => write!(f, "DOCUMENT"),
398 Token::Kv => write!(f, "KV"),
399 Token::Timeseries => write!(f, "TIMESERIES"),
400 Token::Retention => write!(f, "RETENTION"),
401 Token::Queue => write!(f, "QUEUE"),
402 Token::Tree => write!(f, "TREE"),
403 Token::Push => write!(f, "PUSH"),
404 Token::Pop => write!(f, "POP"),
405 Token::Peek => write!(f, "PEEK"),
406 Token::Purge => write!(f, "PURGE"),
407 Token::Ack => write!(f, "ACK"),
408 Token::Nack => write!(f, "NACK"),
409 Token::Priority => write!(f, "PRIORITY"),
410 Token::Neighborhood => write!(f, "NEIGHBORHOOD"),
411 Token::ShortestPath => write!(f, "SHORTEST_PATH"),
412 Token::Centrality => write!(f, "CENTRALITY"),
413 Token::Community => write!(f, "COMMUNITY"),
414 Token::Components => write!(f, "COMPONENTS"),
415 Token::Cycles => write!(f, "CYCLES"),
416 Token::Traverse => write!(f, "TRAVERSE"),
417 Token::Depth => write!(f, "DEPTH"),
418 Token::Direction => write!(f, "DIRECTION"),
419 Token::Algorithm => write!(f, "ALGORITHM"),
420 Token::Strategy => write!(f, "STRATEGY"),
421 Token::MaxIterations => write!(f, "MAX_ITERATIONS"),
422 Token::MaxLength => write!(f, "MAX_LENGTH"),
423 Token::Mode => write!(f, "MODE"),
424 Token::Clustering => write!(f, "CLUSTERING"),
425 Token::TopologicalSort => write!(f, "TOPOLOGICAL_SORT"),
426 Token::Properties => write!(f, "PROPERTIES"),
427 Token::Text => write!(f, "TEXT"),
428 Token::Fuzzy => write!(f, "FUZZY"),
429 Token::MinScore => write!(f, "MIN_SCORE"),
430 Token::Begin => write!(f, "BEGIN"),
431 Token::Commit => write!(f, "COMMIT"),
432 Token::Rollback => write!(f, "ROLLBACK"),
433 Token::Savepoint => write!(f, "SAVEPOINT"),
434 Token::Release => write!(f, "RELEASE"),
435 Token::Start => write!(f, "START"),
436 Token::Transaction => write!(f, "TRANSACTION"),
437 Token::Work => write!(f, "WORK"),
438 Token::Vacuum => write!(f, "VACUUM"),
439 Token::Analyze => write!(f, "ANALYZE"),
440 Token::Schema => write!(f, "SCHEMA"),
441 Token::Sequence => write!(f, "SEQUENCE"),
442 Token::Increment => write!(f, "INCREMENT"),
443 Token::Copy => write!(f, "COPY"),
444 Token::Header => write!(f, "HEADER"),
445 Token::Delimiter => write!(f, "DELIMITER"),
446 Token::View => write!(f, "VIEW"),
447 Token::Materialized => write!(f, "MATERIALIZED"),
448 Token::Refresh => write!(f, "REFRESH"),
449 Token::Partition => write!(f, "PARTITION"),
450 Token::Range => write!(f, "RANGE"),
451 Token::List => write!(f, "LIST"),
452 Token::Hash => write!(f, "HASH"),
453 Token::Attach => write!(f, "ATTACH"),
454 Token::Detach => write!(f, "DETACH"),
455 Token::Of => write!(f, "OF"),
456 Token::Policy => write!(f, "POLICY"),
457 Token::Enable => write!(f, "ENABLE"),
458 Token::Disable => write!(f, "DISABLE"),
459 Token::Security => write!(f, "SECURITY"),
460 Token::Row => write!(f, "ROW"),
461 Token::Level => write!(f, "LEVEL"),
462 Token::Foreign => write!(f, "FOREIGN"),
463 Token::Server => write!(f, "SERVER"),
464 Token::Wrapper => write!(f, "WRAPPER"),
465 Token::Options => write!(f, "OPTIONS"),
466 Token::Data => write!(f, "DATA"),
467 Token::Sessionize => write!(f, "SESSIONIZE"),
468 Token::Gap => write!(f, "GAP"),
469 Token::Over => write!(f, "OVER"),
470 Token::Rows => write!(f, "ROWS"),
471 Token::Preceding => write!(f, "PRECEDING"),
472 Token::Following => write!(f, "FOLLOWING"),
473 Token::Unbounded => write!(f, "UNBOUNDED"),
474 Token::Current => write!(f, "CURRENT"),
475 Token::String(s) => write!(f, "'{}'", s),
476 Token::Integer(n) => write!(f, "{}", n),
477 Token::Float(n) => write!(f, "{}", n),
478 Token::JsonLiteral(s) => write!(f, "{}", s),
479 Token::Ident(s) => write!(f, "{}", s),
480 Token::Eq => write!(f, "="),
481 Token::Ne => write!(f, "<>"),
482 Token::Lt => write!(f, "<"),
483 Token::Le => write!(f, "<="),
484 Token::Gt => write!(f, ">"),
485 Token::Ge => write!(f, ">="),
486 Token::Plus => write!(f, "+"),
487 Token::Minus => write!(f, "-"),
488 Token::Star => write!(f, "*"),
489 Token::Slash => write!(f, "/"),
490 Token::Percent => write!(f, "%"),
491 Token::LParen => write!(f, "("),
492 Token::RParen => write!(f, ")"),
493 Token::LBracket => write!(f, "["),
494 Token::RBracket => write!(f, "]"),
495 Token::LBrace => write!(f, "{{"),
496 Token::RBrace => write!(f, "}}"),
497 Token::Comma => write!(f, ","),
498 Token::Dot => write!(f, "."),
499 Token::Colon => write!(f, ":"),
500 Token::Semi => write!(f, ";"),
501 Token::Dollar => write!(f, "$"),
502 Token::Question => write!(f, "?"),
503 Token::FatArrow => write!(f, "=>"),
504 Token::Arrow => write!(f, "->"),
505 Token::ArrowLeft => write!(f, "<-"),
506 Token::Dash => write!(f, "-"),
507 Token::DotDot => write!(f, ".."),
508 Token::Pipe => write!(f, "|"),
509 Token::DoublePipe => write!(f, "||"),
510 Token::Eof => write!(f, "EOF"),
511 }
512 }
513}
514
515#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
517pub struct Position {
518 pub line: u32,
520 pub column: u32,
522 pub offset: u32,
524}
525
526impl Position {
527 pub fn new(line: u32, column: u32, offset: u32) -> Self {
529 Self {
530 line,
531 column,
532 offset,
533 }
534 }
535}
536
537impl fmt::Display for Position {
538 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
539 write!(f, "{}:{}", self.line, self.column)
540 }
541}
542
543#[derive(Debug, Clone)]
545pub struct Spanned {
546 pub token: Token,
548 pub start: Position,
550 pub end: Position,
552}
553
554impl Spanned {
555 pub fn new(token: Token, start: Position, end: Position) -> Self {
557 Self { token, start, end }
558 }
559}
560
561#[derive(Debug, Clone)]
563pub struct LexerError {
564 pub message: String,
566 pub position: Position,
568 pub limit_hit: Option<LexerLimitHit>,
572}
573
574#[derive(Debug, Clone, PartialEq, Eq)]
576pub enum LexerLimitHit {
577 IdentifierTooLong {
579 limit_name: &'static str,
580 value: usize,
581 },
582}
583
584impl LexerError {
585 pub fn new(message: impl Into<String>, position: Position) -> Self {
587 Self {
588 message: message.into(),
589 position,
590 limit_hit: None,
591 }
592 }
593
594 pub(crate) fn with_limit(
596 message: impl Into<String>,
597 position: Position,
598 limit_hit: LexerLimitHit,
599 ) -> Self {
600 Self {
601 message: message.into(),
602 position,
603 limit_hit: Some(limit_hit),
604 }
605 }
606}
607
608impl fmt::Display for LexerError {
609 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
610 write!(f, "Lexer error at {}: {}", self.position, self.message)
611 }
612}
613
614impl std::error::Error for LexerError {}
615
616pub const JSON_LITERAL_MAX_BYTES: usize = 16 * 1024 * 1024;
623
624pub struct Lexer<'a> {
626 input: &'a str,
629 chars: Peekable<Chars<'a>>,
631 line: u32,
633 column: u32,
634 offset: u32,
635 peeked: Option<Spanned>,
637 putback: Option<(char, Position)>,
639 max_identifier_chars: usize,
641}
642
643impl<'a> Lexer<'a> {
644 pub fn new(input: &'a str) -> Self {
646 Self::with_limits(
647 input,
648 crate::storage::query::parser::ParserLimits::default(),
649 )
650 }
651
652 pub fn with_limits(
654 input: &'a str,
655 limits: crate::storage::query::parser::ParserLimits,
656 ) -> Self {
657 Self {
658 input,
659 chars: input.chars().peekable(),
660 line: 1,
661 column: 1,
662 offset: 0,
663 peeked: None,
664 putback: None,
665 max_identifier_chars: limits.max_identifier_chars,
666 }
667 }
668
669 pub(crate) fn max_identifier_chars(&self) -> usize {
673 self.max_identifier_chars
674 }
675
676 fn position(&self) -> Position {
678 Position::new(self.line, self.column, self.offset)
679 }
680
681 fn unget(&mut self, ch: char, pos: Position) {
683 self.putback = Some((ch, pos));
684 }
685
686 fn advance(&mut self) -> Option<char> {
688 if let Some((ch, pos)) = self.putback.take() {
690 self.line = pos.line;
692 self.column = pos.column + 1;
693 self.offset = pos.offset + ch.len_utf8() as u32;
694 return Some(ch);
695 }
696
697 let ch = self.chars.next()?;
698 self.offset += ch.len_utf8() as u32;
699 if ch == '\n' {
700 self.line += 1;
701 self.column = 1;
702 } else {
703 self.column += 1;
704 }
705 Some(ch)
706 }
707
708 fn peek(&mut self) -> Option<char> {
710 if let Some((ch, _)) = &self.putback {
712 return Some(*ch);
713 }
714 self.chars.peek().copied()
715 }
716
717 fn skip_whitespace(&mut self) {
719 while let Some(ch) = self.peek() {
720 if ch.is_whitespace() {
721 self.advance();
722 } else if ch == '-' {
723 let pos = self.position();
725 self.advance();
726 if self.peek() == Some('-') {
727 self.advance();
729 while let Some(c) = self.peek() {
730 if c == '\n' {
731 break;
732 }
733 self.advance();
734 }
735 } else {
736 self.line = pos.line;
739 self.column = pos.column;
740 self.offset = pos.offset;
741 break;
744 }
745 } else {
746 break;
747 }
748 }
749 }
750
751 pub fn peek_token(&mut self) -> Result<&Spanned, LexerError> {
753 if self.peeked.is_none() {
754 self.peeked = Some(self.next_token_internal()?);
755 }
756 Ok(self.peeked.as_ref().unwrap())
757 }
758
759 pub fn next_token(&mut self) -> Result<Spanned, LexerError> {
761 if let Some(tok) = self.peeked.take() {
762 return Ok(tok);
763 }
764 self.next_token_internal()
765 }
766
767 fn next_token_internal(&mut self) -> Result<Spanned, LexerError> {
769 self.skip_whitespace_simple();
770
771 let start = self.position();
772
773 let ch = match self.peek() {
774 Some(c) => c,
775 None => {
776 return Ok(Spanned::new(Token::Eof, start, start));
777 }
778 };
779
780 let token = match ch {
782 '\'' | '"' => self.scan_string()?,
784
785 '0'..='9' => self.scan_number()?,
787
788 'a'..='z' | 'A'..='Z' | '_' => self.scan_identifier()?,
790
791 '=' => {
793 self.advance();
794 if self.peek() == Some('>') {
797 self.advance();
798 Token::FatArrow
799 } else {
800 Token::Eq
801 }
802 }
803 '<' => self.scan_less_than()?,
804 '>' => self.scan_greater_than()?,
805 '!' => {
806 self.advance();
807 if self.peek() == Some('=') {
808 self.advance();
809 Token::Ne
810 } else {
811 return Err(LexerError::new("Expected '=' after '!'", start));
812 }
813 }
814 '+' => {
815 self.advance();
816 Token::Plus
817 }
818 '-' => self.scan_minus()?,
819 '*' => {
820 self.advance();
821 Token::Star
822 }
823 '/' => {
824 self.advance();
825 Token::Slash
826 }
827 '%' => {
828 self.advance();
829 Token::Percent
830 }
831 '(' => {
832 self.advance();
833 Token::LParen
834 }
835 ')' => {
836 self.advance();
837 Token::RParen
838 }
839 '[' => {
840 self.advance();
841 Token::LBracket
842 }
843 ']' => {
844 self.advance();
845 Token::RBracket
846 }
847 '{' => {
848 if self.looks_like_json_object_start() {
855 return self.scan_json_literal(start);
856 }
857 self.advance();
858 Token::LBrace
859 }
860 '}' => {
861 self.advance();
862 Token::RBrace
863 }
864 ',' => {
865 self.advance();
866 Token::Comma
867 }
868 '.' => self.scan_dot()?,
869 ':' => {
870 self.advance();
871 Token::Colon
872 }
873 ';' => {
874 self.advance();
875 Token::Semi
876 }
877 '$' => {
878 self.advance();
879 Token::Dollar
880 }
881 '?' => {
882 self.advance();
883 Token::Question
884 }
885 '|' => {
886 self.advance();
887 if self.peek() == Some('|') {
888 self.advance();
889 Token::DoublePipe
890 } else {
891 Token::Pipe
892 }
893 }
894 _ => {
895 return Err(LexerError::new(
896 format!("Unexpected character: '{}'", ch),
897 start,
898 ));
899 }
900 };
901
902 let end = self.position();
903 Ok(Spanned::new(token, start, end))
904 }
905
906 fn skip_whitespace_simple(&mut self) {
908 while let Some(ch) = self.peek() {
909 if ch.is_whitespace() {
910 self.advance();
911 } else if ch == '-' && self.input[self.offset as usize..].starts_with("--") {
912 self.advance();
913 self.advance();
914 while let Some(c) = self.peek() {
915 if c == '\n' {
916 break;
917 }
918 self.advance();
919 }
920 } else if ch == '/' && self.input[self.offset as usize..].starts_with("/*") {
921 self.advance();
922 self.advance();
923 while let Some(c) = self.peek() {
924 self.advance();
925 if c == '*' && self.peek() == Some('/') {
926 self.advance();
927 break;
928 }
929 }
930 } else {
931 break;
932 }
933 }
934 }
935
936 fn scan_string(&mut self) -> Result<Token, LexerError> {
938 let quote = self.advance().unwrap(); let start = self.position();
940 let mut value = String::new();
941
942 loop {
943 match self.peek() {
944 None => {
945 return Err(LexerError::new("Unterminated string", start));
946 }
947 Some(c) if c == quote => {
948 self.advance();
949 if self.peek() == Some(quote) {
951 self.advance();
952 value.push(quote);
953 } else {
954 break;
955 }
956 }
957 Some('\\') => {
958 self.advance();
959 match self.peek() {
960 Some('n') => {
961 self.advance();
962 value.push('\n');
963 }
964 Some('r') => {
965 self.advance();
966 value.push('\r');
967 }
968 Some('t') => {
969 self.advance();
970 value.push('\t');
971 }
972 Some('\\') => {
973 self.advance();
974 value.push('\\');
975 }
976 Some(c) if c == quote => {
977 self.advance();
978 value.push(quote);
979 }
980 Some(c) => {
981 value.push('\\');
983 value.push(c);
984 self.advance();
985 }
986 None => {
987 return Err(LexerError::new("Unterminated string", start));
988 }
989 }
990 }
991 Some(c) => {
992 self.advance();
993 value.push(c);
994 }
995 }
996 }
997
998 Ok(Token::String(value))
999 }
1000
1001 fn scan_number(&mut self) -> Result<Token, LexerError> {
1003 let mut value = String::new();
1004 let mut is_float = false;
1005
1006 while let Some(ch) = self.peek() {
1008 if ch.is_ascii_digit() {
1009 value.push(ch);
1010 self.advance();
1011 } else {
1012 break;
1013 }
1014 }
1015
1016 if self.peek() == Some('.') {
1018 let dot_pos = self.position();
1020 self.advance(); if self.peek() == Some('.') {
1023 self.unget('.', dot_pos);
1025 } else if self.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
1027 is_float = true;
1028 value.push('.');
1029 while let Some(ch) = self.peek() {
1030 if ch.is_ascii_digit() {
1031 value.push(ch);
1032 self.advance();
1033 } else {
1034 break;
1035 }
1036 }
1037 } else {
1038 self.unget('.', dot_pos);
1040 }
1041 }
1042
1043 if self.peek() == Some('e') || self.peek() == Some('E') {
1045 is_float = true;
1046 value.push(self.advance().unwrap());
1047
1048 if self.peek() == Some('+') || self.peek() == Some('-') {
1049 value.push(self.advance().unwrap());
1050 }
1051
1052 while let Some(ch) = self.peek() {
1053 if ch.is_ascii_digit() {
1054 value.push(ch);
1055 self.advance();
1056 } else {
1057 break;
1058 }
1059 }
1060 }
1061
1062 if is_float {
1063 match value.parse::<f64>() {
1064 Ok(n) => Ok(Token::Float(n)),
1065 Err(_) => Err(LexerError::new(
1066 format!("Invalid float: {}", value),
1067 self.position(),
1068 )),
1069 }
1070 } else {
1071 match value.parse::<i64>() {
1072 Ok(n) => Ok(Token::Integer(n)),
1073 Err(_) => Err(LexerError::new(
1074 format!("Invalid integer: {}", value),
1075 self.position(),
1076 )),
1077 }
1078 }
1079 }
1080
1081 fn scan_identifier(&mut self) -> Result<Token, LexerError> {
1083 let start_pos = self.position();
1084 let mut value = String::new();
1085 let max = self.max_identifier_chars;
1086
1087 while let Some(ch) = self.peek() {
1088 if ch.is_alphanumeric() || ch == '_' {
1089 if value.chars().count() >= max {
1090 return Err(LexerError::with_limit(
1094 format!(
1095 "identifier exceeds maximum length (max_identifier_chars = {})",
1096 max
1097 ),
1098 start_pos,
1099 LexerLimitHit::IdentifierTooLong {
1100 limit_name: "max_identifier_chars",
1101 value: max,
1102 },
1103 ));
1104 }
1105 value.push(ch);
1106 self.advance();
1107 } else {
1108 break;
1109 }
1110 }
1111
1112 let token = match value.to_uppercase().as_str() {
1114 "SELECT" => Token::Select,
1115 "FROM" => Token::From,
1116 "WHERE" => Token::Where,
1117 "AND" => Token::And,
1118 "OR" => Token::Or,
1119 "NOT" => Token::Not,
1120 "MATCH" => Token::Match,
1121 "RETURN" => Token::Return,
1122 "JOIN" => Token::Join,
1123 "GRAPH" => Token::Graph,
1124 "PATH" => Token::Path,
1125 "TO" => Token::To,
1126 "VIA" => Token::Via,
1127 "ON" => Token::On,
1128 "AS" => Token::As,
1129 "IS" => Token::Is,
1130 "NULL" => Token::Null,
1131 "BETWEEN" => Token::Between,
1132 "LIKE" => Token::Like,
1133 "IN" => Token::In,
1134 "ORDER" => Token::Order,
1135 "BY" => Token::By,
1136 "ASC" => Token::Asc,
1137 "DESC" => Token::Desc,
1138 "NULLS" => Token::Nulls,
1139 "FIRST" => Token::First,
1140 "LAST" => Token::Last,
1141 "LIMIT" => Token::Limit,
1142 "OFFSET" => Token::Offset,
1143 "INNER" => Token::Inner,
1144 "LEFT" => Token::Left,
1145 "RIGHT" => Token::Right,
1146 "OUTER" => Token::Outer,
1147 "FULL" => Token::Full,
1148 "CROSS" => Token::Cross,
1149 "STARTS" => Token::Starts,
1150 "ENDS" => Token::Ends,
1151 "WITH" => Token::With,
1152 "CONTAINS" => Token::Contains,
1153 "TRUE" => Token::True,
1154 "FALSE" => Token::False,
1155 "ENRICH" => Token::Enrich,
1156 "GROUP" => Token::Group,
1157 "COUNT" => Token::Count,
1158 "SUM" => Token::Sum,
1159 "AVG" => Token::Avg,
1160 "MIN" => Token::Min,
1161 "MAX" => Token::Max,
1162 "DISTINCT" => Token::Distinct,
1163 "VECTOR" => Token::Vector,
1164 "SEARCH" => Token::Search,
1165 "SIMILAR" => Token::Similar,
1166 "COLLECTION" => Token::Collection,
1167 "METRIC" => Token::Metric,
1168 "THRESHOLD" => Token::Threshold,
1169 "K" => Token::K,
1170 "HYBRID" => Token::Hybrid,
1171 "FUSION" => Token::Fusion,
1172 "RERANK" => Token::Rerank,
1173 "RRF" => Token::Rrf,
1174 "INTERSECTION" => Token::Intersection,
1175 "UNION" => Token::Union,
1176 "RECURSIVE" => Token::Recursive,
1177 "ALL" => Token::All,
1178 "WEIGHT" => Token::Weight,
1179 "L2" => Token::L2,
1180 "COSINE" => Token::Cosine,
1181 "INNER_PRODUCT" | "INNERPRODUCT" => Token::InnerProduct,
1182 "INCLUDE" => Token::Include,
1183 "METADATA" => Token::Metadata,
1184 "VECTORS" => Token::Vectors,
1185 "EXPLAIN" => Token::Explain,
1186 "FOR" => Token::For,
1187 "FORMAT" => Token::Format,
1188 "JSON" => Token::Json,
1189 "INSERT" => Token::Insert,
1190 "INTO" => Token::Into,
1191 "VALUES" => Token::Values,
1192 "UPDATE" => Token::Update,
1193 "SET" => Token::Set,
1194 "DELETE" => Token::Delete,
1195 "TRUNCATE" => Token::Truncate,
1196 "CREATE" => Token::Create,
1197 "TABLE" => Token::Table,
1198 "DROP" => Token::Drop,
1199 "ALTER" => Token::Alter,
1200 "ADD" => Token::Add,
1201 "COLUMN" => Token::Column,
1202 "PRIMARY" => Token::Primary,
1203 "KEY" => Token::Key,
1204 "DEFAULT" => Token::Default,
1205 "COMPRESS" => Token::Compress,
1206 "INDEX" => Token::Index,
1207 "UNIQUE" => Token::Unique,
1208 "IF" => Token::If,
1209 "EXISTS" => Token::Exists,
1210 "RETURNING" => Token::Returning,
1211 "CASCADE" => Token::Cascade,
1212 "RENAME" => Token::Rename,
1213 "USING" => Token::Using,
1214 "NODE" => Token::Node,
1215 "EDGE" => Token::Edge,
1216 "DOCUMENT" => Token::Document,
1217 "KV" => Token::Kv,
1218 "TIMESERIES" => Token::Timeseries,
1219 "RETENTION" => Token::Retention,
1220 "QUEUE" => Token::Queue,
1221 "TREE" => Token::Tree,
1222 "PUSH" => Token::Push,
1223 "POP" => Token::Pop,
1224 "PEEK" => Token::Peek,
1225 "PURGE" => Token::Purge,
1226 "ACK" => Token::Ack,
1227 "NACK" => Token::Nack,
1228 "PRIORITY" => Token::Priority,
1229 "LPUSH" => Token::Ident("LPUSH".to_string()),
1230 "RPUSH" => Token::Ident("RPUSH".to_string()),
1231 "LPOP" => Token::Ident("LPOP".to_string()),
1232 "RPOP" => Token::Ident("RPOP".to_string()),
1233 "NEIGHBORHOOD" => Token::Neighborhood,
1234 "SHORTEST_PATH" | "SHORTESTPATH" => Token::ShortestPath,
1235 "CENTRALITY" => Token::Centrality,
1236 "COMMUNITY" => Token::Community,
1237 "COMPONENTS" => Token::Components,
1238 "CYCLES" => Token::Cycles,
1239 "TRAVERSE" => Token::Traverse,
1240 "DEPTH" => Token::Depth,
1241 "DIRECTION" => Token::Direction,
1242 "ALGORITHM" => Token::Algorithm,
1243 "STRATEGY" => Token::Strategy,
1244 "MAX_ITERATIONS" | "MAXITERATIONS" => Token::MaxIterations,
1245 "MAX_LENGTH" | "MAXLENGTH" => Token::MaxLength,
1246 "MODE" => Token::Mode,
1247 "CLUSTERING" => Token::Clustering,
1248 "TOPOLOGICAL_SORT" | "TOPOLOGICALSORT" => Token::TopologicalSort,
1249 "PROPERTIES" => Token::Properties,
1250 "TEXT" => Token::Text,
1251 "FUZZY" => Token::Fuzzy,
1252 "MIN_SCORE" | "MINSCORE" => Token::MinScore,
1253 "BEGIN" => Token::Begin,
1254 "COMMIT" => Token::Commit,
1255 "ROLLBACK" => Token::Rollback,
1256 "SAVEPOINT" => Token::Savepoint,
1257 "RELEASE" => Token::Release,
1258 "START" => Token::Start,
1259 "TRANSACTION" => Token::Transaction,
1260 "WORK" => Token::Work,
1261 "VACUUM" => Token::Vacuum,
1262 "ANALYZE" => Token::Analyze,
1263 "SCHEMA" => Token::Schema,
1264 "SEQUENCE" => Token::Sequence,
1265 "INCREMENT" => Token::Increment,
1266 "COPY" => Token::Copy,
1267 "HEADER" => Token::Header,
1268 "DELIMITER" => Token::Delimiter,
1269 "VIEW" => Token::View,
1270 "MATERIALIZED" => Token::Materialized,
1271 "REFRESH" => Token::Refresh,
1272 "PARTITION" => Token::Partition,
1273 "RANGE" => Token::Range,
1274 "LIST" => Token::List,
1275 "HASH" => Token::Hash,
1276 "ATTACH" => Token::Attach,
1277 "DETACH" => Token::Detach,
1278 "OF" => Token::Of,
1279 "POLICY" => Token::Policy,
1280 "ENABLE" => Token::Enable,
1281 "DISABLE" => Token::Disable,
1282 "SECURITY" => Token::Security,
1283 "ROW" => Token::Row,
1284 "LEVEL" => Token::Level,
1285 "FOREIGN" => Token::Foreign,
1286 "SERVER" => Token::Server,
1287 "WRAPPER" => Token::Wrapper,
1288 "OPTIONS" => Token::Options,
1289 "DATA" => Token::Data,
1290 "SESSIONIZE" => Token::Sessionize,
1291 "GAP" => Token::Gap,
1292 "OVER" => Token::Over,
1293 "ROWS" => Token::Rows,
1294 "PRECEDING" => Token::Preceding,
1295 "FOLLOWING" => Token::Following,
1296 "UNBOUNDED" => Token::Unbounded,
1297 "CURRENT" => Token::Current,
1298 _ => Token::Ident(value),
1299 };
1300
1301 Ok(token)
1302 }
1303
1304 fn scan_less_than(&mut self) -> Result<Token, LexerError> {
1306 self.advance(); match self.peek() {
1308 Some('=') => {
1309 self.advance();
1310 Ok(Token::Le)
1311 }
1312 Some('>') => {
1313 self.advance();
1314 Ok(Token::Ne)
1315 }
1316 Some('-') => {
1317 self.advance();
1318 Ok(Token::ArrowLeft)
1319 }
1320 _ => Ok(Token::Lt),
1321 }
1322 }
1323
1324 fn scan_greater_than(&mut self) -> Result<Token, LexerError> {
1326 self.advance(); if self.peek() == Some('=') {
1328 self.advance();
1329 Ok(Token::Ge)
1330 } else {
1331 Ok(Token::Gt)
1332 }
1333 }
1334
1335 fn scan_minus(&mut self) -> Result<Token, LexerError> {
1337 self.advance(); match self.peek() {
1339 Some('>') => {
1340 self.advance();
1341 Ok(Token::Arrow)
1342 }
1343 Some('-') => {
1344 self.advance();
1346 while let Some(c) = self.peek() {
1347 if c == '\n' {
1348 break;
1349 }
1350 self.advance();
1351 }
1352 self.skip_whitespace_simple();
1354 if self.peek().is_none() {
1355 Ok(Token::Eof)
1356 } else {
1357 let next = self.next_token_internal()?;
1358 Ok(next.token)
1359 }
1360 }
1361 _ => Ok(Token::Dash),
1362 }
1363 }
1364
1365 fn scan_dot(&mut self) -> Result<Token, LexerError> {
1367 self.advance(); if self.peek() == Some('.') {
1369 self.advance();
1370 Ok(Token::DotDot)
1371 } else {
1372 Ok(Token::Dot)
1373 }
1374 }
1375
1376 fn looks_like_json_object_start(&self) -> bool {
1381 let bytes = self.input.as_bytes();
1382 let mut i = self.offset as usize;
1383 debug_assert!(bytes.get(i) == Some(&b'{'));
1385 i += 1;
1386 while i < bytes.len() {
1387 match bytes[i] {
1388 b' ' | b'\t' | b'\n' | b'\r' => i += 1,
1389 b'"' | b'}' => return true,
1390 _ => return false,
1391 }
1392 }
1393 false
1394 }
1395
1396 fn scan_json_literal(&mut self, start: Position) -> Result<Spanned, LexerError> {
1413 let start_offset = self.offset as usize;
1414 self.advance();
1416 let mut depth: u32 = 1;
1417 let mut in_string = false;
1418 let mut escape = false;
1419 loop {
1420 let ch = match self.peek() {
1421 Some(c) => c,
1422 None => {
1423 return Err(LexerError::new(
1424 format!(
1425 "unterminated JSON object literal (started at offset {})",
1426 start.offset
1427 ),
1428 self.position(),
1429 ));
1430 }
1431 };
1432
1433 let scanned_bytes = self.offset as usize - start_offset;
1435 if scanned_bytes > JSON_LITERAL_MAX_BYTES {
1436 return Err(LexerError::new(
1437 format!(
1438 "JSON object literal exceeds JSON_LITERAL_MAX_BYTES ({} bytes)",
1439 JSON_LITERAL_MAX_BYTES
1440 ),
1441 start,
1442 ));
1443 }
1444
1445 self.advance();
1446
1447 if escape {
1448 escape = false;
1449 continue;
1450 }
1451
1452 if in_string {
1453 match ch {
1454 '\\' => escape = true,
1455 '"' => in_string = false,
1456 _ => {}
1457 }
1458 continue;
1459 }
1460
1461 match ch {
1462 '"' => in_string = true,
1463 '{' => depth += 1,
1464 '}' => {
1465 depth -= 1;
1466 if depth == 0 {
1467 let end = self.position();
1468 let end_offset = self.offset as usize;
1469 if end_offset - start_offset > JSON_LITERAL_MAX_BYTES {
1471 return Err(LexerError::new(
1472 format!(
1473 "JSON object literal exceeds JSON_LITERAL_MAX_BYTES ({} bytes)",
1474 JSON_LITERAL_MAX_BYTES
1475 ),
1476 start,
1477 ));
1478 }
1479 let raw = self.input[start_offset..end_offset].to_string();
1480 return Ok(Spanned::new(Token::JsonLiteral(raw), start, end));
1481 }
1482 }
1483 _ => {}
1484 }
1485 }
1486 }
1487
1488 pub fn tokenize(&mut self) -> Result<Vec<Spanned>, LexerError> {
1490 let mut tokens = Vec::new();
1491 loop {
1492 let tok = self.next_token()?;
1493 let is_eof = tok.token == Token::Eof;
1494 tokens.push(tok);
1495 if is_eof {
1496 break;
1497 }
1498 }
1499 Ok(tokens)
1500 }
1501}
1502
1503#[cfg(test)]
1508mod tests {
1509 use super::*;
1510
1511 fn tokenize(input: &str) -> Vec<Token> {
1512 let mut lexer = Lexer::new(input);
1513 lexer
1514 .tokenize()
1515 .unwrap()
1516 .into_iter()
1517 .map(|s| s.token)
1518 .collect()
1519 }
1520
1521 #[test]
1522 fn test_keywords() {
1523 let tokens = tokenize("SELECT FROM WHERE AND OR NOT");
1524 assert_eq!(
1525 tokens,
1526 vec![
1527 Token::Select,
1528 Token::From,
1529 Token::Where,
1530 Token::And,
1531 Token::Or,
1532 Token::Not,
1533 Token::Eof
1534 ]
1535 );
1536 }
1537
1538 #[test]
1539 fn test_identifiers() {
1540 let tokens = tokenize("hosts users ip_address");
1541 assert_eq!(
1542 tokens,
1543 vec![
1544 Token::Ident("hosts".into()),
1545 Token::Ident("users".into()),
1546 Token::Ident("ip_address".into()),
1547 Token::Eof
1548 ]
1549 );
1550 }
1551
1552 #[test]
1553 fn test_numbers() {
1554 let tokens = tokenize("42 2.5 1e10 2.5e-3");
1555 assert_eq!(
1556 tokens,
1557 vec![
1558 Token::Integer(42),
1559 Token::Float(2.5),
1560 Token::Float(1e10),
1561 Token::Float(2.5e-3),
1562 Token::Eof
1563 ]
1564 );
1565 }
1566
1567 #[test]
1568 fn test_strings() {
1569 let tokens = tokenize("'hello' \"world\" 'it''s'");
1570 assert_eq!(
1571 tokens,
1572 vec![
1573 Token::String("hello".into()),
1574 Token::String("world".into()),
1575 Token::String("it's".into()),
1576 Token::Eof
1577 ]
1578 );
1579 }
1580
1581 #[test]
1582 fn test_operators() {
1583 let tokens = tokenize("= <> < <= > >= != + - * /");
1584 assert_eq!(
1585 tokens,
1586 vec![
1587 Token::Eq,
1588 Token::Ne,
1589 Token::Lt,
1590 Token::Le,
1591 Token::Gt,
1592 Token::Ge,
1593 Token::Ne,
1594 Token::Plus,
1595 Token::Dash,
1596 Token::Star,
1597 Token::Slash,
1598 Token::Eof
1599 ]
1600 );
1601 }
1602
1603 #[test]
1604 fn test_delimiters() {
1605 let tokens = tokenize("( ) [ ] { a } , . : ;");
1610 assert_eq!(
1611 tokens,
1612 vec![
1613 Token::LParen,
1614 Token::RParen,
1615 Token::LBracket,
1616 Token::RBracket,
1617 Token::LBrace,
1618 Token::Ident("a".into()),
1619 Token::RBrace,
1620 Token::Comma,
1621 Token::Dot,
1622 Token::Colon,
1623 Token::Semi,
1624 Token::Eof
1625 ]
1626 );
1627 }
1628
1629 #[test]
1630 fn test_json_literal_empty_object() {
1631 let tokens = tokenize("{ }");
1632 assert_eq!(tokens, vec![Token::JsonLiteral("{ }".into()), Token::Eof]);
1633 }
1634
1635 #[test]
1636 fn test_json_literal_simple() {
1637 let tokens = tokenize(r#"{"a":1}"#);
1638 assert_eq!(
1639 tokens,
1640 vec![Token::JsonLiteral(r#"{"a":1}"#.into()), Token::Eof]
1641 );
1642 }
1643
1644 #[test]
1645 fn test_json_literal_nested() {
1646 let raw = r#"{"a":{"b":[1,2,{"c":"}"}]}}"#;
1647 let tokens = tokenize(raw);
1648 assert_eq!(tokens, vec![Token::JsonLiteral(raw.into()), Token::Eof]);
1649 }
1650
1651 #[test]
1652 fn test_json_literal_escaped_quote_in_string() {
1653 let raw = r#"{"path":"O\"Brien}"}"#;
1655 let tokens = tokenize(raw);
1656 assert_eq!(tokens, vec![Token::JsonLiteral(raw.into()), Token::Eof]);
1657 }
1658
1659 #[test]
1660 fn test_json_literal_unbalanced_eof() {
1661 let mut lexer = Lexer::new(r#"{"a":1"#);
1662 let err = lexer.tokenize().expect_err("expected unterminated error");
1663 assert!(
1664 err.message.contains("unterminated JSON object literal"),
1665 "got: {}",
1666 err.message
1667 );
1668 }
1669
1670 #[test]
1671 fn test_json_literal_property_bag_compatible() {
1672 let tokens = tokenize("{name: 'value'}");
1675 assert_eq!(tokens[0], Token::LBrace);
1676 assert_eq!(*tokens.last().unwrap(), Token::Eof);
1677 }
1678
1679 #[test]
1680 fn test_graph_syntax() {
1681 let tokens = tokenize("-> <- - ..");
1682 assert_eq!(
1683 tokens,
1684 vec![
1685 Token::Arrow,
1686 Token::ArrowLeft,
1687 Token::Dash,
1688 Token::DotDot,
1689 Token::Eof
1690 ]
1691 );
1692 }
1693
1694 #[test]
1695 fn test_table_query() {
1696 let tokens = tokenize("SELECT ip, hostname FROM hosts WHERE os = 'Linux' LIMIT 10");
1697 assert_eq!(
1698 tokens,
1699 vec![
1700 Token::Select,
1701 Token::Ident("ip".into()),
1702 Token::Comma,
1703 Token::Ident("hostname".into()),
1704 Token::From,
1705 Token::Ident("hosts".into()),
1706 Token::Where,
1707 Token::Ident("os".into()),
1708 Token::Eq,
1709 Token::String("Linux".into()),
1710 Token::Limit,
1711 Token::Integer(10),
1712 Token::Eof
1713 ]
1714 );
1715 }
1716
1717 #[test]
1718 fn test_graph_query() {
1719 let tokens = tokenize("MATCH (h:Host)-[:HAS_SERVICE]->(s:Service) RETURN h, s");
1720 assert_eq!(
1721 tokens,
1722 vec![
1723 Token::Match,
1724 Token::LParen,
1725 Token::Ident("h".into()),
1726 Token::Colon,
1727 Token::Ident("Host".into()),
1728 Token::RParen,
1729 Token::Dash,
1730 Token::LBracket,
1731 Token::Colon,
1732 Token::Ident("HAS_SERVICE".into()),
1733 Token::RBracket,
1734 Token::Arrow,
1735 Token::LParen,
1736 Token::Ident("s".into()),
1737 Token::Colon,
1738 Token::Ident("Service".into()),
1739 Token::RParen,
1740 Token::Return,
1741 Token::Ident("h".into()),
1742 Token::Comma,
1743 Token::Ident("s".into()),
1744 Token::Eof
1745 ]
1746 );
1747 }
1748
1749 #[test]
1750 fn test_join_query() {
1751 let tokens = tokenize("FROM hosts h JOIN GRAPH (h)-[:HAS_VULN]->(v) ON h.ip = v.id");
1752 assert_eq!(
1753 tokens,
1754 vec![
1755 Token::From,
1756 Token::Ident("hosts".into()),
1757 Token::Ident("h".into()),
1758 Token::Join,
1759 Token::Graph,
1760 Token::LParen,
1761 Token::Ident("h".into()),
1762 Token::RParen,
1763 Token::Dash,
1764 Token::LBracket,
1765 Token::Colon,
1766 Token::Ident("HAS_VULN".into()),
1767 Token::RBracket,
1768 Token::Arrow,
1769 Token::LParen,
1770 Token::Ident("v".into()),
1771 Token::RParen,
1772 Token::On,
1773 Token::Ident("h".into()),
1774 Token::Dot,
1775 Token::Ident("ip".into()),
1776 Token::Eq,
1777 Token::Ident("v".into()),
1778 Token::Dot,
1779 Token::Ident("id".into()),
1780 Token::Eof
1781 ]
1782 );
1783 }
1784
1785 #[test]
1786 fn test_path_query() {
1787 let tokens = tokenize("PATH FROM host('192.168.1.1') TO host('10.0.0.1') VIA [:AUTH]");
1788 assert_eq!(
1789 tokens,
1790 vec![
1791 Token::Path,
1792 Token::From,
1793 Token::Ident("host".into()),
1794 Token::LParen,
1795 Token::String("192.168.1.1".into()),
1796 Token::RParen,
1797 Token::To,
1798 Token::Ident("host".into()),
1799 Token::LParen,
1800 Token::String("10.0.0.1".into()),
1801 Token::RParen,
1802 Token::Via,
1803 Token::LBracket,
1804 Token::Colon,
1805 Token::Ident("AUTH".into()),
1806 Token::RBracket,
1807 Token::Eof
1808 ]
1809 );
1810 }
1811
1812 #[test]
1813 fn test_variable_length_pattern() {
1814 let tokens = tokenize("(a)-[*1..5]->(b)");
1815 assert_eq!(
1816 tokens,
1817 vec![
1818 Token::LParen,
1819 Token::Ident("a".into()),
1820 Token::RParen,
1821 Token::Dash,
1822 Token::LBracket,
1823 Token::Star,
1824 Token::Integer(1),
1825 Token::DotDot,
1826 Token::Integer(5),
1827 Token::RBracket,
1828 Token::Arrow,
1829 Token::LParen,
1830 Token::Ident("b".into()),
1831 Token::RParen,
1832 Token::Eof
1833 ]
1834 );
1835 }
1836
1837 #[test]
1838 fn test_case_insensitive_keywords() {
1839 let tokens = tokenize("select FROM Where AND");
1840 assert_eq!(
1841 tokens,
1842 vec![
1843 Token::Select,
1844 Token::From,
1845 Token::Where,
1846 Token::And,
1847 Token::Eof
1848 ]
1849 );
1850 }
1851
1852 #[test]
1853 fn test_comments() {
1854 let tokens = tokenize("SELECT -- this is a comment\nip FROM hosts");
1855 assert_eq!(
1856 tokens,
1857 vec![
1858 Token::Select,
1859 Token::Ident("ip".into()),
1860 Token::From,
1861 Token::Ident("hosts".into()),
1862 Token::Eof
1863 ]
1864 );
1865 }
1866
1867 #[test]
1868 fn test_escaped_strings() {
1869 let tokens = tokenize(r"'hello\nworld' 'tab\there'");
1870 assert_eq!(
1871 tokens,
1872 vec![
1873 Token::String("hello\nworld".into()),
1874 Token::String("tab\there".into()),
1875 Token::Eof
1876 ]
1877 );
1878 }
1879
1880 #[test]
1881 fn test_keyword_matrix_and_alias_spellings() {
1882 let cases = [
1883 ("SELECT", Token::Select),
1884 ("FROM", Token::From),
1885 ("WHERE", Token::Where),
1886 ("AND", Token::And),
1887 ("OR", Token::Or),
1888 ("NOT", Token::Not),
1889 ("MATCH", Token::Match),
1890 ("RETURN", Token::Return),
1891 ("JOIN", Token::Join),
1892 ("GRAPH", Token::Graph),
1893 ("PATH", Token::Path),
1894 ("TO", Token::To),
1895 ("VIA", Token::Via),
1896 ("ON", Token::On),
1897 ("AS", Token::As),
1898 ("IS", Token::Is),
1899 ("NULL", Token::Null),
1900 ("BETWEEN", Token::Between),
1901 ("LIKE", Token::Like),
1902 ("IN", Token::In),
1903 ("ORDER", Token::Order),
1904 ("BY", Token::By),
1905 ("ASC", Token::Asc),
1906 ("DESC", Token::Desc),
1907 ("NULLS", Token::Nulls),
1908 ("FIRST", Token::First),
1909 ("LAST", Token::Last),
1910 ("LIMIT", Token::Limit),
1911 ("OFFSET", Token::Offset),
1912 ("INNER", Token::Inner),
1913 ("LEFT", Token::Left),
1914 ("RIGHT", Token::Right),
1915 ("OUTER", Token::Outer),
1916 ("FULL", Token::Full),
1917 ("CROSS", Token::Cross),
1918 ("STARTS", Token::Starts),
1919 ("ENDS", Token::Ends),
1920 ("WITH", Token::With),
1921 ("CONTAINS", Token::Contains),
1922 ("TRUE", Token::True),
1923 ("FALSE", Token::False),
1924 ("ENRICH", Token::Enrich),
1925 ("GROUP", Token::Group),
1926 ("COUNT", Token::Count),
1927 ("SUM", Token::Sum),
1928 ("AVG", Token::Avg),
1929 ("MIN", Token::Min),
1930 ("MAX", Token::Max),
1931 ("DISTINCT", Token::Distinct),
1932 ("VECTOR", Token::Vector),
1933 ("SEARCH", Token::Search),
1934 ("SIMILAR", Token::Similar),
1935 ("COLLECTION", Token::Collection),
1936 ("METRIC", Token::Metric),
1937 ("THRESHOLD", Token::Threshold),
1938 ("K", Token::K),
1939 ("HYBRID", Token::Hybrid),
1940 ("FUSION", Token::Fusion),
1941 ("RERANK", Token::Rerank),
1942 ("RRF", Token::Rrf),
1943 ("INTERSECTION", Token::Intersection),
1944 ("UNION", Token::Union),
1945 ("RECURSIVE", Token::Recursive),
1946 ("ALL", Token::All),
1947 ("WEIGHT", Token::Weight),
1948 ("L2", Token::L2),
1949 ("COSINE", Token::Cosine),
1950 ("INNER_PRODUCT", Token::InnerProduct),
1951 ("INNERPRODUCT", Token::InnerProduct),
1952 ("INCLUDE", Token::Include),
1953 ("METADATA", Token::Metadata),
1954 ("VECTORS", Token::Vectors),
1955 ("EXPLAIN", Token::Explain),
1956 ("FOR", Token::For),
1957 ("FORMAT", Token::Format),
1958 ("JSON", Token::Json),
1959 ("INSERT", Token::Insert),
1960 ("INTO", Token::Into),
1961 ("VALUES", Token::Values),
1962 ("UPDATE", Token::Update),
1963 ("SET", Token::Set),
1964 ("DELETE", Token::Delete),
1965 ("TRUNCATE", Token::Truncate),
1966 ("CREATE", Token::Create),
1967 ("TABLE", Token::Table),
1968 ("DROP", Token::Drop),
1969 ("ALTER", Token::Alter),
1970 ("ADD", Token::Add),
1971 ("COLUMN", Token::Column),
1972 ("PRIMARY", Token::Primary),
1973 ("KEY", Token::Key),
1974 ("DEFAULT", Token::Default),
1975 ("COMPRESS", Token::Compress),
1976 ("INDEX", Token::Index),
1977 ("UNIQUE", Token::Unique),
1978 ("IF", Token::If),
1979 ("EXISTS", Token::Exists),
1980 ("RETURNING", Token::Returning),
1981 ("CASCADE", Token::Cascade),
1982 ("RENAME", Token::Rename),
1983 ("USING", Token::Using),
1984 ("NODE", Token::Node),
1985 ("EDGE", Token::Edge),
1986 ("DOCUMENT", Token::Document),
1987 ("KV", Token::Kv),
1988 ("TIMESERIES", Token::Timeseries),
1989 ("RETENTION", Token::Retention),
1990 ("QUEUE", Token::Queue),
1991 ("TREE", Token::Tree),
1992 ("PUSH", Token::Push),
1993 ("POP", Token::Pop),
1994 ("PEEK", Token::Peek),
1995 ("PURGE", Token::Purge),
1996 ("ACK", Token::Ack),
1997 ("NACK", Token::Nack),
1998 ("PRIORITY", Token::Priority),
1999 ("LPUSH", Token::Ident("LPUSH".into())),
2000 ("RPUSH", Token::Ident("RPUSH".into())),
2001 ("LPOP", Token::Ident("LPOP".into())),
2002 ("RPOP", Token::Ident("RPOP".into())),
2003 ("NEIGHBORHOOD", Token::Neighborhood),
2004 ("SHORTEST_PATH", Token::ShortestPath),
2005 ("SHORTESTPATH", Token::ShortestPath),
2006 ("CENTRALITY", Token::Centrality),
2007 ("COMMUNITY", Token::Community),
2008 ("COMPONENTS", Token::Components),
2009 ("CYCLES", Token::Cycles),
2010 ("TRAVERSE", Token::Traverse),
2011 ("DEPTH", Token::Depth),
2012 ("DIRECTION", Token::Direction),
2013 ("ALGORITHM", Token::Algorithm),
2014 ("STRATEGY", Token::Strategy),
2015 ("MAX_ITERATIONS", Token::MaxIterations),
2016 ("MAXITERATIONS", Token::MaxIterations),
2017 ("MAX_LENGTH", Token::MaxLength),
2018 ("MAXLENGTH", Token::MaxLength),
2019 ("MODE", Token::Mode),
2020 ("CLUSTERING", Token::Clustering),
2021 ("TOPOLOGICAL_SORT", Token::TopologicalSort),
2022 ("TOPOLOGICALSORT", Token::TopologicalSort),
2023 ("PROPERTIES", Token::Properties),
2024 ("TEXT", Token::Text),
2025 ("FUZZY", Token::Fuzzy),
2026 ("MIN_SCORE", Token::MinScore),
2027 ("MINSCORE", Token::MinScore),
2028 ("BEGIN", Token::Begin),
2029 ("COMMIT", Token::Commit),
2030 ("ROLLBACK", Token::Rollback),
2031 ("SAVEPOINT", Token::Savepoint),
2032 ("RELEASE", Token::Release),
2033 ("START", Token::Start),
2034 ("TRANSACTION", Token::Transaction),
2035 ("WORK", Token::Work),
2036 ("VACUUM", Token::Vacuum),
2037 ("ANALYZE", Token::Analyze),
2038 ("SCHEMA", Token::Schema),
2039 ("SEQUENCE", Token::Sequence),
2040 ("INCREMENT", Token::Increment),
2041 ("COPY", Token::Copy),
2042 ("HEADER", Token::Header),
2043 ("DELIMITER", Token::Delimiter),
2044 ("VIEW", Token::View),
2045 ("MATERIALIZED", Token::Materialized),
2046 ("REFRESH", Token::Refresh),
2047 ("PARTITION", Token::Partition),
2048 ("RANGE", Token::Range),
2049 ("LIST", Token::List),
2050 ("HASH", Token::Hash),
2051 ("ATTACH", Token::Attach),
2052 ("DETACH", Token::Detach),
2053 ("OF", Token::Of),
2054 ("POLICY", Token::Policy),
2055 ("ENABLE", Token::Enable),
2056 ("DISABLE", Token::Disable),
2057 ("SECURITY", Token::Security),
2058 ("ROW", Token::Row),
2059 ("LEVEL", Token::Level),
2060 ("FOREIGN", Token::Foreign),
2061 ("SERVER", Token::Server),
2062 ("WRAPPER", Token::Wrapper),
2063 ("OPTIONS", Token::Options),
2064 ("DATA", Token::Data),
2065 ("plain_ident", Token::Ident("plain_ident".into())),
2066 ];
2067
2068 for (input, expected) in cases {
2069 let tokens = tokenize(input);
2070 assert_eq!(tokens, vec![expected, Token::Eof], "{input}");
2071 }
2072 }
2073
2074 #[test]
2075 fn test_display_all_token_variants() {
2076 let cases = [
2077 (Token::Select, "SELECT"),
2078 (Token::From, "FROM"),
2079 (Token::Where, "WHERE"),
2080 (Token::And, "AND"),
2081 (Token::Or, "OR"),
2082 (Token::Not, "NOT"),
2083 (Token::Match, "MATCH"),
2084 (Token::Return, "RETURN"),
2085 (Token::Join, "JOIN"),
2086 (Token::Graph, "GRAPH"),
2087 (Token::Path, "PATH"),
2088 (Token::To, "TO"),
2089 (Token::Via, "VIA"),
2090 (Token::On, "ON"),
2091 (Token::As, "AS"),
2092 (Token::Is, "IS"),
2093 (Token::Null, "NULL"),
2094 (Token::Between, "BETWEEN"),
2095 (Token::Like, "LIKE"),
2096 (Token::In, "IN"),
2097 (Token::Order, "ORDER"),
2098 (Token::By, "BY"),
2099 (Token::Asc, "ASC"),
2100 (Token::Desc, "DESC"),
2101 (Token::Nulls, "NULLS"),
2102 (Token::First, "FIRST"),
2103 (Token::Last, "LAST"),
2104 (Token::Limit, "LIMIT"),
2105 (Token::Offset, "OFFSET"),
2106 (Token::Inner, "INNER"),
2107 (Token::Left, "LEFT"),
2108 (Token::Right, "RIGHT"),
2109 (Token::Outer, "OUTER"),
2110 (Token::Full, "FULL"),
2111 (Token::Cross, "CROSS"),
2112 (Token::Starts, "STARTS"),
2113 (Token::Ends, "ENDS"),
2114 (Token::With, "WITH"),
2115 (Token::Contains, "CONTAINS"),
2116 (Token::True, "TRUE"),
2117 (Token::False, "FALSE"),
2118 (Token::Enrich, "ENRICH"),
2119 (Token::Group, "GROUP"),
2120 (Token::Count, "COUNT"),
2121 (Token::Sum, "SUM"),
2122 (Token::Avg, "AVG"),
2123 (Token::Min, "MIN"),
2124 (Token::Max, "MAX"),
2125 (Token::Distinct, "DISTINCT"),
2126 (Token::Vector, "VECTOR"),
2127 (Token::Search, "SEARCH"),
2128 (Token::Similar, "SIMILAR"),
2129 (Token::Collection, "COLLECTION"),
2130 (Token::Metric, "METRIC"),
2131 (Token::Threshold, "THRESHOLD"),
2132 (Token::K, "K"),
2133 (Token::Hybrid, "HYBRID"),
2134 (Token::Fusion, "FUSION"),
2135 (Token::Rerank, "RERANK"),
2136 (Token::Rrf, "RRF"),
2137 (Token::Intersection, "INTERSECTION"),
2138 (Token::Union, "UNION"),
2139 (Token::Recursive, "RECURSIVE"),
2140 (Token::All, "ALL"),
2141 (Token::Weight, "WEIGHT"),
2142 (Token::L2, "L2"),
2143 (Token::Cosine, "COSINE"),
2144 (Token::InnerProduct, "INNER_PRODUCT"),
2145 (Token::Include, "INCLUDE"),
2146 (Token::Metadata, "METADATA"),
2147 (Token::Vectors, "VECTORS"),
2148 (Token::Explain, "EXPLAIN"),
2149 (Token::For, "FOR"),
2150 (Token::Format, "FORMAT"),
2151 (Token::Json, "JSON"),
2152 (Token::Insert, "INSERT"),
2153 (Token::Into, "INTO"),
2154 (Token::Values, "VALUES"),
2155 (Token::Update, "UPDATE"),
2156 (Token::Set, "SET"),
2157 (Token::Delete, "DELETE"),
2158 (Token::Truncate, "TRUNCATE"),
2159 (Token::Create, "CREATE"),
2160 (Token::Table, "TABLE"),
2161 (Token::Drop, "DROP"),
2162 (Token::Alter, "ALTER"),
2163 (Token::Add, "ADD"),
2164 (Token::Column, "COLUMN"),
2165 (Token::Primary, "PRIMARY"),
2166 (Token::Key, "KEY"),
2167 (Token::Default, "DEFAULT"),
2168 (Token::Compress, "COMPRESS"),
2169 (Token::Index, "INDEX"),
2170 (Token::Unique, "UNIQUE"),
2171 (Token::If, "IF"),
2172 (Token::Exists, "EXISTS"),
2173 (Token::Returning, "RETURNING"),
2174 (Token::Cascade, "CASCADE"),
2175 (Token::Rename, "RENAME"),
2176 (Token::Using, "USING"),
2177 (Token::Node, "NODE"),
2178 (Token::Edge, "EDGE"),
2179 (Token::Document, "DOCUMENT"),
2180 (Token::Kv, "KV"),
2181 (Token::Timeseries, "TIMESERIES"),
2182 (Token::Retention, "RETENTION"),
2183 (Token::Queue, "QUEUE"),
2184 (Token::Tree, "TREE"),
2185 (Token::Push, "PUSH"),
2186 (Token::Pop, "POP"),
2187 (Token::Peek, "PEEK"),
2188 (Token::Purge, "PURGE"),
2189 (Token::Ack, "ACK"),
2190 (Token::Nack, "NACK"),
2191 (Token::Priority, "PRIORITY"),
2192 (Token::Neighborhood, "NEIGHBORHOOD"),
2193 (Token::ShortestPath, "SHORTEST_PATH"),
2194 (Token::Centrality, "CENTRALITY"),
2195 (Token::Community, "COMMUNITY"),
2196 (Token::Components, "COMPONENTS"),
2197 (Token::Cycles, "CYCLES"),
2198 (Token::Traverse, "TRAVERSE"),
2199 (Token::Depth, "DEPTH"),
2200 (Token::Direction, "DIRECTION"),
2201 (Token::Algorithm, "ALGORITHM"),
2202 (Token::Strategy, "STRATEGY"),
2203 (Token::MaxIterations, "MAX_ITERATIONS"),
2204 (Token::MaxLength, "MAX_LENGTH"),
2205 (Token::Mode, "MODE"),
2206 (Token::Clustering, "CLUSTERING"),
2207 (Token::TopologicalSort, "TOPOLOGICAL_SORT"),
2208 (Token::Properties, "PROPERTIES"),
2209 (Token::Text, "TEXT"),
2210 (Token::Fuzzy, "FUZZY"),
2211 (Token::MinScore, "MIN_SCORE"),
2212 (Token::Begin, "BEGIN"),
2213 (Token::Commit, "COMMIT"),
2214 (Token::Rollback, "ROLLBACK"),
2215 (Token::Savepoint, "SAVEPOINT"),
2216 (Token::Release, "RELEASE"),
2217 (Token::Start, "START"),
2218 (Token::Transaction, "TRANSACTION"),
2219 (Token::Work, "WORK"),
2220 (Token::Vacuum, "VACUUM"),
2221 (Token::Analyze, "ANALYZE"),
2222 (Token::Schema, "SCHEMA"),
2223 (Token::Sequence, "SEQUENCE"),
2224 (Token::Increment, "INCREMENT"),
2225 (Token::Copy, "COPY"),
2226 (Token::Header, "HEADER"),
2227 (Token::Delimiter, "DELIMITER"),
2228 (Token::View, "VIEW"),
2229 (Token::Materialized, "MATERIALIZED"),
2230 (Token::Refresh, "REFRESH"),
2231 (Token::Partition, "PARTITION"),
2232 (Token::Range, "RANGE"),
2233 (Token::List, "LIST"),
2234 (Token::Hash, "HASH"),
2235 (Token::Attach, "ATTACH"),
2236 (Token::Detach, "DETACH"),
2237 (Token::Of, "OF"),
2238 (Token::Policy, "POLICY"),
2239 (Token::Enable, "ENABLE"),
2240 (Token::Disable, "DISABLE"),
2241 (Token::Security, "SECURITY"),
2242 (Token::Row, "ROW"),
2243 (Token::Level, "LEVEL"),
2244 (Token::Foreign, "FOREIGN"),
2245 (Token::Server, "SERVER"),
2246 (Token::Wrapper, "WRAPPER"),
2247 (Token::Options, "OPTIONS"),
2248 (Token::Data, "DATA"),
2249 (Token::String("x".into()), "'x'"),
2250 (Token::Integer(7), "7"),
2251 (Token::Float(1.5), "1.5"),
2252 (Token::JsonLiteral(r#"{"x":1}"#.into()), r#"{"x":1}"#),
2253 (Token::Ident("id".into()), "id"),
2254 (Token::Eq, "="),
2255 (Token::Ne, "<>"),
2256 (Token::Lt, "<"),
2257 (Token::Le, "<="),
2258 (Token::Gt, ">"),
2259 (Token::Ge, ">="),
2260 (Token::Plus, "+"),
2261 (Token::Minus, "-"),
2262 (Token::Star, "*"),
2263 (Token::Slash, "/"),
2264 (Token::Percent, "%"),
2265 (Token::LParen, "("),
2266 (Token::RParen, ")"),
2267 (Token::LBracket, "["),
2268 (Token::RBracket, "]"),
2269 (Token::LBrace, "{"),
2270 (Token::RBrace, "}"),
2271 (Token::Comma, ","),
2272 (Token::Dot, "."),
2273 (Token::Colon, ":"),
2274 (Token::Semi, ";"),
2275 (Token::Dollar, "$"),
2276 (Token::FatArrow, "=>"),
2277 (Token::Arrow, "->"),
2278 (Token::ArrowLeft, "<-"),
2279 (Token::Dash, "-"),
2280 (Token::DotDot, ".."),
2281 (Token::Pipe, "|"),
2282 (Token::DoublePipe, "||"),
2283 (Token::Eof, "EOF"),
2284 ];
2285
2286 for (token, expected) in cases {
2287 assert_eq!(token.to_string(), expected);
2288 }
2289 }
2290
2291 #[test]
2292 fn fat_arrow_lexes_distinctly_from_eq() {
2293 assert_eq!(
2295 tokenize("resolution => 0.5"),
2296 vec![
2297 Token::Ident("resolution".into()),
2298 Token::FatArrow,
2299 Token::Float(0.5),
2300 Token::Eof,
2301 ]
2302 );
2303 assert_eq!(
2304 tokenize("x = 1"),
2305 vec![
2306 Token::Ident("x".into()),
2307 Token::Eq,
2308 Token::Integer(1),
2309 Token::Eof,
2310 ]
2311 );
2312 }
2313
2314 #[test]
2315 fn test_string_escape_and_error_matrix() {
2316 let tokens = tokenize(
2317 r#"'line\nrow' 'carriage\rreturn' 'tab\tstop' 'slash\\' 'quote\'' "dq\"" 'raw\z'"#,
2318 );
2319 assert_eq!(
2320 tokens,
2321 vec![
2322 Token::String("line\nrow".into()),
2323 Token::String("carriage\rreturn".into()),
2324 Token::String("tab\tstop".into()),
2325 Token::String("slash\\".into()),
2326 Token::String("quote'".into()),
2327 Token::String("dq\"".into()),
2328 Token::String(r"raw\z".into()),
2329 Token::Eof
2330 ]
2331 );
2332
2333 let mut lexer = Lexer::new("'unterminated");
2334 assert!(lexer
2335 .next_token()
2336 .unwrap_err()
2337 .message
2338 .contains("Unterminated string"));
2339
2340 let mut lexer = Lexer::new(r"'bad\");
2341 assert!(lexer
2342 .next_token()
2343 .unwrap_err()
2344 .message
2345 .contains("Unterminated string"));
2346 }
2347
2348 #[test]
2349 fn test_operator_comment_peek_limit_and_tokenize_paths() {
2350 let tokens = tokenize("!= % ; $ || | 123.abc 1..2 1e+2 <- -> /* block */ SELECT");
2351 assert_eq!(
2352 tokens,
2353 vec![
2354 Token::Ne,
2355 Token::Percent,
2356 Token::Semi,
2357 Token::Dollar,
2358 Token::DoublePipe,
2359 Token::Pipe,
2360 Token::Integer(123),
2361 Token::Dot,
2362 Token::Ident("abc".into()),
2363 Token::Integer(1),
2364 Token::DotDot,
2365 Token::Integer(2),
2366 Token::Float(1e2),
2367 Token::ArrowLeft,
2368 Token::Arrow,
2369 Token::Select,
2370 Token::Eof,
2371 ]
2372 );
2373
2374 let mut lexer = Lexer::new("SELECT FROM");
2375 assert_eq!(lexer.peek_token().unwrap().token, Token::Select);
2376 assert_eq!(lexer.next_token().unwrap().token, Token::Select);
2377 assert_eq!(lexer.next_token().unwrap().token, Token::From);
2378
2379 let mut lexer = Lexer::new("!");
2380 assert!(lexer
2381 .next_token()
2382 .unwrap_err()
2383 .message
2384 .contains("Expected '=' after '!'"));
2385
2386 let limits = crate::storage::query::parser::ParserLimits {
2387 max_identifier_chars: 3,
2388 ..crate::storage::query::parser::ParserLimits::default()
2389 };
2390 let mut lexer = Lexer::with_limits("abcd", limits);
2391 assert_eq!(lexer.max_identifier_chars(), 3);
2392 let err = lexer.next_token().unwrap_err();
2393 assert!(matches!(
2394 err.limit_hit,
2395 Some(LexerLimitHit::IdentifierTooLong { value: 3, .. })
2396 ));
2397 }
2398}