1use std::fmt;
15use std::iter::Peekable;
16use std::str::Chars;
17
18#[derive(Debug, Clone, PartialEq)]
20pub enum Token {
21 Select,
23 From,
24 Where,
25 And,
26 Or,
27 Not,
28 Match,
29 Return,
30 Join,
31 Graph,
32 Path,
33 To,
34 Via,
35 On,
36 As,
37 Is,
38 Null,
39 Between,
40 Like,
41 In,
42 Order,
43 By,
44 Asc,
45 Desc,
46 Nulls,
47 First,
48 Last,
49 Limit,
50 Offset,
51 Inner,
52 Left,
53 Right,
54 Outer,
55 Full,
56 Cross,
57 Starts,
58 Ends,
59 With,
60 Contains,
61 True,
62 False,
63 Enrich,
64 Group,
65 Count,
66 Sum,
67 Avg,
68 Min,
69 Max,
70 Distinct,
71
72 Vector,
74 Search,
75 Similar,
76 Collection,
77 Metric,
78 Threshold,
79 K,
80 Hybrid,
81 Fusion,
82 Rerank,
83 Rrf,
84 Intersection,
85 Union,
86 Recursive,
87 All,
88 Weight,
89 L2,
90 Cosine,
91 InnerProduct,
92 Include,
93 Metadata,
94 Vectors,
95
96 Insert,
98 Into,
99 Values,
100 Update,
101 Set,
102 Delete,
103 Truncate,
104 Create,
105 Table,
106 Drop,
107 Alter,
108 Add,
109 Column,
110 Primary,
111 Explain,
113 For,
114 Format,
115 Json,
116 Key,
117 Default,
118 Compress,
119 Index,
120 Unique,
121 If,
122 Exists,
123 Returning,
124 Cascade,
125 Rename,
126 Using,
127
128 Node,
130 Edge,
131 Document,
132 Kv,
133
134 Timeseries,
136 Retention,
137 Queue,
138 Tree,
139 Push,
140 Pop,
141 Peek,
142 Purge,
143 Ack,
144 Nack,
145 Priority,
146
147 Neighborhood,
149 ShortestPath,
150 Centrality,
151 Community,
152 Components,
153 Cycles,
154 Traverse,
155 Depth,
156 Direction,
157 Algorithm,
158 Strategy,
159 MaxIterations,
160 MaxLength,
161 Mode,
162 Clustering,
163 TopologicalSort,
164 Properties,
165 Text,
166 Fuzzy,
167 MinScore,
168
169 Begin,
171 Commit,
172 Rollback,
173 Savepoint,
174 Release,
175 Start,
176 Transaction,
177 Work,
178
179 Vacuum,
181 Analyze,
182
183 Schema,
185 Sequence,
186 Increment,
187
188 Copy,
190 Header,
191 Delimiter,
192
193 View,
195 Materialized,
196 Refresh,
197
198 Partition,
200 Range,
201 List,
202 Hash,
203 Attach,
204 Detach,
205 Of,
206
207 Policy,
209 Enable,
210 Disable,
211 Security,
212 Row,
213 Level,
214
215 Foreign,
217 Server,
218 Wrapper,
219 Options,
220 Data,
221
222 Sessionize,
224 Gap,
225
226 Over,
230 Rows,
231 Preceding,
232 Following,
233 Unbounded,
234 Current,
235
236 String(String),
238 Integer(i64),
239 Float(f64),
240 JsonLiteral(String),
246
247 Ident(String),
249
250 Eq, Ne, Lt, Le, Gt, Ge, Plus, Minus, Star, Slash, Percent, LParen, RParen, LBracket, RBracket, LBrace, RBrace, Comma, Dot, Colon, Semi, Dollar, Question, Arrow, ArrowLeft, Dash, DotDot, Pipe, DoublePipe, Eof,
287}
288
289impl fmt::Display for Token {
290 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
291 match self {
292 Token::Select => write!(f, "SELECT"),
293 Token::From => write!(f, "FROM"),
294 Token::Where => write!(f, "WHERE"),
295 Token::And => write!(f, "AND"),
296 Token::Or => write!(f, "OR"),
297 Token::Not => write!(f, "NOT"),
298 Token::Match => write!(f, "MATCH"),
299 Token::Return => write!(f, "RETURN"),
300 Token::Join => write!(f, "JOIN"),
301 Token::Graph => write!(f, "GRAPH"),
302 Token::Path => write!(f, "PATH"),
303 Token::To => write!(f, "TO"),
304 Token::Via => write!(f, "VIA"),
305 Token::On => write!(f, "ON"),
306 Token::As => write!(f, "AS"),
307 Token::Is => write!(f, "IS"),
308 Token::Null => write!(f, "NULL"),
309 Token::Between => write!(f, "BETWEEN"),
310 Token::Like => write!(f, "LIKE"),
311 Token::In => write!(f, "IN"),
312 Token::Order => write!(f, "ORDER"),
313 Token::By => write!(f, "BY"),
314 Token::Asc => write!(f, "ASC"),
315 Token::Desc => write!(f, "DESC"),
316 Token::Nulls => write!(f, "NULLS"),
317 Token::First => write!(f, "FIRST"),
318 Token::Last => write!(f, "LAST"),
319 Token::Limit => write!(f, "LIMIT"),
320 Token::Offset => write!(f, "OFFSET"),
321 Token::Inner => write!(f, "INNER"),
322 Token::Left => write!(f, "LEFT"),
323 Token::Right => write!(f, "RIGHT"),
324 Token::Outer => write!(f, "OUTER"),
325 Token::Full => write!(f, "FULL"),
326 Token::Cross => write!(f, "CROSS"),
327 Token::Starts => write!(f, "STARTS"),
328 Token::Ends => write!(f, "ENDS"),
329 Token::With => write!(f, "WITH"),
330 Token::Contains => write!(f, "CONTAINS"),
331 Token::True => write!(f, "TRUE"),
332 Token::False => write!(f, "FALSE"),
333 Token::Enrich => write!(f, "ENRICH"),
334 Token::Group => write!(f, "GROUP"),
335 Token::Count => write!(f, "COUNT"),
336 Token::Sum => write!(f, "SUM"),
337 Token::Avg => write!(f, "AVG"),
338 Token::Min => write!(f, "MIN"),
339 Token::Max => write!(f, "MAX"),
340 Token::Distinct => write!(f, "DISTINCT"),
341 Token::Vector => write!(f, "VECTOR"),
342 Token::Search => write!(f, "SEARCH"),
343 Token::Similar => write!(f, "SIMILAR"),
344 Token::Collection => write!(f, "COLLECTION"),
345 Token::Metric => write!(f, "METRIC"),
346 Token::Threshold => write!(f, "THRESHOLD"),
347 Token::K => write!(f, "K"),
348 Token::Hybrid => write!(f, "HYBRID"),
349 Token::Fusion => write!(f, "FUSION"),
350 Token::Rerank => write!(f, "RERANK"),
351 Token::Rrf => write!(f, "RRF"),
352 Token::Intersection => write!(f, "INTERSECTION"),
353 Token::Union => write!(f, "UNION"),
354 Token::Recursive => write!(f, "RECURSIVE"),
355 Token::All => write!(f, "ALL"),
356 Token::Weight => write!(f, "WEIGHT"),
357 Token::L2 => write!(f, "L2"),
358 Token::Cosine => write!(f, "COSINE"),
359 Token::InnerProduct => write!(f, "INNER_PRODUCT"),
360 Token::Include => write!(f, "INCLUDE"),
361 Token::Metadata => write!(f, "METADATA"),
362 Token::Vectors => write!(f, "VECTORS"),
363 Token::Explain => write!(f, "EXPLAIN"),
364 Token::For => write!(f, "FOR"),
365 Token::Format => write!(f, "FORMAT"),
366 Token::Json => write!(f, "JSON"),
367 Token::Insert => write!(f, "INSERT"),
368 Token::Into => write!(f, "INTO"),
369 Token::Values => write!(f, "VALUES"),
370 Token::Update => write!(f, "UPDATE"),
371 Token::Set => write!(f, "SET"),
372 Token::Delete => write!(f, "DELETE"),
373 Token::Truncate => write!(f, "TRUNCATE"),
374 Token::Create => write!(f, "CREATE"),
375 Token::Table => write!(f, "TABLE"),
376 Token::Drop => write!(f, "DROP"),
377 Token::Alter => write!(f, "ALTER"),
378 Token::Add => write!(f, "ADD"),
379 Token::Column => write!(f, "COLUMN"),
380 Token::Primary => write!(f, "PRIMARY"),
381 Token::Key => write!(f, "KEY"),
382 Token::Default => write!(f, "DEFAULT"),
383 Token::Compress => write!(f, "COMPRESS"),
384 Token::Index => write!(f, "INDEX"),
385 Token::Unique => write!(f, "UNIQUE"),
386 Token::If => write!(f, "IF"),
387 Token::Exists => write!(f, "EXISTS"),
388 Token::Returning => write!(f, "RETURNING"),
389 Token::Cascade => write!(f, "CASCADE"),
390 Token::Rename => write!(f, "RENAME"),
391 Token::Using => write!(f, "USING"),
392 Token::Node => write!(f, "NODE"),
393 Token::Edge => write!(f, "EDGE"),
394 Token::Document => write!(f, "DOCUMENT"),
395 Token::Kv => write!(f, "KV"),
396 Token::Timeseries => write!(f, "TIMESERIES"),
397 Token::Retention => write!(f, "RETENTION"),
398 Token::Queue => write!(f, "QUEUE"),
399 Token::Tree => write!(f, "TREE"),
400 Token::Push => write!(f, "PUSH"),
401 Token::Pop => write!(f, "POP"),
402 Token::Peek => write!(f, "PEEK"),
403 Token::Purge => write!(f, "PURGE"),
404 Token::Ack => write!(f, "ACK"),
405 Token::Nack => write!(f, "NACK"),
406 Token::Priority => write!(f, "PRIORITY"),
407 Token::Neighborhood => write!(f, "NEIGHBORHOOD"),
408 Token::ShortestPath => write!(f, "SHORTEST_PATH"),
409 Token::Centrality => write!(f, "CENTRALITY"),
410 Token::Community => write!(f, "COMMUNITY"),
411 Token::Components => write!(f, "COMPONENTS"),
412 Token::Cycles => write!(f, "CYCLES"),
413 Token::Traverse => write!(f, "TRAVERSE"),
414 Token::Depth => write!(f, "DEPTH"),
415 Token::Direction => write!(f, "DIRECTION"),
416 Token::Algorithm => write!(f, "ALGORITHM"),
417 Token::Strategy => write!(f, "STRATEGY"),
418 Token::MaxIterations => write!(f, "MAX_ITERATIONS"),
419 Token::MaxLength => write!(f, "MAX_LENGTH"),
420 Token::Mode => write!(f, "MODE"),
421 Token::Clustering => write!(f, "CLUSTERING"),
422 Token::TopologicalSort => write!(f, "TOPOLOGICAL_SORT"),
423 Token::Properties => write!(f, "PROPERTIES"),
424 Token::Text => write!(f, "TEXT"),
425 Token::Fuzzy => write!(f, "FUZZY"),
426 Token::MinScore => write!(f, "MIN_SCORE"),
427 Token::Begin => write!(f, "BEGIN"),
428 Token::Commit => write!(f, "COMMIT"),
429 Token::Rollback => write!(f, "ROLLBACK"),
430 Token::Savepoint => write!(f, "SAVEPOINT"),
431 Token::Release => write!(f, "RELEASE"),
432 Token::Start => write!(f, "START"),
433 Token::Transaction => write!(f, "TRANSACTION"),
434 Token::Work => write!(f, "WORK"),
435 Token::Vacuum => write!(f, "VACUUM"),
436 Token::Analyze => write!(f, "ANALYZE"),
437 Token::Schema => write!(f, "SCHEMA"),
438 Token::Sequence => write!(f, "SEQUENCE"),
439 Token::Increment => write!(f, "INCREMENT"),
440 Token::Copy => write!(f, "COPY"),
441 Token::Header => write!(f, "HEADER"),
442 Token::Delimiter => write!(f, "DELIMITER"),
443 Token::View => write!(f, "VIEW"),
444 Token::Materialized => write!(f, "MATERIALIZED"),
445 Token::Refresh => write!(f, "REFRESH"),
446 Token::Partition => write!(f, "PARTITION"),
447 Token::Range => write!(f, "RANGE"),
448 Token::List => write!(f, "LIST"),
449 Token::Hash => write!(f, "HASH"),
450 Token::Attach => write!(f, "ATTACH"),
451 Token::Detach => write!(f, "DETACH"),
452 Token::Of => write!(f, "OF"),
453 Token::Policy => write!(f, "POLICY"),
454 Token::Enable => write!(f, "ENABLE"),
455 Token::Disable => write!(f, "DISABLE"),
456 Token::Security => write!(f, "SECURITY"),
457 Token::Row => write!(f, "ROW"),
458 Token::Level => write!(f, "LEVEL"),
459 Token::Foreign => write!(f, "FOREIGN"),
460 Token::Server => write!(f, "SERVER"),
461 Token::Wrapper => write!(f, "WRAPPER"),
462 Token::Options => write!(f, "OPTIONS"),
463 Token::Data => write!(f, "DATA"),
464 Token::Sessionize => write!(f, "SESSIONIZE"),
465 Token::Gap => write!(f, "GAP"),
466 Token::Over => write!(f, "OVER"),
467 Token::Rows => write!(f, "ROWS"),
468 Token::Preceding => write!(f, "PRECEDING"),
469 Token::Following => write!(f, "FOLLOWING"),
470 Token::Unbounded => write!(f, "UNBOUNDED"),
471 Token::Current => write!(f, "CURRENT"),
472 Token::String(s) => write!(f, "'{}'", s),
473 Token::Integer(n) => write!(f, "{}", n),
474 Token::Float(n) => write!(f, "{}", n),
475 Token::JsonLiteral(s) => write!(f, "{}", s),
476 Token::Ident(s) => write!(f, "{}", s),
477 Token::Eq => write!(f, "="),
478 Token::Ne => write!(f, "<>"),
479 Token::Lt => write!(f, "<"),
480 Token::Le => write!(f, "<="),
481 Token::Gt => write!(f, ">"),
482 Token::Ge => write!(f, ">="),
483 Token::Plus => write!(f, "+"),
484 Token::Minus => write!(f, "-"),
485 Token::Star => write!(f, "*"),
486 Token::Slash => write!(f, "/"),
487 Token::Percent => write!(f, "%"),
488 Token::LParen => write!(f, "("),
489 Token::RParen => write!(f, ")"),
490 Token::LBracket => write!(f, "["),
491 Token::RBracket => write!(f, "]"),
492 Token::LBrace => write!(f, "{{"),
493 Token::RBrace => write!(f, "}}"),
494 Token::Comma => write!(f, ","),
495 Token::Dot => write!(f, "."),
496 Token::Colon => write!(f, ":"),
497 Token::Semi => write!(f, ";"),
498 Token::Dollar => write!(f, "$"),
499 Token::Question => write!(f, "?"),
500 Token::Arrow => write!(f, "->"),
501 Token::ArrowLeft => write!(f, "<-"),
502 Token::Dash => write!(f, "-"),
503 Token::DotDot => write!(f, ".."),
504 Token::Pipe => write!(f, "|"),
505 Token::DoublePipe => write!(f, "||"),
506 Token::Eof => write!(f, "EOF"),
507 }
508 }
509}
510
511#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
513pub struct Position {
514 pub line: u32,
516 pub column: u32,
518 pub offset: u32,
520}
521
522impl Position {
523 pub fn new(line: u32, column: u32, offset: u32) -> Self {
525 Self {
526 line,
527 column,
528 offset,
529 }
530 }
531}
532
533impl fmt::Display for Position {
534 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
535 write!(f, "{}:{}", self.line, self.column)
536 }
537}
538
539#[derive(Debug, Clone)]
541pub struct Spanned {
542 pub token: Token,
544 pub start: Position,
546 pub end: Position,
548}
549
550impl Spanned {
551 pub fn new(token: Token, start: Position, end: Position) -> Self {
553 Self { token, start, end }
554 }
555}
556
557#[derive(Debug, Clone)]
559pub struct LexerError {
560 pub message: String,
562 pub position: Position,
564 pub limit_hit: Option<LexerLimitHit>,
568}
569
570#[derive(Debug, Clone, PartialEq, Eq)]
572pub enum LexerLimitHit {
573 IdentifierTooLong {
575 limit_name: &'static str,
576 value: usize,
577 },
578}
579
580impl LexerError {
581 pub fn new(message: impl Into<String>, position: Position) -> Self {
583 Self {
584 message: message.into(),
585 position,
586 limit_hit: None,
587 }
588 }
589
590 pub(crate) fn with_limit(
592 message: impl Into<String>,
593 position: Position,
594 limit_hit: LexerLimitHit,
595 ) -> Self {
596 Self {
597 message: message.into(),
598 position,
599 limit_hit: Some(limit_hit),
600 }
601 }
602}
603
604impl fmt::Display for LexerError {
605 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
606 write!(f, "Lexer error at {}: {}", self.position, self.message)
607 }
608}
609
610impl std::error::Error for LexerError {}
611
612pub const JSON_LITERAL_MAX_BYTES: usize = 16 * 1024 * 1024;
619
620pub struct Lexer<'a> {
622 input: &'a str,
625 chars: Peekable<Chars<'a>>,
627 line: u32,
629 column: u32,
630 offset: u32,
631 peeked: Option<Spanned>,
633 putback: Option<(char, Position)>,
635 max_identifier_chars: usize,
637}
638
639impl<'a> Lexer<'a> {
640 pub fn new(input: &'a str) -> Self {
642 Self::with_limits(
643 input,
644 crate::storage::query::parser::ParserLimits::default(),
645 )
646 }
647
648 pub fn with_limits(
650 input: &'a str,
651 limits: crate::storage::query::parser::ParserLimits,
652 ) -> Self {
653 Self {
654 input,
655 chars: input.chars().peekable(),
656 line: 1,
657 column: 1,
658 offset: 0,
659 peeked: None,
660 putback: None,
661 max_identifier_chars: limits.max_identifier_chars,
662 }
663 }
664
665 pub(crate) fn max_identifier_chars(&self) -> usize {
669 self.max_identifier_chars
670 }
671
672 fn position(&self) -> Position {
674 Position::new(self.line, self.column, self.offset)
675 }
676
677 fn unget(&mut self, ch: char, pos: Position) {
679 self.putback = Some((ch, pos));
680 }
681
682 fn advance(&mut self) -> Option<char> {
684 if let Some((ch, pos)) = self.putback.take() {
686 self.line = pos.line;
688 self.column = pos.column + 1;
689 self.offset = pos.offset + ch.len_utf8() as u32;
690 return Some(ch);
691 }
692
693 let ch = self.chars.next()?;
694 self.offset += ch.len_utf8() as u32;
695 if ch == '\n' {
696 self.line += 1;
697 self.column = 1;
698 } else {
699 self.column += 1;
700 }
701 Some(ch)
702 }
703
704 fn peek(&mut self) -> Option<char> {
706 if let Some((ch, _)) = &self.putback {
708 return Some(*ch);
709 }
710 self.chars.peek().copied()
711 }
712
713 fn skip_whitespace(&mut self) {
715 while let Some(ch) = self.peek() {
716 if ch.is_whitespace() {
717 self.advance();
718 } else if ch == '-' {
719 let pos = self.position();
721 self.advance();
722 if self.peek() == Some('-') {
723 self.advance();
725 while let Some(c) = self.peek() {
726 if c == '\n' {
727 break;
728 }
729 self.advance();
730 }
731 } else {
732 self.line = pos.line;
735 self.column = pos.column;
736 self.offset = pos.offset;
737 break;
740 }
741 } else {
742 break;
743 }
744 }
745 }
746
747 pub fn peek_token(&mut self) -> Result<&Spanned, LexerError> {
749 if self.peeked.is_none() {
750 self.peeked = Some(self.next_token_internal()?);
751 }
752 Ok(self.peeked.as_ref().unwrap())
753 }
754
755 pub fn next_token(&mut self) -> Result<Spanned, LexerError> {
757 if let Some(tok) = self.peeked.take() {
758 return Ok(tok);
759 }
760 self.next_token_internal()
761 }
762
763 fn next_token_internal(&mut self) -> Result<Spanned, LexerError> {
765 self.skip_whitespace_simple();
766
767 let start = self.position();
768
769 let ch = match self.peek() {
770 Some(c) => c,
771 None => {
772 return Ok(Spanned::new(Token::Eof, start, start));
773 }
774 };
775
776 let token = match ch {
778 '\'' | '"' => self.scan_string()?,
780
781 '0'..='9' => self.scan_number()?,
783
784 'a'..='z' | 'A'..='Z' | '_' => self.scan_identifier()?,
786
787 '=' => {
789 self.advance();
790 Token::Eq
791 }
792 '<' => self.scan_less_than()?,
793 '>' => self.scan_greater_than()?,
794 '!' => {
795 self.advance();
796 if self.peek() == Some('=') {
797 self.advance();
798 Token::Ne
799 } else {
800 return Err(LexerError::new("Expected '=' after '!'", start));
801 }
802 }
803 '+' => {
804 self.advance();
805 Token::Plus
806 }
807 '-' => self.scan_minus()?,
808 '*' => {
809 self.advance();
810 Token::Star
811 }
812 '/' => {
813 self.advance();
814 Token::Slash
815 }
816 '%' => {
817 self.advance();
818 Token::Percent
819 }
820 '(' => {
821 self.advance();
822 Token::LParen
823 }
824 ')' => {
825 self.advance();
826 Token::RParen
827 }
828 '[' => {
829 self.advance();
830 Token::LBracket
831 }
832 ']' => {
833 self.advance();
834 Token::RBracket
835 }
836 '{' => {
837 if self.looks_like_json_object_start() {
844 return self.scan_json_literal(start);
845 }
846 self.advance();
847 Token::LBrace
848 }
849 '}' => {
850 self.advance();
851 Token::RBrace
852 }
853 ',' => {
854 self.advance();
855 Token::Comma
856 }
857 '.' => self.scan_dot()?,
858 ':' => {
859 self.advance();
860 Token::Colon
861 }
862 ';' => {
863 self.advance();
864 Token::Semi
865 }
866 '$' => {
867 self.advance();
868 Token::Dollar
869 }
870 '?' => {
871 self.advance();
872 Token::Question
873 }
874 '|' => {
875 self.advance();
876 if self.peek() == Some('|') {
877 self.advance();
878 Token::DoublePipe
879 } else {
880 Token::Pipe
881 }
882 }
883 _ => {
884 return Err(LexerError::new(
885 format!("Unexpected character: '{}'", ch),
886 start,
887 ));
888 }
889 };
890
891 let end = self.position();
892 Ok(Spanned::new(token, start, end))
893 }
894
895 fn skip_whitespace_simple(&mut self) {
897 while let Some(ch) = self.peek() {
898 if ch.is_whitespace() {
899 self.advance();
900 } else if ch == '-' && self.input[self.offset as usize..].starts_with("--") {
901 self.advance();
902 self.advance();
903 while let Some(c) = self.peek() {
904 if c == '\n' {
905 break;
906 }
907 self.advance();
908 }
909 } else if ch == '/' && self.input[self.offset as usize..].starts_with("/*") {
910 self.advance();
911 self.advance();
912 while let Some(c) = self.peek() {
913 self.advance();
914 if c == '*' && self.peek() == Some('/') {
915 self.advance();
916 break;
917 }
918 }
919 } else {
920 break;
921 }
922 }
923 }
924
925 fn scan_string(&mut self) -> Result<Token, LexerError> {
927 let quote = self.advance().unwrap(); let start = self.position();
929 let mut value = String::new();
930
931 loop {
932 match self.peek() {
933 None => {
934 return Err(LexerError::new("Unterminated string", start));
935 }
936 Some(c) if c == quote => {
937 self.advance();
938 if self.peek() == Some(quote) {
940 self.advance();
941 value.push(quote);
942 } else {
943 break;
944 }
945 }
946 Some('\\') => {
947 self.advance();
948 match self.peek() {
949 Some('n') => {
950 self.advance();
951 value.push('\n');
952 }
953 Some('r') => {
954 self.advance();
955 value.push('\r');
956 }
957 Some('t') => {
958 self.advance();
959 value.push('\t');
960 }
961 Some('\\') => {
962 self.advance();
963 value.push('\\');
964 }
965 Some(c) if c == quote => {
966 self.advance();
967 value.push(quote);
968 }
969 Some(c) => {
970 value.push('\\');
972 value.push(c);
973 self.advance();
974 }
975 None => {
976 return Err(LexerError::new("Unterminated string", start));
977 }
978 }
979 }
980 Some(c) => {
981 self.advance();
982 value.push(c);
983 }
984 }
985 }
986
987 Ok(Token::String(value))
988 }
989
990 fn scan_number(&mut self) -> Result<Token, LexerError> {
992 let mut value = String::new();
993 let mut is_float = false;
994
995 while let Some(ch) = self.peek() {
997 if ch.is_ascii_digit() {
998 value.push(ch);
999 self.advance();
1000 } else {
1001 break;
1002 }
1003 }
1004
1005 if self.peek() == Some('.') {
1007 let dot_pos = self.position();
1009 self.advance(); if self.peek() == Some('.') {
1012 self.unget('.', dot_pos);
1014 } else if self.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
1016 is_float = true;
1017 value.push('.');
1018 while let Some(ch) = self.peek() {
1019 if ch.is_ascii_digit() {
1020 value.push(ch);
1021 self.advance();
1022 } else {
1023 break;
1024 }
1025 }
1026 } else {
1027 self.unget('.', dot_pos);
1029 }
1030 }
1031
1032 if self.peek() == Some('e') || self.peek() == Some('E') {
1034 is_float = true;
1035 value.push(self.advance().unwrap());
1036
1037 if self.peek() == Some('+') || self.peek() == Some('-') {
1038 value.push(self.advance().unwrap());
1039 }
1040
1041 while let Some(ch) = self.peek() {
1042 if ch.is_ascii_digit() {
1043 value.push(ch);
1044 self.advance();
1045 } else {
1046 break;
1047 }
1048 }
1049 }
1050
1051 if is_float {
1052 match value.parse::<f64>() {
1053 Ok(n) => Ok(Token::Float(n)),
1054 Err(_) => Err(LexerError::new(
1055 format!("Invalid float: {}", value),
1056 self.position(),
1057 )),
1058 }
1059 } else {
1060 match value.parse::<i64>() {
1061 Ok(n) => Ok(Token::Integer(n)),
1062 Err(_) => Err(LexerError::new(
1063 format!("Invalid integer: {}", value),
1064 self.position(),
1065 )),
1066 }
1067 }
1068 }
1069
1070 fn scan_identifier(&mut self) -> Result<Token, LexerError> {
1072 let start_pos = self.position();
1073 let mut value = String::new();
1074 let max = self.max_identifier_chars;
1075
1076 while let Some(ch) = self.peek() {
1077 if ch.is_alphanumeric() || ch == '_' {
1078 if value.chars().count() >= max {
1079 return Err(LexerError::with_limit(
1083 format!(
1084 "identifier exceeds maximum length (max_identifier_chars = {})",
1085 max
1086 ),
1087 start_pos,
1088 LexerLimitHit::IdentifierTooLong {
1089 limit_name: "max_identifier_chars",
1090 value: max,
1091 },
1092 ));
1093 }
1094 value.push(ch);
1095 self.advance();
1096 } else {
1097 break;
1098 }
1099 }
1100
1101 let token = match value.to_uppercase().as_str() {
1103 "SELECT" => Token::Select,
1104 "FROM" => Token::From,
1105 "WHERE" => Token::Where,
1106 "AND" => Token::And,
1107 "OR" => Token::Or,
1108 "NOT" => Token::Not,
1109 "MATCH" => Token::Match,
1110 "RETURN" => Token::Return,
1111 "JOIN" => Token::Join,
1112 "GRAPH" => Token::Graph,
1113 "PATH" => Token::Path,
1114 "TO" => Token::To,
1115 "VIA" => Token::Via,
1116 "ON" => Token::On,
1117 "AS" => Token::As,
1118 "IS" => Token::Is,
1119 "NULL" => Token::Null,
1120 "BETWEEN" => Token::Between,
1121 "LIKE" => Token::Like,
1122 "IN" => Token::In,
1123 "ORDER" => Token::Order,
1124 "BY" => Token::By,
1125 "ASC" => Token::Asc,
1126 "DESC" => Token::Desc,
1127 "NULLS" => Token::Nulls,
1128 "FIRST" => Token::First,
1129 "LAST" => Token::Last,
1130 "LIMIT" => Token::Limit,
1131 "OFFSET" => Token::Offset,
1132 "INNER" => Token::Inner,
1133 "LEFT" => Token::Left,
1134 "RIGHT" => Token::Right,
1135 "OUTER" => Token::Outer,
1136 "FULL" => Token::Full,
1137 "CROSS" => Token::Cross,
1138 "STARTS" => Token::Starts,
1139 "ENDS" => Token::Ends,
1140 "WITH" => Token::With,
1141 "CONTAINS" => Token::Contains,
1142 "TRUE" => Token::True,
1143 "FALSE" => Token::False,
1144 "ENRICH" => Token::Enrich,
1145 "GROUP" => Token::Group,
1146 "COUNT" => Token::Count,
1147 "SUM" => Token::Sum,
1148 "AVG" => Token::Avg,
1149 "MIN" => Token::Min,
1150 "MAX" => Token::Max,
1151 "DISTINCT" => Token::Distinct,
1152 "VECTOR" => Token::Vector,
1153 "SEARCH" => Token::Search,
1154 "SIMILAR" => Token::Similar,
1155 "COLLECTION" => Token::Collection,
1156 "METRIC" => Token::Metric,
1157 "THRESHOLD" => Token::Threshold,
1158 "K" => Token::K,
1159 "HYBRID" => Token::Hybrid,
1160 "FUSION" => Token::Fusion,
1161 "RERANK" => Token::Rerank,
1162 "RRF" => Token::Rrf,
1163 "INTERSECTION" => Token::Intersection,
1164 "UNION" => Token::Union,
1165 "RECURSIVE" => Token::Recursive,
1166 "ALL" => Token::All,
1167 "WEIGHT" => Token::Weight,
1168 "L2" => Token::L2,
1169 "COSINE" => Token::Cosine,
1170 "INNER_PRODUCT" | "INNERPRODUCT" => Token::InnerProduct,
1171 "INCLUDE" => Token::Include,
1172 "METADATA" => Token::Metadata,
1173 "VECTORS" => Token::Vectors,
1174 "EXPLAIN" => Token::Explain,
1175 "FOR" => Token::For,
1176 "FORMAT" => Token::Format,
1177 "JSON" => Token::Json,
1178 "INSERT" => Token::Insert,
1179 "INTO" => Token::Into,
1180 "VALUES" => Token::Values,
1181 "UPDATE" => Token::Update,
1182 "SET" => Token::Set,
1183 "DELETE" => Token::Delete,
1184 "TRUNCATE" => Token::Truncate,
1185 "CREATE" => Token::Create,
1186 "TABLE" => Token::Table,
1187 "DROP" => Token::Drop,
1188 "ALTER" => Token::Alter,
1189 "ADD" => Token::Add,
1190 "COLUMN" => Token::Column,
1191 "PRIMARY" => Token::Primary,
1192 "KEY" => Token::Key,
1193 "DEFAULT" => Token::Default,
1194 "COMPRESS" => Token::Compress,
1195 "INDEX" => Token::Index,
1196 "UNIQUE" => Token::Unique,
1197 "IF" => Token::If,
1198 "EXISTS" => Token::Exists,
1199 "RETURNING" => Token::Returning,
1200 "CASCADE" => Token::Cascade,
1201 "RENAME" => Token::Rename,
1202 "USING" => Token::Using,
1203 "NODE" => Token::Node,
1204 "EDGE" => Token::Edge,
1205 "DOCUMENT" => Token::Document,
1206 "KV" => Token::Kv,
1207 "TIMESERIES" => Token::Timeseries,
1208 "RETENTION" => Token::Retention,
1209 "QUEUE" => Token::Queue,
1210 "TREE" => Token::Tree,
1211 "PUSH" => Token::Push,
1212 "POP" => Token::Pop,
1213 "PEEK" => Token::Peek,
1214 "PURGE" => Token::Purge,
1215 "ACK" => Token::Ack,
1216 "NACK" => Token::Nack,
1217 "PRIORITY" => Token::Priority,
1218 "LPUSH" => Token::Ident("LPUSH".to_string()),
1219 "RPUSH" => Token::Ident("RPUSH".to_string()),
1220 "LPOP" => Token::Ident("LPOP".to_string()),
1221 "RPOP" => Token::Ident("RPOP".to_string()),
1222 "NEIGHBORHOOD" => Token::Neighborhood,
1223 "SHORTEST_PATH" | "SHORTESTPATH" => Token::ShortestPath,
1224 "CENTRALITY" => Token::Centrality,
1225 "COMMUNITY" => Token::Community,
1226 "COMPONENTS" => Token::Components,
1227 "CYCLES" => Token::Cycles,
1228 "TRAVERSE" => Token::Traverse,
1229 "DEPTH" => Token::Depth,
1230 "DIRECTION" => Token::Direction,
1231 "ALGORITHM" => Token::Algorithm,
1232 "STRATEGY" => Token::Strategy,
1233 "MAX_ITERATIONS" | "MAXITERATIONS" => Token::MaxIterations,
1234 "MAX_LENGTH" | "MAXLENGTH" => Token::MaxLength,
1235 "MODE" => Token::Mode,
1236 "CLUSTERING" => Token::Clustering,
1237 "TOPOLOGICAL_SORT" | "TOPOLOGICALSORT" => Token::TopologicalSort,
1238 "PROPERTIES" => Token::Properties,
1239 "TEXT" => Token::Text,
1240 "FUZZY" => Token::Fuzzy,
1241 "MIN_SCORE" | "MINSCORE" => Token::MinScore,
1242 "BEGIN" => Token::Begin,
1243 "COMMIT" => Token::Commit,
1244 "ROLLBACK" => Token::Rollback,
1245 "SAVEPOINT" => Token::Savepoint,
1246 "RELEASE" => Token::Release,
1247 "START" => Token::Start,
1248 "TRANSACTION" => Token::Transaction,
1249 "WORK" => Token::Work,
1250 "VACUUM" => Token::Vacuum,
1251 "ANALYZE" => Token::Analyze,
1252 "SCHEMA" => Token::Schema,
1253 "SEQUENCE" => Token::Sequence,
1254 "INCREMENT" => Token::Increment,
1255 "COPY" => Token::Copy,
1256 "HEADER" => Token::Header,
1257 "DELIMITER" => Token::Delimiter,
1258 "VIEW" => Token::View,
1259 "MATERIALIZED" => Token::Materialized,
1260 "REFRESH" => Token::Refresh,
1261 "PARTITION" => Token::Partition,
1262 "RANGE" => Token::Range,
1263 "LIST" => Token::List,
1264 "HASH" => Token::Hash,
1265 "ATTACH" => Token::Attach,
1266 "DETACH" => Token::Detach,
1267 "OF" => Token::Of,
1268 "POLICY" => Token::Policy,
1269 "ENABLE" => Token::Enable,
1270 "DISABLE" => Token::Disable,
1271 "SECURITY" => Token::Security,
1272 "ROW" => Token::Row,
1273 "LEVEL" => Token::Level,
1274 "FOREIGN" => Token::Foreign,
1275 "SERVER" => Token::Server,
1276 "WRAPPER" => Token::Wrapper,
1277 "OPTIONS" => Token::Options,
1278 "DATA" => Token::Data,
1279 "SESSIONIZE" => Token::Sessionize,
1280 "GAP" => Token::Gap,
1281 "OVER" => Token::Over,
1282 "ROWS" => Token::Rows,
1283 "PRECEDING" => Token::Preceding,
1284 "FOLLOWING" => Token::Following,
1285 "UNBOUNDED" => Token::Unbounded,
1286 "CURRENT" => Token::Current,
1287 _ => Token::Ident(value),
1288 };
1289
1290 Ok(token)
1291 }
1292
1293 fn scan_less_than(&mut self) -> Result<Token, LexerError> {
1295 self.advance(); match self.peek() {
1297 Some('=') => {
1298 self.advance();
1299 Ok(Token::Le)
1300 }
1301 Some('>') => {
1302 self.advance();
1303 Ok(Token::Ne)
1304 }
1305 Some('-') => {
1306 self.advance();
1307 Ok(Token::ArrowLeft)
1308 }
1309 _ => Ok(Token::Lt),
1310 }
1311 }
1312
1313 fn scan_greater_than(&mut self) -> Result<Token, LexerError> {
1315 self.advance(); if self.peek() == Some('=') {
1317 self.advance();
1318 Ok(Token::Ge)
1319 } else {
1320 Ok(Token::Gt)
1321 }
1322 }
1323
1324 fn scan_minus(&mut self) -> Result<Token, LexerError> {
1326 self.advance(); match self.peek() {
1328 Some('>') => {
1329 self.advance();
1330 Ok(Token::Arrow)
1331 }
1332 Some('-') => {
1333 self.advance();
1335 while let Some(c) = self.peek() {
1336 if c == '\n' {
1337 break;
1338 }
1339 self.advance();
1340 }
1341 self.skip_whitespace_simple();
1343 if self.peek().is_none() {
1344 Ok(Token::Eof)
1345 } else {
1346 let next = self.next_token_internal()?;
1347 Ok(next.token)
1348 }
1349 }
1350 _ => Ok(Token::Dash),
1351 }
1352 }
1353
1354 fn scan_dot(&mut self) -> Result<Token, LexerError> {
1356 self.advance(); if self.peek() == Some('.') {
1358 self.advance();
1359 Ok(Token::DotDot)
1360 } else {
1361 Ok(Token::Dot)
1362 }
1363 }
1364
1365 fn looks_like_json_object_start(&self) -> bool {
1370 let bytes = self.input.as_bytes();
1371 let mut i = self.offset as usize;
1372 debug_assert!(bytes.get(i) == Some(&b'{'));
1374 i += 1;
1375 while i < bytes.len() {
1376 match bytes[i] {
1377 b' ' | b'\t' | b'\n' | b'\r' => i += 1,
1378 b'"' | b'}' => return true,
1379 _ => return false,
1380 }
1381 }
1382 false
1383 }
1384
1385 fn scan_json_literal(&mut self, start: Position) -> Result<Spanned, LexerError> {
1402 let start_offset = self.offset as usize;
1403 self.advance();
1405 let mut depth: u32 = 1;
1406 let mut in_string = false;
1407 let mut escape = false;
1408 loop {
1409 let ch = match self.peek() {
1410 Some(c) => c,
1411 None => {
1412 return Err(LexerError::new(
1413 format!(
1414 "unterminated JSON object literal (started at offset {})",
1415 start.offset
1416 ),
1417 self.position(),
1418 ));
1419 }
1420 };
1421
1422 let scanned_bytes = self.offset as usize - start_offset;
1424 if scanned_bytes > JSON_LITERAL_MAX_BYTES {
1425 return Err(LexerError::new(
1426 format!(
1427 "JSON object literal exceeds JSON_LITERAL_MAX_BYTES ({} bytes)",
1428 JSON_LITERAL_MAX_BYTES
1429 ),
1430 start,
1431 ));
1432 }
1433
1434 self.advance();
1435
1436 if escape {
1437 escape = false;
1438 continue;
1439 }
1440
1441 if in_string {
1442 match ch {
1443 '\\' => escape = true,
1444 '"' => in_string = false,
1445 _ => {}
1446 }
1447 continue;
1448 }
1449
1450 match ch {
1451 '"' => in_string = true,
1452 '{' => depth += 1,
1453 '}' => {
1454 depth -= 1;
1455 if depth == 0 {
1456 let end = self.position();
1457 let end_offset = self.offset as usize;
1458 if end_offset - start_offset > JSON_LITERAL_MAX_BYTES {
1460 return Err(LexerError::new(
1461 format!(
1462 "JSON object literal exceeds JSON_LITERAL_MAX_BYTES ({} bytes)",
1463 JSON_LITERAL_MAX_BYTES
1464 ),
1465 start,
1466 ));
1467 }
1468 let raw = self.input[start_offset..end_offset].to_string();
1469 return Ok(Spanned::new(Token::JsonLiteral(raw), start, end));
1470 }
1471 }
1472 _ => {}
1473 }
1474 }
1475 }
1476
1477 pub fn tokenize(&mut self) -> Result<Vec<Spanned>, LexerError> {
1479 let mut tokens = Vec::new();
1480 loop {
1481 let tok = self.next_token()?;
1482 let is_eof = tok.token == Token::Eof;
1483 tokens.push(tok);
1484 if is_eof {
1485 break;
1486 }
1487 }
1488 Ok(tokens)
1489 }
1490}
1491
1492#[cfg(test)]
1497mod tests {
1498 use super::*;
1499
1500 fn tokenize(input: &str) -> Vec<Token> {
1501 let mut lexer = Lexer::new(input);
1502 lexer
1503 .tokenize()
1504 .unwrap()
1505 .into_iter()
1506 .map(|s| s.token)
1507 .collect()
1508 }
1509
1510 #[test]
1511 fn test_keywords() {
1512 let tokens = tokenize("SELECT FROM WHERE AND OR NOT");
1513 assert_eq!(
1514 tokens,
1515 vec![
1516 Token::Select,
1517 Token::From,
1518 Token::Where,
1519 Token::And,
1520 Token::Or,
1521 Token::Not,
1522 Token::Eof
1523 ]
1524 );
1525 }
1526
1527 #[test]
1528 fn test_identifiers() {
1529 let tokens = tokenize("hosts users ip_address");
1530 assert_eq!(
1531 tokens,
1532 vec![
1533 Token::Ident("hosts".into()),
1534 Token::Ident("users".into()),
1535 Token::Ident("ip_address".into()),
1536 Token::Eof
1537 ]
1538 );
1539 }
1540
1541 #[test]
1542 fn test_numbers() {
1543 let tokens = tokenize("42 2.5 1e10 2.5e-3");
1544 assert_eq!(
1545 tokens,
1546 vec![
1547 Token::Integer(42),
1548 Token::Float(2.5),
1549 Token::Float(1e10),
1550 Token::Float(2.5e-3),
1551 Token::Eof
1552 ]
1553 );
1554 }
1555
1556 #[test]
1557 fn test_strings() {
1558 let tokens = tokenize("'hello' \"world\" 'it''s'");
1559 assert_eq!(
1560 tokens,
1561 vec![
1562 Token::String("hello".into()),
1563 Token::String("world".into()),
1564 Token::String("it's".into()),
1565 Token::Eof
1566 ]
1567 );
1568 }
1569
1570 #[test]
1571 fn test_operators() {
1572 let tokens = tokenize("= <> < <= > >= != + - * /");
1573 assert_eq!(
1574 tokens,
1575 vec![
1576 Token::Eq,
1577 Token::Ne,
1578 Token::Lt,
1579 Token::Le,
1580 Token::Gt,
1581 Token::Ge,
1582 Token::Ne,
1583 Token::Plus,
1584 Token::Dash,
1585 Token::Star,
1586 Token::Slash,
1587 Token::Eof
1588 ]
1589 );
1590 }
1591
1592 #[test]
1593 fn test_delimiters() {
1594 let tokens = tokenize("( ) [ ] { a } , . : ;");
1599 assert_eq!(
1600 tokens,
1601 vec![
1602 Token::LParen,
1603 Token::RParen,
1604 Token::LBracket,
1605 Token::RBracket,
1606 Token::LBrace,
1607 Token::Ident("a".into()),
1608 Token::RBrace,
1609 Token::Comma,
1610 Token::Dot,
1611 Token::Colon,
1612 Token::Semi,
1613 Token::Eof
1614 ]
1615 );
1616 }
1617
1618 #[test]
1619 fn test_json_literal_empty_object() {
1620 let tokens = tokenize("{ }");
1621 assert_eq!(tokens, vec![Token::JsonLiteral("{ }".into()), Token::Eof]);
1622 }
1623
1624 #[test]
1625 fn test_json_literal_simple() {
1626 let tokens = tokenize(r#"{"a":1}"#);
1627 assert_eq!(
1628 tokens,
1629 vec![Token::JsonLiteral(r#"{"a":1}"#.into()), Token::Eof]
1630 );
1631 }
1632
1633 #[test]
1634 fn test_json_literal_nested() {
1635 let raw = r#"{"a":{"b":[1,2,{"c":"}"}]}}"#;
1636 let tokens = tokenize(raw);
1637 assert_eq!(tokens, vec![Token::JsonLiteral(raw.into()), Token::Eof]);
1638 }
1639
1640 #[test]
1641 fn test_json_literal_escaped_quote_in_string() {
1642 let raw = r#"{"path":"O\"Brien}"}"#;
1644 let tokens = tokenize(raw);
1645 assert_eq!(tokens, vec![Token::JsonLiteral(raw.into()), Token::Eof]);
1646 }
1647
1648 #[test]
1649 fn test_json_literal_unbalanced_eof() {
1650 let mut lexer = Lexer::new(r#"{"a":1"#);
1651 let err = lexer.tokenize().expect_err("expected unterminated error");
1652 assert!(
1653 err.message.contains("unterminated JSON object literal"),
1654 "got: {}",
1655 err.message
1656 );
1657 }
1658
1659 #[test]
1660 fn test_json_literal_property_bag_compatible() {
1661 let tokens = tokenize("{name: 'value'}");
1664 assert_eq!(tokens[0], Token::LBrace);
1665 assert_eq!(*tokens.last().unwrap(), Token::Eof);
1666 }
1667
1668 #[test]
1669 fn test_graph_syntax() {
1670 let tokens = tokenize("-> <- - ..");
1671 assert_eq!(
1672 tokens,
1673 vec![
1674 Token::Arrow,
1675 Token::ArrowLeft,
1676 Token::Dash,
1677 Token::DotDot,
1678 Token::Eof
1679 ]
1680 );
1681 }
1682
1683 #[test]
1684 fn test_table_query() {
1685 let tokens = tokenize("SELECT ip, hostname FROM hosts WHERE os = 'Linux' LIMIT 10");
1686 assert_eq!(
1687 tokens,
1688 vec![
1689 Token::Select,
1690 Token::Ident("ip".into()),
1691 Token::Comma,
1692 Token::Ident("hostname".into()),
1693 Token::From,
1694 Token::Ident("hosts".into()),
1695 Token::Where,
1696 Token::Ident("os".into()),
1697 Token::Eq,
1698 Token::String("Linux".into()),
1699 Token::Limit,
1700 Token::Integer(10),
1701 Token::Eof
1702 ]
1703 );
1704 }
1705
1706 #[test]
1707 fn test_graph_query() {
1708 let tokens = tokenize("MATCH (h:Host)-[:HAS_SERVICE]->(s:Service) RETURN h, s");
1709 assert_eq!(
1710 tokens,
1711 vec![
1712 Token::Match,
1713 Token::LParen,
1714 Token::Ident("h".into()),
1715 Token::Colon,
1716 Token::Ident("Host".into()),
1717 Token::RParen,
1718 Token::Dash,
1719 Token::LBracket,
1720 Token::Colon,
1721 Token::Ident("HAS_SERVICE".into()),
1722 Token::RBracket,
1723 Token::Arrow,
1724 Token::LParen,
1725 Token::Ident("s".into()),
1726 Token::Colon,
1727 Token::Ident("Service".into()),
1728 Token::RParen,
1729 Token::Return,
1730 Token::Ident("h".into()),
1731 Token::Comma,
1732 Token::Ident("s".into()),
1733 Token::Eof
1734 ]
1735 );
1736 }
1737
1738 #[test]
1739 fn test_join_query() {
1740 let tokens = tokenize("FROM hosts h JOIN GRAPH (h)-[:HAS_VULN]->(v) ON h.ip = v.id");
1741 assert_eq!(
1742 tokens,
1743 vec![
1744 Token::From,
1745 Token::Ident("hosts".into()),
1746 Token::Ident("h".into()),
1747 Token::Join,
1748 Token::Graph,
1749 Token::LParen,
1750 Token::Ident("h".into()),
1751 Token::RParen,
1752 Token::Dash,
1753 Token::LBracket,
1754 Token::Colon,
1755 Token::Ident("HAS_VULN".into()),
1756 Token::RBracket,
1757 Token::Arrow,
1758 Token::LParen,
1759 Token::Ident("v".into()),
1760 Token::RParen,
1761 Token::On,
1762 Token::Ident("h".into()),
1763 Token::Dot,
1764 Token::Ident("ip".into()),
1765 Token::Eq,
1766 Token::Ident("v".into()),
1767 Token::Dot,
1768 Token::Ident("id".into()),
1769 Token::Eof
1770 ]
1771 );
1772 }
1773
1774 #[test]
1775 fn test_path_query() {
1776 let tokens = tokenize("PATH FROM host('192.168.1.1') TO host('10.0.0.1') VIA [:AUTH]");
1777 assert_eq!(
1778 tokens,
1779 vec![
1780 Token::Path,
1781 Token::From,
1782 Token::Ident("host".into()),
1783 Token::LParen,
1784 Token::String("192.168.1.1".into()),
1785 Token::RParen,
1786 Token::To,
1787 Token::Ident("host".into()),
1788 Token::LParen,
1789 Token::String("10.0.0.1".into()),
1790 Token::RParen,
1791 Token::Via,
1792 Token::LBracket,
1793 Token::Colon,
1794 Token::Ident("AUTH".into()),
1795 Token::RBracket,
1796 Token::Eof
1797 ]
1798 );
1799 }
1800
1801 #[test]
1802 fn test_variable_length_pattern() {
1803 let tokens = tokenize("(a)-[*1..5]->(b)");
1804 assert_eq!(
1805 tokens,
1806 vec![
1807 Token::LParen,
1808 Token::Ident("a".into()),
1809 Token::RParen,
1810 Token::Dash,
1811 Token::LBracket,
1812 Token::Star,
1813 Token::Integer(1),
1814 Token::DotDot,
1815 Token::Integer(5),
1816 Token::RBracket,
1817 Token::Arrow,
1818 Token::LParen,
1819 Token::Ident("b".into()),
1820 Token::RParen,
1821 Token::Eof
1822 ]
1823 );
1824 }
1825
1826 #[test]
1827 fn test_case_insensitive_keywords() {
1828 let tokens = tokenize("select FROM Where AND");
1829 assert_eq!(
1830 tokens,
1831 vec![
1832 Token::Select,
1833 Token::From,
1834 Token::Where,
1835 Token::And,
1836 Token::Eof
1837 ]
1838 );
1839 }
1840
1841 #[test]
1842 fn test_comments() {
1843 let tokens = tokenize("SELECT -- this is a comment\nip FROM hosts");
1844 assert_eq!(
1845 tokens,
1846 vec![
1847 Token::Select,
1848 Token::Ident("ip".into()),
1849 Token::From,
1850 Token::Ident("hosts".into()),
1851 Token::Eof
1852 ]
1853 );
1854 }
1855
1856 #[test]
1857 fn test_escaped_strings() {
1858 let tokens = tokenize(r"'hello\nworld' 'tab\there'");
1859 assert_eq!(
1860 tokens,
1861 vec![
1862 Token::String("hello\nworld".into()),
1863 Token::String("tab\there".into()),
1864 Token::Eof
1865 ]
1866 );
1867 }
1868
1869 #[test]
1870 fn test_keyword_matrix_and_alias_spellings() {
1871 let cases = [
1872 ("SELECT", Token::Select),
1873 ("FROM", Token::From),
1874 ("WHERE", Token::Where),
1875 ("AND", Token::And),
1876 ("OR", Token::Or),
1877 ("NOT", Token::Not),
1878 ("MATCH", Token::Match),
1879 ("RETURN", Token::Return),
1880 ("JOIN", Token::Join),
1881 ("GRAPH", Token::Graph),
1882 ("PATH", Token::Path),
1883 ("TO", Token::To),
1884 ("VIA", Token::Via),
1885 ("ON", Token::On),
1886 ("AS", Token::As),
1887 ("IS", Token::Is),
1888 ("NULL", Token::Null),
1889 ("BETWEEN", Token::Between),
1890 ("LIKE", Token::Like),
1891 ("IN", Token::In),
1892 ("ORDER", Token::Order),
1893 ("BY", Token::By),
1894 ("ASC", Token::Asc),
1895 ("DESC", Token::Desc),
1896 ("NULLS", Token::Nulls),
1897 ("FIRST", Token::First),
1898 ("LAST", Token::Last),
1899 ("LIMIT", Token::Limit),
1900 ("OFFSET", Token::Offset),
1901 ("INNER", Token::Inner),
1902 ("LEFT", Token::Left),
1903 ("RIGHT", Token::Right),
1904 ("OUTER", Token::Outer),
1905 ("FULL", Token::Full),
1906 ("CROSS", Token::Cross),
1907 ("STARTS", Token::Starts),
1908 ("ENDS", Token::Ends),
1909 ("WITH", Token::With),
1910 ("CONTAINS", Token::Contains),
1911 ("TRUE", Token::True),
1912 ("FALSE", Token::False),
1913 ("ENRICH", Token::Enrich),
1914 ("GROUP", Token::Group),
1915 ("COUNT", Token::Count),
1916 ("SUM", Token::Sum),
1917 ("AVG", Token::Avg),
1918 ("MIN", Token::Min),
1919 ("MAX", Token::Max),
1920 ("DISTINCT", Token::Distinct),
1921 ("VECTOR", Token::Vector),
1922 ("SEARCH", Token::Search),
1923 ("SIMILAR", Token::Similar),
1924 ("COLLECTION", Token::Collection),
1925 ("METRIC", Token::Metric),
1926 ("THRESHOLD", Token::Threshold),
1927 ("K", Token::K),
1928 ("HYBRID", Token::Hybrid),
1929 ("FUSION", Token::Fusion),
1930 ("RERANK", Token::Rerank),
1931 ("RRF", Token::Rrf),
1932 ("INTERSECTION", Token::Intersection),
1933 ("UNION", Token::Union),
1934 ("RECURSIVE", Token::Recursive),
1935 ("ALL", Token::All),
1936 ("WEIGHT", Token::Weight),
1937 ("L2", Token::L2),
1938 ("COSINE", Token::Cosine),
1939 ("INNER_PRODUCT", Token::InnerProduct),
1940 ("INNERPRODUCT", Token::InnerProduct),
1941 ("INCLUDE", Token::Include),
1942 ("METADATA", Token::Metadata),
1943 ("VECTORS", Token::Vectors),
1944 ("EXPLAIN", Token::Explain),
1945 ("FOR", Token::For),
1946 ("FORMAT", Token::Format),
1947 ("JSON", Token::Json),
1948 ("INSERT", Token::Insert),
1949 ("INTO", Token::Into),
1950 ("VALUES", Token::Values),
1951 ("UPDATE", Token::Update),
1952 ("SET", Token::Set),
1953 ("DELETE", Token::Delete),
1954 ("TRUNCATE", Token::Truncate),
1955 ("CREATE", Token::Create),
1956 ("TABLE", Token::Table),
1957 ("DROP", Token::Drop),
1958 ("ALTER", Token::Alter),
1959 ("ADD", Token::Add),
1960 ("COLUMN", Token::Column),
1961 ("PRIMARY", Token::Primary),
1962 ("KEY", Token::Key),
1963 ("DEFAULT", Token::Default),
1964 ("COMPRESS", Token::Compress),
1965 ("INDEX", Token::Index),
1966 ("UNIQUE", Token::Unique),
1967 ("IF", Token::If),
1968 ("EXISTS", Token::Exists),
1969 ("RETURNING", Token::Returning),
1970 ("CASCADE", Token::Cascade),
1971 ("RENAME", Token::Rename),
1972 ("USING", Token::Using),
1973 ("NODE", Token::Node),
1974 ("EDGE", Token::Edge),
1975 ("DOCUMENT", Token::Document),
1976 ("KV", Token::Kv),
1977 ("TIMESERIES", Token::Timeseries),
1978 ("RETENTION", Token::Retention),
1979 ("QUEUE", Token::Queue),
1980 ("TREE", Token::Tree),
1981 ("PUSH", Token::Push),
1982 ("POP", Token::Pop),
1983 ("PEEK", Token::Peek),
1984 ("PURGE", Token::Purge),
1985 ("ACK", Token::Ack),
1986 ("NACK", Token::Nack),
1987 ("PRIORITY", Token::Priority),
1988 ("LPUSH", Token::Ident("LPUSH".into())),
1989 ("RPUSH", Token::Ident("RPUSH".into())),
1990 ("LPOP", Token::Ident("LPOP".into())),
1991 ("RPOP", Token::Ident("RPOP".into())),
1992 ("NEIGHBORHOOD", Token::Neighborhood),
1993 ("SHORTEST_PATH", Token::ShortestPath),
1994 ("SHORTESTPATH", Token::ShortestPath),
1995 ("CENTRALITY", Token::Centrality),
1996 ("COMMUNITY", Token::Community),
1997 ("COMPONENTS", Token::Components),
1998 ("CYCLES", Token::Cycles),
1999 ("TRAVERSE", Token::Traverse),
2000 ("DEPTH", Token::Depth),
2001 ("DIRECTION", Token::Direction),
2002 ("ALGORITHM", Token::Algorithm),
2003 ("STRATEGY", Token::Strategy),
2004 ("MAX_ITERATIONS", Token::MaxIterations),
2005 ("MAXITERATIONS", Token::MaxIterations),
2006 ("MAX_LENGTH", Token::MaxLength),
2007 ("MAXLENGTH", Token::MaxLength),
2008 ("MODE", Token::Mode),
2009 ("CLUSTERING", Token::Clustering),
2010 ("TOPOLOGICAL_SORT", Token::TopologicalSort),
2011 ("TOPOLOGICALSORT", Token::TopologicalSort),
2012 ("PROPERTIES", Token::Properties),
2013 ("TEXT", Token::Text),
2014 ("FUZZY", Token::Fuzzy),
2015 ("MIN_SCORE", Token::MinScore),
2016 ("MINSCORE", Token::MinScore),
2017 ("BEGIN", Token::Begin),
2018 ("COMMIT", Token::Commit),
2019 ("ROLLBACK", Token::Rollback),
2020 ("SAVEPOINT", Token::Savepoint),
2021 ("RELEASE", Token::Release),
2022 ("START", Token::Start),
2023 ("TRANSACTION", Token::Transaction),
2024 ("WORK", Token::Work),
2025 ("VACUUM", Token::Vacuum),
2026 ("ANALYZE", Token::Analyze),
2027 ("SCHEMA", Token::Schema),
2028 ("SEQUENCE", Token::Sequence),
2029 ("INCREMENT", Token::Increment),
2030 ("COPY", Token::Copy),
2031 ("HEADER", Token::Header),
2032 ("DELIMITER", Token::Delimiter),
2033 ("VIEW", Token::View),
2034 ("MATERIALIZED", Token::Materialized),
2035 ("REFRESH", Token::Refresh),
2036 ("PARTITION", Token::Partition),
2037 ("RANGE", Token::Range),
2038 ("LIST", Token::List),
2039 ("HASH", Token::Hash),
2040 ("ATTACH", Token::Attach),
2041 ("DETACH", Token::Detach),
2042 ("OF", Token::Of),
2043 ("POLICY", Token::Policy),
2044 ("ENABLE", Token::Enable),
2045 ("DISABLE", Token::Disable),
2046 ("SECURITY", Token::Security),
2047 ("ROW", Token::Row),
2048 ("LEVEL", Token::Level),
2049 ("FOREIGN", Token::Foreign),
2050 ("SERVER", Token::Server),
2051 ("WRAPPER", Token::Wrapper),
2052 ("OPTIONS", Token::Options),
2053 ("DATA", Token::Data),
2054 ("plain_ident", Token::Ident("plain_ident".into())),
2055 ];
2056
2057 for (input, expected) in cases {
2058 let tokens = tokenize(input);
2059 assert_eq!(tokens, vec![expected, Token::Eof], "{input}");
2060 }
2061 }
2062
2063 #[test]
2064 fn test_display_all_token_variants() {
2065 let cases = [
2066 (Token::Select, "SELECT"),
2067 (Token::From, "FROM"),
2068 (Token::Where, "WHERE"),
2069 (Token::And, "AND"),
2070 (Token::Or, "OR"),
2071 (Token::Not, "NOT"),
2072 (Token::Match, "MATCH"),
2073 (Token::Return, "RETURN"),
2074 (Token::Join, "JOIN"),
2075 (Token::Graph, "GRAPH"),
2076 (Token::Path, "PATH"),
2077 (Token::To, "TO"),
2078 (Token::Via, "VIA"),
2079 (Token::On, "ON"),
2080 (Token::As, "AS"),
2081 (Token::Is, "IS"),
2082 (Token::Null, "NULL"),
2083 (Token::Between, "BETWEEN"),
2084 (Token::Like, "LIKE"),
2085 (Token::In, "IN"),
2086 (Token::Order, "ORDER"),
2087 (Token::By, "BY"),
2088 (Token::Asc, "ASC"),
2089 (Token::Desc, "DESC"),
2090 (Token::Nulls, "NULLS"),
2091 (Token::First, "FIRST"),
2092 (Token::Last, "LAST"),
2093 (Token::Limit, "LIMIT"),
2094 (Token::Offset, "OFFSET"),
2095 (Token::Inner, "INNER"),
2096 (Token::Left, "LEFT"),
2097 (Token::Right, "RIGHT"),
2098 (Token::Outer, "OUTER"),
2099 (Token::Full, "FULL"),
2100 (Token::Cross, "CROSS"),
2101 (Token::Starts, "STARTS"),
2102 (Token::Ends, "ENDS"),
2103 (Token::With, "WITH"),
2104 (Token::Contains, "CONTAINS"),
2105 (Token::True, "TRUE"),
2106 (Token::False, "FALSE"),
2107 (Token::Enrich, "ENRICH"),
2108 (Token::Group, "GROUP"),
2109 (Token::Count, "COUNT"),
2110 (Token::Sum, "SUM"),
2111 (Token::Avg, "AVG"),
2112 (Token::Min, "MIN"),
2113 (Token::Max, "MAX"),
2114 (Token::Distinct, "DISTINCT"),
2115 (Token::Vector, "VECTOR"),
2116 (Token::Search, "SEARCH"),
2117 (Token::Similar, "SIMILAR"),
2118 (Token::Collection, "COLLECTION"),
2119 (Token::Metric, "METRIC"),
2120 (Token::Threshold, "THRESHOLD"),
2121 (Token::K, "K"),
2122 (Token::Hybrid, "HYBRID"),
2123 (Token::Fusion, "FUSION"),
2124 (Token::Rerank, "RERANK"),
2125 (Token::Rrf, "RRF"),
2126 (Token::Intersection, "INTERSECTION"),
2127 (Token::Union, "UNION"),
2128 (Token::Recursive, "RECURSIVE"),
2129 (Token::All, "ALL"),
2130 (Token::Weight, "WEIGHT"),
2131 (Token::L2, "L2"),
2132 (Token::Cosine, "COSINE"),
2133 (Token::InnerProduct, "INNER_PRODUCT"),
2134 (Token::Include, "INCLUDE"),
2135 (Token::Metadata, "METADATA"),
2136 (Token::Vectors, "VECTORS"),
2137 (Token::Explain, "EXPLAIN"),
2138 (Token::For, "FOR"),
2139 (Token::Format, "FORMAT"),
2140 (Token::Json, "JSON"),
2141 (Token::Insert, "INSERT"),
2142 (Token::Into, "INTO"),
2143 (Token::Values, "VALUES"),
2144 (Token::Update, "UPDATE"),
2145 (Token::Set, "SET"),
2146 (Token::Delete, "DELETE"),
2147 (Token::Truncate, "TRUNCATE"),
2148 (Token::Create, "CREATE"),
2149 (Token::Table, "TABLE"),
2150 (Token::Drop, "DROP"),
2151 (Token::Alter, "ALTER"),
2152 (Token::Add, "ADD"),
2153 (Token::Column, "COLUMN"),
2154 (Token::Primary, "PRIMARY"),
2155 (Token::Key, "KEY"),
2156 (Token::Default, "DEFAULT"),
2157 (Token::Compress, "COMPRESS"),
2158 (Token::Index, "INDEX"),
2159 (Token::Unique, "UNIQUE"),
2160 (Token::If, "IF"),
2161 (Token::Exists, "EXISTS"),
2162 (Token::Returning, "RETURNING"),
2163 (Token::Cascade, "CASCADE"),
2164 (Token::Rename, "RENAME"),
2165 (Token::Using, "USING"),
2166 (Token::Node, "NODE"),
2167 (Token::Edge, "EDGE"),
2168 (Token::Document, "DOCUMENT"),
2169 (Token::Kv, "KV"),
2170 (Token::Timeseries, "TIMESERIES"),
2171 (Token::Retention, "RETENTION"),
2172 (Token::Queue, "QUEUE"),
2173 (Token::Tree, "TREE"),
2174 (Token::Push, "PUSH"),
2175 (Token::Pop, "POP"),
2176 (Token::Peek, "PEEK"),
2177 (Token::Purge, "PURGE"),
2178 (Token::Ack, "ACK"),
2179 (Token::Nack, "NACK"),
2180 (Token::Priority, "PRIORITY"),
2181 (Token::Neighborhood, "NEIGHBORHOOD"),
2182 (Token::ShortestPath, "SHORTEST_PATH"),
2183 (Token::Centrality, "CENTRALITY"),
2184 (Token::Community, "COMMUNITY"),
2185 (Token::Components, "COMPONENTS"),
2186 (Token::Cycles, "CYCLES"),
2187 (Token::Traverse, "TRAVERSE"),
2188 (Token::Depth, "DEPTH"),
2189 (Token::Direction, "DIRECTION"),
2190 (Token::Algorithm, "ALGORITHM"),
2191 (Token::Strategy, "STRATEGY"),
2192 (Token::MaxIterations, "MAX_ITERATIONS"),
2193 (Token::MaxLength, "MAX_LENGTH"),
2194 (Token::Mode, "MODE"),
2195 (Token::Clustering, "CLUSTERING"),
2196 (Token::TopologicalSort, "TOPOLOGICAL_SORT"),
2197 (Token::Properties, "PROPERTIES"),
2198 (Token::Text, "TEXT"),
2199 (Token::Fuzzy, "FUZZY"),
2200 (Token::MinScore, "MIN_SCORE"),
2201 (Token::Begin, "BEGIN"),
2202 (Token::Commit, "COMMIT"),
2203 (Token::Rollback, "ROLLBACK"),
2204 (Token::Savepoint, "SAVEPOINT"),
2205 (Token::Release, "RELEASE"),
2206 (Token::Start, "START"),
2207 (Token::Transaction, "TRANSACTION"),
2208 (Token::Work, "WORK"),
2209 (Token::Vacuum, "VACUUM"),
2210 (Token::Analyze, "ANALYZE"),
2211 (Token::Schema, "SCHEMA"),
2212 (Token::Sequence, "SEQUENCE"),
2213 (Token::Increment, "INCREMENT"),
2214 (Token::Copy, "COPY"),
2215 (Token::Header, "HEADER"),
2216 (Token::Delimiter, "DELIMITER"),
2217 (Token::View, "VIEW"),
2218 (Token::Materialized, "MATERIALIZED"),
2219 (Token::Refresh, "REFRESH"),
2220 (Token::Partition, "PARTITION"),
2221 (Token::Range, "RANGE"),
2222 (Token::List, "LIST"),
2223 (Token::Hash, "HASH"),
2224 (Token::Attach, "ATTACH"),
2225 (Token::Detach, "DETACH"),
2226 (Token::Of, "OF"),
2227 (Token::Policy, "POLICY"),
2228 (Token::Enable, "ENABLE"),
2229 (Token::Disable, "DISABLE"),
2230 (Token::Security, "SECURITY"),
2231 (Token::Row, "ROW"),
2232 (Token::Level, "LEVEL"),
2233 (Token::Foreign, "FOREIGN"),
2234 (Token::Server, "SERVER"),
2235 (Token::Wrapper, "WRAPPER"),
2236 (Token::Options, "OPTIONS"),
2237 (Token::Data, "DATA"),
2238 (Token::String("x".into()), "'x'"),
2239 (Token::Integer(7), "7"),
2240 (Token::Float(1.5), "1.5"),
2241 (Token::JsonLiteral(r#"{"x":1}"#.into()), r#"{"x":1}"#),
2242 (Token::Ident("id".into()), "id"),
2243 (Token::Eq, "="),
2244 (Token::Ne, "<>"),
2245 (Token::Lt, "<"),
2246 (Token::Le, "<="),
2247 (Token::Gt, ">"),
2248 (Token::Ge, ">="),
2249 (Token::Plus, "+"),
2250 (Token::Minus, "-"),
2251 (Token::Star, "*"),
2252 (Token::Slash, "/"),
2253 (Token::Percent, "%"),
2254 (Token::LParen, "("),
2255 (Token::RParen, ")"),
2256 (Token::LBracket, "["),
2257 (Token::RBracket, "]"),
2258 (Token::LBrace, "{"),
2259 (Token::RBrace, "}"),
2260 (Token::Comma, ","),
2261 (Token::Dot, "."),
2262 (Token::Colon, ":"),
2263 (Token::Semi, ";"),
2264 (Token::Dollar, "$"),
2265 (Token::Arrow, "->"),
2266 (Token::ArrowLeft, "<-"),
2267 (Token::Dash, "-"),
2268 (Token::DotDot, ".."),
2269 (Token::Pipe, "|"),
2270 (Token::DoublePipe, "||"),
2271 (Token::Eof, "EOF"),
2272 ];
2273
2274 for (token, expected) in cases {
2275 assert_eq!(token.to_string(), expected);
2276 }
2277 }
2278
2279 #[test]
2280 fn test_string_escape_and_error_matrix() {
2281 let tokens = tokenize(
2282 r#"'line\nrow' 'carriage\rreturn' 'tab\tstop' 'slash\\' 'quote\'' "dq\"" 'raw\z'"#,
2283 );
2284 assert_eq!(
2285 tokens,
2286 vec![
2287 Token::String("line\nrow".into()),
2288 Token::String("carriage\rreturn".into()),
2289 Token::String("tab\tstop".into()),
2290 Token::String("slash\\".into()),
2291 Token::String("quote'".into()),
2292 Token::String("dq\"".into()),
2293 Token::String(r"raw\z".into()),
2294 Token::Eof
2295 ]
2296 );
2297
2298 let mut lexer = Lexer::new("'unterminated");
2299 assert!(lexer
2300 .next_token()
2301 .unwrap_err()
2302 .message
2303 .contains("Unterminated string"));
2304
2305 let mut lexer = Lexer::new(r"'bad\");
2306 assert!(lexer
2307 .next_token()
2308 .unwrap_err()
2309 .message
2310 .contains("Unterminated string"));
2311 }
2312
2313 #[test]
2314 fn test_operator_comment_peek_limit_and_tokenize_paths() {
2315 let tokens = tokenize("!= % ; $ || | 123.abc 1..2 1e+2 <- -> /* block */ SELECT");
2316 assert_eq!(
2317 tokens,
2318 vec![
2319 Token::Ne,
2320 Token::Percent,
2321 Token::Semi,
2322 Token::Dollar,
2323 Token::DoublePipe,
2324 Token::Pipe,
2325 Token::Integer(123),
2326 Token::Dot,
2327 Token::Ident("abc".into()),
2328 Token::Integer(1),
2329 Token::DotDot,
2330 Token::Integer(2),
2331 Token::Float(1e2),
2332 Token::ArrowLeft,
2333 Token::Arrow,
2334 Token::Select,
2335 Token::Eof,
2336 ]
2337 );
2338
2339 let mut lexer = Lexer::new("SELECT FROM");
2340 assert_eq!(lexer.peek_token().unwrap().token, Token::Select);
2341 assert_eq!(lexer.next_token().unwrap().token, Token::Select);
2342 assert_eq!(lexer.next_token().unwrap().token, Token::From);
2343
2344 let mut lexer = Lexer::new("!");
2345 assert!(lexer
2346 .next_token()
2347 .unwrap_err()
2348 .message
2349 .contains("Expected '=' after '!'"));
2350
2351 let limits = crate::storage::query::parser::ParserLimits {
2352 max_identifier_chars: 3,
2353 ..crate::storage::query::parser::ParserLimits::default()
2354 };
2355 let mut lexer = Lexer::with_limits("abcd", limits);
2356 assert_eq!(lexer.max_identifier_chars(), 3);
2357 let err = lexer.next_token().unwrap_err();
2358 assert!(matches!(
2359 err.limit_hit,
2360 Some(LexerLimitHit::IdentifierTooLong { value: 3, .. })
2361 ));
2362 }
2363}