1use std::fmt;
15use std::iter::Peekable;
16use std::str::Chars;
17
18#[derive(Debug, Clone, PartialEq)]
20pub enum Token {
21 Select,
23 From,
24 Where,
25 And,
26 Or,
27 Not,
28 Match,
29 Return,
30 Join,
31 Graph,
32 Path,
33 To,
34 Via,
35 On,
36 As,
37 Is,
38 Null,
39 Between,
40 Like,
41 In,
42 Order,
43 By,
44 Asc,
45 Desc,
46 Nulls,
47 First,
48 Last,
49 Limit,
50 Offset,
51 Inner,
52 Left,
53 Right,
54 Outer,
55 Full,
56 Cross,
57 Starts,
58 Ends,
59 With,
60 Contains,
61 True,
62 False,
63 Enrich,
64 Group,
65 Count,
66 Sum,
67 Avg,
68 Min,
69 Max,
70 Distinct,
71
72 Vector,
74 Search,
75 Similar,
76 Collection,
77 Metric,
78 Threshold,
79 K,
80 Hybrid,
81 Fusion,
82 Rerank,
83 Rrf,
84 Intersection,
85 Union,
86 Recursive,
87 All,
88 Weight,
89 L2,
90 Cosine,
91 InnerProduct,
92 Include,
93 Metadata,
94 Vectors,
95
96 Insert,
98 Into,
99 Values,
100 Update,
101 Set,
102 Delete,
103 Truncate,
104 Create,
105 Table,
106 Drop,
107 Alter,
108 Add,
109 Column,
110 Primary,
111 Explain,
113 For,
114 Format,
115 Json,
116 Key,
117 Default,
118 Compress,
119 Index,
120 Unique,
121 If,
122 Exists,
123 Returning,
124 Cascade,
125 Rename,
126 Using,
127
128 Node,
130 Edge,
131 Document,
132 Kv,
133
134 Timeseries,
136 Retention,
137 Queue,
138 Tree,
139 Push,
140 Pop,
141 Peek,
142 Purge,
143 Ack,
144 Nack,
145 Priority,
146
147 Neighborhood,
149 ShortestPath,
150 Centrality,
151 Community,
152 Components,
153 Cycles,
154 Traverse,
155 Depth,
156 Direction,
157 Algorithm,
158 Strategy,
159 MaxIterations,
160 MaxLength,
161 Mode,
162 Clustering,
163 TopologicalSort,
164 Properties,
165 Text,
166 Fuzzy,
167 MinScore,
168
169 Begin,
171 Commit,
172 Rollback,
173 Savepoint,
174 Release,
175 Start,
176 Transaction,
177 Work,
178
179 Vacuum,
181 Analyze,
182
183 Schema,
185 Sequence,
186 Increment,
187
188 Copy,
190 Header,
191 Delimiter,
192
193 View,
195 Materialized,
196 Refresh,
197
198 Partition,
200 Range,
201 List,
202 Hash,
203 Attach,
204 Detach,
205 Of,
206
207 Policy,
209 Enable,
210 Disable,
211 Security,
212 Row,
213 Level,
214
215 Foreign,
217 Server,
218 Wrapper,
219 Options,
220 Data,
221
222 Sessionize,
224 Gap,
225
226 String(String),
228 Integer(i64),
229 Float(f64),
230 JsonLiteral(String),
236
237 Ident(String),
239
240 Eq, Ne, Lt, Le, Gt, Ge, Plus, Minus, Star, Slash, Percent, LParen, RParen, LBracket, RBracket, LBrace, RBrace, Comma, Dot, Colon, Semi, Dollar, Question, Arrow, ArrowLeft, Dash, DotDot, Pipe, DoublePipe, Eof,
277}
278
279impl fmt::Display for Token {
280 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
281 match self {
282 Token::Select => write!(f, "SELECT"),
283 Token::From => write!(f, "FROM"),
284 Token::Where => write!(f, "WHERE"),
285 Token::And => write!(f, "AND"),
286 Token::Or => write!(f, "OR"),
287 Token::Not => write!(f, "NOT"),
288 Token::Match => write!(f, "MATCH"),
289 Token::Return => write!(f, "RETURN"),
290 Token::Join => write!(f, "JOIN"),
291 Token::Graph => write!(f, "GRAPH"),
292 Token::Path => write!(f, "PATH"),
293 Token::To => write!(f, "TO"),
294 Token::Via => write!(f, "VIA"),
295 Token::On => write!(f, "ON"),
296 Token::As => write!(f, "AS"),
297 Token::Is => write!(f, "IS"),
298 Token::Null => write!(f, "NULL"),
299 Token::Between => write!(f, "BETWEEN"),
300 Token::Like => write!(f, "LIKE"),
301 Token::In => write!(f, "IN"),
302 Token::Order => write!(f, "ORDER"),
303 Token::By => write!(f, "BY"),
304 Token::Asc => write!(f, "ASC"),
305 Token::Desc => write!(f, "DESC"),
306 Token::Nulls => write!(f, "NULLS"),
307 Token::First => write!(f, "FIRST"),
308 Token::Last => write!(f, "LAST"),
309 Token::Limit => write!(f, "LIMIT"),
310 Token::Offset => write!(f, "OFFSET"),
311 Token::Inner => write!(f, "INNER"),
312 Token::Left => write!(f, "LEFT"),
313 Token::Right => write!(f, "RIGHT"),
314 Token::Outer => write!(f, "OUTER"),
315 Token::Full => write!(f, "FULL"),
316 Token::Cross => write!(f, "CROSS"),
317 Token::Starts => write!(f, "STARTS"),
318 Token::Ends => write!(f, "ENDS"),
319 Token::With => write!(f, "WITH"),
320 Token::Contains => write!(f, "CONTAINS"),
321 Token::True => write!(f, "TRUE"),
322 Token::False => write!(f, "FALSE"),
323 Token::Enrich => write!(f, "ENRICH"),
324 Token::Group => write!(f, "GROUP"),
325 Token::Count => write!(f, "COUNT"),
326 Token::Sum => write!(f, "SUM"),
327 Token::Avg => write!(f, "AVG"),
328 Token::Min => write!(f, "MIN"),
329 Token::Max => write!(f, "MAX"),
330 Token::Distinct => write!(f, "DISTINCT"),
331 Token::Vector => write!(f, "VECTOR"),
332 Token::Search => write!(f, "SEARCH"),
333 Token::Similar => write!(f, "SIMILAR"),
334 Token::Collection => write!(f, "COLLECTION"),
335 Token::Metric => write!(f, "METRIC"),
336 Token::Threshold => write!(f, "THRESHOLD"),
337 Token::K => write!(f, "K"),
338 Token::Hybrid => write!(f, "HYBRID"),
339 Token::Fusion => write!(f, "FUSION"),
340 Token::Rerank => write!(f, "RERANK"),
341 Token::Rrf => write!(f, "RRF"),
342 Token::Intersection => write!(f, "INTERSECTION"),
343 Token::Union => write!(f, "UNION"),
344 Token::Recursive => write!(f, "RECURSIVE"),
345 Token::All => write!(f, "ALL"),
346 Token::Weight => write!(f, "WEIGHT"),
347 Token::L2 => write!(f, "L2"),
348 Token::Cosine => write!(f, "COSINE"),
349 Token::InnerProduct => write!(f, "INNER_PRODUCT"),
350 Token::Include => write!(f, "INCLUDE"),
351 Token::Metadata => write!(f, "METADATA"),
352 Token::Vectors => write!(f, "VECTORS"),
353 Token::Explain => write!(f, "EXPLAIN"),
354 Token::For => write!(f, "FOR"),
355 Token::Format => write!(f, "FORMAT"),
356 Token::Json => write!(f, "JSON"),
357 Token::Insert => write!(f, "INSERT"),
358 Token::Into => write!(f, "INTO"),
359 Token::Values => write!(f, "VALUES"),
360 Token::Update => write!(f, "UPDATE"),
361 Token::Set => write!(f, "SET"),
362 Token::Delete => write!(f, "DELETE"),
363 Token::Truncate => write!(f, "TRUNCATE"),
364 Token::Create => write!(f, "CREATE"),
365 Token::Table => write!(f, "TABLE"),
366 Token::Drop => write!(f, "DROP"),
367 Token::Alter => write!(f, "ALTER"),
368 Token::Add => write!(f, "ADD"),
369 Token::Column => write!(f, "COLUMN"),
370 Token::Primary => write!(f, "PRIMARY"),
371 Token::Key => write!(f, "KEY"),
372 Token::Default => write!(f, "DEFAULT"),
373 Token::Compress => write!(f, "COMPRESS"),
374 Token::Index => write!(f, "INDEX"),
375 Token::Unique => write!(f, "UNIQUE"),
376 Token::If => write!(f, "IF"),
377 Token::Exists => write!(f, "EXISTS"),
378 Token::Returning => write!(f, "RETURNING"),
379 Token::Cascade => write!(f, "CASCADE"),
380 Token::Rename => write!(f, "RENAME"),
381 Token::Using => write!(f, "USING"),
382 Token::Node => write!(f, "NODE"),
383 Token::Edge => write!(f, "EDGE"),
384 Token::Document => write!(f, "DOCUMENT"),
385 Token::Kv => write!(f, "KV"),
386 Token::Timeseries => write!(f, "TIMESERIES"),
387 Token::Retention => write!(f, "RETENTION"),
388 Token::Queue => write!(f, "QUEUE"),
389 Token::Tree => write!(f, "TREE"),
390 Token::Push => write!(f, "PUSH"),
391 Token::Pop => write!(f, "POP"),
392 Token::Peek => write!(f, "PEEK"),
393 Token::Purge => write!(f, "PURGE"),
394 Token::Ack => write!(f, "ACK"),
395 Token::Nack => write!(f, "NACK"),
396 Token::Priority => write!(f, "PRIORITY"),
397 Token::Neighborhood => write!(f, "NEIGHBORHOOD"),
398 Token::ShortestPath => write!(f, "SHORTEST_PATH"),
399 Token::Centrality => write!(f, "CENTRALITY"),
400 Token::Community => write!(f, "COMMUNITY"),
401 Token::Components => write!(f, "COMPONENTS"),
402 Token::Cycles => write!(f, "CYCLES"),
403 Token::Traverse => write!(f, "TRAVERSE"),
404 Token::Depth => write!(f, "DEPTH"),
405 Token::Direction => write!(f, "DIRECTION"),
406 Token::Algorithm => write!(f, "ALGORITHM"),
407 Token::Strategy => write!(f, "STRATEGY"),
408 Token::MaxIterations => write!(f, "MAX_ITERATIONS"),
409 Token::MaxLength => write!(f, "MAX_LENGTH"),
410 Token::Mode => write!(f, "MODE"),
411 Token::Clustering => write!(f, "CLUSTERING"),
412 Token::TopologicalSort => write!(f, "TOPOLOGICAL_SORT"),
413 Token::Properties => write!(f, "PROPERTIES"),
414 Token::Text => write!(f, "TEXT"),
415 Token::Fuzzy => write!(f, "FUZZY"),
416 Token::MinScore => write!(f, "MIN_SCORE"),
417 Token::Begin => write!(f, "BEGIN"),
418 Token::Commit => write!(f, "COMMIT"),
419 Token::Rollback => write!(f, "ROLLBACK"),
420 Token::Savepoint => write!(f, "SAVEPOINT"),
421 Token::Release => write!(f, "RELEASE"),
422 Token::Start => write!(f, "START"),
423 Token::Transaction => write!(f, "TRANSACTION"),
424 Token::Work => write!(f, "WORK"),
425 Token::Vacuum => write!(f, "VACUUM"),
426 Token::Analyze => write!(f, "ANALYZE"),
427 Token::Schema => write!(f, "SCHEMA"),
428 Token::Sequence => write!(f, "SEQUENCE"),
429 Token::Increment => write!(f, "INCREMENT"),
430 Token::Copy => write!(f, "COPY"),
431 Token::Header => write!(f, "HEADER"),
432 Token::Delimiter => write!(f, "DELIMITER"),
433 Token::View => write!(f, "VIEW"),
434 Token::Materialized => write!(f, "MATERIALIZED"),
435 Token::Refresh => write!(f, "REFRESH"),
436 Token::Partition => write!(f, "PARTITION"),
437 Token::Range => write!(f, "RANGE"),
438 Token::List => write!(f, "LIST"),
439 Token::Hash => write!(f, "HASH"),
440 Token::Attach => write!(f, "ATTACH"),
441 Token::Detach => write!(f, "DETACH"),
442 Token::Of => write!(f, "OF"),
443 Token::Policy => write!(f, "POLICY"),
444 Token::Enable => write!(f, "ENABLE"),
445 Token::Disable => write!(f, "DISABLE"),
446 Token::Security => write!(f, "SECURITY"),
447 Token::Row => write!(f, "ROW"),
448 Token::Level => write!(f, "LEVEL"),
449 Token::Foreign => write!(f, "FOREIGN"),
450 Token::Server => write!(f, "SERVER"),
451 Token::Wrapper => write!(f, "WRAPPER"),
452 Token::Options => write!(f, "OPTIONS"),
453 Token::Data => write!(f, "DATA"),
454 Token::Sessionize => write!(f, "SESSIONIZE"),
455 Token::Gap => write!(f, "GAP"),
456 Token::String(s) => write!(f, "'{}'", s),
457 Token::Integer(n) => write!(f, "{}", n),
458 Token::Float(n) => write!(f, "{}", n),
459 Token::JsonLiteral(s) => write!(f, "{}", s),
460 Token::Ident(s) => write!(f, "{}", s),
461 Token::Eq => write!(f, "="),
462 Token::Ne => write!(f, "<>"),
463 Token::Lt => write!(f, "<"),
464 Token::Le => write!(f, "<="),
465 Token::Gt => write!(f, ">"),
466 Token::Ge => write!(f, ">="),
467 Token::Plus => write!(f, "+"),
468 Token::Minus => write!(f, "-"),
469 Token::Star => write!(f, "*"),
470 Token::Slash => write!(f, "/"),
471 Token::Percent => write!(f, "%"),
472 Token::LParen => write!(f, "("),
473 Token::RParen => write!(f, ")"),
474 Token::LBracket => write!(f, "["),
475 Token::RBracket => write!(f, "]"),
476 Token::LBrace => write!(f, "{{"),
477 Token::RBrace => write!(f, "}}"),
478 Token::Comma => write!(f, ","),
479 Token::Dot => write!(f, "."),
480 Token::Colon => write!(f, ":"),
481 Token::Semi => write!(f, ";"),
482 Token::Dollar => write!(f, "$"),
483 Token::Question => write!(f, "?"),
484 Token::Arrow => write!(f, "->"),
485 Token::ArrowLeft => write!(f, "<-"),
486 Token::Dash => write!(f, "-"),
487 Token::DotDot => write!(f, ".."),
488 Token::Pipe => write!(f, "|"),
489 Token::DoublePipe => write!(f, "||"),
490 Token::Eof => write!(f, "EOF"),
491 }
492 }
493}
494
495#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
497pub struct Position {
498 pub line: u32,
500 pub column: u32,
502 pub offset: u32,
504}
505
506impl Position {
507 pub fn new(line: u32, column: u32, offset: u32) -> Self {
509 Self {
510 line,
511 column,
512 offset,
513 }
514 }
515}
516
517impl fmt::Display for Position {
518 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
519 write!(f, "{}:{}", self.line, self.column)
520 }
521}
522
523#[derive(Debug, Clone)]
525pub struct Spanned {
526 pub token: Token,
528 pub start: Position,
530 pub end: Position,
532}
533
534impl Spanned {
535 pub fn new(token: Token, start: Position, end: Position) -> Self {
537 Self { token, start, end }
538 }
539}
540
541#[derive(Debug, Clone)]
543pub struct LexerError {
544 pub message: String,
546 pub position: Position,
548 pub limit_hit: Option<LexerLimitHit>,
552}
553
554#[derive(Debug, Clone, PartialEq, Eq)]
556pub enum LexerLimitHit {
557 IdentifierTooLong {
559 limit_name: &'static str,
560 value: usize,
561 },
562}
563
564impl LexerError {
565 pub fn new(message: impl Into<String>, position: Position) -> Self {
567 Self {
568 message: message.into(),
569 position,
570 limit_hit: None,
571 }
572 }
573
574 pub(crate) fn with_limit(
576 message: impl Into<String>,
577 position: Position,
578 limit_hit: LexerLimitHit,
579 ) -> Self {
580 Self {
581 message: message.into(),
582 position,
583 limit_hit: Some(limit_hit),
584 }
585 }
586}
587
588impl fmt::Display for LexerError {
589 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
590 write!(f, "Lexer error at {}: {}", self.position, self.message)
591 }
592}
593
594impl std::error::Error for LexerError {}
595
596pub const JSON_LITERAL_MAX_BYTES: usize = 16 * 1024 * 1024;
603
604pub struct Lexer<'a> {
606 input: &'a str,
609 chars: Peekable<Chars<'a>>,
611 line: u32,
613 column: u32,
614 offset: u32,
615 peeked: Option<Spanned>,
617 putback: Option<(char, Position)>,
619 max_identifier_chars: usize,
621}
622
623impl<'a> Lexer<'a> {
624 pub fn new(input: &'a str) -> Self {
626 Self::with_limits(
627 input,
628 crate::storage::query::parser::ParserLimits::default(),
629 )
630 }
631
632 pub fn with_limits(
634 input: &'a str,
635 limits: crate::storage::query::parser::ParserLimits,
636 ) -> Self {
637 Self {
638 input,
639 chars: input.chars().peekable(),
640 line: 1,
641 column: 1,
642 offset: 0,
643 peeked: None,
644 putback: None,
645 max_identifier_chars: limits.max_identifier_chars,
646 }
647 }
648
649 pub(crate) fn max_identifier_chars(&self) -> usize {
653 self.max_identifier_chars
654 }
655
656 fn position(&self) -> Position {
658 Position::new(self.line, self.column, self.offset)
659 }
660
661 fn unget(&mut self, ch: char, pos: Position) {
663 self.putback = Some((ch, pos));
664 }
665
666 fn advance(&mut self) -> Option<char> {
668 if let Some((ch, pos)) = self.putback.take() {
670 self.line = pos.line;
672 self.column = pos.column + 1;
673 self.offset = pos.offset + ch.len_utf8() as u32;
674 return Some(ch);
675 }
676
677 let ch = self.chars.next()?;
678 self.offset += ch.len_utf8() as u32;
679 if ch == '\n' {
680 self.line += 1;
681 self.column = 1;
682 } else {
683 self.column += 1;
684 }
685 Some(ch)
686 }
687
688 fn peek(&mut self) -> Option<char> {
690 if let Some((ch, _)) = &self.putback {
692 return Some(*ch);
693 }
694 self.chars.peek().copied()
695 }
696
697 fn skip_whitespace(&mut self) {
699 while let Some(ch) = self.peek() {
700 if ch.is_whitespace() {
701 self.advance();
702 } else if ch == '-' {
703 let pos = self.position();
705 self.advance();
706 if self.peek() == Some('-') {
707 self.advance();
709 while let Some(c) = self.peek() {
710 if c == '\n' {
711 break;
712 }
713 self.advance();
714 }
715 } else {
716 self.line = pos.line;
719 self.column = pos.column;
720 self.offset = pos.offset;
721 break;
724 }
725 } else {
726 break;
727 }
728 }
729 }
730
731 pub fn peek_token(&mut self) -> Result<&Spanned, LexerError> {
733 if self.peeked.is_none() {
734 self.peeked = Some(self.next_token_internal()?);
735 }
736 Ok(self.peeked.as_ref().unwrap())
737 }
738
739 pub fn next_token(&mut self) -> Result<Spanned, LexerError> {
741 if let Some(tok) = self.peeked.take() {
742 return Ok(tok);
743 }
744 self.next_token_internal()
745 }
746
747 fn next_token_internal(&mut self) -> Result<Spanned, LexerError> {
749 self.skip_whitespace_simple();
750
751 let start = self.position();
752
753 let ch = match self.peek() {
754 Some(c) => c,
755 None => {
756 return Ok(Spanned::new(Token::Eof, start, start));
757 }
758 };
759
760 let token = match ch {
762 '\'' | '"' => self.scan_string()?,
764
765 '0'..='9' => self.scan_number()?,
767
768 'a'..='z' | 'A'..='Z' | '_' => self.scan_identifier()?,
770
771 '=' => {
773 self.advance();
774 Token::Eq
775 }
776 '<' => self.scan_less_than()?,
777 '>' => self.scan_greater_than()?,
778 '!' => {
779 self.advance();
780 if self.peek() == Some('=') {
781 self.advance();
782 Token::Ne
783 } else {
784 return Err(LexerError::new("Expected '=' after '!'", start));
785 }
786 }
787 '+' => {
788 self.advance();
789 Token::Plus
790 }
791 '-' => self.scan_minus()?,
792 '*' => {
793 self.advance();
794 Token::Star
795 }
796 '/' => {
797 self.advance();
798 Token::Slash
799 }
800 '%' => {
801 self.advance();
802 Token::Percent
803 }
804 '(' => {
805 self.advance();
806 Token::LParen
807 }
808 ')' => {
809 self.advance();
810 Token::RParen
811 }
812 '[' => {
813 self.advance();
814 Token::LBracket
815 }
816 ']' => {
817 self.advance();
818 Token::RBracket
819 }
820 '{' => {
821 if self.looks_like_json_object_start() {
828 return self.scan_json_literal(start);
829 }
830 self.advance();
831 Token::LBrace
832 }
833 '}' => {
834 self.advance();
835 Token::RBrace
836 }
837 ',' => {
838 self.advance();
839 Token::Comma
840 }
841 '.' => self.scan_dot()?,
842 ':' => {
843 self.advance();
844 Token::Colon
845 }
846 ';' => {
847 self.advance();
848 Token::Semi
849 }
850 '$' => {
851 self.advance();
852 Token::Dollar
853 }
854 '?' => {
855 self.advance();
856 Token::Question
857 }
858 '|' => {
859 self.advance();
860 if self.peek() == Some('|') {
861 self.advance();
862 Token::DoublePipe
863 } else {
864 Token::Pipe
865 }
866 }
867 _ => {
868 return Err(LexerError::new(
869 format!("Unexpected character: '{}'", ch),
870 start,
871 ));
872 }
873 };
874
875 let end = self.position();
876 Ok(Spanned::new(token, start, end))
877 }
878
879 fn skip_whitespace_simple(&mut self) {
881 while let Some(ch) = self.peek() {
882 if ch.is_whitespace() {
883 self.advance();
884 } else if ch == '-' && self.input[self.offset as usize..].starts_with("--") {
885 self.advance();
886 self.advance();
887 while let Some(c) = self.peek() {
888 if c == '\n' {
889 break;
890 }
891 self.advance();
892 }
893 } else if ch == '/' && self.input[self.offset as usize..].starts_with("/*") {
894 self.advance();
895 self.advance();
896 while let Some(c) = self.peek() {
897 self.advance();
898 if c == '*' && self.peek() == Some('/') {
899 self.advance();
900 break;
901 }
902 }
903 } else {
904 break;
905 }
906 }
907 }
908
909 fn scan_string(&mut self) -> Result<Token, LexerError> {
911 let quote = self.advance().unwrap(); let start = self.position();
913 let mut value = String::new();
914
915 loop {
916 match self.peek() {
917 None => {
918 return Err(LexerError::new("Unterminated string", start));
919 }
920 Some(c) if c == quote => {
921 self.advance();
922 if self.peek() == Some(quote) {
924 self.advance();
925 value.push(quote);
926 } else {
927 break;
928 }
929 }
930 Some('\\') => {
931 self.advance();
932 match self.peek() {
933 Some('n') => {
934 self.advance();
935 value.push('\n');
936 }
937 Some('r') => {
938 self.advance();
939 value.push('\r');
940 }
941 Some('t') => {
942 self.advance();
943 value.push('\t');
944 }
945 Some('\\') => {
946 self.advance();
947 value.push('\\');
948 }
949 Some(c) if c == quote => {
950 self.advance();
951 value.push(quote);
952 }
953 Some(c) => {
954 value.push('\\');
956 value.push(c);
957 self.advance();
958 }
959 None => {
960 return Err(LexerError::new("Unterminated string", start));
961 }
962 }
963 }
964 Some(c) => {
965 self.advance();
966 value.push(c);
967 }
968 }
969 }
970
971 Ok(Token::String(value))
972 }
973
974 fn scan_number(&mut self) -> Result<Token, LexerError> {
976 let mut value = String::new();
977 let mut is_float = false;
978
979 while let Some(ch) = self.peek() {
981 if ch.is_ascii_digit() {
982 value.push(ch);
983 self.advance();
984 } else {
985 break;
986 }
987 }
988
989 if self.peek() == Some('.') {
991 let dot_pos = self.position();
993 self.advance(); if self.peek() == Some('.') {
996 self.unget('.', dot_pos);
998 } else if self.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
1000 is_float = true;
1001 value.push('.');
1002 while let Some(ch) = self.peek() {
1003 if ch.is_ascii_digit() {
1004 value.push(ch);
1005 self.advance();
1006 } else {
1007 break;
1008 }
1009 }
1010 } else {
1011 self.unget('.', dot_pos);
1013 }
1014 }
1015
1016 if self.peek() == Some('e') || self.peek() == Some('E') {
1018 is_float = true;
1019 value.push(self.advance().unwrap());
1020
1021 if self.peek() == Some('+') || self.peek() == Some('-') {
1022 value.push(self.advance().unwrap());
1023 }
1024
1025 while let Some(ch) = self.peek() {
1026 if ch.is_ascii_digit() {
1027 value.push(ch);
1028 self.advance();
1029 } else {
1030 break;
1031 }
1032 }
1033 }
1034
1035 if is_float {
1036 match value.parse::<f64>() {
1037 Ok(n) => Ok(Token::Float(n)),
1038 Err(_) => Err(LexerError::new(
1039 format!("Invalid float: {}", value),
1040 self.position(),
1041 )),
1042 }
1043 } else {
1044 match value.parse::<i64>() {
1045 Ok(n) => Ok(Token::Integer(n)),
1046 Err(_) => Err(LexerError::new(
1047 format!("Invalid integer: {}", value),
1048 self.position(),
1049 )),
1050 }
1051 }
1052 }
1053
1054 fn scan_identifier(&mut self) -> Result<Token, LexerError> {
1056 let start_pos = self.position();
1057 let mut value = String::new();
1058 let max = self.max_identifier_chars;
1059
1060 while let Some(ch) = self.peek() {
1061 if ch.is_alphanumeric() || ch == '_' {
1062 if value.chars().count() >= max {
1063 return Err(LexerError::with_limit(
1067 format!(
1068 "identifier exceeds maximum length (max_identifier_chars = {})",
1069 max
1070 ),
1071 start_pos,
1072 LexerLimitHit::IdentifierTooLong {
1073 limit_name: "max_identifier_chars",
1074 value: max,
1075 },
1076 ));
1077 }
1078 value.push(ch);
1079 self.advance();
1080 } else {
1081 break;
1082 }
1083 }
1084
1085 let token = match value.to_uppercase().as_str() {
1087 "SELECT" => Token::Select,
1088 "FROM" => Token::From,
1089 "WHERE" => Token::Where,
1090 "AND" => Token::And,
1091 "OR" => Token::Or,
1092 "NOT" => Token::Not,
1093 "MATCH" => Token::Match,
1094 "RETURN" => Token::Return,
1095 "JOIN" => Token::Join,
1096 "GRAPH" => Token::Graph,
1097 "PATH" => Token::Path,
1098 "TO" => Token::To,
1099 "VIA" => Token::Via,
1100 "ON" => Token::On,
1101 "AS" => Token::As,
1102 "IS" => Token::Is,
1103 "NULL" => Token::Null,
1104 "BETWEEN" => Token::Between,
1105 "LIKE" => Token::Like,
1106 "IN" => Token::In,
1107 "ORDER" => Token::Order,
1108 "BY" => Token::By,
1109 "ASC" => Token::Asc,
1110 "DESC" => Token::Desc,
1111 "NULLS" => Token::Nulls,
1112 "FIRST" => Token::First,
1113 "LAST" => Token::Last,
1114 "LIMIT" => Token::Limit,
1115 "OFFSET" => Token::Offset,
1116 "INNER" => Token::Inner,
1117 "LEFT" => Token::Left,
1118 "RIGHT" => Token::Right,
1119 "OUTER" => Token::Outer,
1120 "FULL" => Token::Full,
1121 "CROSS" => Token::Cross,
1122 "STARTS" => Token::Starts,
1123 "ENDS" => Token::Ends,
1124 "WITH" => Token::With,
1125 "CONTAINS" => Token::Contains,
1126 "TRUE" => Token::True,
1127 "FALSE" => Token::False,
1128 "ENRICH" => Token::Enrich,
1129 "GROUP" => Token::Group,
1130 "COUNT" => Token::Count,
1131 "SUM" => Token::Sum,
1132 "AVG" => Token::Avg,
1133 "MIN" => Token::Min,
1134 "MAX" => Token::Max,
1135 "DISTINCT" => Token::Distinct,
1136 "VECTOR" => Token::Vector,
1137 "SEARCH" => Token::Search,
1138 "SIMILAR" => Token::Similar,
1139 "COLLECTION" => Token::Collection,
1140 "METRIC" => Token::Metric,
1141 "THRESHOLD" => Token::Threshold,
1142 "K" => Token::K,
1143 "HYBRID" => Token::Hybrid,
1144 "FUSION" => Token::Fusion,
1145 "RERANK" => Token::Rerank,
1146 "RRF" => Token::Rrf,
1147 "INTERSECTION" => Token::Intersection,
1148 "UNION" => Token::Union,
1149 "RECURSIVE" => Token::Recursive,
1150 "ALL" => Token::All,
1151 "WEIGHT" => Token::Weight,
1152 "L2" => Token::L2,
1153 "COSINE" => Token::Cosine,
1154 "INNER_PRODUCT" | "INNERPRODUCT" => Token::InnerProduct,
1155 "INCLUDE" => Token::Include,
1156 "METADATA" => Token::Metadata,
1157 "VECTORS" => Token::Vectors,
1158 "EXPLAIN" => Token::Explain,
1159 "FOR" => Token::For,
1160 "FORMAT" => Token::Format,
1161 "JSON" => Token::Json,
1162 "INSERT" => Token::Insert,
1163 "INTO" => Token::Into,
1164 "VALUES" => Token::Values,
1165 "UPDATE" => Token::Update,
1166 "SET" => Token::Set,
1167 "DELETE" => Token::Delete,
1168 "TRUNCATE" => Token::Truncate,
1169 "CREATE" => Token::Create,
1170 "TABLE" => Token::Table,
1171 "DROP" => Token::Drop,
1172 "ALTER" => Token::Alter,
1173 "ADD" => Token::Add,
1174 "COLUMN" => Token::Column,
1175 "PRIMARY" => Token::Primary,
1176 "KEY" => Token::Key,
1177 "DEFAULT" => Token::Default,
1178 "COMPRESS" => Token::Compress,
1179 "INDEX" => Token::Index,
1180 "UNIQUE" => Token::Unique,
1181 "IF" => Token::If,
1182 "EXISTS" => Token::Exists,
1183 "RETURNING" => Token::Returning,
1184 "CASCADE" => Token::Cascade,
1185 "RENAME" => Token::Rename,
1186 "USING" => Token::Using,
1187 "NODE" => Token::Node,
1188 "EDGE" => Token::Edge,
1189 "DOCUMENT" => Token::Document,
1190 "KV" => Token::Kv,
1191 "TIMESERIES" => Token::Timeseries,
1192 "RETENTION" => Token::Retention,
1193 "QUEUE" => Token::Queue,
1194 "TREE" => Token::Tree,
1195 "PUSH" => Token::Push,
1196 "POP" => Token::Pop,
1197 "PEEK" => Token::Peek,
1198 "PURGE" => Token::Purge,
1199 "ACK" => Token::Ack,
1200 "NACK" => Token::Nack,
1201 "PRIORITY" => Token::Priority,
1202 "LPUSH" => Token::Ident("LPUSH".to_string()),
1203 "RPUSH" => Token::Ident("RPUSH".to_string()),
1204 "LPOP" => Token::Ident("LPOP".to_string()),
1205 "RPOP" => Token::Ident("RPOP".to_string()),
1206 "NEIGHBORHOOD" => Token::Neighborhood,
1207 "SHORTEST_PATH" | "SHORTESTPATH" => Token::ShortestPath,
1208 "CENTRALITY" => Token::Centrality,
1209 "COMMUNITY" => Token::Community,
1210 "COMPONENTS" => Token::Components,
1211 "CYCLES" => Token::Cycles,
1212 "TRAVERSE" => Token::Traverse,
1213 "DEPTH" => Token::Depth,
1214 "DIRECTION" => Token::Direction,
1215 "ALGORITHM" => Token::Algorithm,
1216 "STRATEGY" => Token::Strategy,
1217 "MAX_ITERATIONS" | "MAXITERATIONS" => Token::MaxIterations,
1218 "MAX_LENGTH" | "MAXLENGTH" => Token::MaxLength,
1219 "MODE" => Token::Mode,
1220 "CLUSTERING" => Token::Clustering,
1221 "TOPOLOGICAL_SORT" | "TOPOLOGICALSORT" => Token::TopologicalSort,
1222 "PROPERTIES" => Token::Properties,
1223 "TEXT" => Token::Text,
1224 "FUZZY" => Token::Fuzzy,
1225 "MIN_SCORE" | "MINSCORE" => Token::MinScore,
1226 "BEGIN" => Token::Begin,
1227 "COMMIT" => Token::Commit,
1228 "ROLLBACK" => Token::Rollback,
1229 "SAVEPOINT" => Token::Savepoint,
1230 "RELEASE" => Token::Release,
1231 "START" => Token::Start,
1232 "TRANSACTION" => Token::Transaction,
1233 "WORK" => Token::Work,
1234 "VACUUM" => Token::Vacuum,
1235 "ANALYZE" => Token::Analyze,
1236 "SCHEMA" => Token::Schema,
1237 "SEQUENCE" => Token::Sequence,
1238 "INCREMENT" => Token::Increment,
1239 "COPY" => Token::Copy,
1240 "HEADER" => Token::Header,
1241 "DELIMITER" => Token::Delimiter,
1242 "VIEW" => Token::View,
1243 "MATERIALIZED" => Token::Materialized,
1244 "REFRESH" => Token::Refresh,
1245 "PARTITION" => Token::Partition,
1246 "RANGE" => Token::Range,
1247 "LIST" => Token::List,
1248 "HASH" => Token::Hash,
1249 "ATTACH" => Token::Attach,
1250 "DETACH" => Token::Detach,
1251 "OF" => Token::Of,
1252 "POLICY" => Token::Policy,
1253 "ENABLE" => Token::Enable,
1254 "DISABLE" => Token::Disable,
1255 "SECURITY" => Token::Security,
1256 "ROW" => Token::Row,
1257 "LEVEL" => Token::Level,
1258 "FOREIGN" => Token::Foreign,
1259 "SERVER" => Token::Server,
1260 "WRAPPER" => Token::Wrapper,
1261 "OPTIONS" => Token::Options,
1262 "DATA" => Token::Data,
1263 "SESSIONIZE" => Token::Sessionize,
1264 "GAP" => Token::Gap,
1265 _ => Token::Ident(value),
1266 };
1267
1268 Ok(token)
1269 }
1270
1271 fn scan_less_than(&mut self) -> Result<Token, LexerError> {
1273 self.advance(); match self.peek() {
1275 Some('=') => {
1276 self.advance();
1277 Ok(Token::Le)
1278 }
1279 Some('>') => {
1280 self.advance();
1281 Ok(Token::Ne)
1282 }
1283 Some('-') => {
1284 self.advance();
1285 Ok(Token::ArrowLeft)
1286 }
1287 _ => Ok(Token::Lt),
1288 }
1289 }
1290
1291 fn scan_greater_than(&mut self) -> Result<Token, LexerError> {
1293 self.advance(); if self.peek() == Some('=') {
1295 self.advance();
1296 Ok(Token::Ge)
1297 } else {
1298 Ok(Token::Gt)
1299 }
1300 }
1301
1302 fn scan_minus(&mut self) -> Result<Token, LexerError> {
1304 self.advance(); match self.peek() {
1306 Some('>') => {
1307 self.advance();
1308 Ok(Token::Arrow)
1309 }
1310 Some('-') => {
1311 self.advance();
1313 while let Some(c) = self.peek() {
1314 if c == '\n' {
1315 break;
1316 }
1317 self.advance();
1318 }
1319 self.skip_whitespace_simple();
1321 if self.peek().is_none() {
1322 Ok(Token::Eof)
1323 } else {
1324 let next = self.next_token_internal()?;
1325 Ok(next.token)
1326 }
1327 }
1328 _ => Ok(Token::Dash),
1329 }
1330 }
1331
1332 fn scan_dot(&mut self) -> Result<Token, LexerError> {
1334 self.advance(); if self.peek() == Some('.') {
1336 self.advance();
1337 Ok(Token::DotDot)
1338 } else {
1339 Ok(Token::Dot)
1340 }
1341 }
1342
1343 fn looks_like_json_object_start(&self) -> bool {
1348 let bytes = self.input.as_bytes();
1349 let mut i = self.offset as usize;
1350 debug_assert!(bytes.get(i) == Some(&b'{'));
1352 i += 1;
1353 while i < bytes.len() {
1354 match bytes[i] {
1355 b' ' | b'\t' | b'\n' | b'\r' => i += 1,
1356 b'"' | b'}' => return true,
1357 _ => return false,
1358 }
1359 }
1360 false
1361 }
1362
1363 fn scan_json_literal(&mut self, start: Position) -> Result<Spanned, LexerError> {
1380 let start_offset = self.offset as usize;
1381 self.advance();
1383 let mut depth: u32 = 1;
1384 let mut in_string = false;
1385 let mut escape = false;
1386 loop {
1387 let ch = match self.peek() {
1388 Some(c) => c,
1389 None => {
1390 return Err(LexerError::new(
1391 format!(
1392 "unterminated JSON object literal (started at offset {})",
1393 start.offset
1394 ),
1395 self.position(),
1396 ));
1397 }
1398 };
1399
1400 let scanned_bytes = self.offset as usize - start_offset;
1402 if scanned_bytes > JSON_LITERAL_MAX_BYTES {
1403 return Err(LexerError::new(
1404 format!(
1405 "JSON object literal exceeds JSON_LITERAL_MAX_BYTES ({} bytes)",
1406 JSON_LITERAL_MAX_BYTES
1407 ),
1408 start,
1409 ));
1410 }
1411
1412 self.advance();
1413
1414 if escape {
1415 escape = false;
1416 continue;
1417 }
1418
1419 if in_string {
1420 match ch {
1421 '\\' => escape = true,
1422 '"' => in_string = false,
1423 _ => {}
1424 }
1425 continue;
1426 }
1427
1428 match ch {
1429 '"' => in_string = true,
1430 '{' => depth += 1,
1431 '}' => {
1432 depth -= 1;
1433 if depth == 0 {
1434 let end = self.position();
1435 let end_offset = self.offset as usize;
1436 if end_offset - start_offset > JSON_LITERAL_MAX_BYTES {
1438 return Err(LexerError::new(
1439 format!(
1440 "JSON object literal exceeds JSON_LITERAL_MAX_BYTES ({} bytes)",
1441 JSON_LITERAL_MAX_BYTES
1442 ),
1443 start,
1444 ));
1445 }
1446 let raw = self.input[start_offset..end_offset].to_string();
1447 return Ok(Spanned::new(Token::JsonLiteral(raw), start, end));
1448 }
1449 }
1450 _ => {}
1451 }
1452 }
1453 }
1454
1455 pub fn tokenize(&mut self) -> Result<Vec<Spanned>, LexerError> {
1457 let mut tokens = Vec::new();
1458 loop {
1459 let tok = self.next_token()?;
1460 let is_eof = tok.token == Token::Eof;
1461 tokens.push(tok);
1462 if is_eof {
1463 break;
1464 }
1465 }
1466 Ok(tokens)
1467 }
1468}
1469
1470#[cfg(test)]
1475mod tests {
1476 use super::*;
1477
1478 fn tokenize(input: &str) -> Vec<Token> {
1479 let mut lexer = Lexer::new(input);
1480 lexer
1481 .tokenize()
1482 .unwrap()
1483 .into_iter()
1484 .map(|s| s.token)
1485 .collect()
1486 }
1487
1488 #[test]
1489 fn test_keywords() {
1490 let tokens = tokenize("SELECT FROM WHERE AND OR NOT");
1491 assert_eq!(
1492 tokens,
1493 vec![
1494 Token::Select,
1495 Token::From,
1496 Token::Where,
1497 Token::And,
1498 Token::Or,
1499 Token::Not,
1500 Token::Eof
1501 ]
1502 );
1503 }
1504
1505 #[test]
1506 fn test_identifiers() {
1507 let tokens = tokenize("hosts users ip_address");
1508 assert_eq!(
1509 tokens,
1510 vec![
1511 Token::Ident("hosts".into()),
1512 Token::Ident("users".into()),
1513 Token::Ident("ip_address".into()),
1514 Token::Eof
1515 ]
1516 );
1517 }
1518
1519 #[test]
1520 fn test_numbers() {
1521 let tokens = tokenize("42 2.5 1e10 2.5e-3");
1522 assert_eq!(
1523 tokens,
1524 vec![
1525 Token::Integer(42),
1526 Token::Float(2.5),
1527 Token::Float(1e10),
1528 Token::Float(2.5e-3),
1529 Token::Eof
1530 ]
1531 );
1532 }
1533
1534 #[test]
1535 fn test_strings() {
1536 let tokens = tokenize("'hello' \"world\" 'it''s'");
1537 assert_eq!(
1538 tokens,
1539 vec![
1540 Token::String("hello".into()),
1541 Token::String("world".into()),
1542 Token::String("it's".into()),
1543 Token::Eof
1544 ]
1545 );
1546 }
1547
1548 #[test]
1549 fn test_operators() {
1550 let tokens = tokenize("= <> < <= > >= != + - * /");
1551 assert_eq!(
1552 tokens,
1553 vec![
1554 Token::Eq,
1555 Token::Ne,
1556 Token::Lt,
1557 Token::Le,
1558 Token::Gt,
1559 Token::Ge,
1560 Token::Ne,
1561 Token::Plus,
1562 Token::Dash,
1563 Token::Star,
1564 Token::Slash,
1565 Token::Eof
1566 ]
1567 );
1568 }
1569
1570 #[test]
1571 fn test_delimiters() {
1572 let tokens = tokenize("( ) [ ] { a } , . : ;");
1577 assert_eq!(
1578 tokens,
1579 vec![
1580 Token::LParen,
1581 Token::RParen,
1582 Token::LBracket,
1583 Token::RBracket,
1584 Token::LBrace,
1585 Token::Ident("a".into()),
1586 Token::RBrace,
1587 Token::Comma,
1588 Token::Dot,
1589 Token::Colon,
1590 Token::Semi,
1591 Token::Eof
1592 ]
1593 );
1594 }
1595
1596 #[test]
1597 fn test_json_literal_empty_object() {
1598 let tokens = tokenize("{ }");
1599 assert_eq!(tokens, vec![Token::JsonLiteral("{ }".into()), Token::Eof]);
1600 }
1601
1602 #[test]
1603 fn test_json_literal_simple() {
1604 let tokens = tokenize(r#"{"a":1}"#);
1605 assert_eq!(
1606 tokens,
1607 vec![Token::JsonLiteral(r#"{"a":1}"#.into()), Token::Eof]
1608 );
1609 }
1610
1611 #[test]
1612 fn test_json_literal_nested() {
1613 let raw = r#"{"a":{"b":[1,2,{"c":"}"}]}}"#;
1614 let tokens = tokenize(raw);
1615 assert_eq!(tokens, vec![Token::JsonLiteral(raw.into()), Token::Eof]);
1616 }
1617
1618 #[test]
1619 fn test_json_literal_escaped_quote_in_string() {
1620 let raw = r#"{"path":"O\"Brien}"}"#;
1622 let tokens = tokenize(raw);
1623 assert_eq!(tokens, vec![Token::JsonLiteral(raw.into()), Token::Eof]);
1624 }
1625
1626 #[test]
1627 fn test_json_literal_unbalanced_eof() {
1628 let mut lexer = Lexer::new(r#"{"a":1"#);
1629 let err = lexer.tokenize().expect_err("expected unterminated error");
1630 assert!(
1631 err.message.contains("unterminated JSON object literal"),
1632 "got: {}",
1633 err.message
1634 );
1635 }
1636
1637 #[test]
1638 fn test_json_literal_property_bag_compatible() {
1639 let tokens = tokenize("{name: 'value'}");
1642 assert_eq!(tokens[0], Token::LBrace);
1643 assert_eq!(*tokens.last().unwrap(), Token::Eof);
1644 }
1645
1646 #[test]
1647 fn test_graph_syntax() {
1648 let tokens = tokenize("-> <- - ..");
1649 assert_eq!(
1650 tokens,
1651 vec![
1652 Token::Arrow,
1653 Token::ArrowLeft,
1654 Token::Dash,
1655 Token::DotDot,
1656 Token::Eof
1657 ]
1658 );
1659 }
1660
1661 #[test]
1662 fn test_table_query() {
1663 let tokens = tokenize("SELECT ip, hostname FROM hosts WHERE os = 'Linux' LIMIT 10");
1664 assert_eq!(
1665 tokens,
1666 vec![
1667 Token::Select,
1668 Token::Ident("ip".into()),
1669 Token::Comma,
1670 Token::Ident("hostname".into()),
1671 Token::From,
1672 Token::Ident("hosts".into()),
1673 Token::Where,
1674 Token::Ident("os".into()),
1675 Token::Eq,
1676 Token::String("Linux".into()),
1677 Token::Limit,
1678 Token::Integer(10),
1679 Token::Eof
1680 ]
1681 );
1682 }
1683
1684 #[test]
1685 fn test_graph_query() {
1686 let tokens = tokenize("MATCH (h:Host)-[:HAS_SERVICE]->(s:Service) RETURN h, s");
1687 assert_eq!(
1688 tokens,
1689 vec![
1690 Token::Match,
1691 Token::LParen,
1692 Token::Ident("h".into()),
1693 Token::Colon,
1694 Token::Ident("Host".into()),
1695 Token::RParen,
1696 Token::Dash,
1697 Token::LBracket,
1698 Token::Colon,
1699 Token::Ident("HAS_SERVICE".into()),
1700 Token::RBracket,
1701 Token::Arrow,
1702 Token::LParen,
1703 Token::Ident("s".into()),
1704 Token::Colon,
1705 Token::Ident("Service".into()),
1706 Token::RParen,
1707 Token::Return,
1708 Token::Ident("h".into()),
1709 Token::Comma,
1710 Token::Ident("s".into()),
1711 Token::Eof
1712 ]
1713 );
1714 }
1715
1716 #[test]
1717 fn test_join_query() {
1718 let tokens = tokenize("FROM hosts h JOIN GRAPH (h)-[:HAS_VULN]->(v) ON h.ip = v.id");
1719 assert_eq!(
1720 tokens,
1721 vec![
1722 Token::From,
1723 Token::Ident("hosts".into()),
1724 Token::Ident("h".into()),
1725 Token::Join,
1726 Token::Graph,
1727 Token::LParen,
1728 Token::Ident("h".into()),
1729 Token::RParen,
1730 Token::Dash,
1731 Token::LBracket,
1732 Token::Colon,
1733 Token::Ident("HAS_VULN".into()),
1734 Token::RBracket,
1735 Token::Arrow,
1736 Token::LParen,
1737 Token::Ident("v".into()),
1738 Token::RParen,
1739 Token::On,
1740 Token::Ident("h".into()),
1741 Token::Dot,
1742 Token::Ident("ip".into()),
1743 Token::Eq,
1744 Token::Ident("v".into()),
1745 Token::Dot,
1746 Token::Ident("id".into()),
1747 Token::Eof
1748 ]
1749 );
1750 }
1751
1752 #[test]
1753 fn test_path_query() {
1754 let tokens = tokenize("PATH FROM host('192.168.1.1') TO host('10.0.0.1') VIA [:AUTH]");
1755 assert_eq!(
1756 tokens,
1757 vec![
1758 Token::Path,
1759 Token::From,
1760 Token::Ident("host".into()),
1761 Token::LParen,
1762 Token::String("192.168.1.1".into()),
1763 Token::RParen,
1764 Token::To,
1765 Token::Ident("host".into()),
1766 Token::LParen,
1767 Token::String("10.0.0.1".into()),
1768 Token::RParen,
1769 Token::Via,
1770 Token::LBracket,
1771 Token::Colon,
1772 Token::Ident("AUTH".into()),
1773 Token::RBracket,
1774 Token::Eof
1775 ]
1776 );
1777 }
1778
1779 #[test]
1780 fn test_variable_length_pattern() {
1781 let tokens = tokenize("(a)-[*1..5]->(b)");
1782 assert_eq!(
1783 tokens,
1784 vec![
1785 Token::LParen,
1786 Token::Ident("a".into()),
1787 Token::RParen,
1788 Token::Dash,
1789 Token::LBracket,
1790 Token::Star,
1791 Token::Integer(1),
1792 Token::DotDot,
1793 Token::Integer(5),
1794 Token::RBracket,
1795 Token::Arrow,
1796 Token::LParen,
1797 Token::Ident("b".into()),
1798 Token::RParen,
1799 Token::Eof
1800 ]
1801 );
1802 }
1803
1804 #[test]
1805 fn test_case_insensitive_keywords() {
1806 let tokens = tokenize("select FROM Where AND");
1807 assert_eq!(
1808 tokens,
1809 vec![
1810 Token::Select,
1811 Token::From,
1812 Token::Where,
1813 Token::And,
1814 Token::Eof
1815 ]
1816 );
1817 }
1818
1819 #[test]
1820 fn test_comments() {
1821 let tokens = tokenize("SELECT -- this is a comment\nip FROM hosts");
1822 assert_eq!(
1823 tokens,
1824 vec![
1825 Token::Select,
1826 Token::Ident("ip".into()),
1827 Token::From,
1828 Token::Ident("hosts".into()),
1829 Token::Eof
1830 ]
1831 );
1832 }
1833
1834 #[test]
1835 fn test_escaped_strings() {
1836 let tokens = tokenize(r"'hello\nworld' 'tab\there'");
1837 assert_eq!(
1838 tokens,
1839 vec![
1840 Token::String("hello\nworld".into()),
1841 Token::String("tab\there".into()),
1842 Token::Eof
1843 ]
1844 );
1845 }
1846
1847 #[test]
1848 fn test_keyword_matrix_and_alias_spellings() {
1849 let cases = [
1850 ("SELECT", Token::Select),
1851 ("FROM", Token::From),
1852 ("WHERE", Token::Where),
1853 ("AND", Token::And),
1854 ("OR", Token::Or),
1855 ("NOT", Token::Not),
1856 ("MATCH", Token::Match),
1857 ("RETURN", Token::Return),
1858 ("JOIN", Token::Join),
1859 ("GRAPH", Token::Graph),
1860 ("PATH", Token::Path),
1861 ("TO", Token::To),
1862 ("VIA", Token::Via),
1863 ("ON", Token::On),
1864 ("AS", Token::As),
1865 ("IS", Token::Is),
1866 ("NULL", Token::Null),
1867 ("BETWEEN", Token::Between),
1868 ("LIKE", Token::Like),
1869 ("IN", Token::In),
1870 ("ORDER", Token::Order),
1871 ("BY", Token::By),
1872 ("ASC", Token::Asc),
1873 ("DESC", Token::Desc),
1874 ("NULLS", Token::Nulls),
1875 ("FIRST", Token::First),
1876 ("LAST", Token::Last),
1877 ("LIMIT", Token::Limit),
1878 ("OFFSET", Token::Offset),
1879 ("INNER", Token::Inner),
1880 ("LEFT", Token::Left),
1881 ("RIGHT", Token::Right),
1882 ("OUTER", Token::Outer),
1883 ("FULL", Token::Full),
1884 ("CROSS", Token::Cross),
1885 ("STARTS", Token::Starts),
1886 ("ENDS", Token::Ends),
1887 ("WITH", Token::With),
1888 ("CONTAINS", Token::Contains),
1889 ("TRUE", Token::True),
1890 ("FALSE", Token::False),
1891 ("ENRICH", Token::Enrich),
1892 ("GROUP", Token::Group),
1893 ("COUNT", Token::Count),
1894 ("SUM", Token::Sum),
1895 ("AVG", Token::Avg),
1896 ("MIN", Token::Min),
1897 ("MAX", Token::Max),
1898 ("DISTINCT", Token::Distinct),
1899 ("VECTOR", Token::Vector),
1900 ("SEARCH", Token::Search),
1901 ("SIMILAR", Token::Similar),
1902 ("COLLECTION", Token::Collection),
1903 ("METRIC", Token::Metric),
1904 ("THRESHOLD", Token::Threshold),
1905 ("K", Token::K),
1906 ("HYBRID", Token::Hybrid),
1907 ("FUSION", Token::Fusion),
1908 ("RERANK", Token::Rerank),
1909 ("RRF", Token::Rrf),
1910 ("INTERSECTION", Token::Intersection),
1911 ("UNION", Token::Union),
1912 ("RECURSIVE", Token::Recursive),
1913 ("ALL", Token::All),
1914 ("WEIGHT", Token::Weight),
1915 ("L2", Token::L2),
1916 ("COSINE", Token::Cosine),
1917 ("INNER_PRODUCT", Token::InnerProduct),
1918 ("INNERPRODUCT", Token::InnerProduct),
1919 ("INCLUDE", Token::Include),
1920 ("METADATA", Token::Metadata),
1921 ("VECTORS", Token::Vectors),
1922 ("EXPLAIN", Token::Explain),
1923 ("FOR", Token::For),
1924 ("FORMAT", Token::Format),
1925 ("JSON", Token::Json),
1926 ("INSERT", Token::Insert),
1927 ("INTO", Token::Into),
1928 ("VALUES", Token::Values),
1929 ("UPDATE", Token::Update),
1930 ("SET", Token::Set),
1931 ("DELETE", Token::Delete),
1932 ("TRUNCATE", Token::Truncate),
1933 ("CREATE", Token::Create),
1934 ("TABLE", Token::Table),
1935 ("DROP", Token::Drop),
1936 ("ALTER", Token::Alter),
1937 ("ADD", Token::Add),
1938 ("COLUMN", Token::Column),
1939 ("PRIMARY", Token::Primary),
1940 ("KEY", Token::Key),
1941 ("DEFAULT", Token::Default),
1942 ("COMPRESS", Token::Compress),
1943 ("INDEX", Token::Index),
1944 ("UNIQUE", Token::Unique),
1945 ("IF", Token::If),
1946 ("EXISTS", Token::Exists),
1947 ("RETURNING", Token::Returning),
1948 ("CASCADE", Token::Cascade),
1949 ("RENAME", Token::Rename),
1950 ("USING", Token::Using),
1951 ("NODE", Token::Node),
1952 ("EDGE", Token::Edge),
1953 ("DOCUMENT", Token::Document),
1954 ("KV", Token::Kv),
1955 ("TIMESERIES", Token::Timeseries),
1956 ("RETENTION", Token::Retention),
1957 ("QUEUE", Token::Queue),
1958 ("TREE", Token::Tree),
1959 ("PUSH", Token::Push),
1960 ("POP", Token::Pop),
1961 ("PEEK", Token::Peek),
1962 ("PURGE", Token::Purge),
1963 ("ACK", Token::Ack),
1964 ("NACK", Token::Nack),
1965 ("PRIORITY", Token::Priority),
1966 ("LPUSH", Token::Ident("LPUSH".into())),
1967 ("RPUSH", Token::Ident("RPUSH".into())),
1968 ("LPOP", Token::Ident("LPOP".into())),
1969 ("RPOP", Token::Ident("RPOP".into())),
1970 ("NEIGHBORHOOD", Token::Neighborhood),
1971 ("SHORTEST_PATH", Token::ShortestPath),
1972 ("SHORTESTPATH", Token::ShortestPath),
1973 ("CENTRALITY", Token::Centrality),
1974 ("COMMUNITY", Token::Community),
1975 ("COMPONENTS", Token::Components),
1976 ("CYCLES", Token::Cycles),
1977 ("TRAVERSE", Token::Traverse),
1978 ("DEPTH", Token::Depth),
1979 ("DIRECTION", Token::Direction),
1980 ("ALGORITHM", Token::Algorithm),
1981 ("STRATEGY", Token::Strategy),
1982 ("MAX_ITERATIONS", Token::MaxIterations),
1983 ("MAXITERATIONS", Token::MaxIterations),
1984 ("MAX_LENGTH", Token::MaxLength),
1985 ("MAXLENGTH", Token::MaxLength),
1986 ("MODE", Token::Mode),
1987 ("CLUSTERING", Token::Clustering),
1988 ("TOPOLOGICAL_SORT", Token::TopologicalSort),
1989 ("TOPOLOGICALSORT", Token::TopologicalSort),
1990 ("PROPERTIES", Token::Properties),
1991 ("TEXT", Token::Text),
1992 ("FUZZY", Token::Fuzzy),
1993 ("MIN_SCORE", Token::MinScore),
1994 ("MINSCORE", Token::MinScore),
1995 ("BEGIN", Token::Begin),
1996 ("COMMIT", Token::Commit),
1997 ("ROLLBACK", Token::Rollback),
1998 ("SAVEPOINT", Token::Savepoint),
1999 ("RELEASE", Token::Release),
2000 ("START", Token::Start),
2001 ("TRANSACTION", Token::Transaction),
2002 ("WORK", Token::Work),
2003 ("VACUUM", Token::Vacuum),
2004 ("ANALYZE", Token::Analyze),
2005 ("SCHEMA", Token::Schema),
2006 ("SEQUENCE", Token::Sequence),
2007 ("INCREMENT", Token::Increment),
2008 ("COPY", Token::Copy),
2009 ("HEADER", Token::Header),
2010 ("DELIMITER", Token::Delimiter),
2011 ("VIEW", Token::View),
2012 ("MATERIALIZED", Token::Materialized),
2013 ("REFRESH", Token::Refresh),
2014 ("PARTITION", Token::Partition),
2015 ("RANGE", Token::Range),
2016 ("LIST", Token::List),
2017 ("HASH", Token::Hash),
2018 ("ATTACH", Token::Attach),
2019 ("DETACH", Token::Detach),
2020 ("OF", Token::Of),
2021 ("POLICY", Token::Policy),
2022 ("ENABLE", Token::Enable),
2023 ("DISABLE", Token::Disable),
2024 ("SECURITY", Token::Security),
2025 ("ROW", Token::Row),
2026 ("LEVEL", Token::Level),
2027 ("FOREIGN", Token::Foreign),
2028 ("SERVER", Token::Server),
2029 ("WRAPPER", Token::Wrapper),
2030 ("OPTIONS", Token::Options),
2031 ("DATA", Token::Data),
2032 ("plain_ident", Token::Ident("plain_ident".into())),
2033 ];
2034
2035 for (input, expected) in cases {
2036 let tokens = tokenize(input);
2037 assert_eq!(tokens, vec![expected, Token::Eof], "{input}");
2038 }
2039 }
2040
2041 #[test]
2042 fn test_display_all_token_variants() {
2043 let cases = [
2044 (Token::Select, "SELECT"),
2045 (Token::From, "FROM"),
2046 (Token::Where, "WHERE"),
2047 (Token::And, "AND"),
2048 (Token::Or, "OR"),
2049 (Token::Not, "NOT"),
2050 (Token::Match, "MATCH"),
2051 (Token::Return, "RETURN"),
2052 (Token::Join, "JOIN"),
2053 (Token::Graph, "GRAPH"),
2054 (Token::Path, "PATH"),
2055 (Token::To, "TO"),
2056 (Token::Via, "VIA"),
2057 (Token::On, "ON"),
2058 (Token::As, "AS"),
2059 (Token::Is, "IS"),
2060 (Token::Null, "NULL"),
2061 (Token::Between, "BETWEEN"),
2062 (Token::Like, "LIKE"),
2063 (Token::In, "IN"),
2064 (Token::Order, "ORDER"),
2065 (Token::By, "BY"),
2066 (Token::Asc, "ASC"),
2067 (Token::Desc, "DESC"),
2068 (Token::Nulls, "NULLS"),
2069 (Token::First, "FIRST"),
2070 (Token::Last, "LAST"),
2071 (Token::Limit, "LIMIT"),
2072 (Token::Offset, "OFFSET"),
2073 (Token::Inner, "INNER"),
2074 (Token::Left, "LEFT"),
2075 (Token::Right, "RIGHT"),
2076 (Token::Outer, "OUTER"),
2077 (Token::Full, "FULL"),
2078 (Token::Cross, "CROSS"),
2079 (Token::Starts, "STARTS"),
2080 (Token::Ends, "ENDS"),
2081 (Token::With, "WITH"),
2082 (Token::Contains, "CONTAINS"),
2083 (Token::True, "TRUE"),
2084 (Token::False, "FALSE"),
2085 (Token::Enrich, "ENRICH"),
2086 (Token::Group, "GROUP"),
2087 (Token::Count, "COUNT"),
2088 (Token::Sum, "SUM"),
2089 (Token::Avg, "AVG"),
2090 (Token::Min, "MIN"),
2091 (Token::Max, "MAX"),
2092 (Token::Distinct, "DISTINCT"),
2093 (Token::Vector, "VECTOR"),
2094 (Token::Search, "SEARCH"),
2095 (Token::Similar, "SIMILAR"),
2096 (Token::Collection, "COLLECTION"),
2097 (Token::Metric, "METRIC"),
2098 (Token::Threshold, "THRESHOLD"),
2099 (Token::K, "K"),
2100 (Token::Hybrid, "HYBRID"),
2101 (Token::Fusion, "FUSION"),
2102 (Token::Rerank, "RERANK"),
2103 (Token::Rrf, "RRF"),
2104 (Token::Intersection, "INTERSECTION"),
2105 (Token::Union, "UNION"),
2106 (Token::Recursive, "RECURSIVE"),
2107 (Token::All, "ALL"),
2108 (Token::Weight, "WEIGHT"),
2109 (Token::L2, "L2"),
2110 (Token::Cosine, "COSINE"),
2111 (Token::InnerProduct, "INNER_PRODUCT"),
2112 (Token::Include, "INCLUDE"),
2113 (Token::Metadata, "METADATA"),
2114 (Token::Vectors, "VECTORS"),
2115 (Token::Explain, "EXPLAIN"),
2116 (Token::For, "FOR"),
2117 (Token::Format, "FORMAT"),
2118 (Token::Json, "JSON"),
2119 (Token::Insert, "INSERT"),
2120 (Token::Into, "INTO"),
2121 (Token::Values, "VALUES"),
2122 (Token::Update, "UPDATE"),
2123 (Token::Set, "SET"),
2124 (Token::Delete, "DELETE"),
2125 (Token::Truncate, "TRUNCATE"),
2126 (Token::Create, "CREATE"),
2127 (Token::Table, "TABLE"),
2128 (Token::Drop, "DROP"),
2129 (Token::Alter, "ALTER"),
2130 (Token::Add, "ADD"),
2131 (Token::Column, "COLUMN"),
2132 (Token::Primary, "PRIMARY"),
2133 (Token::Key, "KEY"),
2134 (Token::Default, "DEFAULT"),
2135 (Token::Compress, "COMPRESS"),
2136 (Token::Index, "INDEX"),
2137 (Token::Unique, "UNIQUE"),
2138 (Token::If, "IF"),
2139 (Token::Exists, "EXISTS"),
2140 (Token::Returning, "RETURNING"),
2141 (Token::Cascade, "CASCADE"),
2142 (Token::Rename, "RENAME"),
2143 (Token::Using, "USING"),
2144 (Token::Node, "NODE"),
2145 (Token::Edge, "EDGE"),
2146 (Token::Document, "DOCUMENT"),
2147 (Token::Kv, "KV"),
2148 (Token::Timeseries, "TIMESERIES"),
2149 (Token::Retention, "RETENTION"),
2150 (Token::Queue, "QUEUE"),
2151 (Token::Tree, "TREE"),
2152 (Token::Push, "PUSH"),
2153 (Token::Pop, "POP"),
2154 (Token::Peek, "PEEK"),
2155 (Token::Purge, "PURGE"),
2156 (Token::Ack, "ACK"),
2157 (Token::Nack, "NACK"),
2158 (Token::Priority, "PRIORITY"),
2159 (Token::Neighborhood, "NEIGHBORHOOD"),
2160 (Token::ShortestPath, "SHORTEST_PATH"),
2161 (Token::Centrality, "CENTRALITY"),
2162 (Token::Community, "COMMUNITY"),
2163 (Token::Components, "COMPONENTS"),
2164 (Token::Cycles, "CYCLES"),
2165 (Token::Traverse, "TRAVERSE"),
2166 (Token::Depth, "DEPTH"),
2167 (Token::Direction, "DIRECTION"),
2168 (Token::Algorithm, "ALGORITHM"),
2169 (Token::Strategy, "STRATEGY"),
2170 (Token::MaxIterations, "MAX_ITERATIONS"),
2171 (Token::MaxLength, "MAX_LENGTH"),
2172 (Token::Mode, "MODE"),
2173 (Token::Clustering, "CLUSTERING"),
2174 (Token::TopologicalSort, "TOPOLOGICAL_SORT"),
2175 (Token::Properties, "PROPERTIES"),
2176 (Token::Text, "TEXT"),
2177 (Token::Fuzzy, "FUZZY"),
2178 (Token::MinScore, "MIN_SCORE"),
2179 (Token::Begin, "BEGIN"),
2180 (Token::Commit, "COMMIT"),
2181 (Token::Rollback, "ROLLBACK"),
2182 (Token::Savepoint, "SAVEPOINT"),
2183 (Token::Release, "RELEASE"),
2184 (Token::Start, "START"),
2185 (Token::Transaction, "TRANSACTION"),
2186 (Token::Work, "WORK"),
2187 (Token::Vacuum, "VACUUM"),
2188 (Token::Analyze, "ANALYZE"),
2189 (Token::Schema, "SCHEMA"),
2190 (Token::Sequence, "SEQUENCE"),
2191 (Token::Increment, "INCREMENT"),
2192 (Token::Copy, "COPY"),
2193 (Token::Header, "HEADER"),
2194 (Token::Delimiter, "DELIMITER"),
2195 (Token::View, "VIEW"),
2196 (Token::Materialized, "MATERIALIZED"),
2197 (Token::Refresh, "REFRESH"),
2198 (Token::Partition, "PARTITION"),
2199 (Token::Range, "RANGE"),
2200 (Token::List, "LIST"),
2201 (Token::Hash, "HASH"),
2202 (Token::Attach, "ATTACH"),
2203 (Token::Detach, "DETACH"),
2204 (Token::Of, "OF"),
2205 (Token::Policy, "POLICY"),
2206 (Token::Enable, "ENABLE"),
2207 (Token::Disable, "DISABLE"),
2208 (Token::Security, "SECURITY"),
2209 (Token::Row, "ROW"),
2210 (Token::Level, "LEVEL"),
2211 (Token::Foreign, "FOREIGN"),
2212 (Token::Server, "SERVER"),
2213 (Token::Wrapper, "WRAPPER"),
2214 (Token::Options, "OPTIONS"),
2215 (Token::Data, "DATA"),
2216 (Token::String("x".into()), "'x'"),
2217 (Token::Integer(7), "7"),
2218 (Token::Float(1.5), "1.5"),
2219 (Token::JsonLiteral(r#"{"x":1}"#.into()), r#"{"x":1}"#),
2220 (Token::Ident("id".into()), "id"),
2221 (Token::Eq, "="),
2222 (Token::Ne, "<>"),
2223 (Token::Lt, "<"),
2224 (Token::Le, "<="),
2225 (Token::Gt, ">"),
2226 (Token::Ge, ">="),
2227 (Token::Plus, "+"),
2228 (Token::Minus, "-"),
2229 (Token::Star, "*"),
2230 (Token::Slash, "/"),
2231 (Token::Percent, "%"),
2232 (Token::LParen, "("),
2233 (Token::RParen, ")"),
2234 (Token::LBracket, "["),
2235 (Token::RBracket, "]"),
2236 (Token::LBrace, "{"),
2237 (Token::RBrace, "}"),
2238 (Token::Comma, ","),
2239 (Token::Dot, "."),
2240 (Token::Colon, ":"),
2241 (Token::Semi, ";"),
2242 (Token::Dollar, "$"),
2243 (Token::Arrow, "->"),
2244 (Token::ArrowLeft, "<-"),
2245 (Token::Dash, "-"),
2246 (Token::DotDot, ".."),
2247 (Token::Pipe, "|"),
2248 (Token::DoublePipe, "||"),
2249 (Token::Eof, "EOF"),
2250 ];
2251
2252 for (token, expected) in cases {
2253 assert_eq!(token.to_string(), expected);
2254 }
2255 }
2256
2257 #[test]
2258 fn test_string_escape_and_error_matrix() {
2259 let tokens = tokenize(
2260 r#"'line\nrow' 'carriage\rreturn' 'tab\tstop' 'slash\\' 'quote\'' "dq\"" 'raw\z'"#,
2261 );
2262 assert_eq!(
2263 tokens,
2264 vec![
2265 Token::String("line\nrow".into()),
2266 Token::String("carriage\rreturn".into()),
2267 Token::String("tab\tstop".into()),
2268 Token::String("slash\\".into()),
2269 Token::String("quote'".into()),
2270 Token::String("dq\"".into()),
2271 Token::String(r"raw\z".into()),
2272 Token::Eof
2273 ]
2274 );
2275
2276 let mut lexer = Lexer::new("'unterminated");
2277 assert!(lexer
2278 .next_token()
2279 .unwrap_err()
2280 .message
2281 .contains("Unterminated string"));
2282
2283 let mut lexer = Lexer::new(r"'bad\");
2284 assert!(lexer
2285 .next_token()
2286 .unwrap_err()
2287 .message
2288 .contains("Unterminated string"));
2289 }
2290
2291 #[test]
2292 fn test_operator_comment_peek_limit_and_tokenize_paths() {
2293 let tokens = tokenize("!= % ; $ || | 123.abc 1..2 1e+2 <- -> /* block */ SELECT");
2294 assert_eq!(
2295 tokens,
2296 vec![
2297 Token::Ne,
2298 Token::Percent,
2299 Token::Semi,
2300 Token::Dollar,
2301 Token::DoublePipe,
2302 Token::Pipe,
2303 Token::Integer(123),
2304 Token::Dot,
2305 Token::Ident("abc".into()),
2306 Token::Integer(1),
2307 Token::DotDot,
2308 Token::Integer(2),
2309 Token::Float(1e2),
2310 Token::ArrowLeft,
2311 Token::Arrow,
2312 Token::Select,
2313 Token::Eof,
2314 ]
2315 );
2316
2317 let mut lexer = Lexer::new("SELECT FROM");
2318 assert_eq!(lexer.peek_token().unwrap().token, Token::Select);
2319 assert_eq!(lexer.next_token().unwrap().token, Token::Select);
2320 assert_eq!(lexer.next_token().unwrap().token, Token::From);
2321
2322 let mut lexer = Lexer::new("!");
2323 assert!(lexer
2324 .next_token()
2325 .unwrap_err()
2326 .message
2327 .contains("Expected '=' after '!'"));
2328
2329 let limits = crate::storage::query::parser::ParserLimits {
2330 max_identifier_chars: 3,
2331 ..crate::storage::query::parser::ParserLimits::default()
2332 };
2333 let mut lexer = Lexer::with_limits("abcd", limits);
2334 assert_eq!(lexer.max_identifier_chars(), 3);
2335 let err = lexer.next_token().unwrap_err();
2336 assert!(matches!(
2337 err.limit_hit,
2338 Some(LexerLimitHit::IdentifierTooLong { value: 3, .. })
2339 ));
2340 }
2341}