1use std::fmt;
15use std::iter::Peekable;
16use std::str::Chars;
17
18#[derive(Debug, Clone, PartialEq)]
20pub enum Token {
21 Select,
23 From,
24 Where,
25 And,
26 Or,
27 Not,
28 Match,
29 Return,
30 Join,
31 Graph,
32 Path,
33 To,
34 Via,
35 On,
36 As,
37 Is,
38 Null,
39 Between,
40 Like,
41 In,
42 Order,
43 By,
44 Asc,
45 Desc,
46 Nulls,
47 First,
48 Last,
49 Limit,
50 Offset,
51 Inner,
52 Left,
53 Right,
54 Outer,
55 Full,
56 Cross,
57 Starts,
58 Ends,
59 With,
60 Contains,
61 True,
62 False,
63 Enrich,
64 Group,
65 Count,
66 Sum,
67 Avg,
68 Min,
69 Max,
70 Distinct,
71
72 Vector,
74 Search,
75 Similar,
76 Collection,
77 Metric,
78 Threshold,
79 K,
80 Hybrid,
81 Fusion,
82 Rerank,
83 Rrf,
84 Intersection,
85 Union,
86 Recursive,
87 All,
88 Weight,
89 L2,
90 Cosine,
91 InnerProduct,
92 Include,
93 Metadata,
94 Vectors,
95
96 Insert,
98 Into,
99 Values,
100 Update,
101 Set,
102 Delete,
103 Truncate,
104 Create,
105 Table,
106 Drop,
107 Alter,
108 Add,
109 Column,
110 Primary,
111 Explain,
113 For,
114 Format,
115 Json,
116 Key,
117 Default,
118 Compress,
119 Index,
120 Unique,
121 If,
122 Exists,
123 Returning,
124 Cascade,
125 Rename,
126 Using,
127
128 Node,
130 Edge,
131 Document,
132 Kv,
133
134 Timeseries,
136 Retention,
137 Queue,
138 Tree,
139 Push,
140 Pop,
141 Peek,
142 Purge,
143 Ack,
144 Nack,
145 Priority,
146
147 Neighborhood,
149 ShortestPath,
150 Centrality,
151 Community,
152 Components,
153 Cycles,
154 Traverse,
155 Depth,
156 Direction,
157 Algorithm,
158 Strategy,
159 MaxIterations,
160 MaxLength,
161 Mode,
162 Clustering,
163 TopologicalSort,
164 Properties,
165 Text,
166 Fuzzy,
167 MinScore,
168
169 Begin,
171 Commit,
172 Rollback,
173 Savepoint,
174 Release,
175 Start,
176 Transaction,
177 Work,
178
179 Vacuum,
181 Analyze,
182
183 Schema,
185 Sequence,
186 Increment,
187
188 Copy,
190 Header,
191 Delimiter,
192
193 View,
195 Materialized,
196 Refresh,
197
198 Partition,
200 Range,
201 List,
202 Hash,
203 Attach,
204 Detach,
205 Of,
206
207 Policy,
209 Enable,
210 Disable,
211 Security,
212 Row,
213 Level,
214
215 Foreign,
217 Server,
218 Wrapper,
219 Options,
220 Data,
221
222 String(String),
224 Integer(i64),
225 Float(f64),
226 JsonLiteral(String),
232
233 Ident(String),
235
236 Eq, Ne, Lt, Le, Gt, Ge, Plus, Minus, Star, Slash, Percent, LParen, RParen, LBracket, RBracket, LBrace, RBrace, Comma, Dot, Colon, Semi, Dollar, Question, Arrow, ArrowLeft, Dash, DotDot, Pipe, DoublePipe, Eof,
273}
274
275impl fmt::Display for Token {
276 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
277 match self {
278 Token::Select => write!(f, "SELECT"),
279 Token::From => write!(f, "FROM"),
280 Token::Where => write!(f, "WHERE"),
281 Token::And => write!(f, "AND"),
282 Token::Or => write!(f, "OR"),
283 Token::Not => write!(f, "NOT"),
284 Token::Match => write!(f, "MATCH"),
285 Token::Return => write!(f, "RETURN"),
286 Token::Join => write!(f, "JOIN"),
287 Token::Graph => write!(f, "GRAPH"),
288 Token::Path => write!(f, "PATH"),
289 Token::To => write!(f, "TO"),
290 Token::Via => write!(f, "VIA"),
291 Token::On => write!(f, "ON"),
292 Token::As => write!(f, "AS"),
293 Token::Is => write!(f, "IS"),
294 Token::Null => write!(f, "NULL"),
295 Token::Between => write!(f, "BETWEEN"),
296 Token::Like => write!(f, "LIKE"),
297 Token::In => write!(f, "IN"),
298 Token::Order => write!(f, "ORDER"),
299 Token::By => write!(f, "BY"),
300 Token::Asc => write!(f, "ASC"),
301 Token::Desc => write!(f, "DESC"),
302 Token::Nulls => write!(f, "NULLS"),
303 Token::First => write!(f, "FIRST"),
304 Token::Last => write!(f, "LAST"),
305 Token::Limit => write!(f, "LIMIT"),
306 Token::Offset => write!(f, "OFFSET"),
307 Token::Inner => write!(f, "INNER"),
308 Token::Left => write!(f, "LEFT"),
309 Token::Right => write!(f, "RIGHT"),
310 Token::Outer => write!(f, "OUTER"),
311 Token::Full => write!(f, "FULL"),
312 Token::Cross => write!(f, "CROSS"),
313 Token::Starts => write!(f, "STARTS"),
314 Token::Ends => write!(f, "ENDS"),
315 Token::With => write!(f, "WITH"),
316 Token::Contains => write!(f, "CONTAINS"),
317 Token::True => write!(f, "TRUE"),
318 Token::False => write!(f, "FALSE"),
319 Token::Enrich => write!(f, "ENRICH"),
320 Token::Group => write!(f, "GROUP"),
321 Token::Count => write!(f, "COUNT"),
322 Token::Sum => write!(f, "SUM"),
323 Token::Avg => write!(f, "AVG"),
324 Token::Min => write!(f, "MIN"),
325 Token::Max => write!(f, "MAX"),
326 Token::Distinct => write!(f, "DISTINCT"),
327 Token::Vector => write!(f, "VECTOR"),
328 Token::Search => write!(f, "SEARCH"),
329 Token::Similar => write!(f, "SIMILAR"),
330 Token::Collection => write!(f, "COLLECTION"),
331 Token::Metric => write!(f, "METRIC"),
332 Token::Threshold => write!(f, "THRESHOLD"),
333 Token::K => write!(f, "K"),
334 Token::Hybrid => write!(f, "HYBRID"),
335 Token::Fusion => write!(f, "FUSION"),
336 Token::Rerank => write!(f, "RERANK"),
337 Token::Rrf => write!(f, "RRF"),
338 Token::Intersection => write!(f, "INTERSECTION"),
339 Token::Union => write!(f, "UNION"),
340 Token::Recursive => write!(f, "RECURSIVE"),
341 Token::All => write!(f, "ALL"),
342 Token::Weight => write!(f, "WEIGHT"),
343 Token::L2 => write!(f, "L2"),
344 Token::Cosine => write!(f, "COSINE"),
345 Token::InnerProduct => write!(f, "INNER_PRODUCT"),
346 Token::Include => write!(f, "INCLUDE"),
347 Token::Metadata => write!(f, "METADATA"),
348 Token::Vectors => write!(f, "VECTORS"),
349 Token::Explain => write!(f, "EXPLAIN"),
350 Token::For => write!(f, "FOR"),
351 Token::Format => write!(f, "FORMAT"),
352 Token::Json => write!(f, "JSON"),
353 Token::Insert => write!(f, "INSERT"),
354 Token::Into => write!(f, "INTO"),
355 Token::Values => write!(f, "VALUES"),
356 Token::Update => write!(f, "UPDATE"),
357 Token::Set => write!(f, "SET"),
358 Token::Delete => write!(f, "DELETE"),
359 Token::Truncate => write!(f, "TRUNCATE"),
360 Token::Create => write!(f, "CREATE"),
361 Token::Table => write!(f, "TABLE"),
362 Token::Drop => write!(f, "DROP"),
363 Token::Alter => write!(f, "ALTER"),
364 Token::Add => write!(f, "ADD"),
365 Token::Column => write!(f, "COLUMN"),
366 Token::Primary => write!(f, "PRIMARY"),
367 Token::Key => write!(f, "KEY"),
368 Token::Default => write!(f, "DEFAULT"),
369 Token::Compress => write!(f, "COMPRESS"),
370 Token::Index => write!(f, "INDEX"),
371 Token::Unique => write!(f, "UNIQUE"),
372 Token::If => write!(f, "IF"),
373 Token::Exists => write!(f, "EXISTS"),
374 Token::Returning => write!(f, "RETURNING"),
375 Token::Cascade => write!(f, "CASCADE"),
376 Token::Rename => write!(f, "RENAME"),
377 Token::Using => write!(f, "USING"),
378 Token::Node => write!(f, "NODE"),
379 Token::Edge => write!(f, "EDGE"),
380 Token::Document => write!(f, "DOCUMENT"),
381 Token::Kv => write!(f, "KV"),
382 Token::Timeseries => write!(f, "TIMESERIES"),
383 Token::Retention => write!(f, "RETENTION"),
384 Token::Queue => write!(f, "QUEUE"),
385 Token::Tree => write!(f, "TREE"),
386 Token::Push => write!(f, "PUSH"),
387 Token::Pop => write!(f, "POP"),
388 Token::Peek => write!(f, "PEEK"),
389 Token::Purge => write!(f, "PURGE"),
390 Token::Ack => write!(f, "ACK"),
391 Token::Nack => write!(f, "NACK"),
392 Token::Priority => write!(f, "PRIORITY"),
393 Token::Neighborhood => write!(f, "NEIGHBORHOOD"),
394 Token::ShortestPath => write!(f, "SHORTEST_PATH"),
395 Token::Centrality => write!(f, "CENTRALITY"),
396 Token::Community => write!(f, "COMMUNITY"),
397 Token::Components => write!(f, "COMPONENTS"),
398 Token::Cycles => write!(f, "CYCLES"),
399 Token::Traverse => write!(f, "TRAVERSE"),
400 Token::Depth => write!(f, "DEPTH"),
401 Token::Direction => write!(f, "DIRECTION"),
402 Token::Algorithm => write!(f, "ALGORITHM"),
403 Token::Strategy => write!(f, "STRATEGY"),
404 Token::MaxIterations => write!(f, "MAX_ITERATIONS"),
405 Token::MaxLength => write!(f, "MAX_LENGTH"),
406 Token::Mode => write!(f, "MODE"),
407 Token::Clustering => write!(f, "CLUSTERING"),
408 Token::TopologicalSort => write!(f, "TOPOLOGICAL_SORT"),
409 Token::Properties => write!(f, "PROPERTIES"),
410 Token::Text => write!(f, "TEXT"),
411 Token::Fuzzy => write!(f, "FUZZY"),
412 Token::MinScore => write!(f, "MIN_SCORE"),
413 Token::Begin => write!(f, "BEGIN"),
414 Token::Commit => write!(f, "COMMIT"),
415 Token::Rollback => write!(f, "ROLLBACK"),
416 Token::Savepoint => write!(f, "SAVEPOINT"),
417 Token::Release => write!(f, "RELEASE"),
418 Token::Start => write!(f, "START"),
419 Token::Transaction => write!(f, "TRANSACTION"),
420 Token::Work => write!(f, "WORK"),
421 Token::Vacuum => write!(f, "VACUUM"),
422 Token::Analyze => write!(f, "ANALYZE"),
423 Token::Schema => write!(f, "SCHEMA"),
424 Token::Sequence => write!(f, "SEQUENCE"),
425 Token::Increment => write!(f, "INCREMENT"),
426 Token::Copy => write!(f, "COPY"),
427 Token::Header => write!(f, "HEADER"),
428 Token::Delimiter => write!(f, "DELIMITER"),
429 Token::View => write!(f, "VIEW"),
430 Token::Materialized => write!(f, "MATERIALIZED"),
431 Token::Refresh => write!(f, "REFRESH"),
432 Token::Partition => write!(f, "PARTITION"),
433 Token::Range => write!(f, "RANGE"),
434 Token::List => write!(f, "LIST"),
435 Token::Hash => write!(f, "HASH"),
436 Token::Attach => write!(f, "ATTACH"),
437 Token::Detach => write!(f, "DETACH"),
438 Token::Of => write!(f, "OF"),
439 Token::Policy => write!(f, "POLICY"),
440 Token::Enable => write!(f, "ENABLE"),
441 Token::Disable => write!(f, "DISABLE"),
442 Token::Security => write!(f, "SECURITY"),
443 Token::Row => write!(f, "ROW"),
444 Token::Level => write!(f, "LEVEL"),
445 Token::Foreign => write!(f, "FOREIGN"),
446 Token::Server => write!(f, "SERVER"),
447 Token::Wrapper => write!(f, "WRAPPER"),
448 Token::Options => write!(f, "OPTIONS"),
449 Token::Data => write!(f, "DATA"),
450 Token::String(s) => write!(f, "'{}'", s),
451 Token::Integer(n) => write!(f, "{}", n),
452 Token::Float(n) => write!(f, "{}", n),
453 Token::JsonLiteral(s) => write!(f, "{}", s),
454 Token::Ident(s) => write!(f, "{}", s),
455 Token::Eq => write!(f, "="),
456 Token::Ne => write!(f, "<>"),
457 Token::Lt => write!(f, "<"),
458 Token::Le => write!(f, "<="),
459 Token::Gt => write!(f, ">"),
460 Token::Ge => write!(f, ">="),
461 Token::Plus => write!(f, "+"),
462 Token::Minus => write!(f, "-"),
463 Token::Star => write!(f, "*"),
464 Token::Slash => write!(f, "/"),
465 Token::Percent => write!(f, "%"),
466 Token::LParen => write!(f, "("),
467 Token::RParen => write!(f, ")"),
468 Token::LBracket => write!(f, "["),
469 Token::RBracket => write!(f, "]"),
470 Token::LBrace => write!(f, "{{"),
471 Token::RBrace => write!(f, "}}"),
472 Token::Comma => write!(f, ","),
473 Token::Dot => write!(f, "."),
474 Token::Colon => write!(f, ":"),
475 Token::Semi => write!(f, ";"),
476 Token::Dollar => write!(f, "$"),
477 Token::Question => write!(f, "?"),
478 Token::Arrow => write!(f, "->"),
479 Token::ArrowLeft => write!(f, "<-"),
480 Token::Dash => write!(f, "-"),
481 Token::DotDot => write!(f, ".."),
482 Token::Pipe => write!(f, "|"),
483 Token::DoublePipe => write!(f, "||"),
484 Token::Eof => write!(f, "EOF"),
485 }
486 }
487}
488
489#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
491pub struct Position {
492 pub line: u32,
494 pub column: u32,
496 pub offset: u32,
498}
499
500impl Position {
501 pub fn new(line: u32, column: u32, offset: u32) -> Self {
503 Self {
504 line,
505 column,
506 offset,
507 }
508 }
509}
510
511impl fmt::Display for Position {
512 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
513 write!(f, "{}:{}", self.line, self.column)
514 }
515}
516
517#[derive(Debug, Clone)]
519pub struct Spanned {
520 pub token: Token,
522 pub start: Position,
524 pub end: Position,
526}
527
528impl Spanned {
529 pub fn new(token: Token, start: Position, end: Position) -> Self {
531 Self { token, start, end }
532 }
533}
534
535#[derive(Debug, Clone)]
537pub struct LexerError {
538 pub message: String,
540 pub position: Position,
542 pub limit_hit: Option<LexerLimitHit>,
546}
547
548#[derive(Debug, Clone, PartialEq, Eq)]
550pub enum LexerLimitHit {
551 IdentifierTooLong {
553 limit_name: &'static str,
554 value: usize,
555 },
556}
557
558impl LexerError {
559 pub fn new(message: impl Into<String>, position: Position) -> Self {
561 Self {
562 message: message.into(),
563 position,
564 limit_hit: None,
565 }
566 }
567
568 pub(crate) fn with_limit(
570 message: impl Into<String>,
571 position: Position,
572 limit_hit: LexerLimitHit,
573 ) -> Self {
574 Self {
575 message: message.into(),
576 position,
577 limit_hit: Some(limit_hit),
578 }
579 }
580}
581
582impl fmt::Display for LexerError {
583 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
584 write!(f, "Lexer error at {}: {}", self.position, self.message)
585 }
586}
587
588impl std::error::Error for LexerError {}
589
590pub const JSON_LITERAL_MAX_BYTES: usize = 16 * 1024 * 1024;
597
598pub struct Lexer<'a> {
600 input: &'a str,
603 chars: Peekable<Chars<'a>>,
605 line: u32,
607 column: u32,
608 offset: u32,
609 peeked: Option<Spanned>,
611 putback: Option<(char, Position)>,
613 max_identifier_chars: usize,
615}
616
617impl<'a> Lexer<'a> {
618 pub fn new(input: &'a str) -> Self {
620 Self::with_limits(
621 input,
622 crate::storage::query::parser::ParserLimits::default(),
623 )
624 }
625
626 pub fn with_limits(
628 input: &'a str,
629 limits: crate::storage::query::parser::ParserLimits,
630 ) -> Self {
631 Self {
632 input,
633 chars: input.chars().peekable(),
634 line: 1,
635 column: 1,
636 offset: 0,
637 peeked: None,
638 putback: None,
639 max_identifier_chars: limits.max_identifier_chars,
640 }
641 }
642
643 pub(crate) fn max_identifier_chars(&self) -> usize {
647 self.max_identifier_chars
648 }
649
650 fn position(&self) -> Position {
652 Position::new(self.line, self.column, self.offset)
653 }
654
655 fn unget(&mut self, ch: char, pos: Position) {
657 self.putback = Some((ch, pos));
658 }
659
660 fn advance(&mut self) -> Option<char> {
662 if let Some((ch, pos)) = self.putback.take() {
664 self.line = pos.line;
666 self.column = pos.column + 1;
667 self.offset = pos.offset + ch.len_utf8() as u32;
668 return Some(ch);
669 }
670
671 let ch = self.chars.next()?;
672 self.offset += ch.len_utf8() as u32;
673 if ch == '\n' {
674 self.line += 1;
675 self.column = 1;
676 } else {
677 self.column += 1;
678 }
679 Some(ch)
680 }
681
682 fn peek(&mut self) -> Option<char> {
684 if let Some((ch, _)) = &self.putback {
686 return Some(*ch);
687 }
688 self.chars.peek().copied()
689 }
690
691 fn skip_whitespace(&mut self) {
693 while let Some(ch) = self.peek() {
694 if ch.is_whitespace() {
695 self.advance();
696 } else if ch == '-' {
697 let pos = self.position();
699 self.advance();
700 if self.peek() == Some('-') {
701 self.advance();
703 while let Some(c) = self.peek() {
704 if c == '\n' {
705 break;
706 }
707 self.advance();
708 }
709 } else {
710 self.line = pos.line;
713 self.column = pos.column;
714 self.offset = pos.offset;
715 break;
718 }
719 } else {
720 break;
721 }
722 }
723 }
724
725 pub fn peek_token(&mut self) -> Result<&Spanned, LexerError> {
727 if self.peeked.is_none() {
728 self.peeked = Some(self.next_token_internal()?);
729 }
730 Ok(self.peeked.as_ref().unwrap())
731 }
732
733 pub fn next_token(&mut self) -> Result<Spanned, LexerError> {
735 if let Some(tok) = self.peeked.take() {
736 return Ok(tok);
737 }
738 self.next_token_internal()
739 }
740
741 fn next_token_internal(&mut self) -> Result<Spanned, LexerError> {
743 self.skip_whitespace_simple();
744
745 let start = self.position();
746
747 let ch = match self.peek() {
748 Some(c) => c,
749 None => {
750 return Ok(Spanned::new(Token::Eof, start, start));
751 }
752 };
753
754 let token = match ch {
756 '\'' | '"' => self.scan_string()?,
758
759 '0'..='9' => self.scan_number()?,
761
762 'a'..='z' | 'A'..='Z' | '_' => self.scan_identifier()?,
764
765 '=' => {
767 self.advance();
768 Token::Eq
769 }
770 '<' => self.scan_less_than()?,
771 '>' => self.scan_greater_than()?,
772 '!' => {
773 self.advance();
774 if self.peek() == Some('=') {
775 self.advance();
776 Token::Ne
777 } else {
778 return Err(LexerError::new("Expected '=' after '!'", start));
779 }
780 }
781 '+' => {
782 self.advance();
783 Token::Plus
784 }
785 '-' => self.scan_minus()?,
786 '*' => {
787 self.advance();
788 Token::Star
789 }
790 '/' => {
791 self.advance();
792 Token::Slash
793 }
794 '%' => {
795 self.advance();
796 Token::Percent
797 }
798 '(' => {
799 self.advance();
800 Token::LParen
801 }
802 ')' => {
803 self.advance();
804 Token::RParen
805 }
806 '[' => {
807 self.advance();
808 Token::LBracket
809 }
810 ']' => {
811 self.advance();
812 Token::RBracket
813 }
814 '{' => {
815 if self.looks_like_json_object_start() {
822 return self.scan_json_literal(start);
823 }
824 self.advance();
825 Token::LBrace
826 }
827 '}' => {
828 self.advance();
829 Token::RBrace
830 }
831 ',' => {
832 self.advance();
833 Token::Comma
834 }
835 '.' => self.scan_dot()?,
836 ':' => {
837 self.advance();
838 Token::Colon
839 }
840 ';' => {
841 self.advance();
842 Token::Semi
843 }
844 '$' => {
845 self.advance();
846 Token::Dollar
847 }
848 '?' => {
849 self.advance();
850 Token::Question
851 }
852 '|' => {
853 self.advance();
854 if self.peek() == Some('|') {
855 self.advance();
856 Token::DoublePipe
857 } else {
858 Token::Pipe
859 }
860 }
861 _ => {
862 return Err(LexerError::new(
863 format!("Unexpected character: '{}'", ch),
864 start,
865 ));
866 }
867 };
868
869 let end = self.position();
870 Ok(Spanned::new(token, start, end))
871 }
872
873 fn skip_whitespace_simple(&mut self) {
875 while let Some(ch) = self.peek() {
876 if ch.is_whitespace() {
877 self.advance();
878 } else if ch == '-' && self.input[self.offset as usize..].starts_with("--") {
879 self.advance();
880 self.advance();
881 while let Some(c) = self.peek() {
882 if c == '\n' {
883 break;
884 }
885 self.advance();
886 }
887 } else if ch == '/' && self.input[self.offset as usize..].starts_with("/*") {
888 self.advance();
889 self.advance();
890 while let Some(c) = self.peek() {
891 self.advance();
892 if c == '*' && self.peek() == Some('/') {
893 self.advance();
894 break;
895 }
896 }
897 } else {
898 break;
899 }
900 }
901 }
902
903 fn scan_string(&mut self) -> Result<Token, LexerError> {
905 let quote = self.advance().unwrap(); let start = self.position();
907 let mut value = String::new();
908
909 loop {
910 match self.peek() {
911 None => {
912 return Err(LexerError::new("Unterminated string", start));
913 }
914 Some(c) if c == quote => {
915 self.advance();
916 if self.peek() == Some(quote) {
918 self.advance();
919 value.push(quote);
920 } else {
921 break;
922 }
923 }
924 Some('\\') => {
925 self.advance();
926 match self.peek() {
927 Some('n') => {
928 self.advance();
929 value.push('\n');
930 }
931 Some('r') => {
932 self.advance();
933 value.push('\r');
934 }
935 Some('t') => {
936 self.advance();
937 value.push('\t');
938 }
939 Some('\\') => {
940 self.advance();
941 value.push('\\');
942 }
943 Some(c) if c == quote => {
944 self.advance();
945 value.push(quote);
946 }
947 Some(c) => {
948 value.push('\\');
950 value.push(c);
951 self.advance();
952 }
953 None => {
954 return Err(LexerError::new("Unterminated string", start));
955 }
956 }
957 }
958 Some(c) => {
959 self.advance();
960 value.push(c);
961 }
962 }
963 }
964
965 Ok(Token::String(value))
966 }
967
968 fn scan_number(&mut self) -> Result<Token, LexerError> {
970 let mut value = String::new();
971 let mut is_float = false;
972
973 while let Some(ch) = self.peek() {
975 if ch.is_ascii_digit() {
976 value.push(ch);
977 self.advance();
978 } else {
979 break;
980 }
981 }
982
983 if self.peek() == Some('.') {
985 let dot_pos = self.position();
987 self.advance(); if self.peek() == Some('.') {
990 self.unget('.', dot_pos);
992 } else if self.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
994 is_float = true;
995 value.push('.');
996 while let Some(ch) = self.peek() {
997 if ch.is_ascii_digit() {
998 value.push(ch);
999 self.advance();
1000 } else {
1001 break;
1002 }
1003 }
1004 } else {
1005 self.unget('.', dot_pos);
1007 }
1008 }
1009
1010 if self.peek() == Some('e') || self.peek() == Some('E') {
1012 is_float = true;
1013 value.push(self.advance().unwrap());
1014
1015 if self.peek() == Some('+') || self.peek() == Some('-') {
1016 value.push(self.advance().unwrap());
1017 }
1018
1019 while let Some(ch) = self.peek() {
1020 if ch.is_ascii_digit() {
1021 value.push(ch);
1022 self.advance();
1023 } else {
1024 break;
1025 }
1026 }
1027 }
1028
1029 if is_float {
1030 match value.parse::<f64>() {
1031 Ok(n) => Ok(Token::Float(n)),
1032 Err(_) => Err(LexerError::new(
1033 format!("Invalid float: {}", value),
1034 self.position(),
1035 )),
1036 }
1037 } else {
1038 match value.parse::<i64>() {
1039 Ok(n) => Ok(Token::Integer(n)),
1040 Err(_) => Err(LexerError::new(
1041 format!("Invalid integer: {}", value),
1042 self.position(),
1043 )),
1044 }
1045 }
1046 }
1047
1048 fn scan_identifier(&mut self) -> Result<Token, LexerError> {
1050 let start_pos = self.position();
1051 let mut value = String::new();
1052 let max = self.max_identifier_chars;
1053
1054 while let Some(ch) = self.peek() {
1055 if ch.is_alphanumeric() || ch == '_' {
1056 if value.chars().count() >= max {
1057 return Err(LexerError::with_limit(
1061 format!(
1062 "identifier exceeds maximum length (max_identifier_chars = {})",
1063 max
1064 ),
1065 start_pos,
1066 LexerLimitHit::IdentifierTooLong {
1067 limit_name: "max_identifier_chars",
1068 value: max,
1069 },
1070 ));
1071 }
1072 value.push(ch);
1073 self.advance();
1074 } else {
1075 break;
1076 }
1077 }
1078
1079 let token = match value.to_uppercase().as_str() {
1081 "SELECT" => Token::Select,
1082 "FROM" => Token::From,
1083 "WHERE" => Token::Where,
1084 "AND" => Token::And,
1085 "OR" => Token::Or,
1086 "NOT" => Token::Not,
1087 "MATCH" => Token::Match,
1088 "RETURN" => Token::Return,
1089 "JOIN" => Token::Join,
1090 "GRAPH" => Token::Graph,
1091 "PATH" => Token::Path,
1092 "TO" => Token::To,
1093 "VIA" => Token::Via,
1094 "ON" => Token::On,
1095 "AS" => Token::As,
1096 "IS" => Token::Is,
1097 "NULL" => Token::Null,
1098 "BETWEEN" => Token::Between,
1099 "LIKE" => Token::Like,
1100 "IN" => Token::In,
1101 "ORDER" => Token::Order,
1102 "BY" => Token::By,
1103 "ASC" => Token::Asc,
1104 "DESC" => Token::Desc,
1105 "NULLS" => Token::Nulls,
1106 "FIRST" => Token::First,
1107 "LAST" => Token::Last,
1108 "LIMIT" => Token::Limit,
1109 "OFFSET" => Token::Offset,
1110 "INNER" => Token::Inner,
1111 "LEFT" => Token::Left,
1112 "RIGHT" => Token::Right,
1113 "OUTER" => Token::Outer,
1114 "FULL" => Token::Full,
1115 "CROSS" => Token::Cross,
1116 "STARTS" => Token::Starts,
1117 "ENDS" => Token::Ends,
1118 "WITH" => Token::With,
1119 "CONTAINS" => Token::Contains,
1120 "TRUE" => Token::True,
1121 "FALSE" => Token::False,
1122 "ENRICH" => Token::Enrich,
1123 "GROUP" => Token::Group,
1124 "COUNT" => Token::Count,
1125 "SUM" => Token::Sum,
1126 "AVG" => Token::Avg,
1127 "MIN" => Token::Min,
1128 "MAX" => Token::Max,
1129 "DISTINCT" => Token::Distinct,
1130 "VECTOR" => Token::Vector,
1131 "SEARCH" => Token::Search,
1132 "SIMILAR" => Token::Similar,
1133 "COLLECTION" => Token::Collection,
1134 "METRIC" => Token::Metric,
1135 "THRESHOLD" => Token::Threshold,
1136 "K" => Token::K,
1137 "HYBRID" => Token::Hybrid,
1138 "FUSION" => Token::Fusion,
1139 "RERANK" => Token::Rerank,
1140 "RRF" => Token::Rrf,
1141 "INTERSECTION" => Token::Intersection,
1142 "UNION" => Token::Union,
1143 "RECURSIVE" => Token::Recursive,
1144 "ALL" => Token::All,
1145 "WEIGHT" => Token::Weight,
1146 "L2" => Token::L2,
1147 "COSINE" => Token::Cosine,
1148 "INNER_PRODUCT" | "INNERPRODUCT" => Token::InnerProduct,
1149 "INCLUDE" => Token::Include,
1150 "METADATA" => Token::Metadata,
1151 "VECTORS" => Token::Vectors,
1152 "EXPLAIN" => Token::Explain,
1153 "FOR" => Token::For,
1154 "FORMAT" => Token::Format,
1155 "JSON" => Token::Json,
1156 "INSERT" => Token::Insert,
1157 "INTO" => Token::Into,
1158 "VALUES" => Token::Values,
1159 "UPDATE" => Token::Update,
1160 "SET" => Token::Set,
1161 "DELETE" => Token::Delete,
1162 "TRUNCATE" => Token::Truncate,
1163 "CREATE" => Token::Create,
1164 "TABLE" => Token::Table,
1165 "DROP" => Token::Drop,
1166 "ALTER" => Token::Alter,
1167 "ADD" => Token::Add,
1168 "COLUMN" => Token::Column,
1169 "PRIMARY" => Token::Primary,
1170 "KEY" => Token::Key,
1171 "DEFAULT" => Token::Default,
1172 "COMPRESS" => Token::Compress,
1173 "INDEX" => Token::Index,
1174 "UNIQUE" => Token::Unique,
1175 "IF" => Token::If,
1176 "EXISTS" => Token::Exists,
1177 "RETURNING" => Token::Returning,
1178 "CASCADE" => Token::Cascade,
1179 "RENAME" => Token::Rename,
1180 "USING" => Token::Using,
1181 "NODE" => Token::Node,
1182 "EDGE" => Token::Edge,
1183 "DOCUMENT" => Token::Document,
1184 "KV" => Token::Kv,
1185 "TIMESERIES" => Token::Timeseries,
1186 "RETENTION" => Token::Retention,
1187 "QUEUE" => Token::Queue,
1188 "TREE" => Token::Tree,
1189 "PUSH" => Token::Push,
1190 "POP" => Token::Pop,
1191 "PEEK" => Token::Peek,
1192 "PURGE" => Token::Purge,
1193 "ACK" => Token::Ack,
1194 "NACK" => Token::Nack,
1195 "PRIORITY" => Token::Priority,
1196 "LPUSH" => Token::Ident("LPUSH".to_string()),
1197 "RPUSH" => Token::Ident("RPUSH".to_string()),
1198 "LPOP" => Token::Ident("LPOP".to_string()),
1199 "RPOP" => Token::Ident("RPOP".to_string()),
1200 "NEIGHBORHOOD" => Token::Neighborhood,
1201 "SHORTEST_PATH" | "SHORTESTPATH" => Token::ShortestPath,
1202 "CENTRALITY" => Token::Centrality,
1203 "COMMUNITY" => Token::Community,
1204 "COMPONENTS" => Token::Components,
1205 "CYCLES" => Token::Cycles,
1206 "TRAVERSE" => Token::Traverse,
1207 "DEPTH" => Token::Depth,
1208 "DIRECTION" => Token::Direction,
1209 "ALGORITHM" => Token::Algorithm,
1210 "STRATEGY" => Token::Strategy,
1211 "MAX_ITERATIONS" | "MAXITERATIONS" => Token::MaxIterations,
1212 "MAX_LENGTH" | "MAXLENGTH" => Token::MaxLength,
1213 "MODE" => Token::Mode,
1214 "CLUSTERING" => Token::Clustering,
1215 "TOPOLOGICAL_SORT" | "TOPOLOGICALSORT" => Token::TopologicalSort,
1216 "PROPERTIES" => Token::Properties,
1217 "TEXT" => Token::Text,
1218 "FUZZY" => Token::Fuzzy,
1219 "MIN_SCORE" | "MINSCORE" => Token::MinScore,
1220 "BEGIN" => Token::Begin,
1221 "COMMIT" => Token::Commit,
1222 "ROLLBACK" => Token::Rollback,
1223 "SAVEPOINT" => Token::Savepoint,
1224 "RELEASE" => Token::Release,
1225 "START" => Token::Start,
1226 "TRANSACTION" => Token::Transaction,
1227 "WORK" => Token::Work,
1228 "VACUUM" => Token::Vacuum,
1229 "ANALYZE" => Token::Analyze,
1230 "SCHEMA" => Token::Schema,
1231 "SEQUENCE" => Token::Sequence,
1232 "INCREMENT" => Token::Increment,
1233 "COPY" => Token::Copy,
1234 "HEADER" => Token::Header,
1235 "DELIMITER" => Token::Delimiter,
1236 "VIEW" => Token::View,
1237 "MATERIALIZED" => Token::Materialized,
1238 "REFRESH" => Token::Refresh,
1239 "PARTITION" => Token::Partition,
1240 "RANGE" => Token::Range,
1241 "LIST" => Token::List,
1242 "HASH" => Token::Hash,
1243 "ATTACH" => Token::Attach,
1244 "DETACH" => Token::Detach,
1245 "OF" => Token::Of,
1246 "POLICY" => Token::Policy,
1247 "ENABLE" => Token::Enable,
1248 "DISABLE" => Token::Disable,
1249 "SECURITY" => Token::Security,
1250 "ROW" => Token::Row,
1251 "LEVEL" => Token::Level,
1252 "FOREIGN" => Token::Foreign,
1253 "SERVER" => Token::Server,
1254 "WRAPPER" => Token::Wrapper,
1255 "OPTIONS" => Token::Options,
1256 "DATA" => Token::Data,
1257 _ => Token::Ident(value),
1258 };
1259
1260 Ok(token)
1261 }
1262
1263 fn scan_less_than(&mut self) -> Result<Token, LexerError> {
1265 self.advance(); match self.peek() {
1267 Some('=') => {
1268 self.advance();
1269 Ok(Token::Le)
1270 }
1271 Some('>') => {
1272 self.advance();
1273 Ok(Token::Ne)
1274 }
1275 Some('-') => {
1276 self.advance();
1277 Ok(Token::ArrowLeft)
1278 }
1279 _ => Ok(Token::Lt),
1280 }
1281 }
1282
1283 fn scan_greater_than(&mut self) -> Result<Token, LexerError> {
1285 self.advance(); if self.peek() == Some('=') {
1287 self.advance();
1288 Ok(Token::Ge)
1289 } else {
1290 Ok(Token::Gt)
1291 }
1292 }
1293
1294 fn scan_minus(&mut self) -> Result<Token, LexerError> {
1296 self.advance(); match self.peek() {
1298 Some('>') => {
1299 self.advance();
1300 Ok(Token::Arrow)
1301 }
1302 Some('-') => {
1303 self.advance();
1305 while let Some(c) = self.peek() {
1306 if c == '\n' {
1307 break;
1308 }
1309 self.advance();
1310 }
1311 self.skip_whitespace_simple();
1313 if self.peek().is_none() {
1314 Ok(Token::Eof)
1315 } else {
1316 let next = self.next_token_internal()?;
1317 Ok(next.token)
1318 }
1319 }
1320 _ => Ok(Token::Dash),
1321 }
1322 }
1323
1324 fn scan_dot(&mut self) -> Result<Token, LexerError> {
1326 self.advance(); if self.peek() == Some('.') {
1328 self.advance();
1329 Ok(Token::DotDot)
1330 } else {
1331 Ok(Token::Dot)
1332 }
1333 }
1334
1335 fn looks_like_json_object_start(&self) -> bool {
1340 let bytes = self.input.as_bytes();
1341 let mut i = self.offset as usize;
1342 debug_assert!(bytes.get(i) == Some(&b'{'));
1344 i += 1;
1345 while i < bytes.len() {
1346 match bytes[i] {
1347 b' ' | b'\t' | b'\n' | b'\r' => i += 1,
1348 b'"' | b'}' => return true,
1349 _ => return false,
1350 }
1351 }
1352 false
1353 }
1354
1355 fn scan_json_literal(&mut self, start: Position) -> Result<Spanned, LexerError> {
1372 let start_offset = self.offset as usize;
1373 self.advance();
1375 let mut depth: u32 = 1;
1376 let mut in_string = false;
1377 let mut escape = false;
1378 loop {
1379 let ch = match self.peek() {
1380 Some(c) => c,
1381 None => {
1382 return Err(LexerError::new(
1383 format!(
1384 "unterminated JSON object literal (started at offset {})",
1385 start.offset
1386 ),
1387 self.position(),
1388 ));
1389 }
1390 };
1391
1392 let scanned_bytes = self.offset as usize - start_offset;
1394 if scanned_bytes > JSON_LITERAL_MAX_BYTES {
1395 return Err(LexerError::new(
1396 format!(
1397 "JSON object literal exceeds JSON_LITERAL_MAX_BYTES ({} bytes)",
1398 JSON_LITERAL_MAX_BYTES
1399 ),
1400 start,
1401 ));
1402 }
1403
1404 self.advance();
1405
1406 if escape {
1407 escape = false;
1408 continue;
1409 }
1410
1411 if in_string {
1412 match ch {
1413 '\\' => escape = true,
1414 '"' => in_string = false,
1415 _ => {}
1416 }
1417 continue;
1418 }
1419
1420 match ch {
1421 '"' => in_string = true,
1422 '{' => depth += 1,
1423 '}' => {
1424 depth -= 1;
1425 if depth == 0 {
1426 let end = self.position();
1427 let end_offset = self.offset as usize;
1428 if end_offset - start_offset > JSON_LITERAL_MAX_BYTES {
1430 return Err(LexerError::new(
1431 format!(
1432 "JSON object literal exceeds JSON_LITERAL_MAX_BYTES ({} bytes)",
1433 JSON_LITERAL_MAX_BYTES
1434 ),
1435 start,
1436 ));
1437 }
1438 let raw = self.input[start_offset..end_offset].to_string();
1439 return Ok(Spanned::new(Token::JsonLiteral(raw), start, end));
1440 }
1441 }
1442 _ => {}
1443 }
1444 }
1445 }
1446
1447 pub fn tokenize(&mut self) -> Result<Vec<Spanned>, LexerError> {
1449 let mut tokens = Vec::new();
1450 loop {
1451 let tok = self.next_token()?;
1452 let is_eof = tok.token == Token::Eof;
1453 tokens.push(tok);
1454 if is_eof {
1455 break;
1456 }
1457 }
1458 Ok(tokens)
1459 }
1460}
1461
1462#[cfg(test)]
1467mod tests {
1468 use super::*;
1469
1470 fn tokenize(input: &str) -> Vec<Token> {
1471 let mut lexer = Lexer::new(input);
1472 lexer
1473 .tokenize()
1474 .unwrap()
1475 .into_iter()
1476 .map(|s| s.token)
1477 .collect()
1478 }
1479
1480 #[test]
1481 fn test_keywords() {
1482 let tokens = tokenize("SELECT FROM WHERE AND OR NOT");
1483 assert_eq!(
1484 tokens,
1485 vec![
1486 Token::Select,
1487 Token::From,
1488 Token::Where,
1489 Token::And,
1490 Token::Or,
1491 Token::Not,
1492 Token::Eof
1493 ]
1494 );
1495 }
1496
1497 #[test]
1498 fn test_identifiers() {
1499 let tokens = tokenize("hosts users ip_address");
1500 assert_eq!(
1501 tokens,
1502 vec![
1503 Token::Ident("hosts".into()),
1504 Token::Ident("users".into()),
1505 Token::Ident("ip_address".into()),
1506 Token::Eof
1507 ]
1508 );
1509 }
1510
1511 #[test]
1512 fn test_numbers() {
1513 let tokens = tokenize("42 2.5 1e10 2.5e-3");
1514 assert_eq!(
1515 tokens,
1516 vec![
1517 Token::Integer(42),
1518 Token::Float(2.5),
1519 Token::Float(1e10),
1520 Token::Float(2.5e-3),
1521 Token::Eof
1522 ]
1523 );
1524 }
1525
1526 #[test]
1527 fn test_strings() {
1528 let tokens = tokenize("'hello' \"world\" 'it''s'");
1529 assert_eq!(
1530 tokens,
1531 vec![
1532 Token::String("hello".into()),
1533 Token::String("world".into()),
1534 Token::String("it's".into()),
1535 Token::Eof
1536 ]
1537 );
1538 }
1539
1540 #[test]
1541 fn test_operators() {
1542 let tokens = tokenize("= <> < <= > >= != + - * /");
1543 assert_eq!(
1544 tokens,
1545 vec![
1546 Token::Eq,
1547 Token::Ne,
1548 Token::Lt,
1549 Token::Le,
1550 Token::Gt,
1551 Token::Ge,
1552 Token::Ne,
1553 Token::Plus,
1554 Token::Dash,
1555 Token::Star,
1556 Token::Slash,
1557 Token::Eof
1558 ]
1559 );
1560 }
1561
1562 #[test]
1563 fn test_delimiters() {
1564 let tokens = tokenize("( ) [ ] { a } , . : ;");
1569 assert_eq!(
1570 tokens,
1571 vec![
1572 Token::LParen,
1573 Token::RParen,
1574 Token::LBracket,
1575 Token::RBracket,
1576 Token::LBrace,
1577 Token::Ident("a".into()),
1578 Token::RBrace,
1579 Token::Comma,
1580 Token::Dot,
1581 Token::Colon,
1582 Token::Semi,
1583 Token::Eof
1584 ]
1585 );
1586 }
1587
1588 #[test]
1589 fn test_json_literal_empty_object() {
1590 let tokens = tokenize("{ }");
1591 assert_eq!(tokens, vec![Token::JsonLiteral("{ }".into()), Token::Eof]);
1592 }
1593
1594 #[test]
1595 fn test_json_literal_simple() {
1596 let tokens = tokenize(r#"{"a":1}"#);
1597 assert_eq!(
1598 tokens,
1599 vec![Token::JsonLiteral(r#"{"a":1}"#.into()), Token::Eof]
1600 );
1601 }
1602
1603 #[test]
1604 fn test_json_literal_nested() {
1605 let raw = r#"{"a":{"b":[1,2,{"c":"}"}]}}"#;
1606 let tokens = tokenize(raw);
1607 assert_eq!(tokens, vec![Token::JsonLiteral(raw.into()), Token::Eof]);
1608 }
1609
1610 #[test]
1611 fn test_json_literal_escaped_quote_in_string() {
1612 let raw = r#"{"path":"O\"Brien}"}"#;
1614 let tokens = tokenize(raw);
1615 assert_eq!(tokens, vec![Token::JsonLiteral(raw.into()), Token::Eof]);
1616 }
1617
1618 #[test]
1619 fn test_json_literal_unbalanced_eof() {
1620 let mut lexer = Lexer::new(r#"{"a":1"#);
1621 let err = lexer.tokenize().expect_err("expected unterminated error");
1622 assert!(
1623 err.message.contains("unterminated JSON object literal"),
1624 "got: {}",
1625 err.message
1626 );
1627 }
1628
1629 #[test]
1630 fn test_json_literal_property_bag_compatible() {
1631 let tokens = tokenize("{name: 'value'}");
1634 assert_eq!(tokens[0], Token::LBrace);
1635 assert_eq!(*tokens.last().unwrap(), Token::Eof);
1636 }
1637
1638 #[test]
1639 fn test_graph_syntax() {
1640 let tokens = tokenize("-> <- - ..");
1641 assert_eq!(
1642 tokens,
1643 vec![
1644 Token::Arrow,
1645 Token::ArrowLeft,
1646 Token::Dash,
1647 Token::DotDot,
1648 Token::Eof
1649 ]
1650 );
1651 }
1652
1653 #[test]
1654 fn test_table_query() {
1655 let tokens = tokenize("SELECT ip, hostname FROM hosts WHERE os = 'Linux' LIMIT 10");
1656 assert_eq!(
1657 tokens,
1658 vec![
1659 Token::Select,
1660 Token::Ident("ip".into()),
1661 Token::Comma,
1662 Token::Ident("hostname".into()),
1663 Token::From,
1664 Token::Ident("hosts".into()),
1665 Token::Where,
1666 Token::Ident("os".into()),
1667 Token::Eq,
1668 Token::String("Linux".into()),
1669 Token::Limit,
1670 Token::Integer(10),
1671 Token::Eof
1672 ]
1673 );
1674 }
1675
1676 #[test]
1677 fn test_graph_query() {
1678 let tokens = tokenize("MATCH (h:Host)-[:HAS_SERVICE]->(s:Service) RETURN h, s");
1679 assert_eq!(
1680 tokens,
1681 vec![
1682 Token::Match,
1683 Token::LParen,
1684 Token::Ident("h".into()),
1685 Token::Colon,
1686 Token::Ident("Host".into()),
1687 Token::RParen,
1688 Token::Dash,
1689 Token::LBracket,
1690 Token::Colon,
1691 Token::Ident("HAS_SERVICE".into()),
1692 Token::RBracket,
1693 Token::Arrow,
1694 Token::LParen,
1695 Token::Ident("s".into()),
1696 Token::Colon,
1697 Token::Ident("Service".into()),
1698 Token::RParen,
1699 Token::Return,
1700 Token::Ident("h".into()),
1701 Token::Comma,
1702 Token::Ident("s".into()),
1703 Token::Eof
1704 ]
1705 );
1706 }
1707
1708 #[test]
1709 fn test_join_query() {
1710 let tokens = tokenize("FROM hosts h JOIN GRAPH (h)-[:HAS_VULN]->(v) ON h.ip = v.id");
1711 assert_eq!(
1712 tokens,
1713 vec![
1714 Token::From,
1715 Token::Ident("hosts".into()),
1716 Token::Ident("h".into()),
1717 Token::Join,
1718 Token::Graph,
1719 Token::LParen,
1720 Token::Ident("h".into()),
1721 Token::RParen,
1722 Token::Dash,
1723 Token::LBracket,
1724 Token::Colon,
1725 Token::Ident("HAS_VULN".into()),
1726 Token::RBracket,
1727 Token::Arrow,
1728 Token::LParen,
1729 Token::Ident("v".into()),
1730 Token::RParen,
1731 Token::On,
1732 Token::Ident("h".into()),
1733 Token::Dot,
1734 Token::Ident("ip".into()),
1735 Token::Eq,
1736 Token::Ident("v".into()),
1737 Token::Dot,
1738 Token::Ident("id".into()),
1739 Token::Eof
1740 ]
1741 );
1742 }
1743
1744 #[test]
1745 fn test_path_query() {
1746 let tokens = tokenize("PATH FROM host('192.168.1.1') TO host('10.0.0.1') VIA [:AUTH]");
1747 assert_eq!(
1748 tokens,
1749 vec![
1750 Token::Path,
1751 Token::From,
1752 Token::Ident("host".into()),
1753 Token::LParen,
1754 Token::String("192.168.1.1".into()),
1755 Token::RParen,
1756 Token::To,
1757 Token::Ident("host".into()),
1758 Token::LParen,
1759 Token::String("10.0.0.1".into()),
1760 Token::RParen,
1761 Token::Via,
1762 Token::LBracket,
1763 Token::Colon,
1764 Token::Ident("AUTH".into()),
1765 Token::RBracket,
1766 Token::Eof
1767 ]
1768 );
1769 }
1770
1771 #[test]
1772 fn test_variable_length_pattern() {
1773 let tokens = tokenize("(a)-[*1..5]->(b)");
1774 assert_eq!(
1775 tokens,
1776 vec![
1777 Token::LParen,
1778 Token::Ident("a".into()),
1779 Token::RParen,
1780 Token::Dash,
1781 Token::LBracket,
1782 Token::Star,
1783 Token::Integer(1),
1784 Token::DotDot,
1785 Token::Integer(5),
1786 Token::RBracket,
1787 Token::Arrow,
1788 Token::LParen,
1789 Token::Ident("b".into()),
1790 Token::RParen,
1791 Token::Eof
1792 ]
1793 );
1794 }
1795
1796 #[test]
1797 fn test_case_insensitive_keywords() {
1798 let tokens = tokenize("select FROM Where AND");
1799 assert_eq!(
1800 tokens,
1801 vec![
1802 Token::Select,
1803 Token::From,
1804 Token::Where,
1805 Token::And,
1806 Token::Eof
1807 ]
1808 );
1809 }
1810
1811 #[test]
1812 fn test_comments() {
1813 let tokens = tokenize("SELECT -- this is a comment\nip FROM hosts");
1814 assert_eq!(
1815 tokens,
1816 vec![
1817 Token::Select,
1818 Token::Ident("ip".into()),
1819 Token::From,
1820 Token::Ident("hosts".into()),
1821 Token::Eof
1822 ]
1823 );
1824 }
1825
1826 #[test]
1827 fn test_escaped_strings() {
1828 let tokens = tokenize(r"'hello\nworld' 'tab\there'");
1829 assert_eq!(
1830 tokens,
1831 vec![
1832 Token::String("hello\nworld".into()),
1833 Token::String("tab\there".into()),
1834 Token::Eof
1835 ]
1836 );
1837 }
1838
1839 #[test]
1840 fn test_keyword_matrix_and_alias_spellings() {
1841 let cases = [
1842 ("SELECT", Token::Select),
1843 ("FROM", Token::From),
1844 ("WHERE", Token::Where),
1845 ("AND", Token::And),
1846 ("OR", Token::Or),
1847 ("NOT", Token::Not),
1848 ("MATCH", Token::Match),
1849 ("RETURN", Token::Return),
1850 ("JOIN", Token::Join),
1851 ("GRAPH", Token::Graph),
1852 ("PATH", Token::Path),
1853 ("TO", Token::To),
1854 ("VIA", Token::Via),
1855 ("ON", Token::On),
1856 ("AS", Token::As),
1857 ("IS", Token::Is),
1858 ("NULL", Token::Null),
1859 ("BETWEEN", Token::Between),
1860 ("LIKE", Token::Like),
1861 ("IN", Token::In),
1862 ("ORDER", Token::Order),
1863 ("BY", Token::By),
1864 ("ASC", Token::Asc),
1865 ("DESC", Token::Desc),
1866 ("NULLS", Token::Nulls),
1867 ("FIRST", Token::First),
1868 ("LAST", Token::Last),
1869 ("LIMIT", Token::Limit),
1870 ("OFFSET", Token::Offset),
1871 ("INNER", Token::Inner),
1872 ("LEFT", Token::Left),
1873 ("RIGHT", Token::Right),
1874 ("OUTER", Token::Outer),
1875 ("FULL", Token::Full),
1876 ("CROSS", Token::Cross),
1877 ("STARTS", Token::Starts),
1878 ("ENDS", Token::Ends),
1879 ("WITH", Token::With),
1880 ("CONTAINS", Token::Contains),
1881 ("TRUE", Token::True),
1882 ("FALSE", Token::False),
1883 ("ENRICH", Token::Enrich),
1884 ("GROUP", Token::Group),
1885 ("COUNT", Token::Count),
1886 ("SUM", Token::Sum),
1887 ("AVG", Token::Avg),
1888 ("MIN", Token::Min),
1889 ("MAX", Token::Max),
1890 ("DISTINCT", Token::Distinct),
1891 ("VECTOR", Token::Vector),
1892 ("SEARCH", Token::Search),
1893 ("SIMILAR", Token::Similar),
1894 ("COLLECTION", Token::Collection),
1895 ("METRIC", Token::Metric),
1896 ("THRESHOLD", Token::Threshold),
1897 ("K", Token::K),
1898 ("HYBRID", Token::Hybrid),
1899 ("FUSION", Token::Fusion),
1900 ("RERANK", Token::Rerank),
1901 ("RRF", Token::Rrf),
1902 ("INTERSECTION", Token::Intersection),
1903 ("UNION", Token::Union),
1904 ("RECURSIVE", Token::Recursive),
1905 ("ALL", Token::All),
1906 ("WEIGHT", Token::Weight),
1907 ("L2", Token::L2),
1908 ("COSINE", Token::Cosine),
1909 ("INNER_PRODUCT", Token::InnerProduct),
1910 ("INNERPRODUCT", Token::InnerProduct),
1911 ("INCLUDE", Token::Include),
1912 ("METADATA", Token::Metadata),
1913 ("VECTORS", Token::Vectors),
1914 ("EXPLAIN", Token::Explain),
1915 ("FOR", Token::For),
1916 ("FORMAT", Token::Format),
1917 ("JSON", Token::Json),
1918 ("INSERT", Token::Insert),
1919 ("INTO", Token::Into),
1920 ("VALUES", Token::Values),
1921 ("UPDATE", Token::Update),
1922 ("SET", Token::Set),
1923 ("DELETE", Token::Delete),
1924 ("TRUNCATE", Token::Truncate),
1925 ("CREATE", Token::Create),
1926 ("TABLE", Token::Table),
1927 ("DROP", Token::Drop),
1928 ("ALTER", Token::Alter),
1929 ("ADD", Token::Add),
1930 ("COLUMN", Token::Column),
1931 ("PRIMARY", Token::Primary),
1932 ("KEY", Token::Key),
1933 ("DEFAULT", Token::Default),
1934 ("COMPRESS", Token::Compress),
1935 ("INDEX", Token::Index),
1936 ("UNIQUE", Token::Unique),
1937 ("IF", Token::If),
1938 ("EXISTS", Token::Exists),
1939 ("RETURNING", Token::Returning),
1940 ("CASCADE", Token::Cascade),
1941 ("RENAME", Token::Rename),
1942 ("USING", Token::Using),
1943 ("NODE", Token::Node),
1944 ("EDGE", Token::Edge),
1945 ("DOCUMENT", Token::Document),
1946 ("KV", Token::Kv),
1947 ("TIMESERIES", Token::Timeseries),
1948 ("RETENTION", Token::Retention),
1949 ("QUEUE", Token::Queue),
1950 ("TREE", Token::Tree),
1951 ("PUSH", Token::Push),
1952 ("POP", Token::Pop),
1953 ("PEEK", Token::Peek),
1954 ("PURGE", Token::Purge),
1955 ("ACK", Token::Ack),
1956 ("NACK", Token::Nack),
1957 ("PRIORITY", Token::Priority),
1958 ("LPUSH", Token::Ident("LPUSH".into())),
1959 ("RPUSH", Token::Ident("RPUSH".into())),
1960 ("LPOP", Token::Ident("LPOP".into())),
1961 ("RPOP", Token::Ident("RPOP".into())),
1962 ("NEIGHBORHOOD", Token::Neighborhood),
1963 ("SHORTEST_PATH", Token::ShortestPath),
1964 ("SHORTESTPATH", Token::ShortestPath),
1965 ("CENTRALITY", Token::Centrality),
1966 ("COMMUNITY", Token::Community),
1967 ("COMPONENTS", Token::Components),
1968 ("CYCLES", Token::Cycles),
1969 ("TRAVERSE", Token::Traverse),
1970 ("DEPTH", Token::Depth),
1971 ("DIRECTION", Token::Direction),
1972 ("ALGORITHM", Token::Algorithm),
1973 ("STRATEGY", Token::Strategy),
1974 ("MAX_ITERATIONS", Token::MaxIterations),
1975 ("MAXITERATIONS", Token::MaxIterations),
1976 ("MAX_LENGTH", Token::MaxLength),
1977 ("MAXLENGTH", Token::MaxLength),
1978 ("MODE", Token::Mode),
1979 ("CLUSTERING", Token::Clustering),
1980 ("TOPOLOGICAL_SORT", Token::TopologicalSort),
1981 ("TOPOLOGICALSORT", Token::TopologicalSort),
1982 ("PROPERTIES", Token::Properties),
1983 ("TEXT", Token::Text),
1984 ("FUZZY", Token::Fuzzy),
1985 ("MIN_SCORE", Token::MinScore),
1986 ("MINSCORE", Token::MinScore),
1987 ("BEGIN", Token::Begin),
1988 ("COMMIT", Token::Commit),
1989 ("ROLLBACK", Token::Rollback),
1990 ("SAVEPOINT", Token::Savepoint),
1991 ("RELEASE", Token::Release),
1992 ("START", Token::Start),
1993 ("TRANSACTION", Token::Transaction),
1994 ("WORK", Token::Work),
1995 ("VACUUM", Token::Vacuum),
1996 ("ANALYZE", Token::Analyze),
1997 ("SCHEMA", Token::Schema),
1998 ("SEQUENCE", Token::Sequence),
1999 ("INCREMENT", Token::Increment),
2000 ("COPY", Token::Copy),
2001 ("HEADER", Token::Header),
2002 ("DELIMITER", Token::Delimiter),
2003 ("VIEW", Token::View),
2004 ("MATERIALIZED", Token::Materialized),
2005 ("REFRESH", Token::Refresh),
2006 ("PARTITION", Token::Partition),
2007 ("RANGE", Token::Range),
2008 ("LIST", Token::List),
2009 ("HASH", Token::Hash),
2010 ("ATTACH", Token::Attach),
2011 ("DETACH", Token::Detach),
2012 ("OF", Token::Of),
2013 ("POLICY", Token::Policy),
2014 ("ENABLE", Token::Enable),
2015 ("DISABLE", Token::Disable),
2016 ("SECURITY", Token::Security),
2017 ("ROW", Token::Row),
2018 ("LEVEL", Token::Level),
2019 ("FOREIGN", Token::Foreign),
2020 ("SERVER", Token::Server),
2021 ("WRAPPER", Token::Wrapper),
2022 ("OPTIONS", Token::Options),
2023 ("DATA", Token::Data),
2024 ("plain_ident", Token::Ident("plain_ident".into())),
2025 ];
2026
2027 for (input, expected) in cases {
2028 let tokens = tokenize(input);
2029 assert_eq!(tokens, vec![expected, Token::Eof], "{input}");
2030 }
2031 }
2032
2033 #[test]
2034 fn test_display_all_token_variants() {
2035 let cases = [
2036 (Token::Select, "SELECT"),
2037 (Token::From, "FROM"),
2038 (Token::Where, "WHERE"),
2039 (Token::And, "AND"),
2040 (Token::Or, "OR"),
2041 (Token::Not, "NOT"),
2042 (Token::Match, "MATCH"),
2043 (Token::Return, "RETURN"),
2044 (Token::Join, "JOIN"),
2045 (Token::Graph, "GRAPH"),
2046 (Token::Path, "PATH"),
2047 (Token::To, "TO"),
2048 (Token::Via, "VIA"),
2049 (Token::On, "ON"),
2050 (Token::As, "AS"),
2051 (Token::Is, "IS"),
2052 (Token::Null, "NULL"),
2053 (Token::Between, "BETWEEN"),
2054 (Token::Like, "LIKE"),
2055 (Token::In, "IN"),
2056 (Token::Order, "ORDER"),
2057 (Token::By, "BY"),
2058 (Token::Asc, "ASC"),
2059 (Token::Desc, "DESC"),
2060 (Token::Nulls, "NULLS"),
2061 (Token::First, "FIRST"),
2062 (Token::Last, "LAST"),
2063 (Token::Limit, "LIMIT"),
2064 (Token::Offset, "OFFSET"),
2065 (Token::Inner, "INNER"),
2066 (Token::Left, "LEFT"),
2067 (Token::Right, "RIGHT"),
2068 (Token::Outer, "OUTER"),
2069 (Token::Full, "FULL"),
2070 (Token::Cross, "CROSS"),
2071 (Token::Starts, "STARTS"),
2072 (Token::Ends, "ENDS"),
2073 (Token::With, "WITH"),
2074 (Token::Contains, "CONTAINS"),
2075 (Token::True, "TRUE"),
2076 (Token::False, "FALSE"),
2077 (Token::Enrich, "ENRICH"),
2078 (Token::Group, "GROUP"),
2079 (Token::Count, "COUNT"),
2080 (Token::Sum, "SUM"),
2081 (Token::Avg, "AVG"),
2082 (Token::Min, "MIN"),
2083 (Token::Max, "MAX"),
2084 (Token::Distinct, "DISTINCT"),
2085 (Token::Vector, "VECTOR"),
2086 (Token::Search, "SEARCH"),
2087 (Token::Similar, "SIMILAR"),
2088 (Token::Collection, "COLLECTION"),
2089 (Token::Metric, "METRIC"),
2090 (Token::Threshold, "THRESHOLD"),
2091 (Token::K, "K"),
2092 (Token::Hybrid, "HYBRID"),
2093 (Token::Fusion, "FUSION"),
2094 (Token::Rerank, "RERANK"),
2095 (Token::Rrf, "RRF"),
2096 (Token::Intersection, "INTERSECTION"),
2097 (Token::Union, "UNION"),
2098 (Token::Recursive, "RECURSIVE"),
2099 (Token::All, "ALL"),
2100 (Token::Weight, "WEIGHT"),
2101 (Token::L2, "L2"),
2102 (Token::Cosine, "COSINE"),
2103 (Token::InnerProduct, "INNER_PRODUCT"),
2104 (Token::Include, "INCLUDE"),
2105 (Token::Metadata, "METADATA"),
2106 (Token::Vectors, "VECTORS"),
2107 (Token::Explain, "EXPLAIN"),
2108 (Token::For, "FOR"),
2109 (Token::Format, "FORMAT"),
2110 (Token::Json, "JSON"),
2111 (Token::Insert, "INSERT"),
2112 (Token::Into, "INTO"),
2113 (Token::Values, "VALUES"),
2114 (Token::Update, "UPDATE"),
2115 (Token::Set, "SET"),
2116 (Token::Delete, "DELETE"),
2117 (Token::Truncate, "TRUNCATE"),
2118 (Token::Create, "CREATE"),
2119 (Token::Table, "TABLE"),
2120 (Token::Drop, "DROP"),
2121 (Token::Alter, "ALTER"),
2122 (Token::Add, "ADD"),
2123 (Token::Column, "COLUMN"),
2124 (Token::Primary, "PRIMARY"),
2125 (Token::Key, "KEY"),
2126 (Token::Default, "DEFAULT"),
2127 (Token::Compress, "COMPRESS"),
2128 (Token::Index, "INDEX"),
2129 (Token::Unique, "UNIQUE"),
2130 (Token::If, "IF"),
2131 (Token::Exists, "EXISTS"),
2132 (Token::Returning, "RETURNING"),
2133 (Token::Cascade, "CASCADE"),
2134 (Token::Rename, "RENAME"),
2135 (Token::Using, "USING"),
2136 (Token::Node, "NODE"),
2137 (Token::Edge, "EDGE"),
2138 (Token::Document, "DOCUMENT"),
2139 (Token::Kv, "KV"),
2140 (Token::Timeseries, "TIMESERIES"),
2141 (Token::Retention, "RETENTION"),
2142 (Token::Queue, "QUEUE"),
2143 (Token::Tree, "TREE"),
2144 (Token::Push, "PUSH"),
2145 (Token::Pop, "POP"),
2146 (Token::Peek, "PEEK"),
2147 (Token::Purge, "PURGE"),
2148 (Token::Ack, "ACK"),
2149 (Token::Nack, "NACK"),
2150 (Token::Priority, "PRIORITY"),
2151 (Token::Neighborhood, "NEIGHBORHOOD"),
2152 (Token::ShortestPath, "SHORTEST_PATH"),
2153 (Token::Centrality, "CENTRALITY"),
2154 (Token::Community, "COMMUNITY"),
2155 (Token::Components, "COMPONENTS"),
2156 (Token::Cycles, "CYCLES"),
2157 (Token::Traverse, "TRAVERSE"),
2158 (Token::Depth, "DEPTH"),
2159 (Token::Direction, "DIRECTION"),
2160 (Token::Algorithm, "ALGORITHM"),
2161 (Token::Strategy, "STRATEGY"),
2162 (Token::MaxIterations, "MAX_ITERATIONS"),
2163 (Token::MaxLength, "MAX_LENGTH"),
2164 (Token::Mode, "MODE"),
2165 (Token::Clustering, "CLUSTERING"),
2166 (Token::TopologicalSort, "TOPOLOGICAL_SORT"),
2167 (Token::Properties, "PROPERTIES"),
2168 (Token::Text, "TEXT"),
2169 (Token::Fuzzy, "FUZZY"),
2170 (Token::MinScore, "MIN_SCORE"),
2171 (Token::Begin, "BEGIN"),
2172 (Token::Commit, "COMMIT"),
2173 (Token::Rollback, "ROLLBACK"),
2174 (Token::Savepoint, "SAVEPOINT"),
2175 (Token::Release, "RELEASE"),
2176 (Token::Start, "START"),
2177 (Token::Transaction, "TRANSACTION"),
2178 (Token::Work, "WORK"),
2179 (Token::Vacuum, "VACUUM"),
2180 (Token::Analyze, "ANALYZE"),
2181 (Token::Schema, "SCHEMA"),
2182 (Token::Sequence, "SEQUENCE"),
2183 (Token::Increment, "INCREMENT"),
2184 (Token::Copy, "COPY"),
2185 (Token::Header, "HEADER"),
2186 (Token::Delimiter, "DELIMITER"),
2187 (Token::View, "VIEW"),
2188 (Token::Materialized, "MATERIALIZED"),
2189 (Token::Refresh, "REFRESH"),
2190 (Token::Partition, "PARTITION"),
2191 (Token::Range, "RANGE"),
2192 (Token::List, "LIST"),
2193 (Token::Hash, "HASH"),
2194 (Token::Attach, "ATTACH"),
2195 (Token::Detach, "DETACH"),
2196 (Token::Of, "OF"),
2197 (Token::Policy, "POLICY"),
2198 (Token::Enable, "ENABLE"),
2199 (Token::Disable, "DISABLE"),
2200 (Token::Security, "SECURITY"),
2201 (Token::Row, "ROW"),
2202 (Token::Level, "LEVEL"),
2203 (Token::Foreign, "FOREIGN"),
2204 (Token::Server, "SERVER"),
2205 (Token::Wrapper, "WRAPPER"),
2206 (Token::Options, "OPTIONS"),
2207 (Token::Data, "DATA"),
2208 (Token::String("x".into()), "'x'"),
2209 (Token::Integer(7), "7"),
2210 (Token::Float(1.5), "1.5"),
2211 (Token::JsonLiteral(r#"{"x":1}"#.into()), r#"{"x":1}"#),
2212 (Token::Ident("id".into()), "id"),
2213 (Token::Eq, "="),
2214 (Token::Ne, "<>"),
2215 (Token::Lt, "<"),
2216 (Token::Le, "<="),
2217 (Token::Gt, ">"),
2218 (Token::Ge, ">="),
2219 (Token::Plus, "+"),
2220 (Token::Minus, "-"),
2221 (Token::Star, "*"),
2222 (Token::Slash, "/"),
2223 (Token::Percent, "%"),
2224 (Token::LParen, "("),
2225 (Token::RParen, ")"),
2226 (Token::LBracket, "["),
2227 (Token::RBracket, "]"),
2228 (Token::LBrace, "{"),
2229 (Token::RBrace, "}"),
2230 (Token::Comma, ","),
2231 (Token::Dot, "."),
2232 (Token::Colon, ":"),
2233 (Token::Semi, ";"),
2234 (Token::Dollar, "$"),
2235 (Token::Arrow, "->"),
2236 (Token::ArrowLeft, "<-"),
2237 (Token::Dash, "-"),
2238 (Token::DotDot, ".."),
2239 (Token::Pipe, "|"),
2240 (Token::DoublePipe, "||"),
2241 (Token::Eof, "EOF"),
2242 ];
2243
2244 for (token, expected) in cases {
2245 assert_eq!(token.to_string(), expected);
2246 }
2247 }
2248
2249 #[test]
2250 fn test_string_escape_and_error_matrix() {
2251 let tokens = tokenize(
2252 r#"'line\nrow' 'carriage\rreturn' 'tab\tstop' 'slash\\' 'quote\'' "dq\"" 'raw\z'"#,
2253 );
2254 assert_eq!(
2255 tokens,
2256 vec![
2257 Token::String("line\nrow".into()),
2258 Token::String("carriage\rreturn".into()),
2259 Token::String("tab\tstop".into()),
2260 Token::String("slash\\".into()),
2261 Token::String("quote'".into()),
2262 Token::String("dq\"".into()),
2263 Token::String(r"raw\z".into()),
2264 Token::Eof
2265 ]
2266 );
2267
2268 let mut lexer = Lexer::new("'unterminated");
2269 assert!(lexer
2270 .next_token()
2271 .unwrap_err()
2272 .message
2273 .contains("Unterminated string"));
2274
2275 let mut lexer = Lexer::new(r"'bad\");
2276 assert!(lexer
2277 .next_token()
2278 .unwrap_err()
2279 .message
2280 .contains("Unterminated string"));
2281 }
2282
2283 #[test]
2284 fn test_operator_comment_peek_limit_and_tokenize_paths() {
2285 let tokens = tokenize("!= % ; $ || | 123.abc 1..2 1e+2 <- -> /* block */ SELECT");
2286 assert_eq!(
2287 tokens,
2288 vec![
2289 Token::Ne,
2290 Token::Percent,
2291 Token::Semi,
2292 Token::Dollar,
2293 Token::DoublePipe,
2294 Token::Pipe,
2295 Token::Integer(123),
2296 Token::Dot,
2297 Token::Ident("abc".into()),
2298 Token::Integer(1),
2299 Token::DotDot,
2300 Token::Integer(2),
2301 Token::Float(1e2),
2302 Token::ArrowLeft,
2303 Token::Arrow,
2304 Token::Select,
2305 Token::Eof,
2306 ]
2307 );
2308
2309 let mut lexer = Lexer::new("SELECT FROM");
2310 assert_eq!(lexer.peek_token().unwrap().token, Token::Select);
2311 assert_eq!(lexer.next_token().unwrap().token, Token::Select);
2312 assert_eq!(lexer.next_token().unwrap().token, Token::From);
2313
2314 let mut lexer = Lexer::new("!");
2315 assert!(lexer
2316 .next_token()
2317 .unwrap_err()
2318 .message
2319 .contains("Expected '=' after '!'"));
2320
2321 let limits = crate::storage::query::parser::ParserLimits {
2322 max_identifier_chars: 3,
2323 ..crate::storage::query::parser::ParserLimits::default()
2324 };
2325 let mut lexer = Lexer::with_limits("abcd", limits);
2326 assert_eq!(lexer.max_identifier_chars(), 3);
2327 let err = lexer.next_token().unwrap_err();
2328 assert!(matches!(
2329 err.limit_hit,
2330 Some(LexerLimitHit::IdentifierTooLong { value: 3, .. })
2331 ));
2332 }
2333}